# Health Professional Shortage Areas - Data Cleaning
---

In [1]:
import pandas as pd

In [2]:
# Import data as df
df = pd.read_excel('raw_data/BCD_HPSA_FCT_DET_PC.xlsx')

# Make copy of df called hpsa; hpsa will be transformed
hpsa = df.copy()

#### Subset data to only geographic areas, only single counties, having a HPSA status of Designated or Proposed for Withdrawal

In [3]:
# I only want to focus on HPSAs in geographic areas (NOT populations groups or facilities)
# Get info about the type of HPSAs in this dataset
hpsa['HPSA Population Type'].value_counts()

Geographic Population                                          27604
Low Income Population HPSA                                     22282
Medicaid Eligible Population HPSA                               5745
Low Income Migrant Farmworker Population HPSA                    669
Other Population HPSA                                            557
Low Income Homeless Population HPSA                              520
Low Income Homeless Migrant Farmworker Population HPSA           450
Homeless Population HPSA                                         443
Native American Population HPSA                                  327
Migrant Seasonal Worker Population HPSA                          180
Low Income Migrant Seasonal Worker Population HPSA                54
Low Income Homeless Migrant Seasonal Worker Population HPSA       44
Migrant Farmworker Population HPSA                                24
Name: HPSA Population Type, dtype: int64

In [4]:
# Keep only the rows that are of the Geographic Population type
hpsa = hpsa[hpsa['HPSA Population Type'] == 'Geographic Population']

# Check shape to make sure indexing worked
hpsa.shape

(27604, 65)

In [5]:
# I only want to focus on currently HPSAs that have NOT been withdrawn
# Get info about current HPSA status
hpsa['HPSA Status'].value_counts()

Withdrawn                  20950
Designated                  3646
Proposed For Withdrawal     3008
Name: HPSA Status, dtype: int64

In [6]:
# Keep only the rows that have a status of Designated or Proposed For Withdrawal
hpsa = hpsa[hpsa['HPSA Status'] != 'Withdrawn']

# Check shape to make sure indexing worked
hpsa.shape

(6654, 65)

In [7]:
# I only want to focus on single counties
# Get info about component type (i.e. type of geography)
hpsa['HPSA Component Type Description'].value_counts()

Census Tract          3891
County Subdivision    1651
Single County         1112
Name: HPSA Component Type Description, dtype: int64

In [8]:
# Keep only the rows that have a component type of Single County
hpsa = hpsa[hpsa['HPSA Component Type Description'] == 'Single County']

# Check shape to make sure indexing worked
hpsa.shape

(1112, 65)

In [9]:
# Check what states are represented
hpsa['Common State Name'].unique()

array(['Texas', 'Ohio', 'Wisconsin', 'Washington', 'South Dakota',
       'Oklahoma', 'North Carolina', 'Marshall Islands',
       'Federated States of Micronesia', 'American Samoa', 'Wyoming',
       'Virginia', 'Utah', 'Pennsylvania', 'Nevada', 'Mississippi',
       'Minnesota', 'Michigan', 'Kansas', 'Iowa', 'Indiana', 'Missouri',
       'Louisiana', 'New York', 'Georgia', 'Colorado', 'Alaska',
       'Illinois', 'Hawaii', 'Arkansas', 'Florida', 'Republic of Palau',
       'Northern Mariana Islands', 'Tennessee', 'Oregon', 'North Dakota',
       'West Virginia', 'South Carolina', 'Montana', 'New Mexico',
       'Nebraska', 'Kentucky', 'Massachusetts', 'California', 'Alabama',
       'Idaho', 'Maryland', 'U.S. Virgin Islands'], dtype=object)

In [10]:
# Exclude rows that are NOT the 50 US states (i.e. exclude US territories & insular areas)
excluded = ['Federated States of Micronesia', 'American Samoa', 'Northern Mariana Islands', 'U.S. Virgin Islands']

hpsa = hpsa[~hpsa['Common State Name'].isin(excluded)]

# Check states again to make sure indexing worked
hpsa['Common State Name'].unique()

array(['Texas', 'Ohio', 'Wisconsin', 'Washington', 'South Dakota',
       'Oklahoma', 'North Carolina', 'Marshall Islands', 'Wyoming',
       'Virginia', 'Utah', 'Pennsylvania', 'Nevada', 'Mississippi',
       'Minnesota', 'Michigan', 'Kansas', 'Iowa', 'Indiana', 'Missouri',
       'Louisiana', 'New York', 'Georgia', 'Colorado', 'Alaska',
       'Illinois', 'Hawaii', 'Arkansas', 'Florida', 'Republic of Palau',
       'Tennessee', 'Oregon', 'North Dakota', 'West Virginia',
       'South Carolina', 'Montana', 'New Mexico', 'Nebraska', 'Kentucky',
       'Massachusetts', 'California', 'Alabama', 'Idaho', 'Maryland'],
      dtype=object)

In [11]:
# Check the counties
hpsa['HPSA Name'].value_counts()

Republic of the Marshall Islands    31
Republic of Palau                   16
Washington County                    9
Lincoln County                       8
Clay County                          7
                                    ..
LaMoure County                       1
Wirt County                          1
Hampshire County                     1
Wyoming County                       1
Colusa County                        1
Name: HPSA Name, Length: 756, dtype: int64

In [12]:
# Why are there more than one value count for some of the counties? (e.g. Washington County)
# Check what the Washington County rows look like
hpsa[hpsa['HPSA Name'] == 'Washington County']

Unnamed: 0,HPSA Name,HPSA ID,Designation Type,HPSA Discipline Class,HPSA Score,PC MCTA Score,Primary State Abbreviation,HPSA Status,HPSA Designation Date,HPSA Designation Last Update Date,...,Provider Type,Rural Status Code,State Abbreviation,State and County Federal Information Processing Standard Code,State FIPS Code,State Name,U.S. - Mexico Border 100 Kilometer Indicator,U.S. - Mexico Border County Indicator,Data Warehouse Record Create Date,Data Warehouse Record Create Date Text
16895,Washington County,1174775647,Geographic HPSA,Primary Care,12,12.0,IL,Designated,1990-06-15,2021-09-09,...,Not Applicable,R,IL,17189,17,Illinois,N,N,2023-09-12,2023/09/12
17773,Washington County,1183826548,Geographic HPSA,Primary Care,9,15.0,IN,Designated,2022-03-14,2022-03-14,...,Not Applicable,R,IN,18175,18,Indiana,N,N,2023-09-12,2023/09/12
19877,Washington County,1124013273,High Needs Geographic HPSA,Primary Care,13,21.0,FL,Designated,2018-10-25,2021-09-10,...,Not Applicable,R,FL,12133,12,Florida,N,N,2023-09-12,2023/09/12
31124,Washington County,1284537078,High Needs Geographic HPSA,Primary Care,21,,MS,Proposed For Withdrawal,1980-01-17,2021-09-10,...,Not Applicable,R,MS,28151,28,Mississippi,N,N,2023-09-12,2023/09/12
34361,Washington County,1211849850,Geographic HPSA,Primary Care,11,12.0,KY,Designated,2022-02-18,2022-02-18,...,Not Applicable,R,KY,21229,21,Kentucky,N,N,2023-09-12,2023/09/12
39527,Washington County,1086860930,Geographic HPSA,Primary Care,16,14.0,CO,Designated,1979-07-10,2021-09-10,...,Not Applicable,R,CO,8121,8,Colorado,N,N,2023-09-12,2023/09/12
51041,Washington County,1371382160,High Needs Geographic HPSA,Primary Care,18,21.0,NC,Designated,1980-01-22,2021-09-08,...,Not Applicable,R,NC,37187,37,North Carolina,N,N,2023-09-12,2023/09/12
59508,Washington County,1219979489,Geographic HPSA,Primary Care,12,,KY,Proposed For Withdrawal,2020-01-10,2021-09-10,...,Not Applicable,R,KY,21229,21,Kentucky,N,N,2023-09-12,2023/09/12
61965,Washington County,1013727029,Geographic HPSA,Primary Care,19,16.0,AL,Designated,2022-04-05,2022-04-05,...,Not Applicable,R,AL,1129,1,Alabama,N,N,2023-09-12,2023/09/12


In [13]:
# Looks like they are just counties in different states that share the same name, no problem!

# Get shape of the hpsa dataframe in its current state
hpsa.shape

(1094, 65)

In [14]:
# Check for duplicated rows
hpsa.duplicated().sum()

0

#### Drop unnecessary columns

In [15]:
# Identify columns to be dropped
cols_to_drop = ['HPSA ID', 'HPSA Discipline Class', 'PC MCTA Score', 'Primary State Abbreviation', 'Metropolitan Indicator',
                'HPSA Geography Identification Number', 'HPSA Degree of Shortage', 'Withdrawn Date', 'HPSA Population Type',
                'Longitude', 'Latitude', 'BHCMIS Organization Identification Number', 'Break in Designation', 'Common Postal Code',
                'Common State FIPS Code', 'County or County Equivalent Federal Information Processing Standard Code',
                'Discipline Class Number', 'HPSA Address', 'HPSA Component Name', 'HPSA Component Source Identification Number',
                'HPSA Component State Abbreviation', 'HPSA Component Type Code', 'HPSA Component Type Description',
                'HPSA Designation Population Type Description', 'HPSA Metropolitan Indicator Code', 'HPSA Population Type Code',
                'HPSA Postal Code', 'HPSA Resident Civilian Population', 'HPSA Status Code', 'HPSA Type Code', 'HPSA City',
                'HPSA Withdrawn Date String', 'Primary State FIPS Code', 'Primary State Name', 'Provider Type', 'Rural Status Code',
                'State Abbreviation', 'State and County Federal Information Processing Standard Code', 'State FIPS Code',
                'State Name', 'U.S. - Mexico Border 100 Kilometer Indicator', 'U.S. - Mexico Border County Indicator',
                'Data Warehouse Record Create Date', 'Data Warehouse Record Create Date Text']

hpsa = hpsa.drop(columns = cols_to_drop)

In [16]:
# List all remaining column names
hpsa.columns

Index(['HPSA Name', 'Designation Type', 'HPSA Score', 'HPSA Status',
       'HPSA Designation Date', 'HPSA Designation Last Update Date',
       'HPSA FTE', 'HPSA Designation Population',
       '% of Population Below 100% Poverty', 'HPSA Formal Ratio',
       'Rural Status', 'Common County Name', 'Common Region Name',
       'Common State Abbreviation', 'Common State County FIPS Code',
       'Common State Name', 'County Equivalent Name',
       'HPSA Estimated Served Population',
       'HPSA Estimated Underserved Population', 'HPSA Provider Ratio Goal',
       'HPSA Shortage'],
      dtype='object')

In [17]:
# Get info about the hpsa transformed dataframe
hpsa.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1094 entries, 346 to 64846
Data columns (total 21 columns):
 #   Column                                 Non-Null Count  Dtype         
---  ------                                 --------------  -----         
 0   HPSA Name                              1094 non-null   object        
 1   Designation Type                       1094 non-null   object        
 2   HPSA Score                             1094 non-null   int64         
 3   HPSA Status                            1094 non-null   object        
 4   HPSA Designation Date                  1094 non-null   datetime64[ns]
 5   HPSA Designation Last Update Date      1094 non-null   datetime64[ns]
 6   HPSA FTE                               1094 non-null   float64       
 7   HPSA Designation Population            1094 non-null   float64       
 8   % of Population Below 100% Poverty     1094 non-null   float64       
 9   HPSA Formal Ratio                      870 non-null    objec