# Health Professional Shortage Areas - Exploratory Data Analysis
---
## Goal:
### Which states/counties in the U.S. are facing the most severe shortages of healthcare professionals?
---

In [1]:
import pandas as pd
import seaborn as sns

In [2]:
# Import data as df
df = pd.read_excel('raw_data/BCD_HPSA_FCT_DET_PC.xlsx')

# Make copy of df called hpsa; hpsa will be transformed
hpsa = df.copy()

#### Subset data to only geographic areas, only single counties, having a HPSA status of Designated or Proposed for Withdrawal

In [56]:
# I only want to focus on HPSAs in geographic areas (NOT populations groups or facilities)
# Get info about the type of HPSAs in this dataset
hpsa['HPSA Population Type'].value_counts()

Geographic Population                                          27604
Low Income Population HPSA                                     22282
Medicaid Eligible Population HPSA                               5745
Low Income Migrant Farmworker Population HPSA                    669
Other Population HPSA                                            557
Low Income Homeless Population HPSA                              520
Low Income Homeless Migrant Farmworker Population HPSA           450
Homeless Population HPSA                                         443
Native American Population HPSA                                  327
Migrant Seasonal Worker Population HPSA                          180
Low Income Migrant Seasonal Worker Population HPSA                54
Low Income Homeless Migrant Seasonal Worker Population HPSA       44
Migrant Farmworker Population HPSA                                24
Name: HPSA Population Type, dtype: int64

In [58]:
# Keep only the rows that are of the Geographic Population type
hpsa = hpsa[hpsa['HPSA Population Type'] == 'Geographic Population']

# Check shape to make sure indexing worked
hpsa.shape

(27604, 65)

In [64]:
# I only want to focus on currently HPSAs that have NOT been withdrawn
# Get info about current HPSA status
hpsa['HPSA Status'].value_counts()

Withdrawn                  20950
Designated                  3646
Proposed For Withdrawal     3008
Name: HPSA Status, dtype: int64

In [65]:
# Keep only the rows that have a status of Designated or Proposed For Withdrawal
hpsa = hpsa[hpsa['HPSA Status'] != 'Withdrawn']

# Check shape to make sure indexing worked
hpsa.shape

(6654, 65)

In [67]:
# I only want to focus on single counties
# Get info about component type (i.e. type of geography)
hpsa['HPSA Component Type Description'].value_counts()

Census Tract          3891
County Subdivision    1651
Single County         1112
Name: HPSA Component Type Description, dtype: int64

In [69]:
# Keep only the rows that have a component type of Single County
hpsa = hpsa[hpsa['HPSA Component Type Description'] == 'Single County']

# Check shape to make sure indexing worked
hpsa.shape

(1112, 65)

In [104]:
# Check what states are represented
hpsa['Common State Name'].unique()

array(['Texas', 'Ohio', 'Wisconsin', 'Washington', 'South Dakota',
       'Oklahoma', 'North Carolina', 'Federated States of Micronesia',
       'American Samoa', 'Wyoming', 'Virginia', 'Utah', 'Pennsylvania',
       'Nevada', 'Mississippi', 'Minnesota', 'Michigan', 'Kansas', 'Iowa',
       'Indiana', 'Missouri', 'Louisiana', 'New York', 'Georgia',
       'Colorado', 'Alaska', 'Illinois', 'Hawaii', 'Arkansas', 'Florida',
       'Northern Mariana Islands', 'Tennessee', 'Oregon', 'North Dakota',
       'West Virginia', 'South Carolina', 'Montana', 'New Mexico',
       'Nebraska', 'Kentucky', 'Massachusetts', 'California', 'Alabama',
       'Idaho', 'Maryland', 'U.S. Virgin Islands'], dtype=object)

In [108]:
# Exclude rows that are NOT the 50 US states (i.e. exclude US territories & insular areas)
excluded = ['Federated States of Micronesia', 'American Samoa', 'Northern Mariana Islands', 'U.S. Virgin Islands']

hpsa = hpsa[~hpsa['Common State Name'].isin(excluded)]

# Check states again to make sure indexing worked
hpsa['Common State Name'].unique()

array(['Texas', 'Ohio', 'Wisconsin', 'Washington', 'South Dakota',
       'Oklahoma', 'North Carolina', 'Wyoming', 'Virginia', 'Utah',
       'Pennsylvania', 'Nevada', 'Mississippi', 'Minnesota', 'Michigan',
       'Kansas', 'Iowa', 'Indiana', 'Missouri', 'Louisiana', 'New York',
       'Georgia', 'Colorado', 'Alaska', 'Illinois', 'Hawaii', 'Arkansas',
       'Florida', 'Tennessee', 'Oregon', 'North Dakota', 'West Virginia',
       'South Carolina', 'Montana', 'New Mexico', 'Nebraska', 'Kentucky',
       'Massachusetts', 'California', 'Alabama', 'Idaho', 'Maryland'],
      dtype=object)

In [109]:
# Check the counties
hpsa['HPSA Name'].value_counts()

Washington County    9
Lincoln County       8
Franklin County      7
Clay County          7
Morgan County        7
                    ..
LaMoure County       1
Wirt County          1
Hampshire County     1
Wyoming County       1
Colusa County        1
Name: HPSA Name, Length: 754, dtype: int64

In [93]:
# Why are there more than one value count for some of the counties? (e.g. Washington County)
# Check what the Washington County rows look like
hpsa[hpsa['HPSA Name'] == 'Washington County']

Unnamed: 0,HPSA Name,HPSA ID,Designation Type,HPSA Discipline Class,HPSA Score,PC MCTA Score,Primary State Abbreviation,HPSA Status,HPSA Designation Date,HPSA Designation Last Update Date,...,Provider Type,Rural Status Code,State Abbreviation,State and County Federal Information Processing Standard Code,State FIPS Code,State Name,U.S. - Mexico Border 100 Kilometer Indicator,U.S. - Mexico Border County Indicator,Data Warehouse Record Create Date,Data Warehouse Record Create Date Text
16895,Washington County,1174775647,Geographic HPSA,Primary Care,12,12.0,IL,Designated,1990-06-15,2021-09-09,...,Not Applicable,R,IL,17189,17,Illinois,N,N,2023-09-12,2023/09/12
17773,Washington County,1183826548,Geographic HPSA,Primary Care,9,15.0,IN,Designated,2022-03-14,2022-03-14,...,Not Applicable,R,IN,18175,18,Indiana,N,N,2023-09-12,2023/09/12
19877,Washington County,1124013273,High Needs Geographic HPSA,Primary Care,13,21.0,FL,Designated,2018-10-25,2021-09-10,...,Not Applicable,R,FL,12133,12,Florida,N,N,2023-09-12,2023/09/12
31124,Washington County,1284537078,High Needs Geographic HPSA,Primary Care,21,,MS,Proposed For Withdrawal,1980-01-17,2021-09-10,...,Not Applicable,R,MS,28151,28,Mississippi,N,N,2023-09-12,2023/09/12
34361,Washington County,1211849850,Geographic HPSA,Primary Care,11,12.0,KY,Designated,2022-02-18,2022-02-18,...,Not Applicable,R,KY,21229,21,Kentucky,N,N,2023-09-12,2023/09/12
39527,Washington County,1086860930,Geographic HPSA,Primary Care,16,14.0,CO,Designated,1979-07-10,2021-09-10,...,Not Applicable,R,CO,8121,8,Colorado,N,N,2023-09-12,2023/09/12
51041,Washington County,1371382160,High Needs Geographic HPSA,Primary Care,18,21.0,NC,Designated,1980-01-22,2021-09-08,...,Not Applicable,R,NC,37187,37,North Carolina,N,N,2023-09-12,2023/09/12
59508,Washington County,1219979489,Geographic HPSA,Primary Care,12,,KY,Proposed For Withdrawal,2020-01-10,2021-09-10,...,Not Applicable,R,KY,21229,21,Kentucky,N,N,2023-09-12,2023/09/12
61965,Washington County,1013727029,Geographic HPSA,Primary Care,19,16.0,AL,Designated,2022-04-05,2022-04-05,...,Not Applicable,R,AL,1129,1,Alabama,N,N,2023-09-12,2023/09/12


In [113]:
# Looks like they are just counties in different states that share the same name, no problem!

# Get info from the hpsa dataframe in its current state
hpsa.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1047 entries, 346 to 64846
Data columns (total 65 columns):
 #   Column                                                                    Non-Null Count  Dtype         
---  ------                                                                    --------------  -----         
 0   HPSA Name                                                                 1047 non-null   object        
 1   HPSA ID                                                                   1047 non-null   object        
 2   Designation Type                                                          1047 non-null   object        
 3   HPSA Discipline Class                                                     1047 non-null   object        
 4   HPSA Score                                                                1047 non-null   int64         
 5   PC MCTA Score                                                             796 non-null    float64       
 6   Prima

#### Drop unnecessary columns

In [None]:
# Identify columns to be dropped
cols_to_drop = ['HPSA Discipline Class', 'PC MCTA Score'

In [76]:
# Check which states are represented
hpsa['Primary State Abbreviation'].unique()

array(['TX', 'OH', 'WI', 'WA', 'SD', 'OK', 'NC', 'MH', 'FM', 'AS', 'WY',
       'VA', 'UT', 'PA', 'NV', 'MS', 'MN', 'MI', 'KS', 'IA', 'IN', 'MO',
       'LA', 'NY', 'GA', 'CO', 'AK', 'IL', 'HI', 'AR', 'FL', 'PW', 'MP',
       'TN', 'OR', 'ND', 'WV', 'SC', 'MT', 'NM', 'NE', 'KY', 'MA', 'CA',
       'AL', 'ID', 'MD', 'VI'], dtype=object)