# Mississippi, Louisiana, & Alabama Health Professional Shortage Areas - Data Cleaning
---

In [1]:
import pandas as pd
from get_cleaned_hpsa import hpsa

In [2]:
# Keep only the rows that are from Mississippi, Louisiana, or Alabama and store in shpsa dataframe (i.e. South HPSAs)
top_three = ['Mississippi', 'Louisiana', 'Alabama']
shpsa = hpsa[hpsa['Common State Name'].isin(top_three)]

In [3]:
# Get info about subsetted dataframe
shpsa.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 139 entries, 7823 to 63646
Data columns (total 22 columns):
 #   Column                                 Non-Null Count  Dtype         
---  ------                                 --------------  -----         
 0   HPSA Name                              139 non-null    object        
 1   Designation Type                       139 non-null    object        
 2   HPSA Score                             139 non-null    int64         
 3   HPSA Status                            139 non-null    object        
 4   HPSA Designation Date                  139 non-null    datetime64[ns]
 5   HPSA Designation Last Update Date      139 non-null    datetime64[ns]
 6   HPSA FTE                               139 non-null    float64       
 7   HPSA Designation Population            139 non-null    float64       
 8   % of Population Below 100% Poverty     139 non-null    float64       
 9   HPSA Formal Ratio                      136 non-null    objec

### Making sure that each row in `shpsa` represents a unique county in Mississippi, Louisiana, & Alabama.

In [4]:
# Get the sample size for HPSA-designated counties (by county FIPS code)
county_n = shpsa['Common State County FIPS Code'].value_counts()
county_n

28101    2
01111    2
22087    2
01133    2
22119    2
        ..
28157    1
28109    1
28107    1
28083    1
01045    1
Name: Common State County FIPS Code, Length: 130, dtype: int64

#### It looks like there are some counties that have more than one row.  Let's investigate this.

In [5]:
# Get the FIPS code of the counties that have more than one row in shpsa
county_n[county_n > 1]

28101    2
01111    2
22087    2
01133    2
22119    2
22065    2
01075    2
01057    2
22027    2
Name: Common State County FIPS Code, dtype: int64

#### Investigate the first three counties in the `shpsa` dataframe.

In [6]:
shpsa[shpsa['Common State County FIPS Code'] == '28101']

Unnamed: 0,HPSA Name,Designation Type,HPSA Score,HPSA Status,HPSA Designation Date,HPSA Designation Last Update Date,HPSA FTE,HPSA Designation Population,% of Population Below 100% Poverty,HPSA Formal Ratio,...,Common Region Name,Common State Abbreviation,Common State County FIPS Code,Common State Name,County Equivalent Name,HPSA Estimated Served Population,HPSA Estimated Underserved Population,HPSA Provider Ratio Goal,HPSA Shortage,Region Alias
7861,Newton County,High Needs Geographic HPSA,13,Designated,2022-04-15,2022-04-15,6.15,20702.0,25.6,3366:1,...,Region 4,MS,28101,Mississippi,Newton,18450.0,2252.0,3000:1,0.75,Southeast
31111,Newton County,Geographic HPSA,19,Proposed For Withdrawal,1978-06-30,2021-09-10,2.15,20971.0,23.3,9754:1,...,Region 4,MS,28101,Mississippi,Newton,7525.0,13446.0,3500:1,3.84,Southeast


In [7]:
shpsa[shpsa['Common State County FIPS Code'] == '01111']

Unnamed: 0,HPSA Name,Designation Type,HPSA Score,HPSA Status,HPSA Designation Date,HPSA Designation Last Update Date,HPSA FTE,HPSA Designation Population,% of Population Below 100% Poverty,HPSA Formal Ratio,...,Common Region Name,Common State Abbreviation,Common State County FIPS Code,Common State Name,County Equivalent Name,HPSA Estimated Served Population,HPSA Estimated Underserved Population,HPSA Provider Ratio Goal,HPSA Shortage,Region Alias
61942,Randolph County,Geographic HPSA,13,Designated,2022-04-21,2022-04-21,3.8,22139.0,16.9,5826:1,...,Region 4,AL,1111,Alabama,Randolph,13300.0,8839.0,3500:1,2.53,Southeast
62017,Randolph County,High Needs Geographic HPSA,5,Proposed For Withdrawal,2018-08-02,2022-04-21,6.975,22139.0,16.9,3174:1,...,Region 4,AL,1111,Alabama,Randolph,20925.0,1214.0,3000:1,0.405,Southeast


In [8]:
shpsa[shpsa['Common State County FIPS Code'] == '22087']

Unnamed: 0,HPSA Name,Designation Type,HPSA Score,HPSA Status,HPSA Designation Date,HPSA Designation Last Update Date,HPSA FTE,HPSA Designation Population,% of Population Below 100% Poverty,HPSA Formal Ratio,...,Common Region Name,Common State Abbreviation,Common State County FIPS Code,Common State Name,County Equivalent Name,HPSA Estimated Served Population,HPSA Estimated Underserved Population,HPSA Provider Ratio Goal,HPSA Shortage,Region Alias
11911,St. Bernard Parish,Geographic HPSA,12,Proposed For Withdrawal,2017-11-24,2021-09-10,6.75,40719.0,17.1,6032:1,...,Region 6,LA,22087,Louisiana,St. Bernard,23625.0,17094.0,3500:1,4.88,South Central
55858,St. Bernard Parish,High Needs Geographic HPSA,16,Designated,2022-03-08,2022-03-08,4.15,45857.0,21.1,11050:1,...,Region 6,LA,22087,Louisiana,St. Bernard,12450.0,33407.0,3000:1,11.14,South Central


#### It looks like the counties having two rows in `shpsa` have one row where their HPSA Status is Proposed For Withdrawal, and a second row where their HPSA Status is Designated (at a later date).  Let's confirm this.

In [9]:
# Get the FIPS codes of those counties with two rows in shpsa
multiple_fips = county_n[county_n > 1].index
multiple_fips

Index(['28101', '01111', '22087', '01133', '22119', '22065', '01075', '01057',
       '22027'],
      dtype='object')

In [10]:
# Iterate through each county, checking if the date that they were Designated is later than the date
# they were Proposed For Withdrawal
is_designated_at_later_date = 0

for county in multiple_fips:
    county_rows = shpsa[shpsa['Common State County FIPS Code'] == county]
    proposed_withdrawal_row = county_rows[county_rows['HPSA Status'] == 'Proposed For Withdrawal']
    designated_row = county_rows[county_rows['HPSA Status'] == 'Designated']
    proposed_date = proposed_withdrawal_row['HPSA Designation Date'].iat[0]
    designated_date = designated_row['HPSA Designation Date'].iat[0]
    
    if designated_date > proposed_date:
        is_designated_at_later_date += 1

# If the number of counties that were designated at a later date is equal to the number of counties in multiple_fips,
# then YES, it is the case that all counties with multiple rows in shpsa just received a recent Designated status
is_designated_at_later_date == len(multiple_fips)

True

#### Since we've confirmed our hypothesis about the multiple rows, we can drop the Proposed For Withdrawal row for those counties with multiple rows in `shpsa`.

In [11]:
# Iterate through each county in multiple_fips, dropping the Proposed For Withdrawal row from shpsa
for county in multiple_fips:
    county_rows = shpsa[shpsa['Common State County FIPS Code'] == county]
    proposed_withdrawal_row = county_rows[county_rows['HPSA Status'] == 'Proposed For Withdrawal']
    shpsa = shpsa.drop(index = proposed_withdrawal_row.index)

#### Do a final check to see that each row in `shpsa` represents a unique county.

In [12]:
# Get the sample size for HPSA-designated counties (by county FIPS code)
county_n = shpsa['Common State County FIPS Code'].value_counts()

# Check for any counties that have more than one row
county_n[county_n > 1]

Series([], Name: Common State County FIPS Code, dtype: int64)

In [13]:
# Get info for final cleaned dataframe
shpsa.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 130 entries, 7823 to 63646
Data columns (total 22 columns):
 #   Column                                 Non-Null Count  Dtype         
---  ------                                 --------------  -----         
 0   HPSA Name                              130 non-null    object        
 1   Designation Type                       130 non-null    object        
 2   HPSA Score                             130 non-null    int64         
 3   HPSA Status                            130 non-null    object        
 4   HPSA Designation Date                  130 non-null    datetime64[ns]
 5   HPSA Designation Last Update Date      130 non-null    datetime64[ns]
 6   HPSA FTE                               130 non-null    float64       
 7   HPSA Designation Population            130 non-null    float64       
 8   % of Population Below 100% Poverty     130 non-null    float64       
 9   HPSA Formal Ratio                      127 non-null    objec