**Program**: 1d_NSCH_AHR_combined_state_data<br>
**Class**: Fall 2025, Machine Learning, Project<br>
**Member**: Vanessa Thorsten<br>
**Description**: This program merges the state-level data for the analyses.<br>
**Outputs**: NSCH_AHR_state.csv file of the state-level data
  
**Program History/Modifications**:<br>
09/06/2025    Initial Version

In [6]:
import pandas as pd

In [7]:
NSCH_state = pd.read_csv("NSCH_state.csv")

NSCH_state = NSCH_state[['STATE','nomChHlthSt_23_1.0_proportion', 'nomChHlthSt_23_2.0_proportion',
'PrevMed_23_1.0_proportion', 'PrevMed_23_2.0_proportion',
'K4Q01_1.0_proportion', 'K4Q01_2.0_proportion',
'smAdeqIns_23_1.0_proportion', 'smAdeqIns_23_2.0_proportion']]

#Create new variables
#Percentage of children in state that are excellent or very good health
NSCH_state['healthy'] = (NSCH_state['nomChHlthSt_23_1.0_proportion']*100).round(1)
NSCH_state['preventative_care'] = (NSCH_state['PrevMed_23_1.0_proportion']*100).round(1)
NSCH_state['place_for_care'] = (NSCH_state['K4Q01_1.0_proportion']*100).round(1)
NSCH_state['insurance'] = (NSCH_state['smAdeqIns_23_1.0_proportion']*100).round(1)

NSCH_state.head()

Unnamed: 0,STATE,nomChHlthSt_23_1.0_proportion,nomChHlthSt_23_2.0_proportion,PrevMed_23_1.0_proportion,PrevMed_23_2.0_proportion,K4Q01_1.0_proportion,K4Q01_2.0_proportion,smAdeqIns_23_1.0_proportion,smAdeqIns_23_2.0_proportion,healthy,preventative_care,place_for_care,insurance
0,AK,0.9245,0.0755,0.7562,0.2438,0.858,0.142,0.6341,0.3659,92.4,75.6,85.8,63.4
1,AL,0.9022,0.0978,0.7939,0.2061,0.8134,0.1866,0.7404,0.2596,90.2,79.4,81.3,74.0
2,AR,0.9153,0.0847,0.755,0.245,0.7856,0.2144,0.6842,0.3158,91.5,75.5,78.6,68.4
3,AZ,0.8792,0.1208,0.7597,0.2403,0.74,0.26,0.624,0.376,87.9,76.0,74.0,62.4
4,CA,0.8839,0.1161,0.7497,0.2503,0.7087,0.2913,0.7053,0.2947,88.4,75.0,70.9,70.5


In [18]:
AHR_state = pd.read_csv("AHR_state.csv")

#rename State to STATE
AHR_state.rename(columns={'State': 'STATE'}, inplace=True)
AHR_state.rename(columns={'Childhood Immunizations': 'immunizations'}, inplace=True)
AHR_state.rename(columns={'High School Completion': 'complete_hs'}, inplace=True)
AHR_state.rename(columns={'Population - Age <18': 'population_LT18'}, inplace=True)
AHR_state.rename(columns={'Primary Care Providers': 'primary_care_providers'}, inplace=True)
AHR_state.rename(columns={'Public Health Funding': 'public_health'}, inplace=True)
AHR_state.rename(columns={'Social Support and Engagement - Annual': 'support_engagement_annual'}, inplace=True)

AHR_state.head()

Unnamed: 0,immunizations,complete_hs,population_LT18,Poverty,primary_care_providers,public_health,support_engagement_annual,STATE,Uninsured
0,59.8,93.2,23.9,10.4,359.3,334.0,0.024,AK,10.4
1,62.6,89.1,22.1,16.0,241.0,137.0,-0.745,AL,8.5
2,66.9,89.8,21.7,12.7,283.4,124.0,,ALL,7.9
3,62.0,89.3,23.0,16.3,241.4,128.0,-1.02,AR,8.9
4,62.9,89.4,21.3,12.6,258.2,84.0,0.125,AZ,9.9


In [19]:
AHR_state['immunizations'].describe()

count    52.000000
mean     67.880769
std       5.097867
min      57.800000
25%      64.300000
50%      67.500000
75%      70.150000
max      83.100000
Name: immunizations, dtype: float64

In [32]:
#Add variables for good vaccine coverage at 67.5% or more
immun_bins = [0, 67.5, 99.99]
immun_labels = ['Low', 'Medium or high']
AHR_state['good_immun_cat'] = pd.cut(AHR_state['immunizations'], bins=immun_bins, labels=immun_labels, right=True)

In [33]:
AHR_state['good_immun_cat'].value_counts()

good_immun_cat
Low               26
Medium or high    26
Name: count, dtype: int64

## Merge Data Files

In [34]:
#Merge the NSCH and AHR data
df = pd.merge(NSCH_state, AHR_state, how = 'left',  on = 'STATE')

# Define regions by state abbreviation
northeast = ['CT', 'ME', 'MA', 'NH', 'RI', 'VT', 'NJ', 'NY', 'PA']
midwest = ['IL', 'IN', 'MI', 'OH', 'WI', 'IA', 'KS', 'MN', 'MO', 'NE', 'ND', 'SD']
south = ['DE', 'FL', 'GA', 'MD', 'NC', 'SC', 'VA', 'DC', 'WV', 'AL', 'KY', 'MS', 'TN', 'AR', 'LA', 'OK', 'TX']
west = ['AZ', 'CO', 'ID', 'MT', 'NV', 'NM', 'UT', 'WY', 'AK', 'CA', 'HI', 'OR', 'WA']

# Create region map dictionary
region_map = {}

for state in northeast:
    region_map[state] = 'Northeast'
for state in midwest:
    region_map[state] = 'Midwest'
for state in south:
    region_map[state] = 'South'
for state in west:
    region_map[state] = 'West'

# Add 'Region' column based on 'STATE'
df['Region'] = df['STATE'].map(region_map)


df.head()

Unnamed: 0,STATE,nomChHlthSt_23_1.0_proportion,nomChHlthSt_23_2.0_proportion,PrevMed_23_1.0_proportion,PrevMed_23_2.0_proportion,K4Q01_1.0_proportion,K4Q01_2.0_proportion,smAdeqIns_23_1.0_proportion,smAdeqIns_23_2.0_proportion,healthy,...,immunizations,complete_hs,population_LT18,Poverty,primary_care_providers,public_health,support_engagement_annual,Uninsured,good_immun_cat,Region
0,AK,0.9245,0.0755,0.7562,0.2438,0.858,0.142,0.6341,0.3659,92.4,...,59.8,93.2,23.9,10.4,359.3,334.0,0.024,10.4,Low,West
1,AL,0.9022,0.0978,0.7939,0.2061,0.8134,0.1866,0.7404,0.2596,90.2,...,62.6,89.1,22.1,16.0,241.0,137.0,-0.745,8.5,Low,South
2,AR,0.9153,0.0847,0.755,0.245,0.7856,0.2144,0.6842,0.3158,91.5,...,62.0,89.3,23.0,16.3,241.4,128.0,-1.02,8.9,Low,South
3,AZ,0.8792,0.1208,0.7597,0.2403,0.74,0.26,0.624,0.376,87.9,...,62.9,89.4,21.3,12.6,258.2,84.0,0.125,9.9,Low,West
4,CA,0.8839,0.1161,0.7497,0.2503,0.7087,0.2913,0.7053,0.2947,88.4,...,59.8,84.8,21.7,12.0,229.7,173.0,0.199,6.4,Low,West


In [35]:
df['Region'].value_counts()

Region
South        17
West         13
Midwest      12
Northeast     9
Name: count, dtype: int64

In [36]:
df.to_csv("NSCH_AHR_state.csv")