In [39]:
import pandas as pd
import glob
import os

In [41]:
# Load Monthly Enrollment csv files
enrollment_path = "../../data/input/enrollment_2018/*/*.csv"
enrollment_files = glob.glob(enrollment_path, recursive=True)

dfs = []
for f in enrollment_files:
    try:
        df = pd.read_csv(f, encoding='utf-8')
    except UnicodeDecodeError:
        df = pd.read_csv(f, encoding='latin1')
    dfs.append(df)
   
# Combine all 12 months
enrollment = pd.concat(dfs, ignore_index=True)
print(f"Enrollment data shape before cleaning: {enrollment.shape}")

Enrollment data shape before cleaning: (27773081, 18)


In [43]:
# Cleaning Enrollment Data
enrollment.columns = [c.strip().replace(" ", "_") for c in enrollment.columns]

# Keeping necessary columns
enrollment = enrollment[['Contract_Number', 'Plan_ID', 'State', 'County', 'Enrollment']]
enrollment.head()

Unnamed: 0,Contract_Number,Plan_ID,State,County,Enrollment
0,,,,,
1,,801.0,,,
2,,801.0,,,
3,,801.0,,,
4,,1.0,,,


In [44]:
# Aggregate monthly enrollment into yearly totals
enrollment_2018 = (
    enrollment
    .groupby(['Contract_Number', 'Plan_ID', 'State', 'County'], as_index=False)
    .agg({'Enrollment': 'sum'})
)

# Adding Year column 
enrollment_2018['Year'] = 2018
print(enrollment_2018.shape)
enrollment_2018.head()

(2477563, 6)


Unnamed: 0,Contract_Number,Plan_ID,State,County,Enrollment,Year
0,E0654,801.0,AK,Aleutians East,************,2018
1,E0654,801.0,AK,Aleutians West,************,2018
2,E0654,801.0,AK,Anchorage,140140138138146140138140145146146141,2018
3,E0654,801.0,AK,Bethel,************,2018
4,E0654,801.0,AK,Bristol Bay,************,2018


In [45]:
# Load Service Area data
service_files = glob.glob('../../data/input/service_area_2018/*/*.csv')
print(f"Found {len(service_files)} service area files: {service_files[:5]}")

service_dfs = []

for f in service_files:
    try:
        df = pd.read_csv(f, encoding='latin1')  # latin1 helps avoid Unicode errors
        service_dfs.append(df)
    except Exception as e:
        print(f"Error reading {f}: {e}")

# Combining service area data into a dataframe
service_area = pd.concat(service_dfs, ignore_index=True)
print(f"Service area data shape before cleaning: {service_area.shape}")
service_area.head()

Found 12 service area files: ['../../data/input/service_area_2018/MA_Cnty_SA_2018_06/MA_Cnty_SA_2018_06.csv', '../../data/input/service_area_2018/MA_Cnty_SA_2018_01/MA_Cnty_SA_2018_01.csv', '../../data/input/service_area_2018/MA_Cnty_SA_2018_08/MA_Cnty_SA_2018_08.csv', '../../data/input/service_area_2018/MA_Cnty_SA_2018_09/MA_Cnty_SA_2018_09.csv', '../../data/input/service_area_2018/MA_Cnty_SA_2018_07/MA_Cnty_SA_2018_07.csv']
Service area data shape before cleaning: (3986459, 11)


Unnamed: 0,Contract ID,Organization Name,Organization Type,Plan Type,Partial,EGHP,SSA,FIPS,County,State,Notes
0,90091,UNITED MINE WORKERS OF AMERICA HLTH & RETIREMENT,HCPP - 1833 Cost,HCPP - 1833 Cost,,,,,,,"Covers the entire US, all States and Counties"
1,H0022,"BUCKEYE COMMUNITY HEALTH PLAN, INC.",Demo,Medicare-Medicaid Plan HMO/HMOPOS,,,36110.0,39023.0,Clark,OH,
2,H0022,"BUCKEYE COMMUNITY HEALTH PLAN, INC.",Demo,Medicare-Medicaid Plan HMO/HMOPOS,,,36170.0,39035.0,Cuyahoga,OH,
3,H0022,"BUCKEYE COMMUNITY HEALTH PLAN, INC.",Demo,Medicare-Medicaid Plan HMO/HMOPOS,,,36260.0,39051.0,Fulton,OH,
4,H0022,"BUCKEYE COMMUNITY HEALTH PLAN, INC.",Demo,Medicare-Medicaid Plan HMO/HMOPOS,,,36280.0,39055.0,Geauga,OH,


In [48]:
# Cleaning service area data 
service_area.columns = [c.strip().replace(" ", "_") for c in service_area.columns]

# Keeping necessary columns
service_area = service_area[['Contract_ID', 'State', 'County']]

# Get rid of duplicates
service_area = service_area.drop_duplicates()

# Changing Contract_ID to match enrollment data header
service_area = service_area.rename(columns={'Contract_ID': 'Contract_Number'})
service_area.head()

Unnamed: 0,Contract_Number,State,County
0,90091,,
1,H0022,OH,Clark
2,H0022,OH,Cuyahoga
3,H0022,OH,Fulton
4,H0022,OH,Geauga


In [49]:
enrollment_2018.head()

Unnamed: 0,Contract_Number,Plan_ID,State,County,Enrollment,Year
0,E0654,801.0,AK,Aleutians East,************,2018
1,E0654,801.0,AK,Aleutians West,************,2018
2,E0654,801.0,AK,Anchorage,140140138138146140138140145146146141,2018
3,E0654,801.0,AK,Bethel,************,2018
4,E0654,801.0,AK,Bristol Bay,************,2018


In [50]:
# Merging enrollment data with service area data (on Contract_Number, State, County)
enrollment_service_2018 = pd.merge(
    enrollment_2018,
    service_area,
    on=['Contract_Number', 'State', 'County'],
    how='inner'
)
print(f"Merged enrollment and service area data shape: {enrollment_service_2018.shape}")
enrollment_service_2018.head()

Merged enrollment and service area data shape: (1369234, 6)


Unnamed: 0,Contract_Number,Plan_ID,State,County,Enrollment,Year
0,H0022,1.0,OH,Clark,622605595587558598638636591578584589,2018
1,H0022,1.0,OH,Cuyahoga,365735493586362835963586382938283610367337013593,2018
2,H0022,1.0,OH,Fulton,126119112115107117125126113111107112,2018
3,H0022,1.0,OH,Geauga,807577758077848472768068,2018
4,H0022,1.0,OH,Greene,601569567560539573618605553557548563,2018


In [52]:
output_path = '../../data/output/enrollment_service_2018.csv'
enrollment_service_2018.to_csv(output_path, index=False)
print(f"Saved merged dataset to {output_path}")

Saved merged dataset to ../../data/output/enrollment_service_2018.csv
