In [4]:
import pandas as pd
import numpy as np
from pathlib import Path
import os

In [5]:
os.getcwd()

'/Users/valeriehernandez/Documents/GitHub/homework1/submission1/data-code'

In [12]:
BASE_DIR = Path("../../data/input")
ENROLL_DIR = BASE_DIR / "enrollment_2018"
SA_DIR = BASE_DIR / "service_area_2018"

In [13]:
ENROLL_DIR.exists(), list(ENROLL_DIR.iterdir())[:5]

(True,
 [PosixPath('../../data/input/enrollment_2018/.DS_Store'),
  PosixPath('../../data/input/enrollment_2018/CPSC_Enrollment_Info_2018_08.csv'),
  PosixPath('../../data/input/enrollment_2018/CPSC_Enrollment_Info_2018_09.csv'),
  PosixPath('../../data/input/enrollment_2018/CPSC_Contract_Info_2018_09.csv'),
  PosixPath('../../data/input/enrollment_2018/CPSC_Contract_Info_2018_08.csv')])

In [14]:
files = [
    f for f in ENROLL_DIR.iterdir()
    if f.is_file() and f.suffix.lower() == ".csv"
]

len(files)

24

In [15]:
enrollment_csvs = [f for f in files if "Enrollment" in f.name]
contract_csvs   = [f for f in files if "Contract"   in f.name]

len(enrollment_csvs), len(contract_csvs)

(12, 12)

In [17]:
enrollment_files = [
    f for f in ENROLL_DIR.iterdir()
    if f.is_file() and "Enrollment" in f.name
]

enroll_cols = [
    "Contract Number",
    "Plan ID",
    "FIPS State County Code",
    "Enrollment"
]

enroll = pd.concat(
    [pd.read_csv(f, usecols=enroll_cols) for f in enrollment_files],
    ignore_index=True
)

# Rename for consistency
enroll = enroll.rename(columns={
    "Contract Number": "CONTRACT_ID",
    "Plan ID": "PLAN_ID",
    "FIPS State County Code": "COUNTY_CODE",
    "Enrollment": "ENROLLMENT"
})

# Collapse monthly → annual (plan–county–year)
enroll_2018 = (
    enroll
    .groupby(["CONTRACT_ID", "PLAN_ID", "COUNTY_CODE"], as_index=False)
    ["ENROLLMENT"]
    .sum()
)


In [20]:
# load contract info
contract_files = [
    f for f in ENROLL_DIR.iterdir()
    if f.is_file() and "Contract" in f.name
]

contract_cols = [
    "Contract ID",
    "Plan ID",
    "Plan Type",
    "SNP Plan",
    "EGHP"
]

contract = pd.concat(
    [
        pd.read_csv(
            f,
            usecols=contract_cols,
            encoding="latin-1"
        )
        for f in contract_files
    ],
    ignore_index=True
)


contract = contract.rename(columns={
    "Contract ID": "CONTRACT_ID",
    "Plan ID": "PLAN_ID",
    "Plan Type": "PLAN_TYPE",
    "SNP Plan": "SNP_PLAN",
    "EGHP": "EGHP_FLAG"
})

# One row per plan
contract = contract.drop_duplicates(["CONTRACT_ID", "PLAN_ID"])


In [21]:
# Merge enrollment with contract info
enroll_contract = enroll_2018.merge(
    contract,
    on=["CONTRACT_ID", "PLAN_ID"],
    how="left"
)

In [24]:
# Load service area info
sa_cols = [
    "Contract ID",
    "FIPS"
]

service_area = pd.concat(
    [
        pd.read_csv(
            f,
            usecols=sa_cols,
            encoding="latin-1"
        )
        for f in sa_files
    ],
    ignore_index=True
)

service_area = service_area.rename(columns={
    "Contract ID": "CONTRACT_ID",
    "FIPS": "COUNTY_CODE"
})

service_area = service_area.drop_duplicates()


In [25]:
# Merge enrollment+contract with service area info
enroll_final = enroll_contract.merge(
    service_area,
    on=["CONTRACT_ID", "COUNTY_CODE"],
    how="inner"
)

In [26]:
OUTPUT_DIR = Path("../../data/output")
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

output_file = OUTPUT_DIR / "ma_enrollment_service_area_2018.csv"

enroll_final.to_csv(output_file, index=False)
print(f"Saved file to: {output_file.resolve()}")
print("Rows:", enroll_final.shape[0])
print("Columns:", enroll_final.shape[1])


Saved file to: /Users/valeriehernandez/Documents/GitHub/homework1/data/output/ma_enrollment_service_area_2018.csv
Rows: 1366487
Columns: 7
