In [9]:
# Import paths
from pathlib import Path
import re
import numpy as np
import pandas as pd

In [10]:
pd.set_option("display.max_columns", 200)
pd.set_option("display.width", 140)

In [13]:
# Set up paths
BASE = Path("../../..")

ENROLL_DIR      = BASE / "data" / "input" / "enrollment_2018"
SERVICE_AREA_DIR = BASE / "data" / "input" / "service_area_2018"

OUT_DIR = BASE / "data" / "output"
OUT_DIR.mkdir(parents=True, exist_ok=True)

YEAR   = 2018
MONTHS = [f"{m:02d}" for m in range(1, 13)]

In [18]:
# Read enrollment and contract 2018 data
def _clean_colnames(cols):
    """Standardize column names: lower-case, strip, replace spaces/punct with _."""
    out = []
    for c in cols:
        c2 = re.sub(r"[^0-9a-zA-Z]+", "_", str(c).strip()).strip("_").lower()
        out.append(c2)
    return out

def read_contract(path: Path) -> pd.DataFrame:
    """Read a single monthly Contract Info file (CPSC_Contract_Info_2018_MM.csv)."""
    expected = [
        "contractid", "planid", "org_type", "plan_type", "partd", "snp", "eghp",
        "org_name", "org_marketing_name", "plan_name", "parent_org", "contract_date",
    ]

    try:
        df = pd.read_csv(path, skiprows=1, header=None, dtype=str, encoding="latin1")
        if df.shape[1] == len(expected):
            df.columns = expected
        else:
            df = pd.read_csv(path, skiprows=1, dtype=str, encoding="latin1")
            df.columns = _clean_colnames(df.columns)
    except Exception:
        df = pd.read_csv(path, dtype=str)
        df.columns = _clean_colnames(df.columns)

    if "planid" in df.columns:
        df["planid"] = pd.to_numeric(df["planid"], errors="coerce")

    return df

In [22]:
def read_enroll(path: Path) -> pd.DataFrame:
    """Read a single monthly Enrollment Info file."""
    try:
        df = pd.read_csv(path, skiprows=1, dtype=str, encoding="latin1")
    except Exception:
        df = pd.read_csv(path, dtype=str, encoding="latin1")

    df.columns = _clean_colnames(df.columns)

    # Harmonize common variants
    ren = {
        "contract_id": "contractid",
        "contract":    "contractid",
        "plan_id":     "planid",
        "plan":        "planid",
        "county_name": "county",
        "state_abbr":  "state",
        "fips_state_county_code": "fips",
    }
    df = df.rename(columns={k: v for k, v in ren.items() if k in df.columns})

    if "planid" in df.columns:
        df["planid"] = pd.to_numeric(df["planid"], errors="coerce")
    if "fips" in df.columns:
        df["fips"] = pd.to_numeric(df["fips"], errors="coerce")

    return df


def load_month(m: str, y: int) -> pd.DataFrame:
    """Load contract + enrollment for one month and merge them."""
    # File names: CPSC_Contract_Info_2018_01.csv, CPSC_Enrollment_Info_2018_01.csv
    c_path = ENROLL_DIR / f"CPSC_Contract_Info_{y}_{m}.csv"
    e_path = ENROLL_DIR / f"CPSC_Enrollment_Info_{y}_{m}.csv"

    contract = read_contract(c_path)

    # Keep one row per (contractid, planid) â€” mirrors R distinct(..., .keep_all=TRUE)
    if {"contractid", "planid"}.issubset(contract.columns):
        contract = contract.drop_duplicates(subset=["contractid", "planid"], keep="first")

    enroll = read_enroll(e_path)

    # Merge contract info into enrollment
    if {"contractid", "planid"}.issubset(enroll.columns) and {"contractid", "planid"}.issubset(contract.columns):
        df = enroll.merge(contract, on=["contractid", "planid"], how="left", suffixes=("", "_contract"))
    else:
        df = enroll.copy()

    df["month"] = int(m)
    df["year"]  = int(y)
    return df