In [1]:
import pandas as pd
import numpy as np

# Helper: read & uppercase columns
def read_upper(path, parse_dates=()):
    df = pd.read_csv(path, low_memory=False)
    df.columns = df.columns.str.upper()
    for c in parse_dates:
        if c in df.columns:
            df[c] = pd.to_datetime(df[c], errors="coerce")
    return df

enc = read_upper("encounters.csv", parse_dates=("START","STOP"))
# Optional for age/sex
pts = read_upper("patients.csv",  parse_dates=("BIRTHDATE",))

In [2]:
# Keep adult inpatient encounters with valid times
enc = enc[enc["ENCOUNTERCLASS"].str.lower() == "inpatient"].copy()
enc = enc[enc["START"].notna() & enc["STOP"].notna()].copy()

# Standard time columns
enc["admit_time"]     = enc["START"]
enc["discharge_time"] = enc["STOP"]

# Cost column: prefer TOTAL_CLAIM_COST; fallback to BASE_ENCOUNTER_COST; else zeros
if "TOTAL_CLAIM_COST" not in enc.columns:
    if "BASE_ENCOUNTER_COST" in enc.columns:
        enc["TOTAL_CLAIM_COST"] = enc["BASE_ENCOUNTER_COST"]
    else:
        enc["TOTAL_CLAIM_COST"] = 0.0
enc["TOTAL_CLAIM_COST"] = pd.to_numeric(enc["TOTAL_CLAIM_COST"], errors="coerce").fillna(0.0)

In [3]:
# Index rows: one record per discharge
cost = enc[["PATIENT","admit_time","discharge_time"]].copy()

# Future encounters within 30 days for same patient (any encounter type; you can restrict if you want)
fut = enc[["PATIENT","START","TOTAL_CLAIM_COST"]].rename(columns={"START":"next_start"})

fut_cost = cost.merge(fut, on="PATIENT", how="left")
mask = (fut_cost["next_start"] >  fut_cost["discharge_time"]) & \
       (fut_cost["next_start"] <= fut_cost["discharge_time"] + pd.Timedelta(days=30))
fut_cost = fut_cost[mask]

# Aggregate total cost incurred in the next 30 days
y_cost = (fut_cost
          .groupby(["PATIENT","discharge_time"], as_index=False)["TOTAL_CLAIM_COST"]
          .sum()
          .rename(columns={"TOTAL_CLAIM_COST":"cost_30d"}))

# Join back to index rows; missing -> 0 cost
reg_df = cost.merge(y_cost, on=["PATIENT","discharge_time"], how="left")
reg_df["cost_30d"] = reg_df["cost_30d"].fillna(0.0)
reg_df.head()

Unnamed: 0,PATIENT,admit_time,discharge_time,cost_30d
0,396e9c69-3923-ef41-6374-922cd0a495dc,1992-05-07 21:33:31+00:00,1992-05-08 21:33:31+00:00,0.0
1,396e9c69-3923-ef41-6374-922cd0a495dc,2011-06-27 14:21:33+00:00,2011-06-29 15:09:14+00:00,0.0
2,e2b370cd-f4a6-0265-9dcf-a5c4ccf849ed,2021-10-13 08:30:35+00:00,2021-10-16 03:59:35+00:00,0.0
3,c26a046f-ff54-fa35-3f03-e27ebbef088a,2020-07-14 14:24:36+00:00,2020-07-18 15:11:17+00:00,0.0
4,c26a046f-ff54-fa35-3f03-e27ebbef088a,2020-08-28 13:55:34+00:00,2020-09-03 14:49:43+00:00,0.0


In [4]:
# Sort to identify next starts
enc = enc.sort_values(["PATIENT","admit_time"])

# Next inpatient START within 30 days => label 1
nexts = (enc[["PATIENT","admit_time"]].rename(columns={"admit_time":"next_start"})
         .merge(enc[["PATIENT","discharge_time"]], on="PATIENT", how="left"))

m = (nexts["next_start"] > nexts["discharge_time"]) & \
    (nexts["next_start"] <= nexts["discharge_time"] + pd.Timedelta(days=30))

next_admit = (nexts[m]
              .groupby(["PATIENT","discharge_time"])["next_start"]
              .min().reset_index())

cls_df = enc.merge(next_admit, on=["PATIENT","discharge_time"], how="left")
cls_df["readmit_30d"] = cls_df["next_start"].notna().astype(int)
cls_df["readmit_30d"].mean()  # prevalence check

np.float64(0.1044776119402985)

In [5]:
# Sanity: counts per class
n_total = len(cls_df)
n_pos   = int(cls_df["readmit_30d"].sum())
print({"total": n_total, "positives": n_pos, "prevalence": n_pos/n_total})

# Ensure only INPATIENT next encounters are driving labels (you already filtered enc to inpatient, so this is OK)

# Same-day transfers check (you already used >, not >=, so transfers at the same timestamp are excluded)

# How many discharges have ≥30d follow-up available?
data_end = enc["admit_time"].max()
has_followup = cls_df["discharge_time"] <= (data_end - pd.Timedelta(days=30))
print({"with_≥30d_followup": int(has_followup.sum()), "share": has_followup.mean()})

{'total': 134, 'positives': 14, 'prevalence': 0.1044776119402985}
{'with_≥30d_followup': 132, 'share': np.float64(0.9850746268656716)}
