In [None]:
import pandas as pd
import numpy as np

# 1. Load Data
df = pd.read_csv("ANC and transfusion data/anc_45days.csv")

# Convert timestamps and handle timezones
df['charttime'] = pd.to_datetime(df['charttime'], errors='coerce').dt.tz_localize(None)
df['dischtime'] = pd.to_datetime(df['dischtime'], errors='coerce').dt.tz_localize(None)
df['admittime'] = pd.to_datetime(df['admittime'], errors='coerce').dt.tz_localize(None)

# 2. Filter out rows where valuenum is missing
df = df.dropna(subset=['valuenum'])

# 3. Create initial aplasia flag (1 if ANC <= 0.5, else 0)
df['aplasia_flag'] = np.where(df['valuenum'] <= 0.5, 1, 0)
df['event_date'] = df['charttime'].dt.floor('D')

# 4. Aggregate to daily level
daily = (
    df.groupby(['subject_id', 'chemo_hadm_id', 'dischtime', 'admittime', 'event_date'], as_index=False)
      .agg(
          aplasia_flag=('aplasia_flag', 'max'),
          daily_min_anc=('valuenum', 'min')  
      )
)

daily = daily.sort_values(['subject_id', 'event_date', 'dischtime'], ascending=[True, True, False])

# Remove duplicates per day (taking the first one)
daily_nodup = (
    daily.groupby(['subject_id', 'event_date'], as_index=False)
         .first()
)

# 5. Create calendar skeleton (0 to 45 days post discharge)
admissions = (
    daily_nodup[['subject_id', 'chemo_hadm_id', 'dischtime', 'admittime']]
    .dropna(subset=['dischtime'])
    .drop_duplicates()
    .sort_values(['subject_id', 'dischtime'])
    .reset_index(drop=True)
)

admissions['next_dischtime'] = admissions.groupby('subject_id')['dischtime'].shift(-1)

all_rows = []
for _, row in admissions.iterrows():
    sid = row['subject_id']
    hadm = row['chemo_hadm_id']
    disc_dt = pd.to_datetime(row['dischtime'])
    admit_dt = pd.to_datetime(row['admittime'])
    disc_date = disc_dt.normalize()
    dates = pd.date_range(disc_date, disc_date + pd.Timedelta(days=45), freq='D')

    tmp = pd.DataFrame({
        'subject_id': sid,
        'chemo_hadm_id': hadm,
        'dischtime': disc_dt,
        'admittime': admit_dt,
        'next_dischtime': row['next_dischtime'],
        'event_date': dates
    })
    all_rows.append(tmp)

full_cal = pd.concat(all_rows, ignore_index=True)

# Merge actual data onto skeleton
full = full_cal.merge(
    daily_nodup[['subject_id', 'chemo_hadm_id', 'event_date', 'aplasia_flag', 'daily_min_anc']], 
    on=['subject_id', 'chemo_hadm_id', 'event_date'],
    how='left'
)

# Filter out days before discharge
full['discharge_date'] = full['dischtime'].dt.normalize()
full['day_offset'] = (full['event_date'] - full['discharge_date']).dt.days
full = full[full['day_offset'] >= 0].reset_index(drop=True)

# 6. Forward Fill Logic (Fills gaps <= 7 days with 2)
def forward_fill_between_ones(g, max_gap_days=7):
    g = g.sort_values('event_date').copy().reset_index(drop=True)
    g['aplasia_flag_filled'] = g['aplasia_flag'].copy()

    ones_idx = g.index[g['aplasia_flag'] == 1].tolist()
    if len(ones_idx) < 2:
        return g

    for i in range(len(ones_idx) - 1):
        s_idx = ones_idx[i]
        t_idx = ones_idx[i + 1]

        s_date = g.at[s_idx, 'event_date']
        t_date = g.at[t_idx, 'event_date']
        gap_days = (t_date - s_date).days

        between_mask = (g['event_date'] > s_date) & (g['event_date'] < t_date)
        any_zero_between = g.loc[between_mask, 'aplasia_flag'].eq(0).any()
        
        if any_zero_between:
            continue

        if 1 <= gap_days <= max_gap_days:
            fill_mask = between_mask & g['aplasia_flag'].isna()
            g.loc[fill_mask, 'aplasia_flag_filled'] = 2
        else:
            continue

    return g

full = (
    full.groupby(['subject_id', 'chemo_hadm_id'], group_keys=False)
        .apply(lambda gg: forward_fill_between_ones(gg, max_gap_days=7))
        .reset_index(drop=True)
)

# 7. Episode Detection
def detect_episodes(g):
    g = g.sort_values('event_date').copy()
    
    
    g['is_aplasia'] = g['aplasia_flag_filled'].isin([1, 2])
    g['prev_is_aplasia'] = g['is_aplasia'].shift(1).fillna(False)
    
    g['episode_start'] = (g['is_aplasia'] & ~g['prev_is_aplasia']).astype(int)
    
    g['episode_id'] = g['episode_start'].cumsum()
    
    g.loc[~g['is_aplasia'], 'episode_id'] = 0
    
    return g

full_ep = (
    full.groupby(['subject_id', 'chemo_hadm_id'], group_keys=False)
        .apply(detect_episodes)
        .reset_index(drop=True)
)

aplasia_days = full_ep[full_ep['aplasia_flag_filled'].isin([1, 2])].copy()

episodes = (
    aplasia_days
    .groupby(['subject_id', 'chemo_hadm_id', 'episode_id'], as_index=False)
    .agg(
        start_date=('event_date', 'min'),
        end_date=('event_date', 'max'),
        duration_days=('event_date', lambda s: (s.max() - s.min()).days + 1),
        minimum_anc_across_episode=('daily_min_anc', 'min') 
    )
    .sort_values(['subject_id', 'chemo_hadm_id', 'episode_id'])
)

episodes = episodes.sort_values(['subject_id', 'start_date'], ascending=[True, True]).reset_index(drop=True)

# 9. Merge discharge info back
discharge_map = (
    daily_nodup[['subject_id', 'chemo_hadm_id', 'dischtime', 'admittime']]
    .dropna()
    .drop_duplicates()
)

episodes = episodes.merge(
    discharge_map,
    on=['subject_id', 'chemo_hadm_id'],
    how='left'
)


# Final Selection
episodes = episodes[[
    'subject_id', 'chemo_hadm_id', 'admittime', 'dischtime', 
    'episode_id', 'start_date', 'end_date', 'duration_days', 'minimum_anc_across_episode'
]]


  .apply(lambda gg: forward_fill_between_ones(gg, max_gap_days=7))
  g['prev_is_aplasia'] = g['is_aplasia'].shift(1).fillna(False)
  g['prev_is_aplasia'] = g['is_aplasia'].shift(1).fillna(False)
  g['prev_is_aplasia'] = g['is_aplasia'].shift(1).fillna(False)
  g['prev_is_aplasia'] = g['is_aplasia'].shift(1).fillna(False)
  g['prev_is_aplasia'] = g['is_aplasia'].shift(1).fillna(False)
  g['prev_is_aplasia'] = g['is_aplasia'].shift(1).fillna(False)
  g['prev_is_aplasia'] = g['is_aplasia'].shift(1).fillna(False)
  g['prev_is_aplasia'] = g['is_aplasia'].shift(1).fillna(False)
  g['prev_is_aplasia'] = g['is_aplasia'].shift(1).fillna(False)
  g['prev_is_aplasia'] = g['is_aplasia'].shift(1).fillna(False)
  g['prev_is_aplasia'] = g['is_aplasia'].shift(1).fillna(False)
  g['prev_is_aplasia'] = g['is_aplasia'].shift(1).fillna(False)
  g['prev_is_aplasia'] = g['is_aplasia'].shift(1).fillna(False)
  g['prev_is_aplasia'] = g['is_aplasia'].shift(1).fillna(False)
  g['prev_is_aplasia'] = g['is_aplasi