In [None]:
import pickle
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.metrics import mean_absolute_error


df_mine = pd.read_csv("/Users/utkarshbansal/Desktop/APLASIA/final.csv")


target_col = 'target_first_episode_duration'
id_col_name = 'chemo_hadm_id' 

if target_col not in df_mine.columns:
    raise ValueError(f"Column '{target_col}' not found!")

fold_files = [f'fold_{i}.pkl' for i in range(5)]
fold_maes = [] 

for fold_num, file_path in enumerate(fold_files):
    
    
    with open(file_path, 'rb') as f:
        original_train_ids, original_val_ids, original_test_ids = pickle.load(f)

   
    def filter_my_data(original_pairs):
        valid_hadm_ids = set([pair[1] for pair in original_pairs])
        return df_mine[df_mine[id_col_name].isin(valid_hadm_ids)]


    df_train = filter_my_data(original_train_ids)
    df_val   = filter_my_data(original_val_ids)
    df_test  = filter_my_data(original_test_ids)

    print(f"\n" )
    print(f" PROCESSING FOLD {fold_num} ".center(60, '#'))
    print(f"\n" )
    print(f"Samples -> Train: {len(df_train)}, Val: {len(df_val)}, Test: {len(df_test)}")

    drop_cols = [id_col_name, target_col]
    X_train = df_train.drop(columns=drop_cols)
    X_val   = df_val.drop(columns=drop_cols)
    X_test  = df_test.drop(columns=drop_cols)

    train_weights = df_train[target_col].apply(lambda x: 10.0 if x > 20 else (5.0 if x > 7 else 1.0))

    y_train = np.log1p(df_train[target_col])
    y_val   = np.log1p(df_val[target_col])
    y_test_original = df_test[target_col] 

    model = xgb.XGBRegressor(
        n_estimators=1200,
        max_depth=5,                 
        learning_rate=0.03,          
        colsample_bytree=0.4,
        subsample=0.8,
        gamma=0.5,
        reg_alpha=5,                 
        reg_lambda=1,
        random_state=42,
        eval_metric='mae',          
        objective='reg:pseudohubererror', 
        early_stopping_rounds=50
    )


    if len(X_train) > 0 and len(X_val) > 0:
        model.fit(
            X_train, y_train,
            sample_weight=train_weights, 
            eval_set=[(X_val, y_val)],
            verbose=False
        )

        if len(X_test) > 0:
            log_preds = model.predict(X_test)
            real_preds = np.expm1(log_preds)
            
            mae = mean_absolute_error(y_test_original, real_preds)
            fold_maes.append(mae)
            print(f"Fold {fold_num} MAE: {mae:.4f}")
            
            print(f"ERROR ANALYSIS FOR FOLD {fold_num}")
            
            
            fold_results = pd.DataFrame({
                'chemo_hadm_id': df_test[id_col_name],
                'Actual_Days': y_test_original.values,
                'Predicted_Days': real_preds,
                'Absolute_Error': np.abs(y_test_original.values - real_preds),
                'Raw_Residual': real_preds - y_test_original.values
            })

            print(f"\nActual Days Distribution (Test Set):")
            print(fold_results['Actual_Days'].describe()[['mean', 'std', 'min', '50%', 'max']])
            
            worst_10 = fold_results.sort_values(by='Absolute_Error', ascending=False).head(10)
            print("\n>>> TOP 10 WORST PREDICTIONS (Highest Error):")
            print(worst_10.to_string(index=False))

            best_10 = fold_results.sort_values(by='Absolute_Error', ascending=True).head(10)
            print("\n>>> TOP 10 BEST PREDICTIONS (Lowest Error):")
            print(best_10.to_string(index=False))

        else:
            print(f"Fold {fold_num}: No test samples found.")
    else:
        print(f"Fold {fold_num}: Not enough train/val samples.")

if len(fold_maes) > 0:
    print(f" FINAL SUBSET AVERAGE MAE: {np.mean(fold_maes):.4f} ".center(60, '='))



#################### PROCESSING FOLD 0 #####################


Samples -> Train: 733, Val: 79, Test: 198
Fold 0 MAE: 2.8825
ERROR ANALYSIS FOR FOLD 0

Actual Days Distribution (Test Set):
mean     4.045455
std      5.838169
min      1.000000
50%      2.000000
max     41.000000
Name: Actual_Days, dtype: float64

>>> TOP 10 WORST PREDICTIONS (Highest Error):
 chemo_hadm_id  Actual_Days  Predicted_Days  Absolute_Error  Raw_Residual
      21176832         41.0        5.691403       35.308597    -35.308597
      22409075         41.0       11.536332       29.463668    -29.463668
      27576498         25.0        3.705463       21.294537    -21.294537
      24952654         19.0        4.503679       14.496321    -14.496321
      20237153          2.0       16.241247       14.241247     14.241247
      25841662         24.0        9.892419       14.107581    -14.107581
      22984596         18.0        7.179803       10.820197    -10.820197
      23809416         13.0        2.600300    