In [1]:
import pandas as pd

In [None]:
import pickle
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor  
from sklearn.metrics import mean_absolute_error


df_mine = pd.read_csv("/Users/utkarshbansal/Desktop/APLASIA/final.csv")


target_col = 'target_first_episode_duration'
id_col_name = 'chemo_hadm_id' 

if target_col not in df_mine.columns:
    raise ValueError(f"Column '{target_col}' not found!")

fold_files = [f'fold_{i}.pkl' for i in range(5)]
fold_maes = [] 

for fold_num, file_path in enumerate(fold_files):
    

    with open(file_path, 'rb') as f:
        original_train_ids, original_val_ids, original_test_ids = pickle.load(f)

   
    def filter_my_data(original_pairs):
        valid_hadm_ids = set([pair[1] for pair in original_pairs])
        return df_mine[df_mine[id_col_name].isin(valid_hadm_ids)]

    
    df_train = filter_my_data(original_train_ids)
    
    df_val   = filter_my_data(original_val_ids) 
    df_train_full = pd.concat([df_train, df_val]) 
    
    df_test  = filter_my_data(original_test_ids)

    print(f"\n" )
    print(f" PROCESSING FOLD {fold_num} (RANDOM FOREST) ".center(60, '#'))
    print(f"\n" )
    print(f"Samples -> Train : {len(df_train_full)}, Test: {len(df_test)}")

    
    drop_cols = [id_col_name, target_col]
    X_train_full = df_train_full.drop(columns=drop_cols)
    y_train_full = df_train_full[target_col] 
    
    X_test = df_test.drop(columns=drop_cols)
    y_test_original = df_test[target_col] 

   
    train_weights = df_train_full[target_col].apply(
        lambda x: 10.0 if x > 20 else (5.0 if x > 7 else 1.0)
    )

    model = RandomForestRegressor(
        n_estimators=200,       
        max_depth=None,          
        min_samples_split=5,     
        min_samples_leaf=2,      
        max_features='sqrt',     
        random_state=42,
        n_jobs=-1               
    )

   
    if len(X_train_full) > 0:
        model.fit(
            X_train_full, 
            y_train_full,
            sample_weight=train_weights 
        )

        if len(X_test) > 0:
            real_preds = model.predict(X_test)
            
            mae = mean_absolute_error(y_test_original, real_preds)
            fold_maes.append(mae)
            print(f"Fold {fold_num} MAE: {mae:.4f}")
            
            
            print("-" * 40)
            print(f"ERROR ANALYSIS FOR FOLD {fold_num}")
            
            fold_results = pd.DataFrame({
                'chemo_hadm_id': df_test[id_col_name],
                'Actual_Days': y_test_original.values,
                'Predicted_Days': real_preds,
                'Absolute_Error': np.abs(y_test_original.values - real_preds),
                'Raw_Residual': real_preds - y_test_original.values
            })

            
            print(f"\nActual Days Distribution (Test Set):")
            print(fold_results['Actual_Days'].describe()[['mean', 'std', 'min', '50%', 'max']])
            
            
            worst_10 = fold_results.sort_values(by='Absolute_Error', ascending=False).head(10)
            print("\n>>> TOP 10 WORST PREDICTIONS (Highest Error):")
            print(worst_10.to_string(index=False))

           
            best_10 = fold_results.sort_values(by='Absolute_Error', ascending=True).head(10)
            print("\n>>> TOP 10 BEST PREDICTIONS (Lowest Error):")
            print(best_10.to_string(index=False))
            
            
            bias = fold_results['Raw_Residual'].mean()
            print(f"\nAverage Bias (Pred - Actual): {bias:.4f}")
            status = "OVERESTIMATING" if bias > 0 else "UNDERESTIMATING"
            print(f"Status: Model is generally {status} stays in this fold.")

        else:
            print(f"Fold {fold_num}: No test samples found.")
    else:
        print(f"Fold {fold_num}: Not enough train/val samples.")

if len(fold_maes) > 0:
   
    print(f" FINAL SUBSET AVERAGE MAE: {np.mean(fold_maes):.4f} ".center(60, '='))
    



############ PROCESSING FOLD 0 (RANDOM FOREST) #############


Samples -> Train : 812, Test: 198
Fold 0 MAE: 3.0900
----------------------------------------
ERROR ANALYSIS FOR FOLD 0

Actual Days Distribution (Test Set):
mean     4.045455
std      5.838169
min      1.000000
50%      2.000000
max     41.000000
Name: Actual_Days, dtype: float64

>>> TOP 10 WORST PREDICTIONS (Highest Error):
 chemo_hadm_id  Actual_Days  Predicted_Days  Absolute_Error  Raw_Residual
      21176832         41.0        9.128403       31.871597    -31.871597
      22409075         41.0       12.926401       28.073599    -28.073599
      27576498         25.0        7.726401       17.273599    -17.273599
      25841662         24.0        7.134997       16.865003    -16.865003
      24952654         19.0        4.931572       14.068428    -14.068428
      28310302          2.0       12.508239       10.508239     10.508239
      27253820         16.0        6.022815        9.977185     -9.977185
      20237153