In [None]:
# install fastkaggle if not available
try: import fastkaggle
except ModuleNotFoundError:
    !pip install -Uq fastkaggle

from fastkaggle import *

## Getting set up

In [None]:
comp = 'playground-series-s3e26'
path = setup_comp(comp, install='')

In [None]:
path

Path('playground-series-s3e26')

In [None]:
trn_path = path/'train.csv'

In [None]:
# add data in first run
# get_dataset(path, 'joebeachcapital/cirrhosis-patient-survival-prediction', force=True)

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler, FunctionTransformer, OneHotEncoder
from sklearn.compose import make_column_transformer, make_column_selector
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split, RandomizedSearchCV, StratifiedKFold, cross_val_score, cross_validate, KFold
from sklearn.ensemble import HistGradientBoostingRegressor, HistGradientBoostingClassifier
from sklearn.metrics import make_scorer, mean_absolute_error, classification_report, log_loss
from sklearn.utils import resample
from scipy.stats import loguniform

from lightgbm import LGBMRegressor, LGBMClassifier, log_evaluation, early_stopping
from xgboost import XGBRegressor, XGBClassifier
from tqdm import tqdm

In [None]:
def preprocess(df, train=False):
    df_ = df.copy()
    df_.Edema = df_.Edema.map({'S':'Y', 'N':'N', 'Y':'Y'}) # must map all keys
    df_['is_gen']='Y'
    if train:
        df1 = pd.read_csv(path/'cirrhosis.csv') # original data based on which the dataset is synthesized
        df1 = pd.concat([df1.drop('Status', axis=1), df1['Status']], axis=1) # move status to last col, same as df_
        df1['is_gen']='N'
        df1.columns = df_.columns
        df_ = pd.concat([df_,df1], axis=0).reset_index(drop=True)
    return df_
    
def cv(X,y,cv=10):
    clf = HistGradientBoostingClassifier(max_iter=10000, early_stopping=True, max_depth=9, n_iter_no_change=80,
                                       l2_regularization=0, learning_rate=0.003)
    ct = make_column_transformer(
                (StandardScaler(), make_column_selector(dtype_include = np.number)),
                (OneHotEncoder(drop='if_binary', handle_unknown='ignore'), make_column_selector(dtype_include=object)), 
                remainder = 'passthrough')
    model = make_pipeline(ct, clf)
    return cross_validate(model, X, y, cv=cv, scoring='neg_log_loss', return_estimator=True)

In [None]:
df = preprocess(pd.read_csv(trn_path),train=True)

In [None]:
def oversample(df, factor=5):
    df_ = df.copy()
    CL = df_[df_.Status=='CL']
    return pd.concat([df_[df_.Status!="CL"], resample(CL, replace=True, n_samples=len(CL)*factor)], axis=0)

In [None]:
df = oversample(df,8)
X, y = df.drop('Status', axis=1).iloc[:,1:], df['Status']

In [None]:
cv_output = cv(X,y,10)

In [None]:
[est[-1].n_iter_ for est in cv_output['estimator']]

[4457, 4538, 3383, 5048, 4775, 3958, 3888, 4967, 3788, 4230]

In [None]:
-cv_output['test_score'].mean(), cv_output['test_score'].std()

(0.4250276975424835, 0.3062302852316274)

## Submitting to Kaggle

In [None]:
ss = pd.read_csv(path/'sample_submission.csv')
ss.head()

Unnamed: 0,id,Status_C,Status_CL,Status_D
0,7905,0.628084,0.034788,0.337128
1,7906,0.628084,0.034788,0.337128
2,7907,0.628084,0.034788,0.337128
3,7908,0.628084,0.034788,0.337128
4,7909,0.628084,0.034788,0.337128


In [None]:
tst = preprocess(pd.read_csv(path/'test.csv'))
tst.head()

Unnamed: 0,id,N_Days,Drug,Age,Sex,Ascites,Hepatomegaly,Spiders,Edema,Bilirubin,Cholesterol,Albumin,Copper,Alk_Phos,SGOT,Tryglicerides,Platelets,Prothrombin,Stage,is_gen
0,7905,3839,D-penicillamine,19724,F,N,Y,N,N,1.2,546.0,3.37,65.0,1636.0,151.9,90.0,430.0,10.6,2.0,Y
1,7906,2468,D-penicillamine,14975,F,N,N,N,N,1.1,660.0,4.22,94.0,1257.0,151.9,155.0,227.0,10.0,2.0,Y
2,7907,51,Placebo,13149,F,N,Y,N,Y,2.0,151.0,2.96,46.0,961.0,69.75,101.0,213.0,13.0,4.0,Y
3,7908,2330,D-penicillamine,20510,F,N,N,N,N,0.6,293.0,3.85,40.0,554.0,125.55,56.0,270.0,10.6,2.0,Y
4,7909,1615,D-penicillamine,21904,F,N,Y,N,N,1.4,277.0,2.97,121.0,1110.0,125.0,126.0,221.0,9.8,1.0,Y


In [None]:
tst_pred = np.stack([est.predict_proba(tst.iloc[:,1:]) for est in cv_output['estimator']]).mean(0)

In [None]:
ss.iloc[:,1:] = tst_pred

In [None]:
ss.to_csv('subm.csv', index=False)
!head subm.csv

id,Status_C,Status_CL,Status_D
7905,0.5137040434978078,0.016504467415852618,0.4697914890863396
7906,0.36336544808845916,0.21474530459780947,0.2071439427159219
7907,0.009832137138413138,0.0009652784006709124,0.989202584460916
7908,0.9668180274769981,0.0048868638880181225,0.028295108634983756
7909,0.8808286279670143,0.04488065362433995,0.07429071840864558
7910,0.9986942117008211,9.590560076788086e-05,0.0012098826984110248
7911,0.9914207119821441,0.0004561529535495925,0.008123135064306241
7912,0.04475098250017007,0.03286943085267366,0.922379586647156
7913,0.0006175051879040318,8.818633318171742e-06,0.9993736761787776


In [None]:
if not iskaggle:
    from kaggle import api
    api.competition_submit_cli('subm.csv', 'refactored code oversampling CL 5X', comp)

100%|█████████████████████| 340k/340k [00:00<00:00, 383kB/s]


In [None]:
tst_labels = tst_pred.argmax(axis=1)

In [None]:
mask_cl = (tst_labels==1)

In [None]:
tst_pred[mask_cl,1]/=2

In [None]:
best_so_far = pd.read_csv('subm.csv', index_col='id').to_numpy()
best_so_far_mask = (best_so_far.argmax(axis=1) == 1)
best_so_far[best_so_far_mask,1]/=2
ss.iloc[:,1:] = best_so_far
ss.to_csv('subm.csv', index=False)
!head subm.csv

id,Status_C,Status_CL,Status_D
7905,0.6585967923278477,0.0236105280569404,0.3177926796152118
7906,0.6583920450680791,0.1489046351929898,0.1927033197389311
7907,0.0297697604846401,0.0076887934700313,0.9625414460453284
7908,0.947984252090274,0.0054536217775073,0.0465621261322187
7909,0.8485589127577186,0.0201340869956581,0.1313070002466234
7910,0.989160120181628,0.0010090498024869,0.0098308300158848
7911,0.971822705243557,0.0022693255443414,0.0259079692121016
7912,0.24390630777922,0.0152947745463511,0.740798917674429
7913,0.0139341569112578,0.0010213726670546,0.9850444704216874


In [None]:
if not iskaggle:
    from kaggle import api
    api.competition_submit_cli('subm.csv', 'refactored code oversampling CL 8X MOD', comp)

100%|█████████████████████| 320k/320k [00:01<00:00, 324kB/s]


## Conclusion

No help to PB. The decrease in loss is due to the fact that there are identical samples in both train and test of each split due to sampling (8X here, less significant decrese in loss when resample 5X).  

## Addendum

In [None]:
if not iskaggle:
    push_notebook('xy', 'histgbr-minmax-transform',
                  title='Minmax transform and HistGBR model',
                  file='01-histgbr-minmax-transform.ipynb',
                  competition=comp, private=False, gpu=False)

Your kernel title does not resolve to the specified id. This may result in surprising behavior. We suggest making your title something that resolves to the specified id. See https://en.wikipedia.org/wiki/Clean_URL#Slug for more information on how slugs are determined.
Kernel version 1 successfully pushed.  Please check progress at https://www.kaggle.com/code/xiaochuanyang/minmax-transform-and-histgbr-model
