In [None]:
# install fastkaggle if not available
try: import fastkaggle
except ModuleNotFoundError:
    !pip install -Uq fastkaggle

from fastkaggle import *

## Getting set up

In [None]:
comp = 'playground-series-s3e26'
path = setup_comp(comp, install='')

In [None]:
path

Path('playground-series-s3e26')

In [None]:
trn_path = path/'train.csv'

In [None]:
!kaggle datasets download -d joebeachcapital/cirrhosis-patient-survival-prediction
!unzip cirrhosis-patient-survival-prediction.zip

cirrhosis-patient-survival-prediction.zip: Skipping, found more recently modified local copy (use --force to force download)
Archive:  cirrhosis-patient-survival-prediction.zip
replace cirrhosis.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: ^C


In [None]:
import pandas as pd
df0 = pd.read_csv(trn_path)

In [None]:
def process(df):
    df_ = df.copy()
    df_.Edema = df.Edema.map({'S':'Y', 'N':'N', 'Y':'Y'}) # must map all keys
    return df_

In [None]:
df1 = pd.read_csv('cirrhosis.csv')

In [None]:
df0.columns, df1.columns

(Index(['id', 'N_Days', 'Drug', 'Age', 'Sex', 'Ascites', 'Hepatomegaly',
        'Spiders', 'Edema', 'Bilirubin', 'Cholesterol', 'Albumin', 'Copper',
        'Alk_Phos', 'SGOT', 'Tryglicerides', 'Platelets', 'Prothrombin',
        'Stage', 'Status'],
       dtype='object'),
 Index(['ID', 'N_Days', 'Status', 'Drug', 'Age', 'Sex', 'Ascites',
        'Hepatomegaly', 'Spiders', 'Edema', 'Bilirubin', 'Cholesterol',
        'Albumin', 'Copper', 'Alk_Phos', 'SGOT', 'Tryglicerides', 'Platelets',
        'Prothrombin', 'Stage'],
       dtype='object'))

In [None]:
df2 = pd.concat([df1.drop('Status', axis=1), df1['Status']], axis=1)

In [None]:
df2.columns = df0.columns

In [None]:
df0 = pd.concat([df0,df2], axis=0)

In [None]:
df = process(df0)

In [None]:
df

Unnamed: 0,id,N_Days,Drug,Age,Sex,Ascites,Hepatomegaly,Spiders,Edema,Bilirubin,Cholesterol,Albumin,Copper,Alk_Phos,SGOT,Tryglicerides,Platelets,Prothrombin,Stage,Status
0,0,999,D-penicillamine,21532,M,N,N,N,N,2.3,316.0,3.35,172.0,1601.0,179.80,63.0,394.0,9.7,3.0,D
1,1,2574,Placebo,19237,F,N,N,N,N,0.9,364.0,3.54,63.0,1440.0,134.85,88.0,361.0,11.0,3.0,C
2,2,3428,Placebo,13727,F,N,Y,Y,Y,3.3,299.0,3.55,131.0,1029.0,119.35,50.0,199.0,11.7,4.0,D
3,3,2576,Placebo,18460,F,N,N,N,N,0.6,256.0,3.50,58.0,1653.0,71.30,96.0,269.0,10.7,3.0,C
4,4,788,Placebo,16658,F,N,Y,N,N,1.1,346.0,3.65,63.0,1181.0,125.55,96.0,298.0,10.6,4.0,C
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
413,414,681,,24472,F,,,,N,1.2,,2.96,,,,,174.0,10.9,3.0,D
414,415,1103,,14245,F,,,,N,0.9,,3.83,,,,,180.0,11.2,4.0,C
415,416,1055,,20819,F,,,,N,1.6,,3.42,,,,,143.0,9.9,3.0,C
416,417,691,,21185,F,,,,N,0.8,,3.75,,,,,269.0,10.4,3.0,C


In [None]:
import numpy as np
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler, FunctionTransformer, OneHotEncoder
from sklearn.compose import make_column_transformer, make_column_selector
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split, RandomizedSearchCV, StratifiedKFold, cross_val_score, cross_validate
from sklearn.ensemble import HistGradientBoostingRegressor, HistGradientBoostingClassifier
from sklearn.metrics import make_scorer, mean_absolute_error, classification_report, log_loss
from scipy.stats import loguniform

In [None]:
from lightgbm import LGBMRegressor, LGBMClassifier, log_evaluation, early_stopping
from xgboost import XGBRegressor, XGBClassifier
from sklearn.model_selection import cross_val_score, KFold, StratifiedKFold, GroupKFold
from tqdm import tqdm
from functools import partial

In [None]:
X = df.iloc[:,1:-1]
y = df.iloc[:,-1]

In [None]:
clf = HistGradientBoostingClassifier(max_iter=10000, early_stopping=True, max_depth=9, n_iter_no_change=50,
                                       l2_regularization=0.0026834539316593453, learning_rate=0.00806838423213153, )
ct = make_column_transformer(
            (StandardScaler(), make_column_selector(dtype_include = np.number)),
            (OneHotEncoder(drop='if_binary', handle_unknown='ignore'), make_column_selector(dtype_include=object)), 
            remainder = 'passthrough')
model = make_pipeline(ct, clf)
cv_scores = cross_validate(model, X, y, cv=10, scoring='neg_log_loss', return_estimator=True)



In [None]:
[est[-1].n_iter_ for est in cv_scores['estimator']]

[541, 514, 708, 464, 381, 420, 357, 408, 432, 509]

In [None]:
-cv_scores['test_score'].mean(), cv_scores['test_score'].std()

(0.44651905703962536, 0.035083286621313256)

## Submitting to Kaggle

In [None]:
ss = pd.read_csv(path/'sample_submission.csv')
ss

Unnamed: 0,id,Status_C,Status_CL,Status_D
0,7905,0.628084,0.034788,0.337128
1,7906,0.628084,0.034788,0.337128
2,7907,0.628084,0.034788,0.337128
3,7908,0.628084,0.034788,0.337128
4,7909,0.628084,0.034788,0.337128
...,...,...,...,...
5266,13171,0.628084,0.034788,0.337128
5267,13172,0.628084,0.034788,0.337128
5268,13173,0.628084,0.034788,0.337128
5269,13174,0.628084,0.034788,0.337128


In [None]:
tst = pd.read_csv(path/'test.csv')
tst = process(tst)
tst.head()

Unnamed: 0,id,N_Days,Drug,Age,Sex,Ascites,Hepatomegaly,Spiders,Edema,Bilirubin,Cholesterol,Albumin,Copper,Alk_Phos,SGOT,Tryglicerides,Platelets,Prothrombin,Stage
0,7905,3839,D-penicillamine,19724,F,N,Y,N,N,1.2,546.0,3.37,65.0,1636.0,151.9,90.0,430.0,10.6,2.0
1,7906,2468,D-penicillamine,14975,F,N,N,N,N,1.1,660.0,4.22,94.0,1257.0,151.9,155.0,227.0,10.0,2.0
2,7907,51,Placebo,13149,F,N,Y,N,Y,2.0,151.0,2.96,46.0,961.0,69.75,101.0,213.0,13.0,4.0
3,7908,2330,D-penicillamine,20510,F,N,N,N,N,0.6,293.0,3.85,40.0,554.0,125.55,56.0,270.0,10.6,2.0
4,7909,1615,D-penicillamine,21904,F,N,Y,N,N,1.4,277.0,2.97,121.0,1110.0,125.0,126.0,221.0,9.8,1.0


In [None]:
tst_pred = np.stack([est.predict_proba(tst.iloc[:,1:]) for est in cv_scores['estimator']]).mean(0)

In [None]:
ss.iloc[:,1:] = tst_pred

In [None]:
ss

Unnamed: 0,id,Status_C,Status_CL,Status_D
0,7905,0.694994,0.019751,0.285255
1,7906,0.634915,0.159313,0.205772
2,7907,0.031412,0.008980,0.959608
3,7908,0.946618,0.004740,0.048642
4,7909,0.855398,0.028063,0.116539
...,...,...,...,...
5266,13171,0.860927,0.061325,0.077748
5267,13172,0.977628,0.002074,0.020297
5268,13173,0.900276,0.007608,0.092116
5269,13174,0.986254,0.002850,0.010896


In [None]:
ss.to_csv('subm.csv', index=False)
!head subm.csv

id,Status_C,Status_CL,Status_D
7905,0.694994469622238,0.019750617481820255,0.2852549128959419
7906,0.6349152498695394,0.15931323497437186,0.20577151515608888
7907,0.03141198828776941,0.008980403225388029,0.9596076084868426
7908,0.9466175392665971,0.004740434769078907,0.04864202596432404
7909,0.8553981750471602,0.028063267062559988,0.11653855789027981
7910,0.9887751779850156,0.0012339135428827322,0.009990908472101743
7911,0.97213183100451,0.0033194520387885467,0.024548716956701407
7912,0.2368601150310734,0.011184719038122726,0.7519551659308037
7913,0.010964160733391709,0.000968689843495888,0.9880671494231125


In [None]:
if not iskaggle:
    from kaggle import api
    api.competition_submit_cli('subm.csv', 'avg cv + origin data', comp)

100%|█████████████████████| 336k/336k [00:00<00:00, 373kB/s]


In [None]:
# !kaggle competitions submit -c playground-series-s3e26 -f subm.csv -m 'avg cv estimators'

## Conclusion

## Addendum

In [None]:
if not iskaggle:
    push_notebook('xy', 'histgbr-minmax-transform',
                  title='Minmax transform and HistGBR model',
                  file='01-histgbr-minmax-transform.ipynb',
                  competition=comp, private=False, gpu=False)

Your kernel title does not resolve to the specified id. This may result in surprising behavior. We suggest making your title something that resolves to the specified id. See https://en.wikipedia.org/wiki/Clean_URL#Slug for more information on how slugs are determined.
Kernel version 1 successfully pushed.  Please check progress at https://www.kaggle.com/code/xiaochuanyang/minmax-transform-and-histgbr-model
