In [None]:
# install fastkaggle if not available
try: import fastkaggle
except ModuleNotFoundError:
    !pip install -Uq fastkaggle

from fastkaggle import *

## Getting set up

In [None]:
comp = 'playground-series-s3e26'
path = setup_comp(comp, install='')

In [None]:
path

Path('playground-series-s3e26')

In [None]:
trn_path = path/'train.csv'

In [None]:
# !kaggle datasets download -d joebeachcapital/cirrhosis-patient-survival-prediction
# !unzip cirrhosis-patient-survival-prediction.zip

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler, FunctionTransformer, OneHotEncoder
from sklearn.compose import make_column_transformer, make_column_selector
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split, RandomizedSearchCV, StratifiedKFold, cross_val_score, cross_validate
from sklearn.ensemble import HistGradientBoostingRegressor, HistGradientBoostingClassifier
from sklearn.metrics import make_scorer, mean_absolute_error, classification_report, log_loss
from scipy.stats import loguniform

from lightgbm import LGBMRegressor, LGBMClassifier, log_evaluation, early_stopping
from xgboost import XGBRegressor, XGBClassifier
from sklearn.model_selection import cross_val_score, KFold, StratifiedKFold, GroupKFold
from tqdm import tqdm
from functools import partial

In [None]:
def process0(df):
    df_ = df.copy()
    df_.Edema = df.Edema.map({'S':'Y', 'N':'N', 'Y':'Y'}) # must map all keys
    return df_

def process1(df):
    df_ = df.copy()
    df1 = pd.read_csv('cirrhosis.csv') # real data with nan
    df1 = pd.concat([df1.drop('Status', axis=1), df1['Status']], axis=1)
    df1.columns = df_.columns
    dfall = pd.concat([df_,df1], axis=0).reset_index()
    return dfall.iloc[:,1:-1], dfall.iloc[:,-1]

def cv(X,y,cv=10):
    clf = HistGradientBoostingClassifier(max_iter=10000, early_stopping=True, max_depth=9, n_iter_no_change=80,
                                       l2_regularization=0, learning_rate=0.003)
    ct = make_column_transformer(
                (StandardScaler(), make_column_selector(dtype_include = np.number)),
                (OneHotEncoder(drop='if_binary', handle_unknown='ignore'), make_column_selector(dtype_include=object)), 
                remainder = 'passthrough')
    model = make_pipeline(ct, clf)
    return cross_validate(model, X, y, cv=cv, scoring='neg_log_loss', return_estimator=True)

In [None]:
df0 = pd.read_csv(trn_path)
X, y = process1(process0(df0))
cv_output = cv(X,y,2)



In [None]:
[est[-1].n_iter_ for est in cv_output['estimator']]

[989, 830]

In [None]:
-cv_output['test_score'].mean(), cv_output['test_score'].std()

(0.4637160754318698, 0.010345952676830528)

## Submitting to Kaggle

In [None]:
ss = pd.read_csv(path/'sample_submission.csv')

In [None]:
tst = pd.read_csv(path/'test.csv')
tst = process0(tst)
tst.head()

Unnamed: 0,id,N_Days,Drug,Age,Sex,Ascites,Hepatomegaly,Spiders,Edema,Bilirubin,Cholesterol,Albumin,Copper,Alk_Phos,SGOT,Tryglicerides,Platelets,Prothrombin,Stage
0,7905,3839,D-penicillamine,19724,F,N,Y,N,N,1.2,546.0,3.37,65.0,1636.0,151.9,90.0,430.0,10.6,2.0
1,7906,2468,D-penicillamine,14975,F,N,N,N,N,1.1,660.0,4.22,94.0,1257.0,151.9,155.0,227.0,10.0,2.0
2,7907,51,Placebo,13149,F,N,Y,N,Y,2.0,151.0,2.96,46.0,961.0,69.75,101.0,213.0,13.0,4.0
3,7908,2330,D-penicillamine,20510,F,N,N,N,N,0.6,293.0,3.85,40.0,554.0,125.55,56.0,270.0,10.6,2.0
4,7909,1615,D-penicillamine,21904,F,N,Y,N,N,1.4,277.0,2.97,121.0,1110.0,125.0,126.0,221.0,9.8,1.0


In [None]:
tst_pred = np.stack([est.predict_proba(tst.iloc[:,1:]) for est in cv_scores['estimator']]).mean(0)

In [None]:
ss.iloc[:,1:] = tst_pred

In [None]:
ss

Unnamed: 0,id,Status_C,Status_CL,Status_D
0,7905,0.658597,0.023611,0.317793
1,7906,0.658392,0.148905,0.192703
2,7907,0.029770,0.007689,0.962541
3,7908,0.947984,0.005454,0.046562
4,7909,0.848559,0.020134,0.131307
...,...,...,...,...
5266,13171,0.858965,0.052650,0.088385
5267,13172,0.973912,0.002076,0.024013
5268,13173,0.908381,0.006511,0.085108
5269,13174,0.986082,0.004135,0.009782


In [None]:
ss.to_csv('subm.csv', index=False)
!head subm.csv

id,Status_C,Status_CL,Status_D
7905,0.6585967923278477,0.0236105280569404,0.31779267961521185
7906,0.6583920450680791,0.1489046351929898,0.19270331973893112
7907,0.02976976048464016,0.007688793470031397,0.9625414460453283
7908,0.9479842520902741,0.005453621777507351,0.04656212613221872
7909,0.8485589127577186,0.0201340869956581,0.1313070002466234
7910,0.989160120181628,0.001009049802486992,0.009830830015884882
7911,0.971822705243557,0.00226932554434143,0.025907969212101617
7912,0.24390630777922007,0.015294774546351109,0.740798917674429
7913,0.01393415691125784,0.001021372667054665,0.9850444704216874


In [None]:
if not iskaggle:
    from kaggle import api
    api.competition_submit_cli('subm.csv', 'decrease lr', comp)

100%|█████████████████████| 336k/336k [00:01<00:00, 334kB/s]


In [None]:
# !kaggle competitions submit -c playground-series-s3e26 -f subm.csv -m 'avg cv estimators'

## Conclusion

## Addendum

In [None]:
if not iskaggle:
    push_notebook('xy', 'histgbr-minmax-transform',
                  title='Minmax transform and HistGBR model',
                  file='01-histgbr-minmax-transform.ipynb',
                  competition=comp, private=False, gpu=False)

Your kernel title does not resolve to the specified id. This may result in surprising behavior. We suggest making your title something that resolves to the specified id. See https://en.wikipedia.org/wiki/Clean_URL#Slug for more information on how slugs are determined.
Kernel version 1 successfully pushed.  Please check progress at https://www.kaggle.com/code/xiaochuanyang/minmax-transform-and-histgbr-model
