In [19]:
# install fastkaggle if not available
try: import fastkaggle
except ModuleNotFoundError:
    !pip install -Uq fastkaggle

from fastkaggle import *

## Getting set up

In [20]:
comp = 'playground-series-s3e26'
path = setup_comp(comp, install='')

In [21]:
path

Path('playground-series-s3e26')

In [22]:
trn_path = path/'train.csv'
tst_path = path/'test.csv'
ss_path = path/'sample_submission.csv'

import pandas as pd
trn = pd.read_csv(trn_path)
tst = pd.read_csv(tst_path)
ss = pd.read_csv(ss_path)

print(f'{trn.shape=}, {tst.shape=}')

trn.shape=(7905, 20), tst.shape=(5271, 19)


In [23]:
trn.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7905 entries, 0 to 7904
Data columns (total 20 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   id             7905 non-null   int64  
 1   N_Days         7905 non-null   int64  
 2   Drug           7905 non-null   object 
 3   Age            7905 non-null   int64  
 4   Sex            7905 non-null   object 
 5   Ascites        7905 non-null   object 
 6   Hepatomegaly   7905 non-null   object 
 7   Spiders        7905 non-null   object 
 8   Edema          7905 non-null   object 
 9   Bilirubin      7905 non-null   float64
 10  Cholesterol    7905 non-null   float64
 11  Albumin        7905 non-null   float64
 12  Copper         7905 non-null   float64
 13  Alk_Phos       7905 non-null   float64
 14  SGOT           7905 non-null   float64
 15  Tryglicerides  7905 non-null   float64
 16  Platelets      7905 non-null   float64
 17  Prothrombin    7905 non-null   float64
 18  Stage   

In [24]:
trn.describe()

Unnamed: 0,id,N_Days,Age,Bilirubin,Cholesterol,Albumin,Copper,Alk_Phos,SGOT,Tryglicerides,Platelets,Prothrombin,Stage
count,7905.0,7905.0,7905.0,7905.0,7905.0,7905.0,7905.0,7905.0,7905.0,7905.0,7905.0,7905.0,7905.0
mean,3952.0,2030.173308,18373.14649,2.594485,350.561923,3.548323,83.902846,1816.74525,114.604602,115.340164,265.228969,10.629462,3.032511
std,2282.121272,1094.233744,3679.958739,3.81296,195.379344,0.346171,75.899266,1903.750657,48.790945,52.530402,87.465579,0.781735,0.866511
min,0.0,41.0,9598.0,0.3,120.0,1.96,4.0,289.0,26.35,33.0,62.0,9.0,1.0
25%,1976.0,1230.0,15574.0,0.7,248.0,3.35,39.0,834.0,75.95,84.0,211.0,10.0,2.0
50%,3952.0,1831.0,18713.0,1.1,298.0,3.58,63.0,1181.0,108.5,104.0,265.0,10.6,3.0
75%,5928.0,2689.0,20684.0,3.0,390.0,3.77,102.0,1857.0,137.95,139.0,316.0,11.0,4.0
max,7904.0,4795.0,28650.0,28.0,1775.0,4.64,588.0,13862.4,457.25,598.0,563.0,18.0,4.0


## Preprocessing data

In [25]:
get_dataset(path, 'joebeachcapital/cirrhosis-patient-survival-prediction', force=True) # filename= cirrhosis.csv

In [56]:
def preprocess(df, train=True, dropna=False):
    df_ = df.copy()
    # df_.Edema = df_.Edema.map({'S':'Y', 'N':'N', 'Y':'Y'}) # must map all keys
    df_['is_gen']='Y'
    if train:
        df1 = pd.read_csv(path/'cirrhosis.csv') # original data based on which the dataset is synthesized
        df1 = pd.concat([df1.drop('Status', axis=1), df1['Status']], axis=1) # move status to last col, same as df_
        df1['is_gen']='N'
        df1.columns = df_.columns
        df_ = pd.concat([df_,df1], axis=0).reset_index(drop=True)
        if dropna: df_=df_.dropna()
        df_['Status']= df_.Status.map({'C':0, 'CL':1,'D':2})
    return df_

## Modelling

In [43]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler, FunctionTransformer, OneHotEncoder,PowerTransformer, LabelEncoder
from sklearn.compose import make_column_transformer, make_column_selector
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split, RandomizedSearchCV, StratifiedKFold, cross_val_score, cross_validate, KFold
from sklearn.ensemble import HistGradientBoostingRegressor, HistGradientBoostingClassifier, RandomForestClassifier
from sklearn.metrics import make_scorer, mean_absolute_error, classification_report, log_loss
from sklearn.linear_model import LogisticRegression

from scipy.stats import loguniform
from lightgbm import LGBMRegressor, LGBMClassifier, log_evaluation, early_stopping
from xgboost import XGBRegressor, XGBClassifier
from tqdm import tqdm

import warnings
warnings.filterwarnings('ignore')

### models not supporting nan 

In [36]:
df = preprocess(pd.read_csv(trn_path),train=True, dropna=True)
X, y = df.drop('Status', axis=1).iloc[:,1:], df['Status']

In [37]:
skf = StratifiedKFold(n_splits = 10)
ct = make_column_transformer(
                (PowerTransformer(), make_column_selector(dtype_include = np.number)),
                (OneHotEncoder(drop='if_binary', handle_unknown='ignore'), make_column_selector(dtype_include=object)), 
                remainder = 'passthrough')

In [13]:
%%time
logit_cv =cross_val_score(
    make_pipeline(ct, LogisticRegression(max_iter=1000)),
    X,y, scoring = 'neg_log_loss', cv=skf, n_jobs=-1)
print(f'logitstic regression {-logit_cv.mean()=}')

logitstic regression -logit_cv.mean()=0.5110572273752632
CPU times: user 132 ms, sys: 113 ms, total: 245 ms
Wall time: 1.95 s




In [14]:
%%time
RF_cv = cross_val_score(make_pipeline(ct, RandomForestClassifier(**{'n_estimators': 1000,
                                                  'criterion': 'log_loss',
                                                  'max_depth': 14,
                                                  'min_samples_split': 3,
                                                  'min_samples_leaf': 1,
                                                  'max_features': 4,
                                                  'random_state': 1,
                                                  'n_jobs': -1})),
                        X, y, scoring = 'neg_log_loss', cv = skf, n_jobs = -1)
print(f"random forrest {-RF_cv.mean()=}")



random forrest -RF_cv.mean()=0.44559989137605854
CPU times: user 150 ms, sys: 14.3 ms, total: 164 ms
Wall time: 16 s


### models supporting nan

In [44]:
df = preprocess(pd.read_csv(trn_path),train=True, dropna=False)
X, y = df.drop('Status', axis=1).iloc[:,1:], df['Status']

In [45]:
%%time
HB_cv = cross_val_score(make_pipeline(ct, HistGradientBoostingClassifier(**{'l2_regularization': 8.876168706639714,
                                                          'early_stopping': False,
                                                          'learning_rate': 0.009956485590638034,
                                                          'max_iter': 500,
                                                          'max_depth': 16,
                                                          'max_bins': 255,
                                                          'min_samples_leaf': 16,
                                                          'max_leaf_nodes': 18,
                                                          'random_state': 3})),
                        X, y, scoring = 'neg_log_loss', cv = skf, n_jobs = -1)

print(f"histGB {-HB_cv.mean()=}")



histGB -HB_cv.mean()=0.43771002014457927
CPU times: user 166 ms, sys: 59.1 ms, total: 225 ms
Wall time: 14.6 s


In [47]:
%%time
LGBM_cv = cross_val_score(make_pipeline(ct,LGBMClassifier(**{'n_estimators': 1000,
                                            'learning_rate': 0.013657589160895923,
                                            'max_depth': 17,
                                            'reg_alpha': 1.9791969860931342,
                                            'reg_lambda': 1.2857088172765347,
                                            'num_leaves': 37,
                                            'subsample': 0.6351453342675659,
                                            'colsample_bytree': 0.2644509924064132})),
                          X, y, scoring = 'neg_log_loss', cv = skf, n_jobs = -1)

print(f"Light GBM  {-LGBM_cv.mean()=}") 

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003430 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1947
[LightGBM] [Info] Number of data points in the train set: 7491, number of used features: 20
[LightGBM] [Info] Start training from score -0.471045
[LightGBM] [Info] Start training from score -3.323036
[LightGBM] [Info] Start training from score -1.079965
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002783 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1980
[LightGBM] [Info] Number of data points in the train set: 7491, number of used features: 29
[LightGBM] [Info] Start training from score -0.470832
[LightGBM] [Info] Start training from score -3.323036
[LightGBM] [Info] Start tr



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004919 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1991
[LightGBM] [Info] Number of data points in the train set: 7491, number of used features: 29
[LightGBM] [Info] Start training from score -0.470832
[LightGBM] [Info] Start training from score -3.323036
[LightGBM] [Info] Start training from score -1.080358
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.009404 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1990
[LightGBM] [Info] Number of data points in the train set: 7490, number of used features: 29
[LightGBM] [Info] Start training from score -0.470912
[LightGBM] [Info] Start training from score -3.322902
[LightGBM] [Info] Start training from score -1.080224
[LightGBM] [Info] Auto-choosing row-

In [46]:
%%time
XGB_cv = cross_val_score(make_pipeline(ct, XGBClassifier(**{'max_depth': 7,
                                          'learning_rate': 0.03570188608151033,
                                          'n_estimators': 1000,
                                          'gamma': 0.6440001307764849,
                                          'min_child_weight': 2,
                                          'colsample_bytree': 0.27034458854562116,
                                          'subsample': 0.8435412915999765})), 
                          X, y, scoring = 'neg_log_loss', cv = skf, n_jobs = -1, error_score='raise')

print(f"XGBoost {-XGB_cv.mean()=}") 

  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):


XGBoost -XGB_cv.mean()=0.4339636450901064
CPU times: user 90.1 ms, sys: 46.8 ms, total: 137 ms
Wall time: 19 s


In [50]:
def cv(X,y,cv=10):
    clf = LGBMClassifier(**{'n_estimators': 1000,
                            'learning_rate': 0.013657589160895923,
                            'max_depth': 17,
                            'reg_alpha': 1.9791969860931342,
                            'reg_lambda': 1.2857088172765347,
                            'num_leaves': 37,
                            'subsample': 0.6351453342675659,
                            'colsample_bytree': 0.2644509924064132})
    ct = make_column_transformer(
                (PowerTransformer(), make_column_selector(dtype_include = np.number)),
                (OneHotEncoder(drop='if_binary', handle_unknown='ignore'), make_column_selector(dtype_include=object)), 
                remainder = 'passthrough')
    model = make_pipeline(ct, clf)
    return cross_validate(model, X, y, cv=cv, scoring='neg_log_loss', return_estimator=True)

In [51]:
%%time
cv_output = cv(X,y,10)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004832 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1988
[LightGBM] [Info] Number of data points in the train set: 7490, number of used features: 29
[LightGBM] [Info] Start training from score -0.470912
[LightGBM] [Info] Start training from score -3.322902
[LightGBM] [Info] Start training from score -1.080224
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000223 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1990
[LightGBM] [Info] Number of data points in the train set: 7490, number of used features: 29
[LightGBM] [Info] Start training from score -0.470912
[LightGBM] [Info] Start training from score -3.322902
[LightGBM] [Info] Start tr

In [52]:
[est[-1].n_iter_ for est in cv_output['estimator']]

[1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000]

In [53]:
-cv_output['test_score'].mean(), cv_output['test_score'].std()

(0.42275781396747264, 0.034352559629100596)

## Submitting to Kaggle

In [54]:
ss = pd.read_csv(path/'sample_submission.csv')
ss.head()

Unnamed: 0,id,Status_C,Status_CL,Status_D
0,7905,0.628084,0.034788,0.337128
1,7906,0.628084,0.034788,0.337128
2,7907,0.628084,0.034788,0.337128
3,7908,0.628084,0.034788,0.337128
4,7909,0.628084,0.034788,0.337128


In [57]:
tst = preprocess(pd.read_csv(path/'test.csv'), train=False)
tst.head()

Unnamed: 0,id,N_Days,Drug,Age,Sex,Ascites,Hepatomegaly,Spiders,Edema,Bilirubin,Cholesterol,Albumin,Copper,Alk_Phos,SGOT,Tryglicerides,Platelets,Prothrombin,Stage,is_gen
0,7905,3839,D-penicillamine,19724,F,N,Y,N,N,1.2,546.0,3.37,65.0,1636.0,151.9,90.0,430.0,10.6,2.0,Y
1,7906,2468,D-penicillamine,14975,F,N,N,N,N,1.1,660.0,4.22,94.0,1257.0,151.9,155.0,227.0,10.0,2.0,Y
2,7907,51,Placebo,13149,F,N,Y,N,Y,2.0,151.0,2.96,46.0,961.0,69.75,101.0,213.0,13.0,4.0,Y
3,7908,2330,D-penicillamine,20510,F,N,N,N,N,0.6,293.0,3.85,40.0,554.0,125.55,56.0,270.0,10.6,2.0,Y
4,7909,1615,D-penicillamine,21904,F,N,Y,N,N,1.4,277.0,2.97,121.0,1110.0,125.0,126.0,221.0,9.8,1.0,Y


In [58]:
tst_pred = np.stack([est.predict_proba(tst.iloc[:,1:]) for est in cv_output['estimator']]).mean(0)

In [59]:
ss.iloc[:,1:] = tst_pred

In [60]:
ss.to_csv('subm.csv', index=False)
!head subm.csv

id,Status_C,Status_CL,Status_D
7905,0.3145367168898071,0.024978122408722478,0.6604851607014703
7906,0.45310963606197746,0.16286635720440776,0.38402400673361475
7907,0.03294870589681721,0.009322964161887395,0.9577283299412953
7908,0.9786218789660575,0.0030776439225697035,0.01830047711137262
7909,0.8432855080843726,0.055813139108385035,0.1009013528072424
7910,0.989848458886341,0.0011861465504909897,0.008965394563168088
7911,0.981747117721089,0.0022701169601863653,0.015982765318724685
7912,0.08951367494067484,0.0230850146366774,0.8874013104226478
7913,0.008120581444055277,0.001640488655418932,0.9902389299005259


In [61]:
if not iskaggle:
    from kaggle import api
    api.competition_submit_cli('subm.csv', 'lgbm 10fold avg', comp)

100%|███████████████████████████████████████████████████████| 337k/337k [00:01<00:00, 302kB/s]


## Conclusion

## Addendum

In [None]:
if not iskaggle:
    push_notebook('xy', 'histgbr-minmax-transform',
                  title='Minmax transform and HistGBR model',
                  file='01-histgbr-minmax-transform.ipynb',
                  competition=comp, private=False, gpu=False)

Your kernel title does not resolve to the specified id. This may result in surprising behavior. We suggest making your title something that resolves to the specified id. See https://en.wikipedia.org/wiki/Clean_URL#Slug for more information on how slugs are determined.
Kernel version 1 successfully pushed.  Please check progress at https://www.kaggle.com/code/xiaochuanyang/minmax-transform-and-histgbr-model
