In [1]:
# install fastkaggle if not available
try: import fastkaggle
except ModuleNotFoundError:
    !pip install -Uq fastkaggle

from fastkaggle import *

## Getting set up

In [2]:
comp = 'playground-series-s3e26'
path = setup_comp(comp, install='')

In [3]:
path

Path('playground-series-s3e26')

In [4]:
trn_path = path/'train.csv'

In [5]:
import pandas as pd
df = pd.read_csv(trn_path)

In [6]:
import numpy as np
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler, FunctionTransformer, OneHotEncoder
from sklearn.compose import make_column_transformer, make_column_selector
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import HistGradientBoostingRegressor, HistGradientBoostingClassifier
from sklearn.metrics import make_scorer, mean_absolute_error, classification_report, log_loss
from scipy.stats import loguniform

In [42]:
X = df.iloc[:,1:-1]
y = df.iloc[:,-1]
X_tr, X_dev, y_tr, y_dev = train_test_split(X,y,test_size=0.2, stratify=y) # XXyy

In [43]:
ct = make_column_transformer(
    (StandardScaler(), make_column_selector(dtype_include = np.number)),
    (OneHotEncoder(), make_column_selector(dtype_include=object)), 
    remainder = 'drop'
)

pipe = make_pipeline(ct, HistGradientBoostingClassifier(max_iter=2000, early_stopping=True))

In [45]:
param_distributions = {
    (pipe[-1].__class__.__name__).lower()+'__learning_rate': loguniform(0.001, 0.1),
}
search = RandomizedSearchCV(pipe, param_distributions)

In [46]:
search

In [47]:
%%time
search.fit(X_tr,y_tr)

In [48]:
search.best_estimator_

In [49]:
y_pred = search.predict(X_dev)
print(classification_report(y_dev,y_pred))

              precision    recall  f1-score   support

           C       0.82      0.92      0.87       993
          CL       0.60      0.11      0.18        55
           D       0.79      0.67      0.73       533

    accuracy                           0.81      1581
   macro avg       0.74      0.57      0.59      1581
weighted avg       0.80      0.81      0.80      1581



In [50]:
y_pred_proba = search.predict_proba(X_dev)
log_loss(y_dev, y_pred_proba)

0.4555136554613228

In [28]:
# y_pred_proba[y_dev=='CL'] = np.array([1/3,1/3,1/3])

In [29]:
# log_loss(y_dev, y_pred_proba)

## Submitting to Kaggle

In [51]:
ss = pd.read_csv(path/'sample_submission.csv')
ss

Unnamed: 0,id,Status_C,Status_CL,Status_D
0,7905,0.628084,0.034788,0.337128
1,7906,0.628084,0.034788,0.337128
2,7907,0.628084,0.034788,0.337128
3,7908,0.628084,0.034788,0.337128
4,7909,0.628084,0.034788,0.337128
...,...,...,...,...
5266,13171,0.628084,0.034788,0.337128
5267,13172,0.628084,0.034788,0.337128
5268,13173,0.628084,0.034788,0.337128
5269,13174,0.628084,0.034788,0.337128


In [52]:
tst = pd.read_csv(path/'test.csv')
tst.head()

Unnamed: 0,id,N_Days,Drug,Age,Sex,Ascites,Hepatomegaly,Spiders,Edema,Bilirubin,Cholesterol,Albumin,Copper,Alk_Phos,SGOT,Tryglicerides,Platelets,Prothrombin,Stage
0,7905,3839,D-penicillamine,19724,F,N,Y,N,N,1.2,546.0,3.37,65.0,1636.0,151.9,90.0,430.0,10.6,2.0
1,7906,2468,D-penicillamine,14975,F,N,N,N,N,1.1,660.0,4.22,94.0,1257.0,151.9,155.0,227.0,10.0,2.0
2,7907,51,Placebo,13149,F,N,Y,N,Y,2.0,151.0,2.96,46.0,961.0,69.75,101.0,213.0,13.0,4.0
3,7908,2330,D-penicillamine,20510,F,N,N,N,N,0.6,293.0,3.85,40.0,554.0,125.55,56.0,270.0,10.6,2.0
4,7909,1615,D-penicillamine,21904,F,N,Y,N,N,1.4,277.0,2.97,121.0,1110.0,125.0,126.0,221.0,9.8,1.0


In [53]:
tst_pred = search.predict_proba(tst.iloc[:,1:])

In [33]:
# tst_class = pipe.predict(tst.iloc[:,1:])

In [81]:
# tst_pred[tst_class=='CL'] = np.array([1,1,1])

In [54]:
ss.iloc[:,1:] = tst_pred

In [55]:
ss

Unnamed: 0,id,Status_C,Status_CL,Status_D
0,7905,0.699760,0.057087,0.243153
1,7906,0.688024,0.205571,0.106405
2,7907,0.031379,0.020331,0.948290
3,7908,0.933012,0.007271,0.059718
4,7909,0.664607,0.036730,0.298663
...,...,...,...,...
5266,13171,0.782085,0.034523,0.183392
5267,13172,0.962817,0.003345,0.033838
5268,13173,0.893547,0.007918,0.098535
5269,13174,0.968747,0.012885,0.018368


In [56]:
ss.to_csv('subm.csv', index=False)
!head subm.csv

id,Status_C,Status_CL,Status_D
7905,0.6997603285409046,0.05708679918224086,0.24315287227685456
7906,0.6880239569209483,0.20557063575296755,0.10640540732608414
7907,0.0313788504120616,0.020331499802633664,0.9482896497853047
7908,0.9330116345588714,0.007270675445870098,0.059717689995258526
7909,0.6646065456219631,0.03673033258561511,0.2986631217924217
7910,0.9760211287948904,0.004219501343093043,0.019759369862016367
7911,0.9660811400619403,0.0042202543457821986,0.029698605592277546
7912,0.5570632648131515,0.04224087360205146,0.4006958615847969
7913,0.0381067032363778,0.0034210030649375017,0.9584722936986846


In [57]:
if not iskaggle:
    from kaggle import api
    api.competition_submit_cli('subm.csv', 'uniformise CL preds', comp)

100%|███████████████████████████████████████████████████████| 335k/335k [00:01<00:00, 291kB/s]


## Conclusion

## Addendum

In [26]:
if not iskaggle:
    push_notebook('xy', 'histgbr-minmax-transform',
                  title='Minmax transform and HistGBR model',
                  file='01-histgbr-minmax-transform.ipynb',
                  competition=comp, private=False, gpu=False)

Your kernel title does not resolve to the specified id. This may result in surprising behavior. We suggest making your title something that resolves to the specified id. See https://en.wikipedia.org/wiki/Clean_URL#Slug for more information on how slugs are determined.
Kernel version 1 successfully pushed.  Please check progress at https://www.kaggle.com/code/xiaochuanyang/minmax-transform-and-histgbr-model
