In [None]:
# install fastkaggle if not available
try: import fastkaggle
except ModuleNotFoundError:
    !pip install -Uq fastkaggle

from fastkaggle import *

## Getting set up

In [None]:
comp = 'playground-series-s3e26'
path = setup_comp(comp, install='')

In [None]:
path

Path('playground-series-s3e26')

In [None]:
trn_path = path/'train.csv'

In [None]:
import pandas as pd
df0 = pd.read_csv(trn_path)

In [None]:
def process(df):
    df_ = df.copy()
    df_.Edema = df.Edema.map({'S':'Y', 'N':'N', 'Y':'Y'}) # must map all keys
    return df_

In [None]:
df = process(df0)

In [None]:
import numpy as np
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler, FunctionTransformer, OneHotEncoder
from sklearn.compose import make_column_transformer, make_column_selector
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import HistGradientBoostingRegressor, HistGradientBoostingClassifier
from sklearn.metrics import make_scorer, mean_absolute_error, classification_report, log_loss
from scipy.stats import loguniform

In [None]:
from lightgbm import LGBMRegressor, LGBMClassifier, log_evaluation, early_stopping
from xgboost import XGBRegressor, XGBClassifier
from sklearn.model_selection import cross_val_score, KFold, StratifiedKFold, GroupKFold
from tqdm import tqdm

In [None]:
def train_model(train_data, model, features, n_splits, kfold_seed, include_orig):
    
    model_name = str(model).split("(")[0]
    test_preds = np.zeros((len(test), 3))
    oof_full = np.zeros((len(train_data), 3))
    val_scores, models = [], []
    
    print(model_name)
    
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=kfold_seed)
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=kfold_seed)
    
    for i, (train_idx, val_idx) in enumerate(tqdm(skf.split(train_data[features], train_data[TARGET]))):

        X_train, X_val = train_data[features].loc[train_idx], train_data[features].loc[val_idx]
        y_train, y_val = train_data[TARGET].loc[train_idx], train_data[TARGET].loc[val_idx]
        
        if include_orig:
            X_train = pd.concat([X_train, original[features]], ignore_index = True)
            y_train = pd.concat([y_train, original[TARGET]], ignore_index = True)
        
        if model_name in ["LGBMRegressor", "LGBMClassifier"]:
            callbacks = [early_stopping(stopping_rounds=50)]
            model.fit(X_train, y_train, eval_set=[(X_val, y_val)], callbacks=callbacks)
        elif model_name in ["XGBClassifier", "CatBoostClassifier"]:
            model.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=10000)
        else:
            model.fit(X_train, y_train)
        
        oof_preds = model.predict_proba(X_val[features])
        test_preds += model.predict_proba(test[features]) / n_splits
 
        oof_full[val_idx] = oof_preds
        score = log_loss(y_val, oof_preds)
        
        models.append(model)
        val_scores.append(score)
            
        print(f"{GREEN_TXT}FOLD {i + 1} log_loss: {round(score, 4)}{RESET_TXT}")
        
    print(f'{GREEN_TXT}mean log_loss across all folds: {np.mean(val_scores):.5f}{RESET_TXT}')
    print(f'{GREEN_TXT}std of log_loss across all folds: {np.std(val_scores):.5f}{RESET_TXT}')
    
    return oof_full, test_preds, models

In [None]:
X = df.iloc[:,1:-1]
y = df.iloc[:,-1]
X_tr, X_dev, y_tr, y_dev = train_test_split(X,y,test_size=0.2, stratify=y) # XXyy

In [None]:
ct = make_column_transformer(
    (StandardScaler(), make_column_selector(dtype_include = np.number)),
    (OneHotEncoder(drop='if_binary'), make_column_selector(dtype_include=object)), 
    remainder = 'drop'
)

pipe = make_pipeline(ct, HistGradientBoostingClassifier(max_iter=10000, early_stopping=True, max_depth=9, n_iter_no_change=50))

In [None]:
param_distributions = {
    (pipe[-1].__class__.__name__).lower()+'__learning_rate': loguniform(0.001, 0.1),
    (pipe[-1].__class__.__name__).lower()+'__l2_regularization': loguniform(0.001,1)
}
search = RandomizedSearchCV(pipe, param_distributions, n_iter=64)

In [None]:
search

In [None]:
%%time
search.fit(X_tr,y_tr)

CPU times: user 13min 42s, sys: 2.28 s, total: 13min 45s
Wall time: 3min 44s


In [None]:
search.best_estimator_

In [None]:
y_pred = search.predict(X_dev)
print(classification_report(y_dev,y_pred))

              precision    recall  f1-score   support

           C       0.84      0.91      0.88       993
          CL       0.67      0.15      0.24        55
           D       0.78      0.73      0.76       533

    accuracy                           0.82      1581
   macro avg       0.76      0.60      0.62      1581
weighted avg       0.82      0.82      0.81      1581



In [None]:
y_pred_proba = search.predict_proba(X_dev)
log_loss(y_dev, y_pred_proba)

0.44783527567390213

In [None]:
# y_pred_proba[y_dev=='CL'] = np.array([1/3,1/3,1/3])

In [None]:
# log_loss(y_dev, y_pred_proba)

## Submitting to Kaggle

In [None]:
ss = pd.read_csv(path/'sample_submission.csv')
ss

Unnamed: 0,id,Status_C,Status_CL,Status_D
0,7905,0.628084,0.034788,0.337128
1,7906,0.628084,0.034788,0.337128
2,7907,0.628084,0.034788,0.337128
3,7908,0.628084,0.034788,0.337128
4,7909,0.628084,0.034788,0.337128
...,...,...,...,...
5266,13171,0.628084,0.034788,0.337128
5267,13172,0.628084,0.034788,0.337128
5268,13173,0.628084,0.034788,0.337128
5269,13174,0.628084,0.034788,0.337128


In [None]:
tst = pd.read_csv(path/'test.csv')
tst = process(tst)
tst.head()

Unnamed: 0,id,N_Days,Drug,Age,Sex,Ascites,Hepatomegaly,Spiders,Edema,Bilirubin,Cholesterol,Albumin,Copper,Alk_Phos,SGOT,Tryglicerides,Platelets,Prothrombin,Stage
0,7905,3839,D-penicillamine,19724,F,N,Y,N,N,1.2,546.0,3.37,65.0,1636.0,151.9,90.0,430.0,10.6,2.0
1,7906,2468,D-penicillamine,14975,F,N,N,N,N,1.1,660.0,4.22,94.0,1257.0,151.9,155.0,227.0,10.0,2.0
2,7907,51,Placebo,13149,F,N,Y,N,Y,2.0,151.0,2.96,46.0,961.0,69.75,101.0,213.0,13.0,4.0
3,7908,2330,D-penicillamine,20510,F,N,N,N,N,0.6,293.0,3.85,40.0,554.0,125.55,56.0,270.0,10.6,2.0
4,7909,1615,D-penicillamine,21904,F,N,Y,N,N,1.4,277.0,2.97,121.0,1110.0,125.0,126.0,221.0,9.8,1.0


In [None]:
tst_pred = search.predict_proba(tst.iloc[:,1:])

In [None]:
# tst_class = pipe.predict(tst.iloc[:,1:])

In [None]:
# tst_pred[tst_class=='CL'] = np.array([1,1,1])

In [None]:
ss.iloc[:,1:] = tst_pred

In [None]:
ss

Unnamed: 0,id,Status_C,Status_CL,Status_D
0,7905,0.669011,0.024550,0.306440
1,7906,0.753854,0.107930,0.138216
2,7907,0.030853,0.011395,0.957752
3,7908,0.920769,0.012650,0.066582
4,7909,0.892450,0.011833,0.095717
...,...,...,...,...
5266,13171,0.835347,0.045681,0.118972
5267,13172,0.970132,0.003434,0.026435
5268,13173,0.875431,0.005931,0.118638
5269,13174,0.975827,0.013152,0.011021


In [None]:
ss.to_csv('subm.csv', index=False)
!head subm.csv

id,Status_C,Status_CL,Status_D
7905,0.6690106578475011,0.024549659330753502,0.30643968282174544
7906,0.7538537777524282,0.1079299787278806,0.1382162435196911
7907,0.030852668808825482,0.011394872899841457,0.957752458291333
7908,0.9207688568493329,0.012649558557008883,0.06658158459365832
7909,0.8924496996711181,0.011832800740265045,0.09571749958861697
7910,0.981509474537495,0.005453636292899222,0.01303688916960579
7911,0.9711565456309311,0.002741875049917623,0.026101579319151295
7912,0.3093598425696906,0.018618983213987153,0.6720211742163221
7913,0.011594340499128265,0.0005439154617245497,0.9878617440391471


In [None]:
if not iskaggle:
    from kaggle import api
    api.competition_submit_cli('subm.csv', 'process edema ohe drop binary', comp)

## Conclusion

## Addendum

In [None]:
if not iskaggle:
    push_notebook('xy', 'histgbr-minmax-transform',
                  title='Minmax transform and HistGBR model',
                  file='01-histgbr-minmax-transform.ipynb',
                  competition=comp, private=False, gpu=False)

Your kernel title does not resolve to the specified id. This may result in surprising behavior. We suggest making your title something that resolves to the specified id. See https://en.wikipedia.org/wiki/Clean_URL#Slug for more information on how slugs are determined.
Kernel version 1 successfully pushed.  Please check progress at https://www.kaggle.com/code/xiaochuanyang/minmax-transform-and-histgbr-model
