In [1]:
import numpy as np
import pandas as pd
from scipy.optimize import minimize
import time
import optuna
from sklearn.linear_model import Ridge
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from catboost import CatBoostClassifier, CatBoostRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import StackingRegressor
from sklearn.model_selection import StratifiedKFold
from tqdm import tqdm
from sklearn.metrics import make_scorer, cohen_kappa_score
from sklearn.model_selection import cross_val_score
from sklearn.base import BaseEstimator, RegressorMixin

In [2]:

TRAIN_LONG_DF = '../data/features/train_wide.csv'
TEST_LONG_DF = '../data/features/test_wide.csv'

SEED = 42
TRAILS = 2
df = pd.read_csv(TRAIN_LONG_DF)
X, y = df.drop(columns=['sii', 'id']), df['sii']
# test_df = pd.read_csv(TEST_LONG_DF)
df.head()

Unnamed: 0,Basic_Demos-Age,Basic_Demos-Sex,CGAS-CGAS_Score,Physical-BMI,Physical-Height,Physical-Weight,Physical-Waist_Circumference,Physical-Diastolic_BP,Physical-HeartRate,Physical-Systolic_BP,...,stat_110,stat_111,stat_112,stat_113,stat_114,stat_115,stat_116,stat_117,stat_118,stat_119
0,-0.361407,-0.757178,0.534609,-0.176658,0.01448,-0.148472,1.618361e-15,-0.743453,1.156487,-0.007807,...,-0.3606,0.226256,0.260711,-1.364576,0.33155,0.08984,-0.200212,0.131316,0.332056,0.071358
1,0.805674,1.320694,-1.387545,0.668686,0.506766,0.579658,1.618361e-15,-0.743453,-0.672362,-0.923205,...,-0.800228,-0.49434,0.177486,0.732828,0.292056,-0.054929,0.117144,0.131316,0.332056,0.140799
2,1.097444,-0.757178,0.260016,-0.414877,1.491337,0.479718,1.618361e-15,-0.743453,-0.824766,0.724512,...,0.39969,0.251541,0.294279,0.732828,0.306761,0.073754,0.117144,0.131316,-0.531463,-0.472595
3,0.513904,-0.757178,-0.655296,3.192956,0.647419,2.145374,1.618361e-15,-0.209968,-0.062746,0.114246,...,-0.632352,0.290468,-0.123746,0.732828,0.350768,0.041583,0.117144,0.131316,1.195574,-0.206405
4,-0.361407,-0.757178,-0.472233,-0.431727,-0.379348,-0.453048,1.618361e-15,-0.591029,0.470669,0.358352,...,0.488918,1.804691,-0.150545,-1.364576,0.31456,-0.247953,0.117144,0.131316,1.195574,-0.889239


In [3]:
df[list(set(df.columns) - set(['sii', 'id']))].dtypes.value_counts(dropna=False)

float64    260
int64       10
Name: count, dtype: int64

In [4]:
df.sii.value_counts()

sii
0    583
1    266
2    137
3     10
Name: count, dtype: int64

In [5]:
def custom_cohen_kappa_scorer(y_true, y_pred):
    initial_thresholds = [0.5, 1.5, 2.5]

    def objective(thresholds):
        thresholds = np.sort(thresholds)
        y_pred_classes = np.digitize(y_pred, thresholds)
        return -cohen_kappa_score(y_true, y_pred_classes, weights='quadratic')

    result = minimize(objective, initial_thresholds, method='Nelder-Mead')
    best_thresholds = np.sort(result.x)

    y_pred_classes = np.digitize(y_pred, best_thresholds)

    return cohen_kappa_score(y_true, y_pred_classes, weights='quadratic')

def catboost_objective(trial):
    params = {
        'iterations': trial.suggest_int('iterations', 10, 1000),
        'depth': trial.suggest_int('depth', 4, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'random_seed': SEED,
        'verbose': False
    }

    model = CatBoostRegressor(**params)

    kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)

    score = cross_val_score(model, X, y, cv=kf, scoring=make_scorer(custom_cohen_kappa_scorer, greater_is_better=True), n_jobs=-1)
    return score.mean()

gaussian_sampler = optuna.samplers.TPESampler(multivariate=True)
study = optuna.create_study(direction='maximize', sampler=gaussian_sampler)
study.optimize(catboost_objective, n_trials=TRAILS)

print("Best params for CatBoost:", study.best_params)

# train and save model
CATBOOST_MODEL = '../models/catboost_model.pkl'
params = study.best_params
params['random_seed'] = SEED
params['verbose'] = False
model = CatBoostRegressor(**params)
model.fit(X, y)
model.save_model(CATBOOST_MODEL)

[I 2024-11-08 23:31:45,969] A new study created in memory with name: no-name-ec6eb607-ab35-4866-b694-7922396ff69d
[I 2024-11-08 23:31:59,494] Trial 0 finished with value: 0.36663518170458104 and parameters: {'iterations': 808, 'depth': 6, 'learning_rate': 0.1428810092223044}. Best is trial 0 with value: 0.36663518170458104.
[I 2024-11-08 23:32:01,215] Trial 1 finished with value: 0.3420160354894879 and parameters: {'iterations': 19, 'depth': 7, 'learning_rate': 0.1734149988010827}. Best is trial 0 with value: 0.36663518170458104.


Best params for CatBoost: {'iterations': 808, 'depth': 6, 'learning_rate': 0.1428810092223044}


In [6]:
# def lgbm_objective(trial):
#     params = {
#         'n_estimators': trial.suggest_int('n_estimators', 10, 1000),
#         'max_depth': -1,
#         'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
#         'random_state': SEED,
#         'verbosity': -1
#     }

#     model = LGBMRegressor(**params)

#     kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)

#     score = cross_val_score(model, X, y, cv=kf, scoring=make_scorer(custom_cohen_kappa_scorer, greater_is_better=True), n_jobs=-1)
#     return score.mean()
# study = optuna.create_study(direction='maximize', sampler=gaussian_sampler)
# study.optimize(lgbm_objective, n_trials=TRAILS)
# print("Best params for LGBM:", study.best_params)

# # train and save model
# LGBM_MODEL = '../models/lgbm_model.pkl'
# params = study.best_params
# params['random_state'] = SEED
# params['verbosity'] = -1
# params['max_depth'] = -1
# model = LGBMRegressor(**params)
# model.fit(X, y)
# model.booster_.save_model(LGBM_MODEL)

In [7]:

def xgb_objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 10, 1000),
        'max_depth': trial.suggest_int('max_depth', 4, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'random_state': SEED
    }

    model = XGBRegressor(**params)

    kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)

    score = cross_val_score(model, X, y, cv=kf, scoring=make_scorer(custom_cohen_kappa_scorer, greater_is_better=True), n_jobs=-1)
    return score.mean()
study = optuna.create_study(direction='maximize', sampler=gaussian_sampler)
study.optimize(xgb_objective, n_trials=TRAILS)
print("Best params for XGB:", study.best_params)

# train and save model
XGB_MODEL = '../models/xgb_model.pkl'
params = study.best_params
params['random_state'] = SEED
model = XGBRegressor(**params)
model.fit(X, y)
model.save_model(XGB_MODEL)

[I 2024-11-08 23:32:07,378] A new study created in memory with name: no-name-ef4cb7ec-e181-4a41-9576-1f4db672bac4
[I 2024-11-08 23:32:22,718] Trial 0 finished with value: 0.39252788366532093 and parameters: {'n_estimators': 453, 'max_depth': 6, 'learning_rate': 0.023918845317147202}. Best is trial 0 with value: 0.39252788366532093.
[I 2024-11-08 23:32:29,598] Trial 1 finished with value: 0.37938758719361587 and parameters: {'n_estimators': 689, 'max_depth': 4, 'learning_rate': 0.16637411641895952}. Best is trial 0 with value: 0.39252788366532093.


Best params for XGB: {'n_estimators': 453, 'max_depth': 6, 'learning_rate': 0.023918845317147202}




In [10]:
def rf_objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 10, 1000),
        'max_depth': trial.suggest_int('max_depth', 4, 10),
        'random_state': SEED
    }

    model = RandomForestRegressor(**params)

    kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)

    score = cross_val_score(model, X, y, cv=kf, scoring=make_scorer(custom_cohen_kappa_scorer, greater_is_better=True), n_jobs=-1)
    return score.mean()
study = optuna.create_study(direction='maximize', sampler=gaussian_sampler)
study.optimize(rf_objective, n_trials=TRAILS)
print("Best params for RF:", study.best_params)

# train and save model
RF_MODEL = '../models/rf_model.pkl'
params = study.best_params
params['random_state'] = SEED
model = RandomForestRegressor(**params)
model.fit(X, y)
import _pickle as cPickle
with open(RF_MODEL, 'wb') as f:
    cPickle.dump(model, f)

[I 2024-11-08 23:37:19,218] A new study created in memory with name: no-name-7f4104c5-8489-4c38-8b31-27b6b4a1cc84
[I 2024-11-08 23:37:39,349] Trial 0 finished with value: 0.35976635033558957 and parameters: {'n_estimators': 389, 'max_depth': 10}. Best is trial 0 with value: 0.35976635033558957.
[I 2024-11-08 23:38:02,094] Trial 1 finished with value: 0.3528795531587369 and parameters: {'n_estimators': 592, 'max_depth': 7}. Best is trial 0 with value: 0.35976635033558957.


Best params for RF: {'n_estimators': 389, 'max_depth': 10}
