In [1]:
# https://www.kaggle.com/competitions/playground-series-s3e24/overview

In [2]:
from warnings import filterwarnings
filterwarnings('ignore')

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.gridspec as gridspec
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_score
import catboost as cb
import xgboost as xg
import lightgbm as lgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from gc import collect
import joblib
import os
from tqdm import tqdm_notebook
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.model_selection import GridSearchCV

In [4]:
train=pd.read_csv('/home/ivan_pronin/IT/kaggle/smoker/data/train.csv')
test=pd.read_csv('/home/ivan_pronin/IT/kaggle/smoker/data/test.csv')

In [6]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 25)

In [12]:
def calculate_dmi(df, features):
    df['IMT'] = df['weight(kg)']/(df['height(cm)']/100)**2

    rule_dict = {
        'cat_1': df['IMT'] < 16.5,
        'cat_2': (df['IMT'] >= 16.5) & (df['IMT'] < 18.4),
        'cat_3': (df['IMT'] >= 18.4) & (df['IMT'] < 24.9),
        'cat_4': (df['IMT'] >= 24.9) & (df['IMT'] < 30),
        'cat_5': (df['IMT'] >= 30) & (df['IMT'] < 34.9),
        'cat_6': (df['IMT'] >= 34.9) & (df['IMT'] < 40),
        'cat_7': (df['IMT'] >= 40)
    }

    for i, key in enumerate(rule_dict):
        df.loc[rule_dict[key], 'IMT'] = i+1
    df['IMT'] = df['IMT'].astype(float)

    features.append('IMT')

    return df, features

In [73]:
def create_special_features(df):
    # order the ears
    best = np.where(df['hearing(left)'] < df['hearing(right)'],
                    df['hearing(left)'],  df['hearing(right)'])
    worst = np.where(df['hearing(left)'] < df['hearing(right)'],
                     df['hearing(right)'],  df['hearing(left)'])
    df['hearing(left)'] = best - 1
    df['hearing(right)'] = worst - 1

    # order the eyes - eyesight is worst to best, and 9+ should be worst!
    df['eyesight(left)'] = np.where(
        df['eyesight(left)'] > 9, 0, df['eyesight(left)'])
    df['eyesight(right)'] = np.where(
        df['eyesight(right)'] > 9, 0, df['eyesight(right)'])
    best = np.where(df['eyesight(left)'] < df['eyesight(right)'],
                    df['eyesight(left)'],  df['eyesight(right)'])
    worst = np.where(df['eyesight(left)'] < df['eyesight(right)'],
                     df['eyesight(right)'],  df['eyesight(left)'])
    df['eyesight(left)'] = best
    df['eyesight(right)'] = worst
    ##
    df['Gtp'] = np.clip(df['Gtp'], 0, 300)
    df['HDL'] = np.clip(df['HDL'], 0, 110)
    df['LDL'] = np.clip(df['LDL'], 0, 200)
    df['ALT'] = np.clip(df['ALT'], 0, 150)
    df['AST'] = np.clip(df['AST'], 0, 100)
    df['serum creatinine'] = np.clip(df['serum creatinine'], 0, 3)

In [105]:
def generate_features(df):

    col_to_del = ['id', 'smoking']

    for col in col_to_del:
        if col in df.columns:
            df.drop(columns=[col], inplace=True)

    # all features available in dataset before feature engineering
    features = sorted(set(df.columns.to_list()))

    # Make new features
    
    # df, features = calculate_dmi(df, features)

    create_special_features(df)

    return df[features]

In [106]:
TRAINING = True
if TRAINING:
    # upload train dataset
    train = pd.read_csv('/home/ivan_pronin/IT/kaggle/smoker/data/train.csv')

    y = train['smoking'].to_numpy()  # make target
    train = generate_features(train)  # make features

    os.system('mkdir models')

    print(f'train dataset was preproccessed for training')
else:
    print(f'This is not training')

collect()

train dataset was preproccessed for training


mkdir: cannot create directory ‘models’: File exists


2107

In [100]:
GPU_mode = 'Y'

model_dict = {
    'lgbm': lgb.LGBMClassifier(
        boosting_type='gbdt',
        n_estimators=3000,
        learning_rate=0.0171,
        reg_alpha=1.10961,
        reg_lambda=0.0037,

        max_depth=13,
        colsample_bytree=0.2364,
        subsample=0.8764,
        num_leaves=150,
        min_child_weight=46,

        n_jobs=-1,
        metric='auc',
        objective='binary',
        early_stopping_round=50,
        verbosity=1
    ),
    'xgb': xg.XGBClassifier(
        booster='gbtree',
        learning_rate=0.027,
        n_estimators=3000,
        reg_alpha=3.05,
        reg_lambda=0.12,

        max_depth=11,
        colsample_bylevel=0.82,
        colsample_bytree=0.20,
        subsample=0.97,

        objective='binary:logistic',
        eval_metric='auc',
        early_stopping_rounds=50,
        tree_method='gpu_hist' if GPU_mode == 'Y' else 'auto',
        verbosity=0
    ),
    'cb': cb.CatBoostClassifier(
        n_estimators=3000,
        learning_rate=0.02,
        l2_leaf_reg=1.2,

        max_depth=8,

        eval_metric='AUC',
        task_type='GPU' if GPU_mode == 'Y' else 'CPU',
        early_stopping_rounds=50,
        metric_period=200,
        # verbose=False
    ),
    'rf': RandomForestClassifier(
        n_estimators=150,
        max_depth=20,
        min_samples_leaf=10,
        min_samples_split=5,
        n_jobs=-1
    ),
    'logreg': LogisticRegression(
        l1_ratio=0.001,
        solver='saga',
        max_iter=200,
        penalty='l2',
        n_jobs=-1
    ),
    'knn': KNeighborsClassifier(
        n_neighbors=50,
        weights='distance'
    )

}

In [94]:
# from mlxtend.evaluate import bias_variance_decomp

def train_func(train_data, y_data, model_name, n_splits, models, oof_score):
    if TRAINING:

        splitter = StratifiedKFold(
            n_splits=n_splits)

        model = model_dict[model_name]

        for fold_id, (train_idx, test_idx) in tqdm_notebook(enumerate(splitter.split(train_data, y_data))):
            x_train, x_test = train_data.iloc[train_idx], train_data.iloc[test_idx]
            y_train, y_test = y_data[train_idx], y_data[test_idx]

            if model_name == 'lgbm':
                model.fit(
                    x_train, y_train,
                    eval_set=[(x_test, y_test)]
                )

            elif model_name == 'cb':
                train_dataset = cb.Pool(x_train, y_train,
                                        )
                test_dataset = cb.Pool(x_test, y_test,
                                       )
                model.fit(
                    train_dataset,
                    eval_set=test_dataset,
                )

            elif model_name == 'xgb':
                model.fit(
                    x_train, y_train,
                    eval_set=[(x_test, y_test)],
                    verbose=200
                )

            else:
                model.fit(
                    x_train, y_train)

            models.append(model)
            joblib.dump(model, f'./models/{model_name}_{fold_id}_fold.model')

            # Creating OOF scores:-
            auc_score = roc_auc_score(y_test, model.predict(x_test))
            oof_score.append(auc_score)
            print(f'for {model_name} fold:{fold_id} auc {auc_score}')

            del auc_score, x_train, x_test, y_train, y_test
            collect()

    else:
        for i in range(n_splits):
            try:
                models.append(joblib.load(
                    f'/models/{model_name}_{i}_fold.model'))
                print(
                    f'{model_name}_{i}_fold.model from working dir successfully uploaded')
            except Exception as e:
                print(
                    f'uploading {model_name}_{i}_fold.model from working dir failed with {e}')

In [101]:
basic_models = []
basic_oof_score = []
n_splits = 10

for model in model_dict:
    if model in ['xgb', 'lgbm']:
        train_func(train, y, model, n_splits,
                   basic_models, basic_oof_score)
print(f"OOF score for all basic_models: {np.mean(basic_oof_score):.4f}")
best_basic_model_idx = np.argmax(basic_oof_score)
best_basic_model = basic_models[best_basic_model_idx]
print(best_basic_model, best_basic_model_idx,
      basic_oof_score[best_basic_model_idx])

0it [00:00, ?it/s]

[LightGBM] [Info] Number of positive: 62687, number of negative: 80643
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003780 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2039
[LightGBM] [Info] Number of data points in the train set: 143330, number of used features: 22
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.437361 -> initscore=-0.251878
[LightGBM] [Info] Start training from score -0.251878
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1625]	valid_0's auc: 0.872758
for lgbm fold:0 auc 0.7885049289149337
[LightGBM] [Info] Number of positive: 62687, number of negative: 80643
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001173 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `

0it [00:00, ?it/s]

[0]	validation_0-auc:0.69450
[200]	validation_0-auc:0.85972
[400]	validation_0-auc:0.86824
[600]	validation_0-auc:0.87077
[800]	validation_0-auc:0.87186
[1000]	validation_0-auc:0.87240
[1189]	validation_0-auc:0.87247
for xgb fold:0 auc 0.7877869485972684
[0]	validation_0-auc:0.69429
[200]	validation_0-auc:0.85591
[400]	validation_0-auc:0.86532
[600]	validation_0-auc:0.86873
[800]	validation_0-auc:0.87000
[1000]	validation_0-auc:0.87076
[1200]	validation_0-auc:0.87111
[1400]	validation_0-auc:0.87125
[1600]	validation_0-auc:0.87135
[1605]	validation_0-auc:0.87135
for xgb fold:1 auc 0.7902512298254789
[0]	validation_0-auc:0.69893
[200]	validation_0-auc:0.86115
[400]	validation_0-auc:0.86988
[600]	validation_0-auc:0.87242
[800]	validation_0-auc:0.87366
[1000]	validation_0-auc:0.87446
[1200]	validation_0-auc:0.87497
[1400]	validation_0-auc:0.87516
[1459]	validation_0-auc:0.87512
for xgb fold:2 auc 0.7933762298254788
[0]	validation_0-auc:0.69180
[200]	validation_0-auc:0.85608
[400]	validatio

In [26]:
# #check distribution by bootstrap

# from sklearn.utils import resample

# roc_auc = []
# n_iterations = 1000
# for i in tqdm_notebook(range(n_iterations)):
#     X_bs, y_bs = resample(train, y, replace=True)
#     # make predictions
#     y_hat = best_model.predict(X_bs)
#     # evaluate model
#     score = roc_auc_score(y_bs, y_hat)
#     roc_auc.append(score)

# # plot distribution of roc_auc
# sns.kdeplot(roc_auc)
# plt.title("Accuracy across 1000 bootstrap samples of the held-out test set")
# plt.xlabel("Accuracy")
# plt.show()

# lower = np.percentile(roc_auc, 2.5)
# upper = np.percentile(roc_auc, 97.5)

# print("95%_of convidence level: [{:.4f}, {:.4f}]".format(lower, upper))

In [27]:
# df = pd.DataFrame()

# for model in models:
#     if type(model).__name__ == 'XGBClassifier':
#         row=model.get_booster().get_score(importance_type='weight')
#         df_transform=pd.DataFrame(row.items(), columns=['feature', f'XGBoost'])
#         df_transform.set_index('feature', inplace=True)
#         df = pd.concat([df, df_transform], axis=1)
#     elif type(model).__name__ == 'LGBMClassifier':
#         row=model.feature_importances_
#         df_transform=pd.DataFrame(row, columns=[f'LGBM'], index=train.columns)
#         df = pd.concat([df, df_transform], axis=1)
#     elif type(model).__name__ == 'CatBoostClassifier':
#         row=model.get_feature_importance()
#         df_transform=pd.DataFrame(row, columns=[f'Catboost'], index=train.columns)
#         df = pd.concat([df, df_transform], axis=1)

# df=df.T.drop_duplicates().T
# df

In [28]:
# lgbm_features_to_del=df['LGBM']
# lower_bound = np.percentile(lgbm_features_to_del, 25)
# lgbm_features_to_del.loc[
#     (lgbm_features_to_del < lower_bound)] = np.nan
# lgbm_features_to_del.dropna(inplace=True)
# lgbm_list_after_del=lgbm_features_to_del.index.tolist()

In [112]:
# download test dataset
test = pd.read_csv('/home/ivan_pronin/IT/kaggle/smoker/data/test.csv')

# generate feature for test dataset
test = generate_features(test)

# make a predictions (2 variants)

# predictions = basic_models[7].predict_proba(test)*0.3+basic_models[8].predict_proba(test)*0.3+\
#     basic_models[17].predict_proba(test)*0.2+basic_models[18].predict_proba(test)*0.2

predictions = np.mean([model.predict_proba(test) for model in basic_models], 0)

predictions

array([[0.30904611, 0.69095389],
       [0.75313631, 0.2468637 ],
       [0.50518261, 0.49481739],
       ...,
       [0.57783241, 0.4221676 ],
       [0.91886459, 0.0811354 ],
       [0.97934413, 0.02065586]])

In [113]:
test = pd.read_csv('/home/ivan_pronin/IT/kaggle/smoker/data/test.csv')
output = pd.DataFrame({'id': test.id, 'smoking': predictions[:, 1]})
output.to_csv('my_answer5.csv', index=False)