<a href="https://colab.research.google.com/github/ykoba84/tabular-playground-series-oct-2021/blob/main/code/TPS_Oct_LightGBM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os

In [None]:
import random
import gc
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold,StratifiedKFold
from lightgbm import LGBMClassifier

import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from sklearn import preprocessing
import seaborn as sns

import matplotlib.pyplot as plt
%matplotlib inline

!pip install datatable
import datatable as dt

Collecting datatable
  Downloading datatable-1.0.0-cp37-cp37m-manylinux_2_12_x86_64.whl (96.9 MB)
[K     |████████████████████████████████| 96.9 MB 41 kB/s 
[?25hInstalling collected packages: datatable
Successfully installed datatable-1.0.0


## Memory Reduction

In [None]:
def reduce_memory_usage(df, verbose=True):
    numerics = ["int8", "int16", "int32", "int64", "float16", "float32", "float64"]
    start_mem = df.memory_usage().sum() / 1024 ** 2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == "int":
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if (
                    c_min > np.finfo(np.float16).min
                    and c_max < np.finfo(np.float16).max
                ):
                    df[col] = df[col].astype(np.float16)
                elif (
                    c_min > np.finfo(np.float32).min
                    and c_max < np.finfo(np.float32).max
                ):
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024 ** 2
    if verbose:
        print(
            "Mem. usage decreased to {:.2f} Mb ({:.1f}% reduction)".format(
                end_mem, 100 * (start_mem - end_mem) / start_mem
            )
        )
    return df

In [None]:
%%time
train = dt.fread('/content/drive/MyDrive/data/train.csv').to_pandas().drop('id', axis=1)
train = reduce_memory_usage(train)
test = dt.fread('/content/drive/MyDrive/data/test.csv').to_pandas().drop('id', axis=1)
test = reduce_memory_usage(test)
sub = dt.fread('/content/drive/MyDrive/data/sample_submission.csv').to_pandas()
sub = reduce_memory_usage(sub)

Mem. usage decreased to 501.63 Mb (73.2% reduction)
Mem. usage decreased to 250.34 Mb (73.3% reduction)
Mem. usage decreased to 2.86 Mb (50.0% reduction)
CPU times: user 2min 4s, sys: 12.1 s, total: 2min 16s
Wall time: 2min 45s


In [None]:
y = train['target']

In [None]:
bool_cols_train = []
for i, col in enumerate(train.columns):
    if train[col].dtypes == bool:
        bool_cols_train.append(i)

In [None]:
bool_cols_test = []
for i, col in enumerate(test.columns):
    if train[col].dtypes == bool:
        bool_cols_test.append(i)

In [None]:
train.iloc[:, bool_cols_train] = train.iloc[:, bool_cols_train].astype(int)
test.iloc[:, bool_cols_test] = test.iloc[:, bool_cols_test].astype(int)

In [None]:
train.head()

Unnamed: 0,f0,f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f11,f12,f13,f14,f15,f16,f17,f18,f19,f20,f21,f22,f23,f24,f25,f26,f27,f28,f29,f30,f31,f32,f33,f34,f35,f36,f37,f38,f39,...,f246,f247,f248,f249,f250,f251,f252,f253,f254,f255,f256,f257,f258,f259,f260,f261,f262,f263,f264,f265,f266,f267,f268,f269,f270,f271,f272,f273,f274,f275,f276,f277,f278,f279,f280,f281,f282,f283,f284,target
0,0.205933,0.410889,0.176758,0.223633,0.423584,0.476074,0.413574,0.611816,0.534668,0.147339,0.026184,0.106628,0.200928,0.713379,0.155762,0.557129,0.341797,0.285645,0.230347,0.203979,0.509766,0.707031,1,0.007793,0.247803,0.263672,0.259521,0.231689,0.138428,0.197876,0.054382,0.194214,0.281494,0.034821,0.02533,0.114441,0.13916,0.246216,0.251465,0.70166,...,1,0,0,0,0,0,0,1,0,1,1,0,0,0,1,0,0,1,1,0,0,0,1,1,0,1,0,1,1,0,0,1,0,0,0,0,0,0,0,1
1,0.18103,0.473145,0.011734,0.213623,0.619629,0.44165,0.230347,0.686035,0.281982,0.238525,0.493408,0.1073,0.231812,0.457031,0.395264,0.617188,0.459473,0.209229,0.20105,0.199341,0.366699,0.585938,1,0.2854,0.400391,0.162476,0.24939,0.141113,0.133667,0.247925,0.139282,0.216431,0.10968,0.03302,0.017456,0.189331,0.168823,0.184204,0.202759,0.218506,...,1,0,0,1,0,1,0,1,1,0,0,0,1,0,0,0,0,1,1,0,1,1,0,0,0,1,0,0,1,1,0,1,0,0,0,0,0,0,0,1
2,0.182617,0.307373,0.325928,0.207153,0.605469,0.309814,0.493408,0.750977,0.536133,0.286865,0.139526,0.107239,0.247803,0.631836,0.347412,0.64209,0.257812,0.162598,0.327393,0.193604,0.495361,0.636719,0,0.007133,0.309814,0.221069,0.284912,0.230835,0.138306,0.199707,0.060394,0.146729,0.20813,0.03598,0.022629,0.113525,0.274902,0.182739,0.151489,0.569824,...,1,1,0,0,1,0,0,0,0,0,0,0,1,1,0,1,0,0,0,0,0,0,0,0,1,1,0,0,1,0,0,0,0,1,1,0,0,0,0,1
3,0.180298,0.494629,0.008369,0.223633,0.760742,0.439209,0.432129,0.776367,0.483887,0.260986,0.147095,0.105408,0.287842,0.455811,0.247925,0.616699,0.335938,0.336914,0.239136,0.176147,0.538086,0.706543,1,0.008827,0.35376,0.219971,0.266846,0.14563,0.13855,0.234985,0.059814,0.140869,0.205078,0.319336,0.009972,0.112305,0.288818,0.33252,0.140869,0.473877,...,1,1,0,1,1,0,1,1,0,1,0,0,0,0,0,0,0,0,1,0,1,1,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1
4,0.177124,0.495605,0.014259,0.548828,0.625488,0.5625,0.117188,0.561035,0.077087,0.158325,0.260254,0.102539,0.265381,0.503906,0.269775,0.545898,0.31958,0.278564,0.214966,0.200195,0.534668,0.728516,0,0.004841,0.323486,0.16626,0.2854,0.208618,0.200439,0.19812,0.414795,0.251953,0.193359,0.034485,0.016464,0.197266,0.207397,0.255859,0.139893,0.321045,...,0,0,0,1,1,1,0,1,0,1,0,0,1,1,0,1,0,0,0,1,0,0,0,0,1,1,0,0,1,0,0,1,1,0,1,0,0,1,0,1


In [None]:
test.head()

Unnamed: 0,f0,f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f11,f12,f13,f14,f15,f16,f17,f18,f19,f20,f21,f22,f23,f24,f25,f26,f27,f28,f29,f30,f31,f32,f33,f34,f35,f36,f37,f38,f39,...,f245,f246,f247,f248,f249,f250,f251,f252,f253,f254,f255,f256,f257,f258,f259,f260,f261,f262,f263,f264,f265,f266,f267,f268,f269,f270,f271,f272,f273,f274,f275,f276,f277,f278,f279,f280,f281,f282,f283,f284
0,0.178223,0.435547,0.010231,0.202026,0.390137,0.324219,0.22168,0.73877,0.58252,0.34375,0.028076,0.102905,0.263916,0.607422,0.172119,0.490967,0.326172,0.251953,0.222412,0.242432,0.470947,0.723145,0,0.151245,0.22644,0.30542,0.269775,0.139526,0.141602,0.200439,0.059479,0.283691,0.244385,0.033875,0.010559,0.112122,0.193481,0.182739,0.188354,0.418945,...,0,1,1,0,0,1,1,1,1,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,1,1,1,0,0,1,1,0,0,0,0,0,1,1,1,0
1,0.181274,0.476562,0.022415,0.283203,0.598145,0.349609,0.283447,0.72168,0.269043,0.208374,0.27832,0.107605,0.402588,0.594238,0.193359,0.592285,0.345947,0.257812,0.201172,0.215942,0.449707,0.627441,1,0.133179,0.438477,0.21582,0.24292,0.156738,0.143921,0.697754,0.210938,0.269043,0.209229,0.040741,0.014839,0.113464,0.187622,0.206665,0.389404,0.535645,...,1,1,1,0,0,1,0,0,1,1,0,1,0,0,1,0,0,0,0,0,0,1,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0.159668,0.451172,0.259766,0.365234,0.594727,0.413574,0.249268,0.642578,0.411133,0.246948,0.029236,0.105408,0.230469,0.622559,0.190552,0.573242,0.53125,0.169678,0.224121,0.204346,0.453125,0.723633,0,0.008881,0.45752,0.199585,0.30127,0.217529,0.139282,0.200439,0.057343,0.488525,0.116272,0.040161,0.015915,0.113159,0.137085,0.182739,0.250244,0.572266,...,1,1,1,0,1,0,1,0,1,0,1,1,0,1,0,0,0,0,0,1,1,1,1,0,0,0,1,1,1,0,0,0,0,0,0,0,1,0,0,0
3,0.182373,0.520996,0.095337,0.327637,0.741699,0.358643,0.27002,0.601562,0.297852,0.25293,0.549805,0.225586,0.199341,0.667969,0.298096,0.598145,0.332275,0.240845,0.209106,0.210693,0.529785,0.755371,0,0.007473,0.340332,0.181519,0.449463,0.178467,0.144531,0.198975,0.057861,0.135376,0.525391,0.312744,0.011681,0.201904,0.198608,0.234619,0.329102,0.539551,...,0,1,1,0,1,0,1,0,1,0,0,1,1,1,1,0,0,0,0,1,0,0,1,0,1,1,0,1,0,0,0,0,0,0,0,1,1,0,0,0
4,0.22937,0.336426,0.023514,0.301025,0.668945,0.481689,0.545898,0.667969,0.545898,0.202759,0.271973,0.107361,0.342529,0.634766,0.299072,0.62207,0.427979,0.275391,0.230347,0.192871,0.442871,0.650879,1,0.008278,0.296143,0.165649,0.250977,0.145752,0.140869,0.239624,0.055756,0.358643,0.118713,0.037323,0.013741,0.229614,0.13562,0.244751,0.243042,0.417725,...,1,1,0,0,0,0,0,0,1,0,1,0,0,1,1,0,0,0,0,0,0,0,1,1,1,0,1,0,1,1,0,0,0,0,1,0,0,1,0,0


## Modeling

In [None]:
continous_cols= ['f'+str(i) for i in range(242)]
continous_cols.remove('f22')
continous_cols.remove('f43')
categorical_cols = ['f'+str(i) for i in range(242,285)]+['f22','f43']
cols = continous_cols + categorical_cols

In [None]:
params = {
    'reg_alpha': 8.784125077358365,
    'reg_lambda': 0.0025286925777068953,
    'colsample_bytree': 0.2,
    'subsample': 0.5,
    'learning_rate': 0.025,
    'max_depth': 100,
    'num_leaves': 7,
    'min_child_samples': 185,
    'cat_smooth': 54,
    'objective': 'binary',  
    'random_state': 48,
    'n_estimators': 20000,
    'n_jobs': -1
    }

In [None]:
preds = np.zeros(test.shape[0])
kf = StratifiedKFold(n_splits=10,random_state=48,shuffle=True)
auc=[]  # list contains auc for each fold
n=0
for trn_idx, test_idx in kf.split(train[cols],y):
    X_tr,X_val=train[cols].iloc[trn_idx],train[cols].iloc[test_idx]
    y_tr,y_val=y.iloc[trn_idx],y.iloc[test_idx]
    model = LGBMClassifier(**params)
    model.fit(X_tr,y_tr,eval_set=[(X_val,y_val)],early_stopping_rounds=100,eval_metric="auc",verbose="False")
    preds += model.predict_proba(test[cols])[:,1]/kf.n_splits
    auc.append(roc_auc_score(y_val, model.predict_proba(X_val)[:, 1]))
    gc.collect()
    print(f"fold: {n+1}, auc: {auc[n]}")
    n+=1  

In [15]:
np.mean(auc)

0.857029870154953

In [17]:
sub['target']=preds
sub.to_csv('/content/drive/MyDrive/data/submission.csv', index=False)