In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os

In [2]:
import random
import gc
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold,StratifiedKFold
from lightgbm import LGBMClassifier

import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from sklearn import preprocessing
import seaborn as sns

import matplotlib.pyplot as plt
%matplotlib inline

#!pip install datatable
#import datatable as dt

In [3]:
train = pd.read_csv('../input/train.csv')
test = pd.read_csv('../input/test.csv')
sub = pd.read_csv('../input/sample_submission.csv')
y = train['target']

In [4]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Columns: 287 entries, id to target
dtypes: float64(240), int64(47)
memory usage: 2.1 GB


## Modeling

In [5]:
continous_cols= ['f'+str(i) for i in range(242)]
continous_cols.remove('f22')
continous_cols.remove('f43')
categorical_cols = ['f'+str(i) for i in range(242,285)]+['f22','f43']
cols = continous_cols + categorical_cols

In [6]:
from sklearn.preprocessing import RobustScaler
scaler = RobustScaler()
train[continous_cols] = scaler.fit_transform(train[continous_cols])
test[continous_cols] = scaler.transform(test[continous_cols])

In [7]:
params = {
    'reg_alpha': 8.784125077358365,
    'reg_lambda': 0.0025286925777068953,
    'colsample_bytree': 0.2,
    'subsample': 0.5,
    'learning_rate': 0.025,
    'max_depth': 100,
    'num_leaves': 7,
    'min_child_samples': 185,
    'cat_smooth': 54,
    'objective': 'binary',  
    'random_state': 48,
    'n_estimators': 20000,
    'n_jobs': -1
    }

In [8]:
preds = np.zeros(test.shape[0])
kf = StratifiedKFold(n_splits=10,random_state=48,shuffle=True)
auc=[]  # list contains auc for each fold
n=0
for trn_idx, test_idx in kf.split(train[cols],y):
    X_tr,X_val=train[cols].iloc[trn_idx],train[cols].iloc[test_idx]
    y_tr,y_val=y.iloc[trn_idx],y.iloc[test_idx]
    model = LGBMClassifier(**params)
    model.fit(X_tr,y_tr,eval_set=[(X_val,y_val)],early_stopping_rounds=100,eval_metric="auc",verbose=False)
    preds += model.predict_proba(test[cols])[:,1]/kf.n_splits
    auc.append(roc_auc_score(y_val, model.predict_proba(X_val)[:, 1]))
    gc.collect()
    print(f"fold: {n+1}, auc: {auc[n]}")
    n+=1  

fold: 1, auc: 0.8573776914255349
fold: 2, auc: 0.8573726570206998
fold: 3, auc: 0.8576769133129075
fold: 4, auc: 0.8569299989955711
fold: 5, auc: 0.8583327159427404
fold: 6, auc: 0.8558183007221458
fold: 7, auc: 0.8558685487684545
fold: 8, auc: 0.8559062244031764
fold: 9, auc: 0.8560541217394786
fold: 10, auc: 0.8587173073938704


In [9]:
np.mean(auc)

0.8570054479724579

In [11]:
sub['target']=preds
sub.to_csv('../output/submission.csv', index=False)

sub.head()

Unnamed: 0,id,target
0,1000000,0.763309
1,1000001,0.237555
2,1000002,0.901139
3,1000003,0.867541
4,1000004,0.248017
