In [1]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np

import sys
from pathlib import Path

import matplotlib.pyplot as plt
%matplotlib inline

import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score

import gc
import os
os.environ['KMP_DUPLICATE_LIB_OK']='True'

In [2]:
train = pd.read_csv("../data/train.csv")
test = pd.read_csv("../data/test.csv")

In [None]:
df_test = test.drop(['ID_code'], axis=1).copy()
df_test = df_test.values

unique_samples = []
unique_count = np.zeros_like(df_test)
for feature in tqdm(range(df_test.shape[1])):
    _, index_, count_ = np.unique(df_test[:, feature], return_counts=True, return_index=True)
    unique_count[index_[count_ == 1], feature] += 1

real_samples_indexes = np.argwhere(np.sum(unique_count, axis=1) > 0)[:, 0]
synthetic_data_indices = np.argwhere(np.sum(unique_count, axis=1) == 0)[:, 0]

# Credits: https://www.kaggle.com/yag320/list-of-fake-samples-and-public-private-lb-split

In [3]:
raw_columns = sorted(list(np.setdiff1d(train.columns, ["ID_code", "target"])), key=lambda x:int(x.split('_')[1]))

In [4]:
real_test = test[~test.index.isin(synthetic_data_indices)].reset_index(drop=True)
fake_test = test[test.index.isin(synthetic_data_indices)].reset_index(drop=True)

In [5]:
real_test.shape, fake_test.shape

((100000, 201), (100000, 201))

In [6]:
panel = pd.concat([train, real_test], sort=False, ignore_index=True)

In [7]:
engineered_columns = []
for col in raw_columns:
    sys.stdout.write('\r{}'.format(col))
    sys.stdout.flush()
    panel[f"{col}_count"] = panel[col].map(panel[col].value_counts())
    panel[f"{col}_count_ratio"] = panel[col]/panel[f"{col}_count"]
    panel[f"{col}_count_mult"] = panel[col] * panel[f"{col}_count"]
    panel[f"{col}_val_count_diff"] = panel[col].rank(ascending=False, pct=True) - panel[f"{col}_count"].rank(ascending=False, pct=True)
   
    engineered_columns.append(f"{col}_count")
    engineered_columns.append(f"{col}_count_ratio")
    engineered_columns.append(f"{col}_count_mult")
    engineered_columns.append(f"{col}_val_count_diff")

var_199

In [8]:
x_train = panel[panel["ID_code"].astype(str).str.startswith("train")].reset_index(drop=True)
x_test = panel[panel["ID_code"].astype(str).str.startswith("test")].reset_index(drop=True)

In [11]:
x_test_final = pd.concat([x_test, fake_test], sort=False, ignore_index=True)

In [26]:
params = {
       'bagging_freq': 5,
       'bagging_fraction': 0.8,
#         'boost_from_average':'false',
       'boost': 'gbdt',
       'feature_fraction': 0.35, #0.75, 0.5
       'learning_rate': 0.01,
#        'max_depth': 6,
#        'max_bin': 1023,
       'metric':'auc',
       'min_data_in_leaf': 255,
       'min_sum_hessian_in_leaf': 20.0,
       'num_leaves': 3,
       'reg_alpha': 0.95,
       'reg_lambda': 0.95,
       'num_threads': -1,
       'tree_learner': 'serial',
       'objective': 'binary',
       'verbosity': 1
   }

In [27]:
num_rounds = 100000
kfold = 5
folds = StratifiedKFold(n_splits=kfold, shuffle=True, random_state=42)
oof = np.zeros(len(x_train))
predictions = np.zeros(len(x_test_final))
model_objects = {}

In [28]:
for fold_, (trn_idx, val_idx) in enumerate(folds.split(x_train, x_train["target"])):
    fold_num = f"Fold_{fold_}"
    print(fold_num)
    build = x_train.loc[trn_idx, :]
    val = x_train.loc[val_idx, :]
    cols_for_model = raw_columns + engineered_columns
    print(len(cols_for_model))
    lbuild = lgb.Dataset(build[cols_for_model], label=build['target'])
    lval = lgb.Dataset(val[cols_for_model], label=val['target'], reference=lbuild)
    clf = lgb.train(params, lbuild, num_rounds, valid_sets=[lbuild, lval], verbose_eval=500, early_stopping_rounds=1000)
    model_objects[fold_num] = clf
    val_preds = clf.predict(val[cols_for_model], num_iteration=clf.best_iteration)
    oof[val_idx] = val_preds
    predictions += clf.predict(x_test_final[cols_for_model], num_iteration=clf.best_iteration) / folds.n_splits
#     break

Fold_0
1000
Training until validation scores don't improve for 1000 rounds.
[500]	training's auc: 0.799575	valid_1's auc: 0.790849
[1000]	training's auc: 0.834762	valid_1's auc: 0.8268
[1500]	training's auc: 0.854291	valid_1's auc: 0.845596
[2000]	training's auc: 0.86708	valid_1's auc: 0.857645
[2500]	training's auc: 0.876278	valid_1's auc: 0.865884
[3000]	training's auc: 0.883607	valid_1's auc: 0.87269
[3500]	training's auc: 0.889621	valid_1's auc: 0.878127
[4000]	training's auc: 0.894258	valid_1's auc: 0.882574
[4500]	training's auc: 0.898474	valid_1's auc: 0.886185
[5000]	training's auc: 0.901624	valid_1's auc: 0.888773
[11500]	training's auc: 0.923555	valid_1's auc: 0.907027
[12000]	training's auc: 0.924473	valid_1's auc: 0.907834
[12500]	training's auc: 0.925301	valid_1's auc: 0.908437
[13000]	training's auc: 0.926137	valid_1's auc: 0.909061
[13500]	training's auc: 0.926932	valid_1's auc: 0.909643
[14000]	training's auc: 0.927644	valid_1's auc: 0.91011
[14500]	training's auc: 0.92

In [29]:
np.mean([m.best_score["valid_1"]["auc"] for m in model_objects.values()])

0.9174341029325582

In [30]:
submit = pd.DataFrame({"ID_code": x_test_final["ID_code"], "target": predictions})

In [31]:
submit.head()

Unnamed: 0,ID_code,target
0,test_3,0.175899
1,test_7,0.083944
2,test_11,0.03212
3,test_15,0.01251
4,test_16,0.461827


In [32]:
submit.to_csv("../submissions/version_1000featuresNew_cv91744.csv.gz", index=False, compression="gzip")

In [33]:
submit.shape

(200000, 2)