In [1]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
from tqdm import tqdm; tqdm.pandas()
pd.options.display.max_columns = 202
pd.options.display.max_rows = 300

In [2]:
train = pd.read_pickle('data/train_original.pkl')
test = pd.read_pickle('data/test_original.pkl')

In [3]:
special_cols = [col for col in train.columns if train[col].dtype != np.float64]
feature_cols = [col for col in train.columns if col not in special_cols]

In [4]:
trn_vals = pd.Series(np.concatenate(train[feature_cols].values))
tst_vals = pd.Series(np.concatenate(test[feature_cols].values))

In [5]:
trn_vals.nunique()/len(trn_vals), tst_vals.nunique()/len(tst_vals)

(0.02072085, 0.01857135)

In [6]:
trn_vals.isin(tst_vals).sum()/len(trn_vals), tst_vals.isin(trn_vals).sum()/len(tst_vals)

(0.99359275, 0.9970897)

In [7]:
import gc
del trn_vals, tst_vals; gc.collect()

11

### Countvec to transform features into per value counts vector

In [8]:
min_count = 10
all_values = pd.concat([train[feature_cols], test[feature_cols]], axis=0).reset_index(drop=True).values
all_values = pd.Series(np.concatenate(all_values))
all_values_vc = all_values.value_counts()
all_values_vc = all_values_vc[all_values_vc>min_count]
all_values = all_values.map(all_values_vc).fillna(-999)
all_values = all_values.values.reshape((-1,len(feature_cols)))
train_df = pd.DataFrame(data=all_values[:train.shape[0]], columns=feature_cols)
test_df = pd.DataFrame(data=all_values[train.shape[0]:], columns=feature_cols)
train_df.to_pickle('count vec data/train_cv.pkl')
test_df.to_pickle('count vec data/test_cv.pkl')

In [9]:
all_values.max()

2592.0

In [10]:
def to_texts(row):
    return " ".join([str(r).replace("-","_") for r in row])

all_df = pd.concat([train_df, test_df], axis=0).reset_index(drop=True)
all_texts = all_df.progress_apply(to_texts, axis=1)

from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(dtype=np.float32)
all_texts_sparse = cv.fit_transform(all_texts)
pd.to_pickle(all_texts_sparse[:train.shape[0]],'count vec data/train_cv_sparse.pkl')
pd.to_pickle(all_texts_sparse[train.shape[0]:],'count vec data/test_cv_sparse.pkl')

100%|████████████████████████████████████████████████████████████████████| 400000/400000 [00:46<00:00, 8639.31it/s]


### Train a 10-fold LGB

In [11]:
train_df = pd.read_pickle('count vec data/train_cv.pkl')
test_df = pd.read_pickle('count vec data/test_cv.pkl')
#train_df = pd.read_pickle('count vec data/train_cv_sparse.pkl')
#test_df = pd.read_pickle('count vec data/test_cv_sparse.pkl')

In [12]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
import lightgbm as lgb
import time

version = 'kh_lgb_10fold_vc_ver1'

params = {'num_leaves': 8,
         'min_data_in_leaf': 42,
         'objective': 'binary',
         'max_depth': 16,
         'learning_rate': 0.03,
         'boosting': 'gbdt',
         'bagging_freq': 5,
         'bagging_fraction': 0.8,
         'feature_fraction': 0.8201,
         'bagging_seed': 11,
         'reg_alpha': 1.7289,
         'reg_lambda': 4.984,
         'random_state': 42,
         'metric': 'binary_logloss',
         'verbosity': -1,
         'subsample': 0.81,
         'min_gain_to_split': 0.01,
         'min_child_weight': 19.428,
         'num_threads': 2}

oof = np.zeros(len(train))
prediction = np.zeros(len(test))

n_fold = 10
folds = StratifiedKFold(n_splits=n_fold, shuffle=True, random_state=42)
feature_importance_df = pd.DataFrame()

for fold_n, (train_index, valid_index) in enumerate(folds.split(train.target.values,train.target.values)):
    
    print('Fold', fold_n, 'started at', time.ctime())
    X_train, X_valid = train_df[feature_cols].values[train_index], train_df[feature_cols].values[valid_index]
    y_train, y_valid = train.target.values[train_index], train.target.values[valid_index]
    
    train_data = lgb.Dataset(X_train, label=y_train)
    valid_data = lgb.Dataset(X_valid, label=y_valid)
        
    model = lgb.train(params,train_data, num_boost_round=3200,
                    valid_sets = [train_data, valid_data], verbose_eval=100,
                    early_stopping_rounds = 200)
     
    fold_importance_df = pd.DataFrame()
    fold_importance_df["Feature"] = feature_cols
    fold_importance_df["importance"] = model.feature_importance(importance_type='gain')
    fold_importance_df["fold"] = fold_n + 1
    feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)    
       
    oof[valid_index] = model.predict(X_valid, num_iteration=model.best_iteration)
    prediction += model.predict(test_df[feature_cols].values, num_iteration=model.best_iteration)/n_fold
    gc.collect()
    print(roc_auc_score(y_valid, oof[valid_index]))
    
full_auc = roc_auc_score(train.target.values, oof)
print(full_auc)
# baseline: raw features 10 fold: 0.8984
# kh_lgb_10fold_vc_ver1: 0.7896

Fold 0 started at Thu Mar 14 19:59:33 2019
Training until validation scores don't improve for 200 rounds.
[100]	training's binary_logloss: 0.324797	valid_1's binary_logloss: 0.325842
[200]	training's binary_logloss: 0.323642	valid_1's binary_logloss: 0.325697
[300]	training's binary_logloss: 0.322568	valid_1's binary_logloss: 0.325552
[400]	training's binary_logloss: 0.321577	valid_1's binary_logloss: 0.325544
[500]	training's binary_logloss: 0.320579	valid_1's binary_logloss: 0.32553
Early stopping, best iteration is:
[326]	training's binary_logloss: 0.322306	valid_1's binary_logloss: 0.32551
0.5295218366507743
Fold 1 started at Thu Mar 14 20:00:49 2019
Training until validation scores don't improve for 200 rounds.


KeyboardInterrupt: 

In [None]:
pd.to_pickle(oof, 'oof+submission/'+version+'_oof_train')
pd.to_pickle(prediction, 'oof+submission/'+version+'_oof_test')    
sub = pd.DataFrame({"ID_code": test.ID_code.values})
sub["target"] = prediction
sub.to_csv('oof+submission/' + version + '_' + str(full_auc).replace('.', '_') + ".csv", index=False)

In [None]:
mean_gain = feature_importance_df[['importance', 'Feature']].groupby('Feature').mean().sort_values('importance', ascending=False)
mean_gain