In [1]:
import os
import sys
import collections
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import spearmanr
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn import preprocessing, model_selection, metrics, ensemble
import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostRegressor
import warnings
warnings.simplefilter("ignore")

In [2]:
data_dir = 'data/'
print('Data Files are: ', os.listdir(data_dir))
train = pd.read_csv(data_dir + 'train.csv')
test = pd.read_csv(data_dir + 'test.csv')
test_leak = pd.read_csv('test_leak_targets.csv').replace({0: np.nan})
print('Train DataFrame Shape: ', train.shape)
print('Test DataFrame Shape: ', test.shape)
print('Leak DataFrame Shape: ', test_leak.shape)
print(test_leak.head())
train.head()

Data Files are:  ['sample_submission.csv.zip', 'test.csv', 'test.csv.zip', 'train.csv', 'train.csv.zip']
Train DataFrame Shape:  (4459, 4993)
Test DataFrame Shape:  (49342, 4992)
Leak DataFrame Shape:  (49342, 2)
          ID  leak_target
0  000137c73          NaN
1  00021489f          NaN
2  0004d7953          NaN
3  00056a333          NaN
4  00056d8eb          NaN


Unnamed: 0,ID,target,48df886f9,0deb4b6a8,34b15f335,a8cb14b00,2f0771a37,30347e683,d08d1fbe3,6ee66e115,...,3ecc09859,9281abeea,8675bec0b,3a13ed79a,f677d4d13,71b203550,137efaa80,fb36b89d9,7e293fbaf,9fc776466
0,000d6aaf2,38000000.0,0.0,0,0.0,0,0,0,0,0,...,0.0,0.0,0.0,0,0,0,0,0,0,0
1,000fbd867,600000.0,0.0,0,0.0,0,0,0,0,0,...,0.0,0.0,0.0,0,0,0,0,0,0,0
2,0027d6b71,10000000.0,0.0,0,0.0,0,0,0,0,0,...,0.0,0.0,0.0,0,0,0,0,0,0,0
3,0028cbf45,2000000.0,0.0,0,0.0,0,0,0,0,0,...,0.0,0.0,0.0,0,0,0,0,0,0,0
4,002a68644,14400000.0,0.0,0,0.0,0,0,0,0,0,...,0.0,0.0,0.0,0,0,0,0,0,0,0


In [3]:
leak_cols =  ['f190486d6', '58e2e02e6', 'eeb9cd3aa', '9fd594eec', '6eef030c1', '15ace8c9f', 'fb0f5dbfe', '58e056e12',
              '20aa07010', '024c577b9', 'd6bb78916', 'b43a7cfd5', '58232a6fb', '1702b5bf0', '324921c7b', '62e59a501', 
              '2ec5b290f', '241f0f867', 'fb49e4212', '66ace2992', 'f74e8f13d', '5c6487af1', '963a49cdc', '26fc93eb7', 
              '1931ccfdd', '703885424', '70feb1494', '491b9ee45', '23310aa6f', 'e176a204a', '6619d81fc', '1db387535', 
              'fc99f9426', '91f701ba2', '0572565c2', '190db8488', 'adb64ff71', 'c47340d97', 'c5a231d81', '0ff32eb98']

## Calculate Feature Importance with Leak Features

In [4]:
c = train.columns.tolist()[2:]
correlation = [spearmanr(train['target'], train[c])[0] for c in train.columns.tolist()[2:]]
corr = pd.DataFrame({'col_name': train.columns.tolist()[2:], 'col_values': correlation}).sort_values(by='col_values')
corr.head() # The correlations are small

Unnamed: 0,col_name,col_values
216,77eb013ca,-0.116095
1908,a60027bb4,-0.115835
1378,3adf5e2b5,-0.114185
220,186b87c05,-0.113428
2232,f8b733d3f,-0.113011


In [5]:
x_train = train[train.columns[2:]]
x_test = test[test.columns[1:]]
y_train = np.log1p(train['target'])

In [None]:
# Use RMSLE to analysis the feature importance from the log target value
model = ensemble.ExtraTreesRegressor(n_estimators=200, max_depth=20, max_features=0.5, n_jobs=-1, random_state=0)
model.fit(x_train, y_train)
importances = model.feature_importances_
idx = np.argsort(importances)[::-1][:50]
x_train.columns.values[idx[:10]]

In [None]:
chosen_features = x_train.columns.values[idx]

tmp_train = np.round(x_train[x_train != 0], 2)
tmp_test = np.round(x_test[x_test != 0], 2)

pd.options.mode.chained_assignment = None
x_train['mean'], x_test['mean'] = tmp_train[chosen_features].mean(axis=1), tmp_test[chosen_features].mean(axis=1)
x_train['sum'], x_test['sum'] = tmp_train[chosen_features].sum(axis=1), tmp_test[chosen_features].sum(axis=1)
x_train['min'], x_test['min'] = tmp_train[chosen_features].min(axis=1), tmp_test[chosen_features].min(axis=1)
x_train['max'], x_test['max'] = tmp_train[chosen_features].max(axis=1), tmp_test[chosen_features].max(axis=1)
x_train['var'], x_test['var'] = tmp_train[chosen_features].var(axis=1), tmp_test[chosen_features].var(axis=1)
x_train['std'], x_test['std'] = tmp_train[chosen_features].std(axis=1), tmp_test[chosen_features].std(axis=1)
x_train['skew'], x_test['skew'] = tmp_train[chosen_features].skew(axis=1), tmp_test[chosen_features].skew(axis=1)
x_train['kurtosis'], x_test['kurtosis'] = tmp_train[chosen_features].kurtosis(axis=1), tmp_test[chosen_features].kurtosis(axis=1)

del(tmp_train)
del(tmp_test)
chosen_features = list(chosen_features) + ['mean', 'sum', 'min', 'max', 'var', 'std', 'skew', 'kurtosis']
x_train, x_test = x_train[chosen_features].fillna(0), x_test[chosen_features].fillna(0)
x_train.head()

In [None]:
# LGBM feature importance analysis
def lgb_model(x_train, x_test, y_train, y_test):
    params = {
        "objective" : "regression",
        "metric" : "rmse",
        "num_leaves" : 40,
        "learning_rate" : 0.01,
        "bagging_fraction" : 0.8,
        "feature_fraction" : 0.8,
        "bagging_frequency" : 5,
        "bagging_seed" : 42,
        "verbosity" : -1,
        "max_depth" : 7
    }
    train = lgb.Dataset(x_train, label=y_train)
    test = lgb.Dataset(x_test, label=y_test)
    model = lgb.train(params, train, 1000, valid_sets=[test], early_stopping_rounds=100, verbose_eval=200)
    return model

In [None]:
# Using KFold to train multiple models and average the result
num_kfold = 5
kf = model_selection.KFold(n_splits=num_kfold, shuffle=True, random_state=666)

target = pd.DataFrame(columns=['ID', 'target'])
target['ID'] = test['ID']
target['target'] = 0.0
for t_idx, v_idx in kf.split(x_train):
    t_x, v_x = x_train.loc[t_idx,:], x_train.loc[v_idx,:]
    t_y, v_y = y_train[t_idx], y_train[v_idx]
    model = lgb_model(t_x, v_x, t_y, v_y)
    target['target'] += np.expm1(model.predict(x_test))

target['target'] /= float(num_kfold)
target.to_csv('submission1.csv', index=False)

In [None]:
def xgb_model(x_train, x_test, y_train, y_test):
    params = {'objective': 'reg:linear',
              'eval_metric': 'rmse',
              'eta': 0.001,
              'max_depth': 8, 
              'subsample': 0.6,
              'colsample_bytree': 0.7,
              'alpha':0.001,
              'random_state': 42,
              'silent': True}
    train = xgb.DMatrix(x_train, y_train)
    test = xgb.DMatrix(x_test, y_test)
    watchlist = [(train, 'train'), (test, 'valid')]
    model = xgb.train(params, train, 8000, watchlist, maximize=False, early_stopping_rounds = 100, verbose_eval=1000)
    return model

In [None]:
# Using KFold to train multiple models and average the result
num_kfold = 5
kf = model_selection.KFold(n_splits=num_kfold, shuffle=True, random_state=666)

target = pd.DataFrame(columns=['ID', 'target'])
target['ID'] = test['ID']
target['target'] = 0.0
for t_idx, v_idx in kf.split(x_train):
    t_x, v_x = x_train.loc[t_idx,:], x_train.loc[v_idx,:]
    t_y, v_y = y_train[t_idx], y_train[v_idx]
    model = xgb_model(t_x, v_x, t_y, v_y)
    target['target'] += np.expm1(model.predict(xgb.DMatrix(x_test)))

target['target'] /= float(num_kfold)
target.to_csv('submission2.csv', index=False)

In [None]:
def cat_model(x_train, x_test, y_train, y_test):
    model = CatBoostRegressor(iterations=1000,
                              learning_rate=0.02,
                              depth=8,
                              eval_metric='RMSE',
                              random_seed = 66,
                              bagging_temperature = 0.2,
                              od_type='Iter',
                              metric_period = 50,
                              od_wait=20)
    model.fit(x_train, y_train, eval_set=(x_test, y_test), use_best_model=True, verbose=True)
    return model

In [None]:
# Using KFold to train multiple models and average the result
num_kfold = 5
kf = model_selection.KFold(n_splits=num_kfold, shuffle=True, random_state=666)

target = pd.DataFrame(columns=['ID', 'target'])
target['ID'] = test['ID']
target['target'] = 0
for t_idx, v_idx in kf.split(x_train):
    t_x, v_x = x_train.loc[t_idx,:], x_train.loc[v_idx,:]
    t_y, v_y = y_train[t_idx], y_train[v_idx]
    model = lgb_model(t_x, v_x, t_y, v_y)
    target['target'] += np.expm1(model.predict(x_test))

target['target'] /= float(num_kfold)
target.to_csv('submission3.csv', index=False)

In [None]:
lgb_data = pd.read_csv('submission1.csv')
xgb_data = pd.read_csv('submission2.csv')
cat_data = pd.read_csv('submission3.csv')

target = pd.DataFrame(columns=['ID', 'target'])
target['ID'] = lgb_data['ID']
target['target'] = (lgb_data['target'] + xgb_data['target'] + cat_data['target']) / 3
target.loc[test_leak['leak_target'].notnull(), 'target'] = test_leak.loc[test_leak['leak_target'].notnull(), 'leak_target']
target.to_csv('submission_final_small.csv', index=False)