In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import time

from pathlib import Path

%matplotlib inline

In [2]:
from elo_helpers import *

In [93]:
train = pd.read_feather('./data/train_v4.feather')
test = pd.read_feather('./data/test_v4.feather')

In [7]:
train.head()

Unnamed: 0,first_active_month,card_id,target,feature_1_mean_enc,feature_2_mean_enc,feature_3_mean_enc,feature_1_1,feature_1_2,feature_1_3,feature_1_4,...,new_purchase_amount_max,new_purchase_amount_min,new_purchase_amount_std,new_installments_sum,new_installments_median,new_installments_max,new_installments_min,new_installments_std,new_month_lag_min,new_month_lag_max
0,6,C_ID_92a2005557,-0.820283,-0.501972,-0.342233,-0.429176,0,0,0,0,...,-0.296112,-0.724368,0.135812,0.0,0.0,0.0,0.0,0.0,1.0,2.0
1,1,C_ID_3d0044924f,0.392913,-0.377079,-0.389225,-0.349962,0,0,0,1,...,-0.701858,-0.73941,0.014326,6.0,1.0,1.0,1.0,0.0,1.0,2.0
2,8,C_ID_d639edf6cd,0.688056,-0.349808,-0.342233,-0.349962,0,1,0,0,...,-0.700326,-0.700326,,0.0,0.0,0.0,0.0,,2.0,2.0
3,9,C_ID_186d6a6901,0.142495,-0.377079,-0.512248,-0.349962,0,0,0,1,...,-0.56674,-0.734135,0.065882,5.0,1.0,1.0,-1.0,0.755929,1.0,2.0
4,11,C_ID_cdbd2c0db2,-0.159749,-0.305659,-0.512248,-0.349962,1,0,0,0,...,0.450886,-0.739395,0.223821,35.0,1.0,2.0,-1.0,0.376913,1.0,2.0


In [4]:
import xgboost as xgb
import lightgbm as lgb

In [5]:
from sklearn.model_selection import KFold, GridSearchCV, RandomizedSearchCV, train_test_split

In [94]:
X = train.drop(['card_id', 'target'], axis=1)
y = train.target

X_tr, X_val, y_tr, y_val = train_test_split(X, y, test_size=0.35, random_state=11)

Overall strategy here will be to run the training set through a randomized search combined with KFolds for both XGBoost and LightGBM. Then, once I have the suggested parameters from the randomized search, repeat the process (maybe with KFold and maybe without) on a grid search of parameters that are close to those found by the randomized search. 

After the parameters are chosen, then I will submit predictions using each algorithm independently before trying mixes of the two algorithms' predictions (is this similar to regressing the two algo's preds onto the target?). 

- *Still need to look into outliers. Create clusters?*
- *am I still overfitting with too many trees?*

## XGBoost

In [7]:
xgb_params_rand = {
    'learning_rate': [0.03, 0.01, 0.005, 0.001],
    'max_depth': [5, 7, 10],
    'subsample': [0.6, 0.8, 0.95],
    'colsample_bytree': [0.6, 0.8, 0.9, 1],
    'colsample_bylevel': [0.75, 1],
    'n_estimators': [100, 175, 250],
    'min_child_weight': [2, 3, 6]
#     'objective': 'reg:linear', 
#     'eval_metric': 'rmse', 
#     'silent': True
}

In [85]:
xgb_reg = xgb.XGBRegressor(n_jobs=-1, silent=True)

xgb_random = RandomizedSearchCV(xgb_reg,
                               param_distributions=xgb_params_rand,
                               scoring= 'neg_mean_squared_error', #calc_rmse,
                               cv=5, n_jobs=-1)

In [20]:
xgb_random.fit(X, y)

RandomizedSearchCV(cv=5, error_score='raise',
          estimator=XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=-1, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1),
          fit_params=None, iid=True, n_iter=10, n_jobs=-1,
          param_distributions={'learning_rate': [0.03, 0.01, 0.005, 0.001], 'max_depth': [5, 7, 10], 'subsample': [0.6, 0.8, 0.95], 'colsample_bytree': [0.6, 0.8, 0.9, 1], 'colsample_bylevel': [0.75, 1], 'n_estimators': [100, 175, 250], 'min_child_weight': [2, 3, 6]},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score='warn', scoring='neg_mean_squared_error',
          verbose=0)

In [24]:
xgb_random.best_params_

{'subsample': 0.8,
 'n_estimators': 175,
 'min_child_weight': 6,
 'max_depth': 5,
 'learning_rate': 0.03,
 'colsample_bytree': 0.8,
 'colsample_bylevel': 1}

In [37]:
xgb_random.best_score_

-13.88111955179334

In [26]:
xgb_params_grid = {
    'subsample': [0.75, 0.8, 0.85],
    'n_estimators': [150, 175, 200],
    'min_child_weight': [5, 6],
    'max_depth': [5, 6],
    'learning_rate': [0.03, 0.01],
    'colsample_bytree': [0.75, 0.8, 0.85]
}

In [28]:
xgb_grid = GridSearchCV(xgb_reg,
                       param_grid=xgb_params_grid,
                       scoring='neg_mean_squared_error',
                       cv=5, n_jobs=-1)

In [34]:
current = time.time()
xgb_grid.fit(X, y)
time.time() - current

33990.286532878876

In [38]:
xgb_grid.best_params_

{'colsample_bytree': 0.8,
 'learning_rate': 0.03,
 'max_depth': 6,
 'min_child_weight': 6,
 'n_estimators': 200,
 'subsample': 0.85}

In [36]:
xgb_grid.best_score_

-13.793381969214288

In [10]:
13.79**0.5

3.713488925525428

In [43]:
xgb_best = xgb_grid.best_estimator_

In [52]:
xgb_best.save_model('./models/xgb_best_grid.model')

In [97]:
xgb_best = xgb.sklearn.XGBRegressor({'nthread':4})
xgb_best.load_model('./models/xgb_best_grid.model')

In [99]:
xgb_preds = xgb_best.predict(test.drop('card_id', axis=1))

In [49]:
xgb_preds_df = pd.DataFrame(np.array([test.card_id, xgb_preds]).T, columns=['card_id', 'target'])
xgb_preds_df.head()

Unnamed: 0,card_id,target
0,C_ID_0ab67a22ab,-1.62725
1,C_ID_130fd0cbdd,-0.0981798
2,C_ID_b709037bc5,-0.710223
3,C_ID_d27d835a9f,0.0165643
4,C_ID_2b5e3df5c2,-1.12372


In [50]:
xgb_preds_df.to_csv('./submissions/sub_5_xgb.csv', index=False)

## LightGBM

In [19]:
lgb_params_rand = {
    'learning_rate': [0.03, 0.01, 0.005, 0.001],
    'n_estimators': [100, 175, 250],
    'max_depth': [5, 7, 9],
    'min_child_samples':[6, 12, 20],
    'num_leaves': [31, 64],
    'subsample': [0.6, 0.8, 1],
    'colsample_bytree': [0.6, 0.8, 1],
    'reg_alpha':[0, 0.5, 1],
    'reg_lambda':[0, 0.5, 1]
}

In [20]:
lgb_reg = lgb.LGBMRegressor(n_jobs=-1, silent=True)

lgb_rand = RandomizedSearchCV(lgb_reg, param_distributions=lgb_params_rand,
                             scoring='neg_mean_squared_error',
                             cv=5, n_jobs=-1)

In [21]:
lgb_rand.fit(X, y)

RandomizedSearchCV(cv=5, error_score='raise',
          estimator=LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
       learning_rate=0.1, max_depth=-1, min_child_samples=20,
       min_child_weight=0.001, min_split_gain=0.0, n_estimators=100,
       n_jobs=-1, num_leaves=31, objective=None, random_state=None,
       reg_alpha=0.0, reg_lambda=0.0, silent=True, subsample=1.0,
       subsample_for_bin=200000, subsample_freq=0),
          fit_params=None, iid=True, n_iter=10, n_jobs=-1,
          param_distributions={'learning_rate': [0.03, 0.01, 0.005, 0.001], 'n_estimators': [100, 175, 250], 'max_depth': [5, 7, 9], 'min_child_samples': [6, 12, 20], 'num_leaves': [31, 64], 'subsample': [0.6, 0.8, 1], 'colsample_bytree': [0.6, 0.8, 1], 'reg_alpha': [0, 0.5, 1], 'reg_lambda': [0, 0.5, 1]},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score='warn', scoring='neg_mean_squared_error',
          verbose=0)

In [22]:
lgb_rand.best_score_

-13.761889376565035

In [23]:
lgb_rand.best_params_

{'subsample': 1,
 'reg_lambda': 1,
 'reg_alpha': 0.5,
 'num_leaves': 64,
 'n_estimators': 250,
 'min_child_samples': 6,
 'max_depth': 7,
 'learning_rate': 0.03,
 'colsample_bytree': 1}

In [31]:
lgb_params_grid = {
    'n_estimators': [200],
    'subsample': [0.75, 0.85],
    'reg_lambda': [0.9, 1],
    'reg_alpha': [0.5, 0.8],
    'num_leaves': [64],
    'min_child_samples': [6, 10],
    'max_depth': [6, 7, 8],
    'learning_rate': [0.03, 0.01],
    'colsample_bytree': [0.75, 0.85]
}

In [32]:
lgb_grid = GridSearchCV(lgb_reg,
                       param_grid=lgb_params_grid,
                       scoring='neg_mean_squared_error',
                       cv=5, n_jobs=-1)

In [34]:
current = time.time()
lgb_grid.fit(X, y)
time.time() - current

7619.442352294922

In [35]:
lgb_grid.best_score_

-13.741341283499333

In [36]:
lgb_grid.best_params_

{'colsample_bytree': 0.75,
 'learning_rate': 0.03,
 'max_depth': 8,
 'min_child_samples': 10,
 'n_estimators': 200,
 'num_leaves': 64,
 'reg_alpha': 0.8,
 'reg_lambda': 0.9,
 'subsample': 0.75}

In [37]:
lgb_best = lgb_grid.best_estimator_

In [38]:
lgb_best.booster_.save_model('./models/lgb_best_grid.model')

In [39]:
foo = lgb.Booster(model_file='./models/lgb_best_grid.model')

In [47]:
lgb_preds = lgb_best.predict(test.drop('card_id', axis=1))

lgb_preds_df = pd.DataFrame(np.array([test.card_id, lgb_preds]).T,
                            columns=['card_id', 'target'])

In [48]:
lgb_preds_df.head()

Unnamed: 0,card_id,target
0,C_ID_0ab67a22ab,-1.50155
1,C_ID_130fd0cbdd,-0.122454
2,C_ID_b709037bc5,-0.439529
3,C_ID_d27d835a9f,-0.0363408
4,C_ID_2b5e3df5c2,-1.33748


In [49]:
lgb_preds_df.to_csv('./submissions/sub_6_lgb.csv', index=False)

## Ensembling/Stacking!

In [60]:
xgb_preds_df = pd.read_csv('./submissions/sub_5_xgb.csv')

In [67]:
preds_df = lgb_preds_df.merge(xgb_preds_df, on='card_id', suffixes=('_lgb', '_xgb'))

In [68]:
mean = 0.5*preds_df.target_lgb + 0.5*preds_df.target_xgb
quarter = 0.25*preds_df.target_lgb + 0.75*preds_df.target_xgb
three_quarter = 0.75*preds_df.target_lgb + 0.25*preds_df.target_xgb

In [70]:
preds_df['mean'] = mean
preds_df['quarter'] = quarter
preds_df['three_quarter'] = three_quarter

In [74]:
preds_df[['card_id', 'mean']].rename(columns={'mean':'target'}).to_csv('./submissions/sub_7_xgblgb_mean.csv', index=False)

In [75]:
preds_df[['card_id', 'three_quarter']].rename(columns={'three_quarter':'target'}).to_csv('./submissions/sub_8_xgblgb_34.csv', 
                                                                                index=False)

In [76]:
preds_df[['card_id', 'quarter']].rename(columns={'quarter':'target'}).to_csv('./submissions/sub_9_xgblgb_14.csv', 
                                                                                index=False)

In [77]:
from sklearn.linear_model import LinearRegression

In [101]:
xgb_tr_preds = xgb_best.predict(X)
lgb_tr_preds = lgb_best.predict(X)

In [105]:
lr = LinearRegression()
lr.fit(np.array([xgb_tr_preds, lgb_tr_preds]).T, train.target)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [106]:
stack_preds = lr.predict(np.array([xgb_preds, lgb_preds]).T)

In [116]:
stack_preds_df = pd.DataFrame(np.array([test.card_id, stack_preds]).T, 
                             columns=['card_id', 'target'])

stack_preds_df.to_csv('./submissions/sub_10_xgblgb_stack.csv', index=False)