In [1]:
import pandas as pd
import glob
import xgboost as xgb
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
import numpy as np
# https://scikit-learn.org/stable/modules/ensemble.html#extremely-randomized-trees

### Loading  Data

In [2]:
train_data = pd.read_csv("C:\\Kaggle competetion datasets\\Elo Merchent Category Recommendation\\Elo Feb 18th\\summary_card\\df_train_summary.csv")
test_data = pd.read_csv("C:\\Kaggle competetion datasets\\Elo Merchent Category Recommendation\\Elo Feb 18th\\summary_card\\df_test_summary.csv")

In [3]:
print(train_data.shape)
test_data.shape

(201917, 95)


(123623, 93)

### Functions to concatenate features, kfold splits

In [4]:
def feature_concat(train_data1,filepath):
    for file in filepath:
        data_features = pd.read_csv(file)
        train_data1=pd.merge(train_data1,data_features,how='left',on='card_id')
    return(train_data1)

In [5]:
def kfold_split(splitcount,train_data1,ignore_cols):  
    output_feature=['target']
    input_features=[x for x in train_data1.columns if x not in ignore_cols]
    train_X = train_data1[input_features]
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    train_X = train_X.select_dtypes(include=numerics)
    train_y = train_data1[output_feature]
    kf = StratifiedKFold(n_splits=splitcount, random_state=2018, shuffle=True)
    counter=0
    models=[]
    splits={}
    for dev_index, val_index in kf.split(train_X,train_data1['target_bin']):
        dev_X, val_X = train_X.loc[dev_index, :], train_X.loc[val_index, :]
        dev_y, val_y = train_y.loc[dev_index], train_y.loc[val_index]
        splits[counter] = [dev_X, val_X,dev_y, val_y]
        counter=counter+1
    return splits

### Concatenate features

In [6]:
%%time
file_list = glob.glob("C:/Kaggle competetion datasets/Elo Merchent Category Recommendation/Elo Feb 18th/features_feb21st/*.csv")
train_features = feature_concat(train_data,file_list)

Wall time: 8.29 s


In [7]:
train_features.shape

(201917, 123)

In [8]:
# train_features=train_data.copy()

In [9]:
# # Create correlation matrix
# corr_matrix = train_features.corr().abs()

# # Select upper triangle of correlation matrix
# upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))

# # Find index of feature columns with correlation greater than 0.95
# to_drop = [column for column in upper.columns if any(upper[column] > 0.80)]

# train_features=train_features1.drop(to_drop, axis=1).copy()

### Binning target variable

In [8]:
# Binning the numeric variable to different categories
target_std=train_features['target'].std()
max_target=train_features['target'].max()+1
min_target=train_features['target'].min()-1
bins=[min_target,-3*target_std,-2*target_std,-1*target_std,target_std*1,target_std*2,target_std*3,max_target]
labels=[-4,-3,-2,0,2,3,5]
train_features['target_bin']=pd.cut(train_features['target'],bins=bins,labels=labels).astype(int)
train_features['target_bin']=train_features['target_bin'].abs()
train_features.groupby(['target_bin']).size()


target_bin
0    191093
2      8200
3       363
4      2237
5        24
dtype: int64

### KFold Splits

In [9]:
%%time
kfolds = kfold_split(5,train_features,['first_active_month', 'card_id','target','target_bin','year_nunique_y','year_nunique_x', 'outliers', 'Unnamed: 0'])

Wall time: 1.62 s


### LGBM

In [10]:
def run_lgb(train_X, train_y, val_X, val_y):
    param = {'num_leaves': 30,
             'min_data_in_leaf': 177,
             'objective': 'regression',
             'max_depth': 9,
             'learning_rate': 0.01,
             "boosting": "gbdt",
#              "feature_fraction": 0.7,
             "bagging_freq": 1,
             "bagging_fraction": 0.7,
             "bagging_seed": 11,
             "metric": 'rmse',
             "lambda_l1": 0.1,
             "random_state": 133,
             "verbosity": -1}

    lgtrain = lgb.Dataset(train_X, label=train_y)
    lgval = lgb.Dataset(val_X, label=val_y)
    evals_result = {}
    model = lgb.train(param, lgtrain, 2000, valid_sets=[lgval], early_stopping_rounds=100, verbose_eval=100,
                      evals_result=evals_result)
    return model, evals_result

In [11]:
best_list = []
imp_list = []
for split in range(0,1):
    model, evals_result = run_lgb(kfolds[split][0], kfolds[split][2], kfolds[split][1], kfolds[split][3])
    
    best = {}
    best['best_iter'] = model.best_iteration
    best['best_score'] = model.best_score['valid_0']['rmse']
    best_list.append(best)
    feature_imp = pd.DataFrame(model.feature_importance(), kfolds[split][0].columns).sort_values(0,ascending=False).reset_index()
    feature_imp['split'] = split
    imp_list.append(feature_imp)
    
    

Training until validation scores don't improve for 100 rounds.
[100]	valid_0's rmse: 3.69953
[200]	valid_0's rmse: 3.66555
[300]	valid_0's rmse: 3.65195
[400]	valid_0's rmse: 3.64547
[500]	valid_0's rmse: 3.64204
[600]	valid_0's rmse: 3.63991
[700]	valid_0's rmse: 3.63907
[800]	valid_0's rmse: 3.63865
[900]	valid_0's rmse: 3.63845
Early stopping, best iteration is:
[842]	valid_0's rmse: 3.63837


In [14]:
imp_list

[                                     index     0  split
 0                     hist_month_diff_mean  1287      0
 1             new_hist_purchase_amount_max   921      0
 2                hist_authorized_flag_mean   819      0
 3           new_hist_purchase_date_uptonow   762      0
 4                                     qua2   760      0
 5                      hist_category_1_sum   668      0
 6                  new_hist_month_lag_mean   588      0
 7                                     qua3   529      0
 8              new_hist_purchase_date_diff   509      0
 9                hist_purchase_amount_mean   485      0
 10                  hist_purchase_date_max   482      0
 11                      hist_month_nunique   480      0
 12           new_hist_purchase_amount_mean   478      0
 13              hist_purchase_date_uptonow   439      0
 14                    hist_category_1_mean   433      0
 15                  hist_new_trx_date_diff   392      0
 16            new_hist_purchas

In [13]:
imp_list_df = pd.concat(imp_list, axis=0).groupby('index').agg('mean')
imp_list_df

Unnamed: 0_level_0,0,split
index,Unnamed: 1_level_1,Unnamed: 2_level_1
card_id_total,87,0
city_count_<lambda>,11,0
city_count_mean,166,0
city_count_mean_x_diff,242,0
city_count_var,166,0
dayofweek,63,0
elapsed_time,174,0
feature_1,151,0
feature_2,93,0
feature_3,43,0


In [13]:
# %matplotlib inline 
# import matplotlib.pyplot as plt
# fig, ax = plt.subplots(figsize=(12,10))
# lgb.plot_importance(model, max_num_features=50, height=0.8, ax=ax)
# ax.grid(False)
# plt.title("LightGBM - Feature Importance", fontsize=15)
# plt.show()

### With Mean Feature_Importance

In [12]:
# import seaborn as sns
# fig, ax = plt.subplots(figsize=(12,20))
# sns.barplot(x="Mean", y="Col_Names", data = feature_imp, orient='h',ax=ax)
# plt.show()

### Random Forest

In [None]:
def run_lgb(train_X, train_y, val_X, val_y):
    param = {'num_leaves': 30,
             'min_data_in_leaf': 177,
             'objective': 'regression',
             'max_depth': 9,
             'learning_rate': 0.01,
             "boosting": "gbdt",
#              "feature_fraction": 0.7,
             "bagging_freq": 1,
             "bagging_fraction": 0.7,
             "bagging_seed": 11,
             "metric": 'rmse',
             "lambda_l1": 0.1,
             "random_state": 133,
             "verbosity": -1}

    lgtrain = lgb.Dataset(train_X, label=train_y)
    lgval = lgb.Dataset(val_X, label=val_y)
    evals_result = {}
    model = lgb.train(param, lgtrain, 2000, valid_sets=[lgval], early_stopping_rounds=100, verbose_eval=100,
                      evals_result=evals_result)
    return model, evals_result