In [1]:
import pandas as pd
import glob
import xgboost as xgb
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
import numpy as np
# https://scikit-learn.org/stable/modules/ensemble.html#extremely-randomized-trees

### Loading  Data

In [2]:
train_data = pd.read_csv("C:\\Kaggle competetion datasets\\Elo Merchent Category Recommendation\\Elo Feb 18th\\summary_card\\df_train_summary.csv")
test_data = pd.read_csv("C:\\Kaggle competetion datasets\\Elo Merchent Category Recommendation\\Elo Feb 18th\\summary_card\\df_test_summary.csv")

In [3]:
print(train_data.shape)
test_data.shape

(201917, 95)


(123623, 93)

### Functions to concatenate features, kfold splits

In [4]:
def feature_concat(train_data1,filepath):
    for file in filepath:
        data_features = pd.read_csv(file)
        train_data1=pd.merge(train_data1,data_features,how='left',on='card_id')
    return(train_data1)

In [5]:
def kfold_split(splitcount,train_data1,ignore_cols):  
    output_feature=['target']
    input_features=[x for x in train_data1.columns if x not in ignore_cols]
    train_X = train_data1[input_features]
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    train_X = train_X.select_dtypes(include=numerics)
    train_y = train_data1[output_feature]
    kf = StratifiedKFold(n_splits=splitcount, random_state=2018, shuffle=True)
    counter=0
    models=[]
    splits={}
    for dev_index, val_index in kf.split(train_X,train_data1['target_bin']):
        dev_X, val_X = train_X.loc[dev_index, :], train_X.loc[val_index, :]
        dev_y, val_y = train_y.loc[dev_index], train_y.loc[val_index]
        splits[counter] = [dev_X, val_X,dev_y, val_y]
        counter=counter+1
    return splits

### Concatenate features

In [7]:
%%time
file_list = glob.glob("C:/Kaggle competetion datasets/Elo Merchent Category Recommendation/Elo Feb 18th/monthly_trans_diff_feb21st/*.csv")
train_features = feature_concat(train_data,file_list)

Wall time: 4.14 s


In [8]:
train_features.shape

(201917, 101)

In [8]:
# train_features=train_data.copy()

In [9]:
# # Create correlation matrix
# corr_matrix = train_features.corr().abs()

# # Select upper triangle of correlation matrix
# upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))

# # Find index of feature columns with correlation greater than 0.95
# to_drop = [column for column in upper.columns if any(upper[column] > 0.80)]

# train_features=train_features1.drop(to_drop, axis=1).copy()

### Binning target variable

In [9]:
# Binning the numeric variable to different categories
target_std=train_features['target'].std()
max_target=train_features['target'].max()+1
min_target=train_features['target'].min()-1
bins=[min_target,-3*target_std,-2*target_std,-1*target_std,target_std*1,target_std*2,target_std*3,max_target]
labels=[-4,-3,-2,0,2,3,5]
train_features['target_bin']=pd.cut(train_features['target'],bins=bins,labels=labels).astype(int)
train_features['target_bin']=train_features['target_bin'].abs()
train_features.groupby(['target_bin']).size()


target_bin
0    191093
2      8200
3       363
4      2237
5        24
dtype: int64

### KFold Splits

In [10]:
%%time
kfolds = kfold_split(5,train_features,['first_active_month', 'card_id','target','target_bin','year_nunique_y','year_nunique_x', 'outliers', 'Unnamed: 0'])

Wall time: 2.05 s


### LGBM

In [13]:
def run_lgb(train_X, train_y, val_X, val_y):
    param = {'num_leaves': 30,
             'min_data_in_leaf': 177,
             'objective': 'regression',
             'max_depth': 9,
             'learning_rate': 0.01,
             "boosting": "gbdt",
#              "feature_fraction": 0.7,
             "bagging_freq": 1,
             "bagging_fraction": 0.7,
             "bagging_seed": 11,
             "metric": 'rmse',
             "lambda_l1": 0.1,
             "random_state": 133,
             "verbosity": -1}

    lgtrain = lgb.Dataset(train_X, label=train_y)
    lgval = lgb.Dataset(val_X, label=val_y)
    evals_result = {}
    model = lgb.train(param, lgtrain, 2000, valid_sets=[lgval], early_stopping_rounds=100, verbose_eval=100,
                      evals_result = evals_result)
    return model, evals_result

In [14]:
best_list = []
imp_list = []
for split in range(0,5):
    model, evals_result = run_lgb(kfolds[split][0], kfolds[split][2], kfolds[split][1], kfolds[split][3])
    
    best = {}
    best['best_iter'] = model.best_iteration
    best['best_score'] = model.best_score['valid_0']['rmse']
    best_list.append(best)
    feature_imp = pd.DataFrame(model.feature_importance(), kfolds[split][0].columns).sort_values(0,ascending=False).reset_index()
    feature_imp['split'] = split
    imp_list.append(feature_imp)
    
    

Training until validation scores don't improve for 100 rounds.
[100]	valid_0's rmse: 3.69864
[200]	valid_0's rmse: 3.66395
[300]	valid_0's rmse: 3.65067
[400]	valid_0's rmse: 3.64392
[500]	valid_0's rmse: 3.64056
[600]	valid_0's rmse: 3.63831
[700]	valid_0's rmse: 3.63699
[800]	valid_0's rmse: 3.63602
[900]	valid_0's rmse: 3.63585
[1000]	valid_0's rmse: 3.63555
[1100]	valid_0's rmse: 3.63517
[1200]	valid_0's rmse: 3.63519
Early stopping, best iteration is:
[1177]	valid_0's rmse: 3.63495
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's rmse: 3.72313
[200]	valid_0's rmse: 3.69203
[300]	valid_0's rmse: 3.68093
[400]	valid_0's rmse: 3.67586
[500]	valid_0's rmse: 3.67254
[600]	valid_0's rmse: 3.66974
[700]	valid_0's rmse: 3.66839
[800]	valid_0's rmse: 3.66742
[900]	valid_0's rmse: 3.66716
[1000]	valid_0's rmse: 3.66712
[1100]	valid_0's rmse: 3.66701
Early stopping, best iteration is:
[1076]	valid_0's rmse: 3.66683
Training until validation scores don't improve 

In [16]:
best_list

[{'best_iter': 1177, 'best_score': 3.634954270755675},
 {'best_iter': 1076, 'best_score': 3.666827054262234},
 {'best_iter': 1124, 'best_score': 3.642304375806671},
 {'best_iter': 1079, 'best_score': 3.669506548376446},
 {'best_iter': 1082, 'best_score': 3.644483348851669}]

In [17]:
pd.concat(imp_list, axis=0)

Unnamed: 0,index,0,split
0,hist_month_diff_mean,1937,0
1,hist_authorized_flag_mean,1267,0
2,new_hist_purchase_date_uptonow,1172,0
3,new_hist_purchase_amount_max,1143,0
4,hist_category_1_sum,1052,0
5,hist_purchase_date_max,829,0
6,hist_purchase_date_diff,797,0
7,hist_category_1_mean,758,0
8,new_hist_purchase_amount_mean,739,0
9,new_hist_month_lag_mean,732,0


In [20]:
imp_list_df = pd.concat(imp_list, axis=0).groupby('index').agg('mean')
imp_list_df

Unnamed: 0_level_0,0,split
index,Unnamed: 1_level_1,Unnamed: 2_level_1
card_id_total,154.0,2
city_count_mean_x_diff,367.6,2
dayofweek,123.8,2
elapsed_time,246.0,2
feature_1,261.0,2
feature_2,115.8,2
feature_3,57.4,2
hist_authorized_flag_mean,1176.6,2
hist_authorized_flag_sum,203.2,2
hist_card_id_size,150.8,2


In [21]:
# %matplotlib inline 
# import matplotlib.pyplot as plt
# fig, ax = plt.subplots(figsize=(12,10))
# lgb.plot_importance(model, max_num_features=50, height=0.8, ax=ax)
# ax.grid(False)
# plt.title("LightGBM - Feature Importance", fontsize=15)
# plt.show()

### With Mean Feature_Importance

In [22]:
# import seaborn as sns
# fig, ax = plt.subplots(figsize=(12,20))
# sns.barplot(x="Mean", y="Col_Names", data = feature_imp, orient='h',ax=ax)
# plt.show()

### Random Forest

In [35]:
train_features1 = train_features.copy()
train_features1 = train_features1.fillna(0)

In [39]:
kfolds1 = kfold_split(5,train_features1,['first_active_month', 'card_id','target','target_bin','year_nunique_y','year_nunique_x', 'outliers', 'Unnamed: 0'])

In [40]:
from sklearn.ensemble import RandomForestRegressor
rf_model = RandomForestRegressor()

In [65]:
rf_model = RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=14,
           min_samples_leaf=177,
           n_estimators=100, n_jobs=-1,
           oob_score=True, random_state=133, verbose=-1)

rf_model.fit(kfolds1[split][0], kfolds1[split][2])

  
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.


KeyboardInterrupt: 

In [51]:
y_cap = rf_model.predict(kfolds1[split][1])

[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  64 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.2s finished


In [52]:
from math import sqrt
from sklearn.metrics import mean_squared_error

In [53]:
sqrt(mean_squared_error(kfolds1[split][3], y_cap))

3.6793628815961403

In [59]:
def run_rf(train_X, train_y, val_X, val_y):

#     rftrain = lgb.Dataset(train_X, label=train_y)
#     lgval = lgb.Dataset(val_X, label=val_y)
    evals_result = {}
    model = rf_model.fit(train_X, train_y)
    y_pred = model.predict(val_X)
    rmse_score = sqrt(mean_squared_error(val_y, y_pred))
    return model, y_pred, rmse_score

In [64]:
best_list_rf = []
imp_list_rf = []
rmse_score = []
for split in range(0,5):
    model_rf, evals_result_rf, score = run_rf(kfolds1[split][0], kfolds1[split][2], kfolds1[split][1], kfolds1[split][3])
    
    best_rf = {}
    #best_rf['best_iter'] = model_rf.best_iteration
    rmse_score.append(score)
    #best_list_rf.append(best)
    feature_imp_rf = pd.DataFrame(model_rf.feature_importances_, kfolds1[split][0].columns).sort_values(0,ascending=False).reset_index()
    feature_imp_rf['split'] = split
    imp_list_rf.append(feature_imp_rf)

  
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  3.6min finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  64 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.1s finished
  
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  4.1min finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  64 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.2s finished
  
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.


KeyboardInterrupt: 