In [1]:
import pandas as pd
import glob
import xgboost as xgb
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import mean_squared_error
import numpy as np
# https://scikit-learn.org/stable/modules/ensemble.html#extremely-randomized-trees

  from numpy.core.umath_tests import inner1d


### Loading  Data

In [2]:
%%time
train_data = pd.read_csv("../fresh_code/ultimate/train_test/df_train_summary.csv")
test_data = pd.read_csv("../fresh_code/ultimate/train_test/df_test_summary.csv")

Wall time: 21.5 s


In [3]:
train_data.shape

(201917, 95)

### Functions to concatenate features, kfold splits

In [3]:
def feature_concat(train_data1,filepath):
    for file in filepath:
        data_features = pd.read_csv(file)
        data_features = data_features.drop_duplicates()
        train_data1=pd.merge(train_data1,data_features,how='left',on='card_id')
#         print(file)
#         print(train_data1.shape)
    return(train_data1)

In [4]:
def kfold_split(splitcount,train_data1,ignore_cols):  
    output_feature=['target']
    input_features=[x for x in train_data1.columns if x not in ignore_cols]
    train_X = train_data1[input_features]
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    train_X = train_X.select_dtypes(include=numerics)
    train_y = train_data1[output_feature]
    kf = StratifiedKFold(n_splits=splitcount, random_state=2018, shuffle=True)
    counter=0
    models=[]
    splits={}
    for dev_index, val_index in kf.split(train_X,train_data1['target_bin']):
        dev_X, val_X = train_X.loc[dev_index, :], train_X.loc[val_index, :]
        dev_y, val_y = train_y.loc[dev_index], train_y.loc[val_index]
        splits[counter] = [dev_X, val_X,dev_y, val_y]
        counter=counter+1
    return splits

In [7]:
def run_lgb(train_X, train_y, val_X, val_y):
    param = {'num_leaves': 75,
             'min_data_in_leaf': 200,
             'objective': 'regression',
             'max_depth': 14,
             'learning_rate': 0.01,
             "boosting": "gbdt",
#              "feature_fraction": 0.7,
#              "bagging_freq": 1,
#              "bagging_fraction": 0.7,
#              "bagging_seed": 11,
             "metric": 'rmse',
             "lambda_l1": 0.9,
             "random_state": 133,
             "verbosity": -1}

    lgtrain = lgb.Dataset(train_X, label=train_y)
    lgval = lgb.Dataset(val_X, label=val_y)
    evals_result = {}
    model = lgb.train(param, lgtrain, 10000, valid_sets=[lgval], early_stopping_rounds=100, verbose_eval=1000,
                      evals_result=evals_result)
    
    return model, evals_result

### Concatenate features

In [10]:
# %%time
file_list=glob.glob("../fresh_code/ultimate/*.csv")
model_results=[]
for file in file_list:
    print(file)
    train_features1=feature_concat(train_data,[file])
# Binning the numeric variable to different categories
    target_std=train_features1['target'].std()
    max_target=train_features1['target'].max()+1
    min_target=train_features1['target'].min()-1
    bins=[min_target,-3*target_std,-2*target_std,-1*target_std,target_std*1,target_std*2,target_std*3,max_target]
    labels=[-4,-3,-2,0,2,3,5]
    train_features1['target_bin']=pd.cut(train_features1['target'],bins=bins,labels=labels).astype(int)
    train_features1['target_bin']=train_features1['target_bin'].abs()
    train_features1.groupby(['target_bin']).size()
    kfolds=kfold_split(5,train_features1,['Unnamed: 0','outliers','first_active_month', 'card_id','target','target_bin','year_nunique_y','year_nunique_x'])
    
    for split in range(0,5):
        eval={}
        X_train, X_test, y_train, y_test = train_test_split(kfolds[split][0],kfolds[split][2], test_size=0.1, random_state=42)
        model, evals_result=run_lgb(X_train,y_train, kfolds[split][1], kfolds[split][3])
        eval['best_iteration']=model.best_iteration
        eval['best_score']=model.best_score['valid_0']['rmse']
        pred=model.predict(X_test)
        eval['test_score']=np.sqrt(mean_squared_error(y_test, pred))
        eval['file_name']=file
        eval['split']=split
        model_results.append(eval)
        print(eval)
# train_features=train_features.fillna(0)

../fresh_code/ultimate\card_merc_count_pivot_svd.csv
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's rmse: 3.69642
[200]	valid_0's rmse: 3.6642
[300]	valid_0's rmse: 3.65281
[400]	valid_0's rmse: 3.6468
[500]	valid_0's rmse: 3.64502
Early stopping, best iteration is:
[497]	valid_0's rmse: 3.6449
{'best_iteration': 497, 'best_score': 3.6449034828182585, 'test_score': 3.4830274803554953, 'file_name': '../fresh_code/ultimate\\card_merc_count_pivot_svd.csv', 'split': 0}


In [12]:
model_result=pd.DataFrame(model_results)
model_result.to_csv("model_file_results.csv",index=False)

Unnamed: 0,best_iteration,best_score,file_name,split,test_score
0,497,3.644903,../fresh_code/ultimate\card_merc_count_pivot_s...,0,3.483027
