In [1]:
import glob
import pickle
from sklearn.model_selection import StratifiedKFold

In [2]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import train_test_split
train_data = pd.read_csv("../data/train.csv")
# Binning the numeric variable to different categories
target_std=train_data['target'].std()
max_target=train_data['target'].max()+1
min_target=train_data['target'].min()-1
bins=[min_target,-3*target_std,-2*target_std,-1*target_std,target_std*1,target_std*2,target_std*3,max_target]
labels=[-4,-3,-2,0,2,3,4]
train_data['target_bin']=pd.cut(train_data['target'],bins=bins,labels=labels).astype(int)
train_data['target_bin']=train_data['target_bin'].abs()
train_data.groupby(['target_bin']).size()

target_bin
0    191093
2      8200
3       363
4      2261
dtype: int64

In [3]:
def run_lgb(train_X, train_y, val_X, val_y):
    param = {'num_leaves': 300,
             'min_data_in_leaf': 100,
             'objective': 'regression',
             'max_depth': 15,
             'learning_rate': 0.01,
             "boosting": "gbdt",
             "feature_fraction": 0.9,
             "bagging_freq": 1,
             "bagging_fraction": 0.8,
             "bagging_seed": 11,
             "metric": 'rmse',
             # "lambda_l1": 0.2634,
             "random_state": 133,
             "verbosity": -1}

    lgtrain = lgb.Dataset(train_X, label=train_y)
    lgval = lgb.Dataset(val_X, label=val_y)
    evals_result = {}
    model = lgb.train(param, lgtrain, 2000, valid_sets=[lgval], early_stopping_rounds=100, verbose_eval=100,
                      evals_result=evals_result)
    return model, evals_result

In [14]:
def model_run(filepath,train_data1):
    results={}
    results['filepath']=filepath[0]
    for file in filepath:
        data_features = pd.read_csv(file)
        train_data1=pd.merge(train_data1,data_features,how='left',on='card_id')
    output_feature=['target']
    input_features=[x for x in train_data1.columns if x not in ['first_active_month', 'card_id','target','target_bin']]
    train_X = train_data1[input_features]
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    train_X = train_X.select_dtypes(include=numerics)
    train_y = train_data1[output_feature]
    kf = StratifiedKFold(n_splits=5, random_state=2018, shuffle=True)
    counter=0
    for dev_index, val_index in kf.split(train_X,train_data1['target_bin']):
        
        dev_X, val_X = train_X.loc[dev_index, :], train_X.loc[val_index, :]
        dev_y, val_y = train_y.loc[dev_index], train_y.loc[val_index]
        model, evals_result = run_lgb(dev_X, dev_y, val_X, val_y)
        results['best_iteration'+str(counter)]=model.best_iteration
        results['best_score'+str(counter)]=model.best_score['valid_0']['rmse']
        counter+=1
    return results

In [15]:
pathlist=["../Fresh/*.csv",
"../model features/features_csv/*.csv",
"../model features/diffhistfuture/*.csv",
"../model features/merchant_pivot/*.csv",
"../model features/merchant_pivot/new/*.csv",
"../model features/merchant_pivot/history/*.csv",
"../model features/new_merchant/*.csv",
"../model features/transaction amount and dates/*.csv"
]

In [16]:
errors=[]
rmse_results=[]
for path in pathlist:
    filelist=glob.glob(path)
    for file in filelist:
        try:
            results=model_run([file],train_data)
            rmse_results.append(results)
        except Exception as e:
            print(e)
            errors.append(file)

Training until validation scores don't improve for 100 rounds.
[100]	valid_0's rmse: 3.83756
Early stopping, best iteration is:
[76]	valid_0's rmse: 3.83746
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's rmse: 3.85322
Early stopping, best iteration is:
[91]	valid_0's rmse: 3.85309
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's rmse: 3.85503
Early stopping, best iteration is:
[60]	valid_0's rmse: 3.85478
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's rmse: 3.84854
[200]	valid_0's rmse: 3.84927
Early stopping, best iteration is:
[122]	valid_0's rmse: 3.8485
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's rmse: 3.84603
Early stopping, best iteration is:
[75]	valid_0's rmse: 3.84584
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's rmse: 3.83804
[200]	valid_0's rmse: 3.83941
Early stopping, best iteration is:
[103]	valid_0's rmse: 3.838

Early stopping, best iteration is:
[41]	valid_0's rmse: 3.85581
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's rmse: 3.8516
Early stopping, best iteration is:
[40]	valid_0's rmse: 3.85143
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's rmse: 3.8477
Early stopping, best iteration is:
[67]	valid_0's rmse: 3.84742
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's rmse: 3.83963
Early stopping, best iteration is:
[54]	valid_0's rmse: 3.8393
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's rmse: 3.85561
Early stopping, best iteration is:
[31]	valid_0's rmse: 3.85511
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's rmse: 3.85595
Early stopping, best iteration is:
[69]	valid_0's rmse: 3.85574
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's rmse: 3.85169
Early stopping, best iteration is:
[62]	valid_0's rmse: 3.851

[100]	valid_0's rmse: 3.83868
[200]	valid_0's rmse: 3.8391
Early stopping, best iteration is:
[106]	valid_0's rmse: 3.83867
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's rmse: 3.85389
[200]	valid_0's rmse: 3.85422
Early stopping, best iteration is:
[108]	valid_0's rmse: 3.85388
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's rmse: 3.85503
Early stopping, best iteration is:
[96]	valid_0's rmse: 3.85501
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's rmse: 3.85031
[200]	valid_0's rmse: 3.85037
Early stopping, best iteration is:
[134]	valid_0's rmse: 3.85022
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's rmse: 3.84704
Early stopping, best iteration is:
[97]	valid_0's rmse: 3.84702
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's rmse: 3.83708
Early stopping, best iteration is:
[80]	valid_0's rmse: 3.83696
Training until validation sco

Training until validation scores don't improve for 100 rounds.
[100]	valid_0's rmse: 3.85517
Early stopping, best iteration is:
[60]	valid_0's rmse: 3.85468
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's rmse: 3.85
Early stopping, best iteration is:
[78]	valid_0's rmse: 3.84981
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's rmse: 3.84638
Early stopping, best iteration is:
[68]	valid_0's rmse: 3.84608
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's rmse: 3.82017
[200]	valid_0's rmse: 3.81875
Early stopping, best iteration is:
[173]	valid_0's rmse: 3.8185
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's rmse: 3.83277
[200]	valid_0's rmse: 3.83015
[300]	valid_0's rmse: 3.83065
Early stopping, best iteration is:
[222]	valid_0's rmse: 3.82995
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's rmse: 3.83922
[200]	valid_0's rmse: 3.83946
Earl

Training until validation scores don't improve for 100 rounds.
[100]	valid_0's rmse: 3.85159
Early stopping, best iteration is:
[91]	valid_0's rmse: 3.85152
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's rmse: 3.84956
Early stopping, best iteration is:
[69]	valid_0's rmse: 3.84922
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's rmse: 3.84379
Early stopping, best iteration is:
[77]	valid_0's rmse: 3.84376
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's rmse: 3.8383
Early stopping, best iteration is:
[72]	valid_0's rmse: 3.83804
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's rmse: 3.85422
Early stopping, best iteration is:
[56]	valid_0's rmse: 3.85381
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's rmse: 3.85484
Early stopping, best iteration is:
[75]	valid_0's rmse: 3.85458
Training until validation scores don't improve for 100 roun

[100]	valid_0's rmse: 3.85365
Early stopping, best iteration is:
[78]	valid_0's rmse: 3.85332
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's rmse: 3.85004
Early stopping, best iteration is:
[58]	valid_0's rmse: 3.84979
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's rmse: 3.8455
Early stopping, best iteration is:
[82]	valid_0's rmse: 3.8453
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's rmse: 3.82363
[200]	valid_0's rmse: 3.81961
[300]	valid_0's rmse: 3.81904
[400]	valid_0's rmse: 3.81958
Early stopping, best iteration is:
[314]	valid_0's rmse: 3.8189
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's rmse: 3.84016
[200]	valid_0's rmse: 3.83739
[300]	valid_0's rmse: 3.83751
Early stopping, best iteration is:
[248]	valid_0's rmse: 3.83709
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's rmse: 3.84132
[200]	valid_0's rmse: 3.83944
[300]	

[200]	valid_0's rmse: 3.83888
Early stopping, best iteration is:
[138]	valid_0's rmse: 3.83806
'card_id'


  if self.run_code(code, result):


'card_id'
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's rmse: 3.83095
[200]	valid_0's rmse: 3.83013
Early stopping, best iteration is:
[169]	valid_0's rmse: 3.8301
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's rmse: 3.84549
[200]	valid_0's rmse: 3.84393
[300]	valid_0's rmse: 3.84431
Early stopping, best iteration is:
[215]	valid_0's rmse: 3.84385
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's rmse: 3.8487
[200]	valid_0's rmse: 3.84874
Early stopping, best iteration is:
[151]	valid_0's rmse: 3.84835
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's rmse: 3.84297
[200]	valid_0's rmse: 3.84193
Early stopping, best iteration is:
[198]	valid_0's rmse: 3.84191
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's rmse: 3.83932
[200]	valid_0's rmse: 3.83839
Early stopping, best iteration is:
[189]	valid_0's rmse: 3.83834
Training until validat

[100]	valid_0's rmse: 3.83121
[200]	valid_0's rmse: 3.82988
Early stopping, best iteration is:
[152]	valid_0's rmse: 3.82959
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's rmse: 3.83402
[200]	valid_0's rmse: 3.83366
Early stopping, best iteration is:
[137]	valid_0's rmse: 3.83307
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's rmse: 3.82868
[200]	valid_0's rmse: 3.82865
Early stopping, best iteration is:
[152]	valid_0's rmse: 3.82789
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's rmse: 3.82372
[200]	valid_0's rmse: 3.82253
Early stopping, best iteration is:
[179]	valid_0's rmse: 3.82232
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's rmse: 3.83887
[200]	valid_0's rmse: 3.83841
[300]	valid_0's rmse: 3.83818
[400]	valid_0's rmse: 3.83805
[500]	valid_0's rmse: 3.83799
[600]	valid_0's rmse: 3.83795
[700]	valid_0's rmse: 3.83795
[800]	valid_0's rmse: 3.83793
[900]	va

[200]	valid_0's rmse: 3.84569
[300]	valid_0's rmse: 3.84571
Early stopping, best iteration is:
[222]	valid_0's rmse: 3.84563
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's rmse: 3.84246
[200]	valid_0's rmse: 3.84071
[300]	valid_0's rmse: 3.84049
Early stopping, best iteration is:
[294]	valid_0's rmse: 3.84049
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's rmse: 3.83827
[200]	valid_0's rmse: 3.83645
[300]	valid_0's rmse: 3.83627
Early stopping, best iteration is:
[274]	valid_0's rmse: 3.83626
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's rmse: 3.81705
[200]	valid_0's rmse: 3.81474
Early stopping, best iteration is:
[193]	valid_0's rmse: 3.81466
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's rmse: 3.83405
[200]	valid_0's rmse: 3.83261
Early stopping, best iteration is:
[164]	valid_0's rmse: 3.83241
Training until validation scores don't improve for 100 rounds.


[200]	valid_0's rmse: 3.83771
Early stopping, best iteration is:
[108]	valid_0's rmse: 3.83729
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's rmse: 3.85385
Early stopping, best iteration is:
[75]	valid_0's rmse: 3.85382
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's rmse: 3.8545
Early stopping, best iteration is:
[77]	valid_0's rmse: 3.85445
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's rmse: 3.8496
[200]	valid_0's rmse: 3.8498
Early stopping, best iteration is:
[132]	valid_0's rmse: 3.84955
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's rmse: 3.84536
[200]	valid_0's rmse: 3.84544
Early stopping, best iteration is:
[140]	valid_0's rmse: 3.84525
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's rmse: 3.82626
[200]	valid_0's rmse: 3.82506
Early stopping, best iteration is:
[192]	valid_0's rmse: 3.82504
Training until validation scor

  if self.run_code(code, result):


Training until validation scores don't improve for 100 rounds.
[100]	valid_0's rmse: 3.83887
[200]	valid_0's rmse: 3.83841
[300]	valid_0's rmse: 3.83818
[400]	valid_0's rmse: 3.83805
[500]	valid_0's rmse: 3.83799
[600]	valid_0's rmse: 3.83795
[700]	valid_0's rmse: 3.83795
[800]	valid_0's rmse: 3.83793
[900]	valid_0's rmse: 3.83791
[1000]	valid_0's rmse: 3.83792
Early stopping, best iteration is:
[910]	valid_0's rmse: 3.83791
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's rmse: 3.8545
[200]	valid_0's rmse: 3.85419
[300]	valid_0's rmse: 3.8541
[400]	valid_0's rmse: 3.85409
Early stopping, best iteration is:
[338]	valid_0's rmse: 3.85408
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's rmse: 3.85565
[200]	valid_0's rmse: 3.85554
[300]	valid_0's rmse: 3.85555
Early stopping, best iteration is:
[209]	valid_0's rmse: 3.85553
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's rmse: 3.85133
[200]	valid_0'

[100]	valid_0's rmse: 3.83887
[200]	valid_0's rmse: 3.83841
[300]	valid_0's rmse: 3.83818
[400]	valid_0's rmse: 3.83805
[500]	valid_0's rmse: 3.83799
[600]	valid_0's rmse: 3.83795
[700]	valid_0's rmse: 3.83795
[800]	valid_0's rmse: 3.83793
[900]	valid_0's rmse: 3.83791
[1000]	valid_0's rmse: 3.83792
Early stopping, best iteration is:
[910]	valid_0's rmse: 3.83791
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's rmse: 3.8545
[200]	valid_0's rmse: 3.85419
[300]	valid_0's rmse: 3.8541
[400]	valid_0's rmse: 3.85409
Early stopping, best iteration is:
[338]	valid_0's rmse: 3.85408
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's rmse: 3.85565
[200]	valid_0's rmse: 3.85554
[300]	valid_0's rmse: 3.85555
Early stopping, best iteration is:
[209]	valid_0's rmse: 3.85553
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's rmse: 3.85133
[200]	valid_0's rmse: 3.85104
[300]	valid_0's rmse: 3.85094
[400]	valid_0's r

Early stopping, best iteration is:
[56]	valid_0's rmse: 3.85441
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's rmse: 3.85482
Early stopping, best iteration is:
[60]	valid_0's rmse: 3.85448
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's rmse: 3.85114
Early stopping, best iteration is:
[54]	valid_0's rmse: 3.85065
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's rmse: 3.8476
Early stopping, best iteration is:
[40]	valid_0's rmse: 3.84697
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's rmse: 3.83384
[200]	valid_0's rmse: 3.83527
Early stopping, best iteration is:
[106]	valid_0's rmse: 3.83379
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's rmse: 3.8512
Early stopping, best iteration is:
[68]	valid_0's rmse: 3.85106
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's rmse: 3.85158
Early stopping, best iteratio

Early stopping, best iteration is:
[56]	valid_0's rmse: 3.83947
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's rmse: 3.8553
Early stopping, best iteration is:
[55]	valid_0's rmse: 3.85503
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's rmse: 3.85612
Early stopping, best iteration is:
[50]	valid_0's rmse: 3.85575
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's rmse: 3.85101
Early stopping, best iteration is:
[79]	valid_0's rmse: 3.85092
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's rmse: 3.84799
Early stopping, best iteration is:
[51]	valid_0's rmse: 3.84755
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's rmse: 3.83929
Early stopping, best iteration is:
[88]	valid_0's rmse: 3.83918
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's rmse: 3.85494
Early stopping, best iteration is:
[65]	valid_0's rmse: 3.8

  if self.run_code(code, result):


Training until validation scores don't improve for 100 rounds.
[100]	valid_0's rmse: 3.83887
[200]	valid_0's rmse: 3.83841
[300]	valid_0's rmse: 3.83818
[400]	valid_0's rmse: 3.83805
[500]	valid_0's rmse: 3.83799
[600]	valid_0's rmse: 3.83795
[700]	valid_0's rmse: 3.83795
[800]	valid_0's rmse: 3.83793
[900]	valid_0's rmse: 3.83791
[1000]	valid_0's rmse: 3.83792
Early stopping, best iteration is:
[910]	valid_0's rmse: 3.83791
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's rmse: 3.8545
[200]	valid_0's rmse: 3.85419
[300]	valid_0's rmse: 3.8541
[400]	valid_0's rmse: 3.85409
Early stopping, best iteration is:
[338]	valid_0's rmse: 3.85408
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's rmse: 3.85565
[200]	valid_0's rmse: 3.85554
[300]	valid_0's rmse: 3.85555
Early stopping, best iteration is:
[209]	valid_0's rmse: 3.85553
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's rmse: 3.85133
[200]	valid_0'

[100]	valid_0's rmse: 3.84634
Early stopping, best iteration is:
[70]	valid_0's rmse: 3.84611
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's rmse: 3.83693
Early stopping, best iteration is:
[90]	valid_0's rmse: 3.83674
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's rmse: 3.85456
Early stopping, best iteration is:
[71]	valid_0's rmse: 3.85392
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's rmse: 3.85477
Early stopping, best iteration is:
[55]	valid_0's rmse: 3.85404
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's rmse: 3.85007
Early stopping, best iteration is:
[68]	valid_0's rmse: 3.8498
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's rmse: 3.8465
Early stopping, best iteration is:
[63]	valid_0's rmse: 3.84613
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's rmse: 3.8396
Early stopping, best iteration 