In [1]:
import numpy as np
import pandas as pd
import datetime
import gc
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import mean_squared_error
import os
import warnings
warnings.filterwarnings('ignore')

In [2]:
%%time
path="D:\Python\Elo"
df_train = pd.read_csv(os.path.join(path,'main.csv'))
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df
df_train=reduce_mem_usage(df_train)
hist_data=pd.read_csv(os.path.join(path,'hard_data_hist.csv'))
df_train=df_train.merge(hist_data,how='left',on='card_id')
new_data=pd.read_csv(os.path.join(path,'hard_data_new.csv'))
df_train=df_train.merge(new_data,how='left',on='card_id')

hist_data=pd.read_csv(os.path.join(path,'hist_last_update.csv'))
df_train=df_train.merge(hist_data,how='left',on='card_id')
new_data=pd.read_csv(os.path.join(path,'new_last_update.csv'))
df_train=df_train.merge(new_data,how='left',on='card_id')
def ratio_month(df):
    cl_one=[a for a in df.columns if 'hist_month' in str(a)]
    cl_two=[a for a in df.columns if 'new_month' in str(a)]
    for one,two in zip(cl_one,cl_two):
        df['new_'+one+'_ratio']=df[two]/df[one]
    return df
df_train=ratio_month(df_train)
df_train.to_csv(os.path.join(path,'result.csv'))
#selector=['hist_month_lag_mean', 'hist_month_lag_var', 'hist_month_diff_mean',
#       'hist_fathers_day_2017_mean', 'hist_duration_min',
#       'hist_purchase_date_diff', 'hist_purchase_date_uptonow','card_id','target']
#df_train=df_train[selector]
#del hist_data; new_data

Mem. usage decreased to 151.74 Mb (66.3% reduction)
Wall time: 3min 35s


In [31]:
from sklearn.linear_model import Ridge
from sklearn.preprocessing import LabelEncoder
lb=LabelEncoder()
obj=list(df_train.select_dtypes(include=['object']).columns)[2:]
for o in obj:
    df_train[o]=lb.fit_transform(df_train[o])
df_train=df_train.replace([np.inf, -np.inf], np.nan)
df_train=df_train.fillna(0)
df_train['outliers'] = 0
df_train.loc[df_train['target'] < -30, 'outliers'] = 1
all_df_train_columns = [c for c in df_train.columns if c not in ['card_id', 'first_active_month','target','outliers']]
best_factor=[all_df_train_columns[1]]
best_score=9000
for fact in all_df_train_columns[2:]:
    df_train_columns=best_factor.copy()
    df_train_columns.append(fact)
    rskf=StratifiedKFold(5,shuffle=True,random_state=4590)
    val_pr=np.zeros(len(df_train))
    for train_index,val_index in rskf.split(df_train,df_train['outliers'].values):
    #    train_data=lgb.Dataset(df_train[df_train_columns].loc[train_index],label=df_train['target'].loc[train_index])
    #    val_data=lgb.Dataset(df_train[df_train_columns].loc[val_index],label=df_train['target'].loc[val_index])
        model=Ridge()
        model.fit(df_train[df_train_columns].loc[train_index],df_train['target'].loc[train_index])
        val_pr[val_index]=model.predict(df_train[df_train_columns].loc[val_index])
    score=np.sqrt(mean_squared_error(val_pr,df_train['target']))
    if score<best_score:
        best_factor.append(fact)
        best_score=score

In [5]:
df_train['outliers'] = 0
df_train.loc[df_train['target'] < -30, 'outliers'] = 1
obj=list(df_train.select_dtypes(include=['object']).columns)[2:]
from sklearn.preprocessing import LabelEncoder
lb=LabelEncoder()
for o in obj:
    df_train[o]=lb.fit_transform(df_train[o]) 
df_train_columns = [c for c in df_train.columns if c not in ['card_id', 'first_active_month','target','outliers','Unnamed: 0','Unnamed: 0_x']]
param = {'objective': 'regression_l2', 
    'metric': 'rmse',
            'boosting_type': 'gbdt', 
            'n_jobs': 4, 'max_depth': 18, 
            'n_estimators': 6100, 
            'subsample_freq': 2, 
            'min_data_per_group': 100, 
            'max_cat_to_onehot': 4, 
            'cat_l2': 10.0, 
            'cat_smooth': 10.0, 
            'max_cat_threshold': 32, 
            'metric_freq': 10, 
            'verbosity': -1, 
            'colsample_bytree': 0.5, 
            'learning_rate': 0.0061033234451294376, 
            'min_child_samples': 20, 
            'min_child_weight': 9.0, 
            'min_split_gain': 1e-06, 
            'num_leaves': 36, 
            'reg_alpha': 40.0, 
            'reg_lambda': 13.3, 
            'subsample': 0.9}
rskf=StratifiedKFold(5,shuffle=True,random_state=4590)
val_pr=np.zeros(len(df_train))
feature_importance_df = pd.DataFrame()
#test_pr=np.zeros(len(df_test))
for train_index,val_index in rskf.split(df_train,df_train['outliers'].values):
    train_data=lgb.Dataset(df_train[df_train_columns].loc[train_index],label=df_train['target'].loc[train_index])
    val_data=lgb.Dataset(df_train[df_train_columns].loc[val_index],label=df_train['target'].loc[val_index])
    num_round = 10000
    model=lgb.train(param,train_data,num_round,valid_sets=[train_data,val_data],verbose_eval=100,early_stopping_rounds=400)
    fold_importance_df=pd.DataFrame()
    fold_importance_df['feature'] = df_train_columns
    fold_importance_df['importance']=model.feature_importance()
    feature_importance_df=pd.concat([feature_importance_df,fold_importance_df],axis=0)
    
    val_pr[val_index]=model.predict(df_train[df_train_columns].loc[val_index],num_iteration=model.best_iteration)
np.sqrt(mean_squared_error(val_pr,df_train['target']))

Training until validation scores don't improve for 400 rounds.
[100]	training's rmse: 3.7993	valid_1's rmse: 3.81975
[200]	training's rmse: 3.77654	valid_1's rmse: 3.80353
[300]	training's rmse: 3.76332	valid_1's rmse: 3.79703
[400]	training's rmse: 3.75406	valid_1's rmse: 3.79459
[500]	training's rmse: 3.74649	valid_1's rmse: 3.7937
[600]	training's rmse: 3.73988	valid_1's rmse: 3.79363
[700]	training's rmse: 3.73381	valid_1's rmse: 3.79365
[800]	training's rmse: 3.72825	valid_1's rmse: 3.79364
[900]	training's rmse: 3.72297	valid_1's rmse: 3.79378
Early stopping, best iteration is:
[581]	training's rmse: 3.741	valid_1's rmse: 3.79358
Training until validation scores don't improve for 400 rounds.
[100]	training's rmse: 3.80269	valid_1's rmse: 3.80616
[200]	training's rmse: 3.78011	valid_1's rmse: 3.78984
[300]	training's rmse: 3.76683	valid_1's rmse: 3.78344
[400]	training's rmse: 3.75758	valid_1's rmse: 3.7811
[500]	training's rmse: 3.74998	valid_1's rmse: 3.78028
[600]	training's rm

3.783163030815591

In [8]:
# one model 3.644954699945824
# last 3.6441864880195727
df_train['outliers'] = 0
df_train.loc[df_train['target'] < -30, 'outliers'] = 1
obj=list(df_train.select_dtypes(include=['object']).columns)[2:]
from sklearn.preprocessing import LabelEncoder
lb=LabelEncoder()
for o in obj:
    df_train[o]=lb.fit_transform(df_train[o]) 
df_train_columns = [c for c in df_train.columns if c not in ['card_id', 'first_active_month','target','outliers','Unnamed: 0','Unnamed: 0_x']]
param = {'objective': 'regression_l2', 
            'boosting_type': 'gbdt', 
            'n_jobs': 4, 'max_depth': 7, 
            'n_estimators': 20000, 
            'subsample_freq': 2, 
            'subsample_for_bin': 200000, 
            'min_data_per_group': 100, 
            'max_cat_to_onehot': 4, 
            'cat_l2': 10.0, 
            'cat_smooth': 10.0, 
            'max_cat_threshold': 32, 
            'metric_freq': 10, 
            'verbosity': -1, 
            'metric': 'rmse', 
            'colsample_bytree': 0.5, 
            'learning_rate': 0.0061033234451294376, 
            'min_child_samples': 80, 
            'min_child_weight': 100.0, 
            'min_split_gain': 1e-06, 
            'num_leaves': 47, 
            'reg_alpha': 10.0, 
            'reg_lambda': 10.0, 
            'subsample': 0.9}
from catboost import CatBoostClassifier, Pool
rskf=StratifiedKFold(11,shuffle=True,random_state=4590)
val_pr=np.zeros(len(df_train))
feature_importance_df = pd.DataFrame()
#test_pr=np.zeros(len(df_test))
for train_index,val_index in rskf.split(df_train,df_train['outliers'].values):
    train_data=lgb.Dataset(df_train[df_train_columns].loc[train_index],label=df_train['target'].loc[train_index])
    val_data=lgb.Dataset(df_train[df_train_columns].loc[val_index],label=df_train['target'].loc[val_index])
    num_round = 10000
    
    pool=Pool(df_train[df_train_columns].loc[train_index],df_train['outliers'].loc[train_index])
    val_pool=Pool(df_train[df_train_columns].loc[val_index],df_train['outliers'].loc[val_index])
    model = CatBoostClassifier(iterations=1000,learning_rate=0.01,verbose=0, loss_function='CrossEntropy',eval_metric='AUC')
    model.fit(pool,eval_set=val_pool,use_best_model=True,verbose_eval=False)
    cat_pr=model.predict(df_train[df_train_columns].loc[val_index],prediction_type="Probability")[:,1]
    
    model=lgb.train(param,train_data,num_round,valid_sets=[train_data,val_data],verbose_eval=100,early_stopping_rounds=400)
    lgb_pr=model.predict(df_train[df_train_columns].loc[val_index],num_iteration=model.best_iteration)
    prediction=pd.DataFrame({'cat_pr':cat_pr,'lgb_pr':lgb_pr,'target':df_train['target'].loc[val_index]})
    thresholds = []
    for thresh in np.arange(0.01, 1, 0.01):
        thresh = np.round(thresh, 2)
        pr=np.where(prediction['cat_pr']>thresh,-33.22,prediction['lgb_pr'])
        res = np.sqrt(mean_squared_error(pr,prediction['target'].values))
        thresholds.append([thresh, res])
    thresholds.sort(key=lambda x: x[1], reverse=False)
    best_thresh = thresholds[0][0]
    
    val_pr[val_index]=np.where(prediction['cat_pr']>best_thresh,-33.22,prediction['lgb_pr'])
    
np.sqrt(mean_squared_error(val_pr,df_train['target']))

Training until validation scores don't improve for 400 rounds.
[100]	training's rmse: 3.71247	valid_1's rmse: 3.74918
[200]	training's rmse: 3.64187	valid_1's rmse: 3.6996
[300]	training's rmse: 3.59826	valid_1's rmse: 3.6771
[400]	training's rmse: 3.56789	valid_1's rmse: 3.6653
[500]	training's rmse: 3.54509	valid_1's rmse: 3.65721
[600]	training's rmse: 3.52739	valid_1's rmse: 3.65263
[700]	training's rmse: 3.51212	valid_1's rmse: 3.64931
[800]	training's rmse: 3.49854	valid_1's rmse: 3.6471
[900]	training's rmse: 3.48566	valid_1's rmse: 3.64532
[1000]	training's rmse: 3.47426	valid_1's rmse: 3.64405
[1100]	training's rmse: 3.46305	valid_1's rmse: 3.64288
[1200]	training's rmse: 3.45268	valid_1's rmse: 3.64253
[1300]	training's rmse: 3.44319	valid_1's rmse: 3.64186
[1400]	training's rmse: 3.43309	valid_1's rmse: 3.64121
[1500]	training's rmse: 3.42311	valid_1's rmse: 3.64068
[1600]	training's rmse: 3.41422	valid_1's rmse: 3.64057
[1700]	training's rmse: 3.40548	valid_1's rmse: 3.6401

[2700]	training's rmse: 3.32628	valid_1's rmse: 3.59808
[2800]	training's rmse: 3.31803	valid_1's rmse: 3.5981
Early stopping, best iteration is:
[2417]	training's rmse: 3.34991	valid_1's rmse: 3.59705
Training until validation scores don't improve for 400 rounds.
[100]	training's rmse: 3.70869	valid_1's rmse: 3.77408
[200]	training's rmse: 3.63713	valid_1's rmse: 3.73756
[300]	training's rmse: 3.59327	valid_1's rmse: 3.72126
[400]	training's rmse: 3.56254	valid_1's rmse: 3.71282
[500]	training's rmse: 3.53935	valid_1's rmse: 3.70718
[600]	training's rmse: 3.52077	valid_1's rmse: 3.70371
[700]	training's rmse: 3.50521	valid_1's rmse: 3.70121
[800]	training's rmse: 3.49152	valid_1's rmse: 3.6996
[900]	training's rmse: 3.47936	valid_1's rmse: 3.69843
[1000]	training's rmse: 3.46809	valid_1's rmse: 3.69755
[1100]	training's rmse: 3.45717	valid_1's rmse: 3.6972
[1200]	training's rmse: 3.44641	valid_1's rmse: 3.69665
[1300]	training's rmse: 3.43625	valid_1's rmse: 3.69627
[1400]	training's 

[1100]	training's rmse: 3.45571	valid_1's rmse: 3.67902
[1200]	training's rmse: 3.44535	valid_1's rmse: 3.67924
Early stopping, best iteration is:
[870]	training's rmse: 3.4829	valid_1's rmse: 3.67816


3.643498400318252

In [5]:
#best score with catboost feature 3.643498400318252
#df_train['outliers'] = 0
#df_train.loc[df_train['target'] < -30, 'outliers'] = 1
best_pr=val_pr.copy()
obj=list(df_train.select_dtypes(include=['object']).columns)[2:]
from sklearn.preprocessing import LabelEncoder
lb=LabelEncoder()
for o in obj:
    df_train[o]=lb.fit_transform(df_train[o]) 
df_train_columns = [c for c in df_train.columns if c not in ['card_id', 'first_active_month','target','outliers','Unnamed: 0']]
param = {'objective': 'regression_l2', 
    'metric': 'rmse',
            'boosting_type': 'gbdt', 
            'n_jobs': 4, 'max_depth': 18, 
            'n_estimators': 6100, 
            'subsample_freq': 2, 
            'min_data_per_group': 100, 
            'max_cat_to_onehot': 4, 
            'cat_l2': 10.0, 
            'cat_smooth': 10.0, 
            'max_cat_threshold': 32, 
            'metric_freq': 10, 
            'verbosity': -1, 
            'colsample_bytree': 0.5, 
            'learning_rate': 0.0061033234451294376, 
            'min_child_samples': 20, 
            'min_child_weight': 9.0, 
            'min_split_gain': 1e-06, 
            'num_leaves': 36, 
            'reg_alpha': 40.0, 
            'reg_lambda': 13.3, 
            'subsample': 0.9}
from catboost import CatBoostClassifier, Pool
rskf=StratifiedKFold(11,shuffle=True,random_state=4590)
val_pr=np.zeros(len(df_train))
feature_importance_df = pd.DataFrame()
#test_pr=np.zeros(len(df_test))
for train_index,val_index in rskf.split(df_train,df_train['outliers'].values):
    num_round = 10000
    train_data=lgb.Dataset(df_train[df_train_columns].loc[train_index],label=df_train['outliers'].loc[train_index])
    val_data=lgb.Dataset(df_train[df_train_columns].loc[val_index],label=df_train['outliers'].loc[val_index])
    params={
        'num_leaves': 31,
         'n_jobs': 4,
         'min_data_in_leaf': 30, 
         'objective':'binary',
         'max_depth': 6,
         'learning_rate': 0.01,
         "boosting": "rf",
         "feature_fraction": 0.9,
         "bagging_freq": 1,
         "bagging_fraction": 0.9 ,
         "bagging_seed": 11,
         "metric": 'binary_logloss',
         "lambda_l1": 0.1,
         "verbosity": -1,
         "random_state": 2333
    }
    model=lgb.train(params,train_data,num_round,valid_sets=[train_data,val_data],verbose_eval=100,early_stopping_rounds=200)
    cat_pr=model.predict(df_train[df_train_columns].loc[val_index],num_iteration=model.best_iteration)
    
    train_data=lgb.Dataset(df_train[df_train_columns].loc[train_index],label=df_train['target'].loc[train_index])
    val_data=lgb.Dataset(df_train[df_train_columns].loc[val_index],label=df_train['target'].loc[val_index])
    model=lgb.train(param,train_data,num_round,valid_sets=[train_data,val_data],verbose_eval=100,early_stopping_rounds=400)
    lgb_pr=model.predict(df_train[df_train_columns].loc[val_index],num_iteration=model.best_iteration)
    prediction=pd.DataFrame({'cat_pr':cat_pr,'lgb_pr':lgb_pr,'target':df_train['target'].loc[val_index]})
    thresholds = []
    for thresh in np.arange(0.01, 1, 0.01):
        thresh = np.round(thresh, 2)
        pr=np.where(prediction['cat_pr']>thresh,-33.22,prediction['lgb_pr'])
        res = np.sqrt(mean_squared_error(pr,prediction['target'].values))
        thresholds.append([thresh, res])
    thresholds.sort(key=lambda x: x[1], reverse=False)
    best_thresh = thresholds[0][0]
    
    val_pr[val_index]=np.where(prediction['cat_pr']>best_thresh,-33.22,prediction['lgb_pr'])
    
np.sqrt(mean_squared_error(val_pr,df_train['target']))

Training until validation scores don't improve for 200 rounds.
[100]	training's binary_logloss: 0.0442728	valid_1's binary_logloss: 0.0449508
[200]	training's binary_logloss: 0.0442359	valid_1's binary_logloss: 0.0449403
Early stopping, best iteration is:
[57]	training's binary_logloss: 0.0442299	valid_1's binary_logloss: 0.0448892
Training until validation scores don't improve for 400 rounds.
[100]	training's rmse: 3.71265	valid_1's rmse: 3.74979
[200]	training's rmse: 3.64397	valid_1's rmse: 3.70293
[300]	training's rmse: 3.59972	valid_1's rmse: 3.68022
[400]	training's rmse: 3.5689	valid_1's rmse: 3.66885
[500]	training's rmse: 3.54513	valid_1's rmse: 3.66168
[600]	training's rmse: 3.52562	valid_1's rmse: 3.65669
[700]	training's rmse: 3.50936	valid_1's rmse: 3.65363
[800]	training's rmse: 3.49437	valid_1's rmse: 3.65141
[900]	training's rmse: 3.48028	valid_1's rmse: 3.64934
[1000]	training's rmse: 3.46741	valid_1's rmse: 3.64762
[1100]	training's rmse: 3.45501	valid_1's rmse: 3.646

Training until validation scores don't improve for 200 rounds.
[100]	training's binary_logloss: 0.0434539	valid_1's binary_logloss: 0.0473659
[200]	training's binary_logloss: 0.0434244	valid_1's binary_logloss: 0.0473453
[300]	training's binary_logloss: 0.0434177	valid_1's binary_logloss: 0.0473318
[400]	training's binary_logloss: 0.0433987	valid_1's binary_logloss: 0.0473091
[500]	training's binary_logloss: 0.0434052	valid_1's binary_logloss: 0.0473238
Early stopping, best iteration is:
[392]	training's binary_logloss: 0.0433992	valid_1's binary_logloss: 0.0473049
Training until validation scores don't improve for 400 rounds.
[100]	training's rmse: 3.71504	valid_1's rmse: 3.73092
[200]	training's rmse: 3.64335	valid_1's rmse: 3.6914
[300]	training's rmse: 3.59728	valid_1's rmse: 3.67346
[400]	training's rmse: 3.56458	valid_1's rmse: 3.66518
[500]	training's rmse: 3.53994	valid_1's rmse: 3.65998
[600]	training's rmse: 3.52098	valid_1's rmse: 3.65657
[700]	training's rmse: 3.50427	valid

3.6472082581786145

In [25]:
#df_train['hist_purchase_amount_sum']
#df_train['hist_purchase_date_uptonow']
#df_train['hist_card_id_size']
best_pr_val

array([0., 0., 0., ..., 0., 0., 0.])

In [24]:
#best score with catboost feature 3.643498400318252
#df_train['outliers'] = 0
#df_train.loc[df_train['target'] < -30, 'outliers'] = 1
best_pr=val_pr.copy()
obj=list(df_train.select_dtypes(include=['object']).columns)[2:]
from sklearn.preprocessing import LabelEncoder
lb=LabelEncoder()
for o in obj:
    df_train[o]=lb.fit_transform(df_train[o]) 
df_train_columns = [c for c in df_train.columns if c not in ['card_id', 'first_active_month','target','outliers','Unnamed: 0']]
param = {'objective': 'regression_l2', 
    'metric': 'rmse',
            'boosting_type': 'gbdt', 
            'n_jobs': 4, 'max_depth': 18, 
            'n_estimators': 6100, 
            'subsample_freq': 2, 
            'min_data_per_group': 100, 
            'max_cat_to_onehot': 4, 
            'cat_l2': 10.0, 
            'cat_smooth': 10.0, 
            'max_cat_threshold': 32, 
            'metric_freq': 10, 
            'verbosity': -1, 
            'colsample_bytree': 0.5, 
            'learning_rate': 0.0061033234451294376, 
            'min_child_samples': 20, 
            'min_child_weight': 9.0, 
            'min_split_gain': 1e-06, 
            'num_leaves': 36, 
            'reg_alpha': 40.0, 
            'reg_lambda': 13.3, 
            'subsample': 0.9}
from catboost import CatBoostClassifier, Pool
rskf=StratifiedKFold(5,shuffle=True,random_state=4590)
val_pr=np.zeros(len(df_train))
feature_importance_df = pd.DataFrame()
#test_pr=np.zeros(len(df_test))
for train_index,val_index in rskf.split(df_train,df_train['outliers'].values):
    num_round = 10000
    train_data=lgb.Dataset(df_train[df_train_columns].loc[train_index],label=df_train['outliers'].loc[train_index])
    val_data=lgb.Dataset(df_train[df_train_columns].loc[val_index],label=df_train['outliers'].loc[val_index])
    best_pr_val=best_pr[val_index]
    params={'n_estimators': 6100, 
             'n_jobs': 4,
        'num_leaves': 31,
         'min_data_in_leaf': 30, 
         'objective':'binary',
         'max_depth': 12,
         'learning_rate': 0.01,
         "boosting": "gbdt",
         "feature_fraction": 0.9,
         "bagging_freq": 1,
         "bagging_fraction": 0.9 ,
         "bagging_seed": 11,
         "metric": 'binary_logloss',
         "lambda_l1": 30,
         "verbosity": -1,
         "random_state": 2333
    }
    model=lgb.train(params,train_data,num_round,valid_sets=[train_data,val_data],verbose_eval=100,early_stopping_rounds=400)
    cat_pr=model.predict(df_train[df_train_columns].loc[val_index],num_iteration=model.best_iteration)
    
    train_data=lgb.Dataset(df_train[df_train_columns].loc[train_index][df_train['outliers']==0],label=df_train['target'].loc[train_index][df_train['outliers']==0])
    val_data=lgb.Dataset(df_train[df_train_columns].loc[val_index][df_train['outliers']==0],label=df_train['target'].loc[val_index][df_train['outliers']==0])
    model=lgb.train(param,train_data,num_round,valid_sets=[train_data,val_data],verbose_eval=100,early_stopping_rounds=400)
    lgb_pr=model.predict(df_train[df_train_columns].loc[val_index],num_iteration=model.best_iteration)
    prediction=pd.DataFrame({'cat_pr':cat_pr,'lgb_pr':lgb_pr,'target':df_train['target'].loc[val_index]})
    thresholds = []
    for thresh in np.arange(0.01, 1, 0.01):
        thresh = np.round(thresh, 2)
        pr=np.where(prediction['cat_pr']>thresh,best_pr_val,prediction['lgb_pr'])
        res = np.sqrt(mean_squared_error(pr,prediction['target'].values))
        thresholds.append([thresh, res])
    thresholds.sort(key=lambda x: x[1], reverse=False)
    best_thresh = thresholds[0][0]
    
    val_pr[val_index]=np.where(prediction['cat_pr']>best_thresh,best_pr_val,prediction['lgb_pr'])
    
np.sqrt(mean_squared_error(val_pr,df_train['target']))

Training until validation scores don't improve for 400 rounds.
[100]	training's binary_logloss: 0.045606	valid_1's binary_logloss: 0.0470904
[200]	training's binary_logloss: 0.0426354	valid_1's binary_logloss: 0.0449755
[300]	training's binary_logloss: 0.0410832	valid_1's binary_logloss: 0.0442041
[400]	training's binary_logloss: 0.0399629	valid_1's binary_logloss: 0.0438532
[500]	training's binary_logloss: 0.0390839	valid_1's binary_logloss: 0.0436931
[600]	training's binary_logloss: 0.0383157	valid_1's binary_logloss: 0.0436038
[700]	training's binary_logloss: 0.0376569	valid_1's binary_logloss: 0.0435639
[800]	training's binary_logloss: 0.0370513	valid_1's binary_logloss: 0.0435445
[900]	training's binary_logloss: 0.0365494	valid_1's binary_logloss: 0.0435432
[1000]	training's binary_logloss: 0.0361064	valid_1's binary_logloss: 0.0435355
[1100]	training's binary_logloss: 0.0356762	valid_1's binary_logloss: 0.0435396
[1200]	training's binary_logloss: 0.0352959	valid_1's binary_loglos

[3600]	training's rmse: 1.46594	valid_1's rmse: 1.54864
[3700]	training's rmse: 1.4639	valid_1's rmse: 1.54862
[3800]	training's rmse: 1.46192	valid_1's rmse: 1.5486
[3900]	training's rmse: 1.45991	valid_1's rmse: 1.54856
[4000]	training's rmse: 1.45795	valid_1's rmse: 1.54856
[4100]	training's rmse: 1.45597	valid_1's rmse: 1.54853
[4200]	training's rmse: 1.45398	valid_1's rmse: 1.54849
[4300]	training's rmse: 1.45207	valid_1's rmse: 1.54851
[4400]	training's rmse: 1.45009	valid_1's rmse: 1.54847
[4500]	training's rmse: 1.44815	valid_1's rmse: 1.54844
[4600]	training's rmse: 1.44617	valid_1's rmse: 1.54844
[4700]	training's rmse: 1.44421	valid_1's rmse: 1.54845
[4800]	training's rmse: 1.44232	valid_1's rmse: 1.54844
[4900]	training's rmse: 1.44042	valid_1's rmse: 1.54843
[5000]	training's rmse: 1.43852	valid_1's rmse: 1.54839
[5100]	training's rmse: 1.43662	valid_1's rmse: 1.54839
[5200]	training's rmse: 1.43479	valid_1's rmse: 1.54834
[5300]	training's rmse: 1.43286	valid_1's rmse: 1.

[2700]	training's rmse: 1.48517	valid_1's rmse: 1.54921
[2800]	training's rmse: 1.48297	valid_1's rmse: 1.5491
[2900]	training's rmse: 1.48081	valid_1's rmse: 1.54898
[3000]	training's rmse: 1.47862	valid_1's rmse: 1.54888
[3100]	training's rmse: 1.4765	valid_1's rmse: 1.54874
[3200]	training's rmse: 1.47443	valid_1's rmse: 1.54863
[3300]	training's rmse: 1.47237	valid_1's rmse: 1.54859
[3400]	training's rmse: 1.47036	valid_1's rmse: 1.54854
[3500]	training's rmse: 1.46829	valid_1's rmse: 1.54853
[3600]	training's rmse: 1.46628	valid_1's rmse: 1.54848
[3700]	training's rmse: 1.46426	valid_1's rmse: 1.54844
[3800]	training's rmse: 1.46226	valid_1's rmse: 1.54841
[3900]	training's rmse: 1.46022	valid_1's rmse: 1.54837
[4000]	training's rmse: 1.45822	valid_1's rmse: 1.54834
[4100]	training's rmse: 1.45631	valid_1's rmse: 1.54832
[4200]	training's rmse: 1.45436	valid_1's rmse: 1.54836
[4300]	training's rmse: 1.4524	valid_1's rmse: 1.54834
[4400]	training's rmse: 1.45047	valid_1's rmse: 1.5

3.8000297111508057

In [12]:
#from boruta import BorutaPy
obj=list(df_train.select_dtypes(include=['object']).columns)[2:]
from sklearn.preprocessing import LabelEncoder
lb=LabelEncoder()
for o in obj:
    df_train[o]=lb.fit_transform(df_train[o]) 
df_train_columns = [c for c in df_train.columns if c not in ['card_id', 'first_active_month','target','outliers','Unnamed: 0']]
df_train=df_train.replace([np.inf, -np.inf], np.nan)
df_train=df_train.fillna(0)
from lightgbm import LGBMRegressor
lgbmclf = LGBMRegressor(boosting_type='gbdt', objective='regression', num_iteration=1000, num_leaves=31,
                        max_depth=7, learning_rate=0.01, feature_fraction= 0.9,
                       bagging_freq= 1, bagging_fraction= 0.9,bagging_seed= 11, metric= 'rmse', lambda_l1=10)
borutaselector = BorutaPy(lgbmclf, n_estimators='auto', verbose=2) 
borutaselector.fit(df_train[df_train_columns].values, df_train['target'].values)

Iteration: 	1 / 100
Confirmed: 	0
Tentative: 	288
Rejected: 	0
Iteration: 	2 / 100
Confirmed: 	0
Tentative: 	288
Rejected: 	0
Iteration: 	3 / 100
Confirmed: 	0
Tentative: 	288
Rejected: 	0
Iteration: 	4 / 100
Confirmed: 	0
Tentative: 	288
Rejected: 	0
Iteration: 	5 / 100
Confirmed: 	0
Tentative: 	288
Rejected: 	0
Iteration: 	6 / 100
Confirmed: 	0
Tentative: 	288
Rejected: 	0
Iteration: 	7 / 100
Confirmed: 	0
Tentative: 	288
Rejected: 	0
Iteration: 	8 / 100
Confirmed: 	33
Tentative: 	31
Rejected: 	224
Iteration: 	9 / 100
Confirmed: 	33
Tentative: 	31
Rejected: 	224
Iteration: 	10 / 100
Confirmed: 	33
Tentative: 	31
Rejected: 	224
Iteration: 	11 / 100
Confirmed: 	33
Tentative: 	31
Rejected: 	224
Iteration: 	12 / 100
Confirmed: 	34
Tentative: 	30
Rejected: 	224
Iteration: 	13 / 100
Confirmed: 	34
Tentative: 	30
Rejected: 	224
Iteration: 	14 / 100
Confirmed: 	34
Tentative: 	30
Rejected: 	224
Iteration: 	15 / 100
Confirmed: 	34
Tentative: 	29
Rejected: 	225
Iteration: 	16 / 100
Confirmed: 	

KeyboardInterrupt: 

In [7]:

from __future__ import print_function, division
import numpy as np
import scipy as sp
from sklearn.utils import check_random_state, check_X_y
from sklearn.base import TransformerMixin, BaseEstimator


class BorutaPy(BaseEstimator, TransformerMixin):
    """
    Improved Python implementation of the Boruta R package.

    The improvements of this implementation include:
    - Faster run times:
        Thanks to scikit-learn's fast implementation of the ensemble methods.
    - Scikit-learn like interface:
        Use BorutaPy just like any other scikit learner: fit, fit_transform and
        transform are all implemented in a similar fashion.
    - Modularity:
        Any ensemble method could be used: random forest, extra trees
        classifier, even gradient boosted trees.
    - Two step correction:
        The original Boruta code corrects for multiple testing in an overly
        conservative way. In this implementation, the Benjamini Hochberg FDR is
        used to correct in each iteration across active features. This means
        only those features are included in the correction which are still in
        the selection process. Following this, each that passed goes through a
        regular Bonferroni correction to check for the repeated testing over
        the iterations.
    - Percentile:
        Instead of using the max values of the shadow features the user can
        specify which percentile to use. This gives a finer control over this
        crucial parameter. For more info, please read about the perc parameter.
    - Automatic tree number:
        Setting the n_estimator to 'auto' will calculate the number of trees
        in each itartion based on the number of features under investigation.
        This way more trees are used when the training data has many feautres
        and less when most of the features have been rejected.
    - Ranking of features:
        After fitting BorutaPy it provides the user with ranking of features.
        Confirmed ones are 1, Tentatives are 2, and the rejected are ranked
        starting from 3, based on their feautre importance history through
        the iterations.

    We highly recommend using pruned trees with a depth between 3-7.

    For more, see the docs of these functions, and the examples below.

    Original code and method by: Miron B Kursa, https://m2.icm.edu.pl/boruta/

    Boruta is an all relevant feature selection method, while most other are
    minimal optimal; this means it tries to find all features carrying
    information usable for prediction, rather than finding a possibly compact
    subset of features on which some classifier has a minimal error.

    Why bother with all relevant feature selection?
    When you try to understand the phenomenon that made your data, you should
    care about all factors that contribute to it, not just the bluntest signs
    of it in context of your methodology (yes, minimal optimal set of features
    by definition depends on your classifier choice).

    Parameters
    ----------

    estimator : object
        A supervised learning estimator, with a 'fit' method that returns the
        feature_importances_ attribute. Important features must correspond to
        high absolute values in the feature_importances_.

    n_estimators : int or string, default = 1000
        If int sets the number of estimators in the chosen ensemble method.
        If 'auto' this is determined automatically based on the size of the
        dataset. The other parameters of the used estimators need to be set
        with initialisation.

    perc : int, default = 100
        Instead of the max we use the percentile defined by the user, to pick
        our threshold for comparison between shadow and real features. The max
        tend to be too stringent. This provides a finer control over this. The
        lower perc is the more false positives will be picked as relevant but
        also the less relevant features will be left out. The usual trade-off.
        The default is essentially the vanilla Boruta corresponding to the max.

    alpha : float, default = 0.05
        Level at which the corrected p-values will get rejected in both
        correction steps.

    two_step : Boolean, default = True
        If you want to use the original implementation of Boruta with Bonferroni
        correction only set this to False.

    max_iter : int, default = 100
        The number of maximum iterations to perform.

    random_state : int, RandomState instance or None; default=None
        If int, random_state is the seed used by the random number generator;
        If RandomState instance, random_state is the random number generator;
        If None, the random number generator is the RandomState instance used
        by `np.random`.

    verbose : int, default=0
        Controls verbosity of output:
        - 0: no output
        - 1: displays iteration number
        - 2: which features have been selected already

    Attributes
    ----------

    n_features_ : int
        The number of selected features.

    support_ : array of shape [n_features]

        The mask of selected features - only confirmed ones are True.

    support_weak_ : array of shape [n_features]

        The mask of selected tentative features, which haven't gained enough
        support during the max_iter number of iterations..

    ranking_ : array of shape [n_features]

        The feature ranking, such that ``ranking_[i]`` corresponds to the
        ranking position of the i-th feature. Selected (i.e., estimated
        best) features are assigned rank 1 and tentative features are assigned
        rank 2.

    Examples
    --------
    
    import pandas as pd
    from sklearn.ensemble import RandomForestClassifier
    from boruta import BorutaPy
    
    # load X and y
    # NOTE BorutaPy accepts numpy arrays only, hence the .values attribute
    X = pd.read_csv('examples/test_X.csv', index_col=0).values
    y = pd.read_csv('examples/test_y.csv', header=None, index_col=0).values
    y = y.ravel()
    
    # define random forest classifier, with utilising all cores and
    # sampling in proportion to y labels
    rf = RandomForestClassifier(n_jobs=-1, class_weight='balanced', max_depth=5)
    
    # define Boruta feature selection method
    feat_selector = BorutaPy(rf, n_estimators='auto', verbose=2, random_state=1)
    
    # find all relevant features - 5 features should be selected
    feat_selector.fit(X, y)
    
    # check selected features - first 5 features are selected
    feat_selector.support_
    
    # check ranking of features
    feat_selector.ranking_
    
    # call transform() on X to filter it down to selected features
    X_filtered = feat_selector.transform(X)

    References
    ----------

    [1] Kursa M., Rudnicki W., "Feature Selection with the Boruta Package"
        Journal of Statistical Software, Vol. 36, Issue 11, Sep 2010
    """

    def __init__(self, estimator, n_estimators=1000, perc=100, alpha=0.05,
                 two_step=True, max_iter=100, random_state=None, verbose=0):
        self.estimator = estimator
        self.n_estimators = n_estimators
        self.perc = perc
        self.alpha = alpha
        self.two_step = two_step
        self.max_iter = max_iter
        self.random_state = random_state
        self.verbose = verbose

    def fit(self, X, y):
        """
        Fits the Boruta feature selection with the provided estimator.

        Parameters
        ----------
        X : array-like, shape = [n_samples, n_features]
            The training input samples.

        y : array-like, shape = [n_samples]
            The target values.
        """

        return self._fit(X, y)

    def transform(self, X, weak=False):
        """
        Reduces the input X to the features selected by Boruta.

        Parameters
        ----------
        X : array-like, shape = [n_samples, n_features]
            The training input samples.

        weak: boolean, default = False
            If set to true, the tentative features are also used to reduce X.

        Returns
        -------
        X : array-like, shape = [n_samples, n_features_]
            The input matrix X's columns are reduced to the features which were
            selected by Boruta.
        """

        return self._transform(X, weak)

    def fit_transform(self, X, y, weak=False):
        """
        Fits Boruta, then reduces the input X to the selected features.

        Parameters
        ----------
        X : array-like, shape = [n_samples, n_features]
            The training input samples.

        y : array-like, shape = [n_samples]
            The target values.

        weak: boolean, default = False
            If set to true, the tentative features are also used to reduce X.

        Returns
        -------
        X : array-like, shape = [n_samples, n_features_]
            The input matrix X's columns are reduced to the features which were
            selected by Boruta.
        """

        self._fit(X, y)
        return self._transform(X, weak)

    def _fit(self, X, y):
        # check input params
        self._check_params(X, y)
        self.random_state = check_random_state(self.random_state)
        # setup variables for Boruta
        n_sample, n_feat = X.shape
        _iter = 1
        # holds the decision about each feature:
        # 0  - default state = tentative in original code
        # 1  - accepted in original code
        # -1 - rejected in original code
        dec_reg = np.zeros(n_feat, dtype=np.int)
        # counts how many times a given feature was more important than
        # the best of the shadow features
        hit_reg = np.zeros(n_feat, dtype=np.int)
        # these record the history of the iterations
        imp_history = np.zeros(n_feat, dtype=np.float)
        sha_max_history = []

        # set n_estimators
        if self.n_estimators != 'auto':
            self.estimator.set_params(n_estimators=self.n_estimators)

        # main feature selection loop
        while np.any(dec_reg == 0) and _iter < self.max_iter:
            # find optimal number of trees and depth
            if self.n_estimators == 'auto':
                # number of features that aren't rejected
                not_rejected = np.where(dec_reg >= 0)[0].shape[0]
                n_tree = self._get_tree_num(not_rejected)
                self.estimator.set_params(n_estimators=n_tree)

            # make sure we start with a new tree in each iteration
            self.estimator.set_params(random_state=self.random_state.get_state()[2])

            # add shadow attributes, shuffle them and train estimator, get imps
            cur_imp = self._add_shadows_get_imps(X, y, dec_reg)

            # get the threshold of shadow importances we will use for rejection
            imp_sha_max = np.percentile(cur_imp[1], self.perc)

            # record importance history
            sha_max_history.append(imp_sha_max)
            imp_history = np.vstack((imp_history, cur_imp[0]))

            # register which feature is more imp than the max of shadows
            hit_reg = self._assign_hits(hit_reg, cur_imp, imp_sha_max)

            # based on hit_reg we check if a feature is doing better than
            # expected by chance
            dec_reg = self._do_tests(dec_reg, hit_reg, _iter)

            # print out confirmed features
            if self.verbose > 0 and _iter < self.max_iter:
                self._print_results(dec_reg, _iter, 0)
            if _iter < self.max_iter:
                _iter += 1

        # we automatically apply R package's rough fix for tentative ones
        confirmed = np.where(dec_reg == 1)[0]
        tentative = np.where(dec_reg == 0)[0]
        # ignore the first row of zeros
        tentative_median = np.median(imp_history[1:, tentative], axis=0)
        # which tentative to keep
        tentative_confirmed = np.where(tentative_median
                                       > np.median(sha_max_history))[0]
        tentative = tentative[tentative_confirmed]

        # basic result variables
        self.n_features_ = confirmed.shape[0]
        self.support_ = np.zeros(n_feat, dtype=np.bool)
        self.support_[confirmed] = 1
        self.support_weak_ = np.zeros(n_feat, dtype=np.bool)
        self.support_weak_[tentative] = 1

        # ranking, confirmed variables are rank 1
        self.ranking_ = np.ones(n_feat, dtype=np.int)
        # tentative variables are rank 2
        self.ranking_[tentative] = 2
        # selected = confirmed and tentative
        selected = np.hstack((confirmed, tentative))
        # all rejected features are sorted by importance history
        not_selected = np.setdiff1d(np.arange(n_feat), selected)
        # large importance values should rank higher = lower ranks -> *(-1)
        imp_history_rejected = imp_history[1:, not_selected] * -1

        # update rank for not_selected features
        if not_selected.shape[0] > 0:
                # calculate ranks in each iteration, then median of ranks across feats
                iter_ranks = self._nanrankdata(imp_history_rejected, axis=1)
                rank_medians = np.nanmedian(iter_ranks, axis=0)
                ranks = self._nanrankdata(rank_medians, axis=0)

                # set smallest rank to 3 if there are tentative feats
                if tentative.shape[0] > 0:
                    ranks = ranks - np.min(ranks) + 3
                else:
                    # and 2 otherwise
                    ranks = ranks - np.min(ranks) + 2
                self.ranking_[not_selected] = ranks
        else:
            # all are selected, thus we set feature supports to True
            self.support_ = np.ones(n_feat, dtype=np.bool)

        # notify user
        if self.verbose > 0:
            self._print_results(dec_reg, _iter, 1)
        return self

    def _transform(self, X, weak=False):
        # sanity check
        try:
            self.ranking_
        except AttributeError:
            raise ValueError('You need to call the fit(X, y) method first.')

        if weak:
            X = X[:, self.support_ + self.support_weak_]
        else:
            X = X[:, self.support_]
        return X

    def _get_tree_num(self, n_feat):
        depth = self.estimator.get_params()['max_depth']
        if depth == None:
            depth = 10
        # how many times a feature should be considered on average
        f_repr = 100
        # n_feat * 2 because the training matrix is extended with n shadow features
        multi = ((n_feat * 2) / (np.sqrt(n_feat * 2) * depth))
        n_estimators = int(multi * f_repr)
        return n_estimators

    def _get_imp(self, X, y):
        try:
            self.estimator.fit(X, y)
        except Exception as e:
            raise ValueError('Please check your X and y variable. The provided'
                             'estimator cannot be fitted to your data.\n' + str(e))
        try:
            imp = self.estimator.feature_importances_
        except Exception:
            raise ValueError('Only methods with feature_importance_ attribute '
                             'are currently supported in BorutaPy.')
        return imp

    def _get_shuffle(self, seq):
        self.random_state.shuffle(seq)
        return seq

    def _add_shadows_get_imps(self, X, y, dec_reg):
        # find features that are tentative still
        x_cur_ind = np.where(dec_reg >= 0)[0]
        x_cur = np.copy(X[:, x_cur_ind])
        x_cur_w = x_cur.shape[1]
        # deep copy the matrix for the shadow matrix
        x_sha = np.copy(x_cur)
        # make sure there's at least 5 columns in the shadow matrix for
        while (x_sha.shape[1] < 5):
            x_sha = np.hstack((x_sha, x_sha))
        # shuffle xSha
        x_sha = np.apply_along_axis(self._get_shuffle, 0, x_sha)
        # get importance of the merged matrix
        imp = self._get_imp(np.hstack((x_cur, x_sha)), y)
        # separate importances of real and shadow features
        imp_sha = imp[x_cur_w:]
        imp_real = np.zeros(X.shape[1])
        imp_real[:] = np.nan
        imp_real[x_cur_ind] = imp[:x_cur_w]
        return imp_real, imp_sha

    def _assign_hits(self, hit_reg, cur_imp, imp_sha_max):
        # register hits for features that did better than the best of shadows
        cur_imp_no_nan = cur_imp[0]
        cur_imp_no_nan[np.isnan(cur_imp_no_nan)] = 0
        hits = np.where(cur_imp_no_nan > imp_sha_max)[0]
        hit_reg[hits] += 1
        return hit_reg

    def _do_tests(self, dec_reg, hit_reg, _iter):
        active_features = np.where(dec_reg >= 0)[0]
        hits = hit_reg[active_features]
        # get uncorrected p values based on hit_reg
        to_accept_ps = sp.stats.binom.sf(hits - 1, _iter, .5).flatten()
        to_reject_ps = sp.stats.binom.cdf(hits, _iter, .5).flatten()

        if self.two_step:
            # two step multicor process
            # first we correct for testing several features in each round using FDR
            to_accept = self._fdrcorrection(to_accept_ps, alpha=self.alpha)[0]
            to_reject = self._fdrcorrection(to_reject_ps, alpha=self.alpha)[0]

            # second we correct for testing the same feature over and over again
            # using bonferroni
            to_accept2 = to_accept_ps <= self.alpha / float(_iter)
            to_reject2 = to_reject_ps <= self.alpha / float(_iter)

            # combine the two multi corrections, and get indexes
            to_accept *= to_accept2
            to_reject *= to_reject2
        else:
            # as in th original Boruta, we simply do bonferroni correction
            # with the total n_feat in each iteration
            to_accept = to_accept_ps <= self.alpha / float(len(dec_reg))
            to_reject = to_reject_ps <= self.alpha / float(len(dec_reg))

        # find features which are 0 and have been rejected or accepted
        to_accept = np.where((dec_reg[active_features] == 0) * to_accept)[0]
        to_reject = np.where((dec_reg[active_features] == 0) * to_reject)[0]

        # updating dec_reg
        dec_reg[active_features[to_accept]] = 1
        dec_reg[active_features[to_reject]] = -1
        return dec_reg

    def _fdrcorrection(self, pvals, alpha=0.05):
        """
        Benjamini/Hochberg p-value correction for false discovery rate, from
        statsmodels package. Included here for decoupling dependency on statsmodels.

        Parameters
        ----------
        pvals : array_like
            set of p-values of the individual tests.
        alpha : float
            error rate

        Returns
        -------
        rejected : array, bool
            True if a hypothesis is rejected, False if not
        pvalue-corrected : array
            pvalues adjusted for multiple hypothesis testing to limit FDR
        """
        pvals = np.asarray(pvals)
        pvals_sortind = np.argsort(pvals)
        pvals_sorted = np.take(pvals, pvals_sortind)
        nobs = len(pvals_sorted)
        ecdffactor = np.arange(1, nobs + 1) / float(nobs)

        reject = pvals_sorted <= ecdffactor * alpha
        if reject.any():
            rejectmax = max(np.nonzero(reject)[0])
            reject[:rejectmax] = True

        pvals_corrected_raw = pvals_sorted / ecdffactor
        pvals_corrected = np.minimum.accumulate(pvals_corrected_raw[::-1])[::-1]
        pvals_corrected[pvals_corrected > 1] = 1
        # reorder p-values and rejection mask to original order of pvals
        pvals_corrected_ = np.empty_like(pvals_corrected)
        pvals_corrected_[pvals_sortind] = pvals_corrected
        reject_ = np.empty_like(reject)
        reject_[pvals_sortind] = reject
        return reject_, pvals_corrected_

    def _nanrankdata(self, X, axis=1):
        """
        Replaces bottleneck's nanrankdata with scipy and numpy alternative.
        """
        ranks = sp.stats.mstats.rankdata(X, axis=axis)
        ranks[np.isnan(X)] = np.nan
        return ranks

    def _check_params(self, X, y):
        """
        Check hyperparameters as well as X and y before proceeding with fit.
        """
        # check X and y are consistent len, X is Array and y is column
        X, y = check_X_y(X, y)
        if self.perc <= 0 or self.perc > 100:
            raise ValueError('The percentile should be between 0 and 100.')

        if self.alpha <= 0 or self.alpha > 1:
            raise ValueError('Alpha should be between 0 and 1.')

    def _print_results(self, dec_reg, _iter, flag):
        n_iter = str(_iter) + ' / ' + str(self.max_iter)
        n_confirmed = np.where(dec_reg == 1)[0].shape[0]
        n_rejected = np.where(dec_reg == -1)[0].shape[0]
        cols = ['Iteration: ', 'Confirmed: ', 'Tentative: ', 'Rejected: ']

        # still in feature selection
        if flag == 0:
            n_tentative = np.where(dec_reg == 0)[0].shape[0]
            content = map(str, [n_iter, n_confirmed, n_tentative, n_rejected])
            if self.verbose == 1:
                output = cols[0] + n_iter
            elif self.verbose > 1:
                output = '\n'.join([x[0] + '\t' + x[1] for x in zip(cols, content)])

        # Boruta finished running and tentatives have been filtered
        else:
            n_tentative = np.sum(self.support_weak_)
            content = map(str, [n_iter, n_confirmed, n_tentative, n_rejected])
            result = '\n'.join([x[0] + '\t' + x[1] for x in zip(cols, content)])
            output = "\n\nBorutaPy finished running.\n\n" + result
        print(output)


In [8]:
df_train['outliers'] = 0
df_train.loc[df_train['target'] < -30, 'outliers'] = 1
obj=list(df_train.select_dtypes(include=['object']).columns)[2:]
from sklearn.preprocessing import LabelEncoder
lb=LabelEncoder()
for o in obj:
    df_train[o]=lb.fit_transform(df_train[o]) 
df_train_columns = [c for c in df_train.columns if c not in ['card_id', 'first_active_month','target','outliers','Unnamed: 0','Unnamed: 0_x']]
def optim_lgb(min_child,n_estimator,max_depth,num_leaves,lambda_l1,lambda_l2,min_split_gain,min_child_weight):
    def run_lgb(train_data,val_data,val_x,min_child,n_estimator,max_depth,num_leaves,lambda_l1,lambda_l2,min_split_gain,min_child_weight):
        num_round = 10000
        params = {
                    "objective" : "regression_l2",
                    "metric" : "rmse", 
                    "boosting": "gbdt",
                    'n_jobs': 6,
                   'learning_rate': 0.008,
                    'max_cat_to_onehot': 4, 
                     'min_child_samples': int(min_child),
                    'cat_smooth': 10.0,
                    'n_estimators': int(n_estimator),
                     'max_depth':int(round(max_depth)),
                     'num_leaves':int(num_leaves),
                     'lambda_l1':max(lambda_l1, 0),
                     'lambda_l2':max(lambda_l2, 0),
                     'min_split_gain':min_split_gain,
                     'min_child_weight':min_child_weight
                }
        model=lgb.train(params,train_data,num_round,valid_sets=[train_data,val_data],verbose_eval=False,early_stopping_rounds=400)

        pred_val = model.predict(val_x, num_iteration=model.best_iteration)
    #    return pred_val
        return  pred_val
    rskf=StratifiedKFold(5,shuffle=True,random_state=315)
    val_pr=np.zeros(len(df_train))
    for train_index,val_index in rskf.split(df_train,df_train['outliers'].values):
        train_data=lgb.Dataset(df_train[df_train_columns].loc[train_index],label=df_train['target'].loc[train_index])
        val_data=lgb.Dataset(df_train[df_train_columns].loc[val_index],label=df_train['target'].loc[val_index])
        val_pr[val_index]=run_lgb(train_data,val_data,df_train[df_train_columns].loc[val_index],min_child,n_estimator,max_depth,num_leaves,lambda_l1,lambda_l2,min_split_gain,min_child_weight)
    return 1-np.sqrt(mean_squared_error(val_pr,df_train['target']))
    
from bayes_opt import BayesianOptimization
optimizer = BayesianOptimization(optim_lgb,{'min_child':(5,200),'n_estimator':(50,7000),'max_depth':(-1,20),'num_leaves':(10,50),'lambda_l1':(0,200),'lambda_l2':(0,200),
'min_split_gain':(0.001, 0.1),'min_child_weight':(2,50)}
    
)
optimizer.maximize(init_points=3,
    n_iter=30)

|   iter    |  target   | lambda_l1 | lambda_l2 | max_depth | min_child | min_ch... | min_sp... | n_esti... | num_le... |
-------------------------------------------------------------------------------------------------------------------------
| [0m 1       [0m | [0m-2.781   [0m | [0m 181.9   [0m | [0m 102.6   [0m | [0m 4.355   [0m | [0m 132.0   [0m | [0m 15.41   [0m | [0m 0.07727 [0m | [0m 4.23e+03[0m | [0m 26.78   [0m |
| [0m 2       [0m | [0m-2.782   [0m | [0m 199.1   [0m | [0m 52.12   [0m | [0m 7.318   [0m | [0m 58.1    [0m | [0m 27.92   [0m | [0m 0.04823 [0m | [0m 1.254e+0[0m | [0m 49.07   [0m |
| [0m 3       [0m | [0m-2.781   [0m | [0m 149.2   [0m | [0m 27.45   [0m | [0m 5.759   [0m | [0m 14.91   [0m | [0m 36.73   [0m | [0m 0.05114 [0m | [0m 4.55e+03[0m | [0m 44.62   [0m |
| [0m 4       [0m | [0m-2.783   [0m | [0m 3.834   [0m | [0m 18.0    [0m | [0m 19.15   [0m | [0m 195.5   [0m | [0m 18.29   [0m | [0m 0.0

In [None]:
from catboost import CatBoostRegressor, Pool
rskf=StratifiedKFold(5,shuffle=True,random_state=4590)
val_pr=np.zeros(len(df_train))
feature_importance_df = pd.DataFrame()
#test_pr=np.zeros(len(df_test))
for train_index,val_index in rskf.split(df_train,df_train['outliers'].values):
    pool=Pool(df_train[df_train_columns].loc[train_index],df_train['target'].loc[train_index])
    val_pool=Pool(df_train[df_train_columns].loc[val_index],df_train['target'].loc[val_index])
    num_round = 10000
    model = CatBoostRegressor(iterations=1000,learning_rate=0.01,max_depth=14, loss_function='RMSE',early_stopping_rounds=300)
    model.fit(pool,eval_set=val_pool,use_best_model=True,verbose_eval=True)
    
    val_pr[val_index]=model.predict(df_train[df_train_columns].loc[val_index])
np.sqrt(mean_squared_error(val_pr,df_train['target']))

0:	learn: 3.8659009	test: 3.8797576	best: 3.8797576 (0)	total: 3.02s	remaining: 50m 22s
1:	learn: 3.8640229	test: 3.8781945	best: 3.8781945 (1)	total: 6.12s	remaining: 50m 54s
2:	learn: 3.8621282	test: 3.8766670	best: 3.8766670 (2)	total: 9.11s	remaining: 50m 25s
3:	learn: 3.8602801	test: 3.8751953	best: 3.8751953 (3)	total: 11.6s	remaining: 48m 18s
4:	learn: 3.8585214	test: 3.8737509	best: 3.8737509 (4)	total: 14.4s	remaining: 47m 48s
5:	learn: 3.8567039	test: 3.8723242	best: 3.8723242 (5)	total: 17.7s	remaining: 48m 54s
6:	learn: 3.8548921	test: 3.8709073	best: 3.8709073 (6)	total: 21.1s	remaining: 49m 46s
7:	learn: 3.8532044	test: 3.8695107	best: 3.8695107 (7)	total: 23.6s	remaining: 48m 50s
8:	learn: 3.8516083	test: 3.8680850	best: 3.8680850 (8)	total: 26.5s	remaining: 48m 40s
9:	learn: 3.8499003	test: 3.8667534	best: 3.8667534 (9)	total: 29.7s	remaining: 48m 58s
10:	learn: 3.8481348	test: 3.8654440	best: 3.8654440 (10)	total: 33s	remaining: 49m 28s
11:	learn: 3.8465372	test: 3.864

91:	learn: 3.7639024	test: 3.8109117	best: 3.8109117 (91)	total: 4m 26s	remaining: 43m 55s
92:	learn: 3.7631093	test: 3.8106243	best: 3.8106243 (92)	total: 4m 29s	remaining: 43m 47s
93:	learn: 3.7623956	test: 3.8103903	best: 3.8103903 (93)	total: 4m 32s	remaining: 43m 45s
94:	learn: 3.7616474	test: 3.8100588	best: 3.8100588 (94)	total: 4m 35s	remaining: 43m 48s
95:	learn: 3.7610226	test: 3.8097469	best: 3.8097469 (95)	total: 4m 38s	remaining: 43m 40s
96:	learn: 3.7603253	test: 3.8095000	best: 3.8095000 (96)	total: 4m 40s	remaining: 43m 35s
97:	learn: 3.7597873	test: 3.8092605	best: 3.8092605 (97)	total: 4m 43s	remaining: 43m 27s
98:	learn: 3.7590375	test: 3.8090170	best: 3.8090170 (98)	total: 4m 45s	remaining: 43m 20s
99:	learn: 3.7585347	test: 3.8087155	best: 3.8087155 (99)	total: 4m 48s	remaining: 43m 12s
100:	learn: 3.7578420	test: 3.8083986	best: 3.8083986 (100)	total: 4m 51s	remaining: 43m 17s
101:	learn: 3.7572971	test: 3.8081076	best: 3.8081076 (101)	total: 4m 55s	remaining: 43m

180:	learn: 3.7187373	test: 3.7983086	best: 3.7983086 (180)	total: 8m 47s	remaining: 39m 47s
181:	learn: 3.7182584	test: 3.7983000	best: 3.7983000 (181)	total: 8m 50s	remaining: 39m 43s
182:	learn: 3.7177564	test: 3.7982647	best: 3.7982647 (182)	total: 8m 53s	remaining: 39m 40s
183:	learn: 3.7173367	test: 3.7982335	best: 3.7982335 (183)	total: 8m 56s	remaining: 39m 38s
184:	learn: 3.7168999	test: 3.7981963	best: 3.7981963 (184)	total: 8m 59s	remaining: 39m 35s
185:	learn: 3.7164324	test: 3.7982236	best: 3.7981963 (184)	total: 9m 1s	remaining: 39m 29s
186:	learn: 3.7159333	test: 3.7981561	best: 3.7981561 (186)	total: 9m 3s	remaining: 39m 25s
187:	learn: 3.7156745	test: 3.7980789	best: 3.7980789 (187)	total: 9m 6s	remaining: 39m 20s
188:	learn: 3.7152308	test: 3.7980433	best: 3.7980433 (188)	total: 9m 9s	remaining: 39m 16s
189:	learn: 3.7147610	test: 3.7980341	best: 3.7980341 (189)	total: 9m 11s	remaining: 39m 11s
190:	learn: 3.7142246	test: 3.7979831	best: 3.7979831 (190)	total: 9m 14s	

268:	learn: 3.6849368	test: 3.7969617	best: 3.7968818 (259)	total: 12m 44s	remaining: 34m 38s
269:	learn: 3.6844927	test: 3.7969367	best: 3.7968818 (259)	total: 12m 47s	remaining: 34m 34s
270:	learn: 3.6842522	test: 3.7969321	best: 3.7968818 (259)	total: 12m 49s	remaining: 34m 30s
271:	learn: 3.6839278	test: 3.7969319	best: 3.7968818 (259)	total: 12m 52s	remaining: 34m 26s
272:	learn: 3.6834607	test: 3.7969239	best: 3.7968818 (259)	total: 12m 54s	remaining: 34m 22s
273:	learn: 3.6830732	test: 3.7969283	best: 3.7968818 (259)	total: 12m 56s	remaining: 34m 18s
274:	learn: 3.6826005	test: 3.7969761	best: 3.7968818 (259)	total: 12m 59s	remaining: 34m 15s
275:	learn: 3.6818560	test: 3.7969637	best: 3.7968818 (259)	total: 13m 2s	remaining: 34m 12s
276:	learn: 3.6814687	test: 3.7969936	best: 3.7968818 (259)	total: 13m 4s	remaining: 34m 8s
277:	learn: 3.6810916	test: 3.7969663	best: 3.7968818 (259)	total: 13m 7s	remaining: 34m 4s
278:	learn: 3.6809183	test: 3.7969905	best: 3.7968818 (259)	total

356:	learn: 3.6520480	test: 3.7976651	best: 3.7968818 (259)	total: 16m 32s	remaining: 29m 47s
357:	learn: 3.6519018	test: 3.7976375	best: 3.7968818 (259)	total: 16m 35s	remaining: 29m 44s
358:	learn: 3.6516275	test: 3.7976519	best: 3.7968818 (259)	total: 16m 37s	remaining: 29m 41s
359:	learn: 3.6514410	test: 3.7976682	best: 3.7968818 (259)	total: 16m 40s	remaining: 29m 38s
360:	learn: 3.6510910	test: 3.7976304	best: 3.7968818 (259)	total: 16m 42s	remaining: 29m 34s
361:	learn: 3.6506622	test: 3.7976219	best: 3.7968818 (259)	total: 16m 45s	remaining: 29m 31s
362:	learn: 3.6503036	test: 3.7977040	best: 3.7968818 (259)	total: 16m 47s	remaining: 29m 28s
363:	learn: 3.6500584	test: 3.7977158	best: 3.7968818 (259)	total: 16m 50s	remaining: 29m 24s
364:	learn: 3.6498771	test: 3.7977073	best: 3.7968818 (259)	total: 16m 52s	remaining: 29m 21s
365:	learn: 3.6494810	test: 3.7977415	best: 3.7968818 (259)	total: 16m 55s	remaining: 29m 18s
366:	learn: 3.6490925	test: 3.7977614	best: 3.7968818 (259)	

444:	learn: 3.6231768	test: 3.7987757	best: 3.7968818 (259)	total: 20m 29s	remaining: 25m 32s
445:	learn: 3.6225879	test: 3.7987873	best: 3.7968818 (259)	total: 20m 31s	remaining: 25m 30s
446:	learn: 3.6221082	test: 3.7988173	best: 3.7968818 (259)	total: 20m 34s	remaining: 25m 27s
447:	learn: 3.6219190	test: 3.7988479	best: 3.7968818 (259)	total: 20m 37s	remaining: 25m 24s
448:	learn: 3.6214442	test: 3.7989078	best: 3.7968818 (259)	total: 20m 39s	remaining: 25m 21s
449:	learn: 3.6210104	test: 3.7989112	best: 3.7968818 (259)	total: 20m 42s	remaining: 25m 18s
450:	learn: 3.6207397	test: 3.7989044	best: 3.7968818 (259)	total: 20m 44s	remaining: 25m 15s
451:	learn: 3.6204061	test: 3.7988854	best: 3.7968818 (259)	total: 20m 47s	remaining: 25m 11s
452:	learn: 3.6201428	test: 3.7989149	best: 3.7968818 (259)	total: 20m 49s	remaining: 25m 8s
453:	learn: 3.6198669	test: 3.7989241	best: 3.7968818 (259)	total: 20m 52s	remaining: 25m 6s
454:	learn: 3.6196342	test: 3.7989169	best: 3.7968818 (259)	to

532:	learn: 3.5936042	test: 3.8000032	best: 3.7968818 (259)	total: 24m 29s	remaining: 21m 27s
533:	learn: 3.5933537	test: 3.8000171	best: 3.7968818 (259)	total: 24m 32s	remaining: 21m 24s
534:	learn: 3.5932008	test: 3.8000334	best: 3.7968818 (259)	total: 24m 34s	remaining: 21m 21s
535:	learn: 3.5930076	test: 3.7999873	best: 3.7968818 (259)	total: 24m 36s	remaining: 21m 18s
536:	learn: 3.5927575	test: 3.7999865	best: 3.7968818 (259)	total: 24m 39s	remaining: 21m 15s
537:	learn: 3.5925474	test: 3.7999477	best: 3.7968818 (259)	total: 24m 41s	remaining: 21m 12s
538:	learn: 3.5922262	test: 3.7999764	best: 3.7968818 (259)	total: 24m 44s	remaining: 21m 9s
539:	learn: 3.5918873	test: 3.8000191	best: 3.7968818 (259)	total: 24m 46s	remaining: 21m 6s
540:	learn: 3.5916441	test: 3.8000244	best: 3.7968818 (259)	total: 24m 49s	remaining: 21m 3s
541:	learn: 3.5911563	test: 3.8000756	best: 3.7968818 (259)	total: 24m 51s	remaining: 21m
542:	learn: 3.5910167	test: 3.8000749	best: 3.7968818 (259)	total: 

61:	learn: 3.7904866	test: 3.8079669	best: 3.8079669 (61)	total: 2m 31s	remaining: 38m 10s
62:	learn: 3.7896632	test: 3.8074834	best: 3.8074834 (62)	total: 2m 33s	remaining: 38m 7s
63:	learn: 3.7889301	test: 3.8069891	best: 3.8069891 (63)	total: 2m 36s	remaining: 38m 4s
64:	learn: 3.7882888	test: 3.8064430	best: 3.8064430 (64)	total: 2m 38s	remaining: 38m 1s
65:	learn: 3.7871736	test: 3.8058795	best: 3.8058795 (65)	total: 2m 41s	remaining: 37m 58s
66:	learn: 3.7864992	test: 3.8053900	best: 3.8053900 (66)	total: 2m 43s	remaining: 37m 55s
67:	learn: 3.7857135	test: 3.8049400	best: 3.8049400 (67)	total: 2m 45s	remaining: 37m 52s
68:	learn: 3.7849447	test: 3.8044646	best: 3.8044646 (68)	total: 2m 48s	remaining: 37m 49s
69:	learn: 3.7839804	test: 3.8040461	best: 3.8040461 (69)	total: 2m 50s	remaining: 37m 46s
70:	learn: 3.7832795	test: 3.8035915	best: 3.8035915 (70)	total: 2m 53s	remaining: 37m 43s
71:	learn: 3.7825048	test: 3.8031293	best: 3.8031293 (71)	total: 2m 55s	remaining: 37m 40s
72

151:	learn: 3.7380152	test: 3.7846325	best: 3.7846325 (151)	total: 6m 29s	remaining: 36m 11s
152:	learn: 3.7376300	test: 3.7845661	best: 3.7845661 (152)	total: 6m 31s	remaining: 36m 8s
153:	learn: 3.7369636	test: 3.7845515	best: 3.7845515 (153)	total: 6m 34s	remaining: 36m 5s
154:	learn: 3.7365584	test: 3.7843943	best: 3.7843943 (154)	total: 6m 36s	remaining: 36m 3s
155:	learn: 3.7359303	test: 3.7842942	best: 3.7842942 (155)	total: 6m 39s	remaining: 36m
156:	learn: 3.7356617	test: 3.7841788	best: 3.7841788 (156)	total: 6m 41s	remaining: 35m 57s
157:	learn: 3.7353700	test: 3.7840663	best: 3.7840663 (157)	total: 6m 44s	remaining: 35m 54s
158:	learn: 3.7349877	test: 3.7839911	best: 3.7839911 (158)	total: 6m 46s	remaining: 35m 51s
159:	learn: 3.7346952	test: 3.7838400	best: 3.7838400 (159)	total: 6m 49s	remaining: 35m 48s
160:	learn: 3.7342598	test: 3.7837470	best: 3.7837470 (160)	total: 6m 51s	remaining: 35m 45s
161:	learn: 3.7339608	test: 3.7836466	best: 3.7836466 (161)	total: 6m 54s	rem

240:	learn: 3.7016046	test: 3.7801041	best: 3.7801041 (239)	total: 10m 7s	remaining: 31m 52s
241:	learn: 3.7010418	test: 3.7800637	best: 3.7800637 (241)	total: 10m 9s	remaining: 31m 49s
242:	learn: 3.7005951	test: 3.7800327	best: 3.7800327 (242)	total: 10m 12s	remaining: 31m 47s
243:	learn: 3.7001826	test: 3.7800311	best: 3.7800311 (243)	total: 10m 14s	remaining: 31m 44s
244:	learn: 3.6995812	test: 3.7800172	best: 3.7800172 (244)	total: 10m 17s	remaining: 31m 43s
245:	learn: 3.6990635	test: 3.7799992	best: 3.7799992 (245)	total: 10m 20s	remaining: 31m 41s
246:	learn: 3.6985970	test: 3.7799938	best: 3.7799938 (246)	total: 10m 22s	remaining: 31m 39s
247:	learn: 3.6982987	test: 3.7800247	best: 3.7799938 (246)	total: 10m 25s	remaining: 31m 36s
248:	learn: 3.6978234	test: 3.7800497	best: 3.7799938 (246)	total: 10m 27s	remaining: 31m 33s
249:	learn: 3.6975687	test: 3.7800735	best: 3.7799938 (246)	total: 10m 30s	remaining: 31m 30s
250:	learn: 3.6970147	test: 3.7800316	best: 3.7799938 (246)	to

328:	learn: 3.6692362	test: 3.7794813	best: 3.7794237 (324)	total: 13m 51s	remaining: 28m 15s
329:	learn: 3.6688892	test: 3.7794683	best: 3.7794237 (324)	total: 13m 53s	remaining: 28m 13s
330:	learn: 3.6682680	test: 3.7794642	best: 3.7794237 (324)	total: 13m 56s	remaining: 28m 10s
331:	learn: 3.6679718	test: 3.7794383	best: 3.7794237 (324)	total: 13m 58s	remaining: 28m 7s
332:	learn: 3.6676865	test: 3.7794663	best: 3.7794237 (324)	total: 14m 1s	remaining: 28m 5s
333:	learn: 3.6674390	test: 3.7794284	best: 3.7794237 (324)	total: 14m 3s	remaining: 28m 2s
334:	learn: 3.6671051	test: 3.7794254	best: 3.7794237 (324)	total: 14m 6s	remaining: 27m 59s
335:	learn: 3.6665957	test: 3.7794580	best: 3.7794237 (324)	total: 14m 8s	remaining: 27m 56s
336:	learn: 3.6662554	test: 3.7795222	best: 3.7794237 (324)	total: 14m 10s	remaining: 27m 53s
337:	learn: 3.6660799	test: 3.7795162	best: 3.7794237 (324)	total: 14m 13s	remaining: 27m 51s
338:	learn: 3.6654954	test: 3.7795341	best: 3.7794237 (324)	total: 

416:	learn: 3.6392075	test: 3.7800721	best: 3.7794237 (324)	total: 17m 28s	remaining: 24m 26s
417:	learn: 3.6387486	test: 3.7800291	best: 3.7794237 (324)	total: 17m 31s	remaining: 24m 23s
418:	learn: 3.6384956	test: 3.7800552	best: 3.7794237 (324)	total: 17m 33s	remaining: 24m 21s
419:	learn: 3.6382459	test: 3.7800920	best: 3.7794237 (324)	total: 17m 36s	remaining: 24m 18s
420:	learn: 3.6379367	test: 3.7801297	best: 3.7794237 (324)	total: 17m 38s	remaining: 24m 16s
421:	learn: 3.6373564	test: 3.7801155	best: 3.7794237 (324)	total: 17m 41s	remaining: 24m 13s
422:	learn: 3.6370280	test: 3.7801023	best: 3.7794237 (324)	total: 17m 43s	remaining: 24m 10s
423:	learn: 3.6365878	test: 3.7801363	best: 3.7794237 (324)	total: 17m 46s	remaining: 24m 8s
424:	learn: 3.6362048	test: 3.7801813	best: 3.7794237 (324)	total: 17m 48s	remaining: 24m 5s
425:	learn: 3.6357914	test: 3.7802268	best: 3.7794237 (324)	total: 17m 50s	remaining: 24m 3s
426:	learn: 3.6354792	test: 3.7802305	best: 3.7794237 (324)	tot

504:	learn: 3.6092126	test: 3.7811389	best: 3.7794237 (324)	total: 21m 8s	remaining: 20m 43s
505:	learn: 3.6088237	test: 3.7811278	best: 3.7794237 (324)	total: 21m 11s	remaining: 20m 41s
506:	learn: 3.6084186	test: 3.7811161	best: 3.7794237 (324)	total: 21m 13s	remaining: 20m 38s
507:	learn: 3.6080090	test: 3.7811324	best: 3.7794237 (324)	total: 21m 16s	remaining: 20m 36s
508:	learn: 3.6076505	test: 3.7812063	best: 3.7794237 (324)	total: 21m 18s	remaining: 20m 33s
509:	learn: 3.6072054	test: 3.7812487	best: 3.7794237 (324)	total: 21m 21s	remaining: 20m 30s
510:	learn: 3.6068310	test: 3.7812859	best: 3.7794237 (324)	total: 21m 23s	remaining: 20m 28s
511:	learn: 3.6065106	test: 3.7812922	best: 3.7794237 (324)	total: 21m 26s	remaining: 20m 25s
512:	learn: 3.6063069	test: 3.7812712	best: 3.7794237 (324)	total: 21m 28s	remaining: 20m 23s
513:	learn: 3.6060938	test: 3.7812584	best: 3.7794237 (324)	total: 21m 31s	remaining: 20m 20s
514:	learn: 3.6059152	test: 3.7812314	best: 3.7794237 (324)	t

592:	learn: 3.5800141	test: 3.7826313	best: 3.7794237 (324)	total: 24m 53s	remaining: 17m 4s
593:	learn: 3.5796501	test: 3.7826628	best: 3.7794237 (324)	total: 24m 57s	remaining: 17m 3s
594:	learn: 3.5793311	test: 3.7826968	best: 3.7794237 (324)	total: 24m 59s	remaining: 17m
595:	learn: 3.5790540	test: 3.7827169	best: 3.7794237 (324)	total: 25m 2s	remaining: 16m 58s
596:	learn: 3.5787754	test: 3.7827175	best: 3.7794237 (324)	total: 25m 6s	remaining: 16m 56s
597:	learn: 3.5783724	test: 3.7827155	best: 3.7794237 (324)	total: 25m 9s	remaining: 16m 54s
598:	learn: 3.5780834	test: 3.7827522	best: 3.7794237 (324)	total: 25m 12s	remaining: 16m 52s
599:	learn: 3.5777213	test: 3.7827738	best: 3.7794237 (324)	total: 25m 14s	remaining: 16m 49s
600:	learn: 3.5775020	test: 3.7827609	best: 3.7794237 (324)	total: 25m 17s	remaining: 16m 47s
601:	learn: 3.5772685	test: 3.7828029	best: 3.7794237 (324)	total: 25m 19s	remaining: 16m 44s
602:	learn: 3.5769548	test: 3.7827970	best: 3.7794237 (324)	total: 25

56:	learn: 3.7932501	test: 3.8190341	best: 3.8190341 (56)	total: 2m 17s	remaining: 37m 53s
57:	learn: 3.7922266	test: 3.8184363	best: 3.8184363 (57)	total: 2m 19s	remaining: 37m 50s
58:	learn: 3.7914919	test: 3.8178063	best: 3.8178063 (58)	total: 2m 22s	remaining: 37m 47s
59:	learn: 3.7906619	test: 3.8172803	best: 3.8172803 (59)	total: 2m 24s	remaining: 37m 45s
60:	learn: 3.7899449	test: 3.8167451	best: 3.8167451 (60)	total: 2m 27s	remaining: 37m 43s
61:	learn: 3.7891523	test: 3.8162972	best: 3.8162972 (61)	total: 2m 29s	remaining: 37m 40s
62:	learn: 3.7881757	test: 3.8157701	best: 3.8157701 (62)	total: 2m 31s	remaining: 37m 38s
63:	learn: 3.7872558	test: 3.8152210	best: 3.8152210 (63)	total: 2m 34s	remaining: 37m 35s
64:	learn: 3.7862676	test: 3.8146552	best: 3.8146552 (64)	total: 2m 36s	remaining: 37m 33s
65:	learn: 3.7853633	test: 3.8141955	best: 3.8141955 (65)	total: 2m 39s	remaining: 37m 30s
66:	learn: 3.7845803	test: 3.8136727	best: 3.8136727 (66)	total: 2m 41s	remaining: 37m 28s

146:	learn: 3.7366548	test: 3.7934635	best: 3.7934635 (146)	total: 5m 55s	remaining: 34m 20s
147:	learn: 3.7361585	test: 3.7933753	best: 3.7933753 (147)	total: 5m 57s	remaining: 34m 20s
148:	learn: 3.7358359	test: 3.7932525	best: 3.7932525 (148)	total: 6m	remaining: 34m 21s
149:	learn: 3.7354596	test: 3.7931261	best: 3.7931261 (149)	total: 6m 3s	remaining: 34m 20s
150:	learn: 3.7350813	test: 3.7930050	best: 3.7930050 (150)	total: 6m 6s	remaining: 34m 20s
151:	learn: 3.7347365	test: 3.7928834	best: 3.7928834 (151)	total: 6m 9s	remaining: 34m 21s
152:	learn: 3.7344254	test: 3.7927795	best: 3.7927795 (152)	total: 6m 12s	remaining: 34m 20s
153:	learn: 3.7338839	test: 3.7926961	best: 3.7926961 (153)	total: 6m 14s	remaining: 34m 19s
154:	learn: 3.7332971	test: 3.7925849	best: 3.7925849 (154)	total: 6m 17s	remaining: 34m 19s
155:	learn: 3.7327876	test: 3.7924635	best: 3.7924635 (155)	total: 6m 20s	remaining: 34m 18s
156:	learn: 3.7321151	test: 3.7923810	best: 3.7923810 (156)	total: 6m 23s	rem

235:	learn: 3.7009148	test: 3.7882715	best: 3.7882715 (235)	total: 10m 10s	remaining: 32m 55s
236:	learn: 3.7007323	test: 3.7882210	best: 3.7882210 (236)	total: 10m 12s	remaining: 32m 53s
237:	learn: 3.7003909	test: 3.7882215	best: 3.7882210 (236)	total: 10m 15s	remaining: 32m 51s
238:	learn: 3.7000827	test: 3.7881571	best: 3.7881571 (238)	total: 10m 18s	remaining: 32m 49s
239:	learn: 3.6997045	test: 3.7881228	best: 3.7881228 (239)	total: 10m 21s	remaining: 32m 47s
240:	learn: 3.6992858	test: 3.7880705	best: 3.7880705 (240)	total: 10m 23s	remaining: 32m 45s
241:	learn: 3.6987620	test: 3.7880409	best: 3.7880409 (241)	total: 10m 26s	remaining: 32m 43s
242:	learn: 3.6980217	test: 3.7881240	best: 3.7880409 (241)	total: 10m 29s	remaining: 32m 41s
243:	learn: 3.6975061	test: 3.7881228	best: 3.7880409 (241)	total: 10m 32s	remaining: 32m 38s
244:	learn: 3.6971670	test: 3.7881202	best: 3.7880409 (241)	total: 10m 34s	remaining: 32m 36s
245:	learn: 3.6966804	test: 3.7881580	best: 3.7880409 (241)	

323:	learn: 3.6692299	test: 3.7876383	best: 3.7875796 (283)	total: 14m 17s	remaining: 29m 48s
324:	learn: 3.6688349	test: 3.7876262	best: 3.7875796 (283)	total: 14m 19s	remaining: 29m 45s
325:	learn: 3.6684621	test: 3.7876208	best: 3.7875796 (283)	total: 14m 22s	remaining: 29m 42s
326:	learn: 3.6682244	test: 3.7876383	best: 3.7875796 (283)	total: 14m 25s	remaining: 29m 41s
327:	learn: 3.6679869	test: 3.7876062	best: 3.7875796 (283)	total: 14m 28s	remaining: 29m 39s
328:	learn: 3.6677087	test: 3.7875917	best: 3.7875796 (283)	total: 14m 31s	remaining: 29m 36s
329:	learn: 3.6671371	test: 3.7876191	best: 3.7875796 (283)	total: 14m 33s	remaining: 29m 33s
330:	learn: 3.6669072	test: 3.7876137	best: 3.7875796 (283)	total: 14m 36s	remaining: 29m 31s
331:	learn: 3.6663938	test: 3.7876163	best: 3.7875796 (283)	total: 14m 39s	remaining: 29m 28s
332:	learn: 3.6659093	test: 3.7875883	best: 3.7875796 (283)	total: 14m 41s	remaining: 29m 26s
333:	learn: 3.6655174	test: 3.7876383	best: 3.7875796 (283)	

411:	learn: 3.6396925	test: 3.7880796	best: 3.7875796 (283)	total: 18m 12s	remaining: 25m 59s
412:	learn: 3.6395537	test: 3.7880815	best: 3.7875796 (283)	total: 18m 15s	remaining: 25m 56s
413:	learn: 3.6393057	test: 3.7881226	best: 3.7875796 (283)	total: 18m 18s	remaining: 25m 54s
414:	learn: 3.6389647	test: 3.7880822	best: 3.7875796 (283)	total: 18m 20s	remaining: 25m 51s
415:	learn: 3.6384853	test: 3.7881300	best: 3.7875796 (283)	total: 18m 23s	remaining: 25m 48s
416:	learn: 3.6380284	test: 3.7881282	best: 3.7875796 (283)	total: 18m 26s	remaining: 25m 46s
417:	learn: 3.6376933	test: 3.7881606	best: 3.7875796 (283)	total: 18m 28s	remaining: 25m 44s
418:	learn: 3.6373971	test: 3.7881692	best: 3.7875796 (283)	total: 18m 31s	remaining: 25m 41s
419:	learn: 3.6372046	test: 3.7881842	best: 3.7875796 (283)	total: 18m 34s	remaining: 25m 38s
420:	learn: 3.6370522	test: 3.7881811	best: 3.7875796 (283)	total: 18m 37s	remaining: 25m 36s
421:	learn: 3.6368521	test: 3.7881948	best: 3.7875796 (283)	

499:	learn: 3.6097844	test: 3.7890997	best: 3.7875796 (283)	total: 22m 24s	remaining: 22m 24s
500:	learn: 3.6095720	test: 3.7890857	best: 3.7875796 (283)	total: 22m 27s	remaining: 22m 22s
501:	learn: 3.6091542	test: 3.7890505	best: 3.7875796 (283)	total: 22m 30s	remaining: 22m 19s
502:	learn: 3.6088789	test: 3.7891043	best: 3.7875796 (283)	total: 22m 33s	remaining: 22m 17s
503:	learn: 3.6086468	test: 3.7891024	best: 3.7875796 (283)	total: 22m 36s	remaining: 22m 15s
504:	learn: 3.6084637	test: 3.7890717	best: 3.7875796 (283)	total: 22m 40s	remaining: 22m 13s
505:	learn: 3.6082667	test: 3.7890661	best: 3.7875796 (283)	total: 22m 43s	remaining: 22m 11s
506:	learn: 3.6080953	test: 3.7890581	best: 3.7875796 (283)	total: 22m 46s	remaining: 22m 8s
507:	learn: 3.6077423	test: 3.7890751	best: 3.7875796 (283)	total: 22m 48s	remaining: 22m 5s
508:	learn: 3.6071395	test: 3.7890663	best: 3.7875796 (283)	total: 22m 51s	remaining: 22m 2s
509:	learn: 3.6067336	test: 3.7890671	best: 3.7875796 (283)	tot

In [14]:
path="D:\Python\Elo"
df_hist = pd.read_csv(os.path.join(path,'historical_transactions.csv'))
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df
df_train=reduce_mem_usage(df_hist)

Mem. usage decreased to 1749.11 Mb (43.7% reduction)


In [20]:
from sklearn.feature_extraction.text import TfidfVectorizer
import gensim
df=df_train
groupby='card_id'
target='category_3'
df_bag = pd.DataFrame(df[[groupby, target]])
df_bag[target] = df_bag[target].astype(str)
df_bag[target].fillna('NAN', inplace=True)
df_bag = df_bag.groupby(groupby, as_index=False)[target].agg({'list':(lambda x: list(x))}).reset_index()
doc_list = list(df_bag['list'].values)
w2v = Word2Vec(doc_list, size=30, window=3, min_count=1, workers=32)

ModuleNotFoundError: No module named 'gensim'

In [21]:
df_bag

Unnamed: 0,index,card_id,list
0,0,C_ID_00007093c1,"[B, B, B, C, B, B, B, B, B, C, B, B, B, B, B, ..."
1,1,C_ID_0001238066,"[nan, C, B, C, B, C, B, C, B, B, B, B, B, B, B..."
2,2,C_ID_0001506ef0,"[A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, ..."
3,3,C_ID_0001793786,"[A, A, A, A, A, A, A, B, A, A, A, A, A, A, A, ..."
4,4,C_ID_000183fdda,"[B, B, C, C, B, C, B, B, B, B, B, C, C, C, B, ..."
5,5,C_ID_00024e244b,"[A, B, A, A, A, A, A, B, A, A, B, B, A, A, A, ..."
6,6,C_ID_0002709b5a,"[B, B, B, B, B, B, B, B, B, B, B, B, B, B, B, ..."
7,7,C_ID_00027503e2,"[A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, ..."
8,8,C_ID_000298032a,"[A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, ..."
9,9,C_ID_0002ba3c2e,"[A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, ..."
