In [1]:
import pandas as pd
import numpy as np
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV, cross_val_score, KFold
import xgboost
import lightgbm
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA, SparsePCA, MiniBatchSparsePCA, KernelPCA
from sklearn.pipeline import Pipeline
from sklearn.metrics import log_loss, accuracy_score
import copy
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cluster import KMeans
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import ExtraTreesClassifier, GradientBoostingClassifier, RandomForestClassifier, AdaBoostClassifier, BaggingClassifier
from sklearn.neural_network import MLPClassifier

kfold = KFold(5, shuffle=True, random_state=999)

In [2]:
def get_train_df(fold_num):
    train_df = pd.read_csv('Data/pca_projected_506_nc_from_resnet_train.csv')
    #train_df.reset_index(drop=True, inplace=True)
    train_df.head(5)
    train_y = np.array(train_df['is_iceberg']).reshape((train_df.shape[0],))
    
    train_X = pd.read_csv('Data/fine_tune_resnet_extract_features_fold_{}_40_avg_train.csv'.format(fold_num))
    
    print("Train data X: {}, y: {}".format(train_X.shape, train_y.shape))
    return train_X, train_y

In [3]:
def get_test_df(fold):
    test_df = pd.read_csv('Data/fine_tune_resnet_extract_features_fold_{}_40_avg_test.csv'.format(fold))
    test_df.sort_index(inplace=True)

    return test_df

In [4]:
def get_feature_num(pca, th):
    counts = pd.Series(pca.explained_variance_ >= th).value_counts(sort=False)
    return counts.values[1]

In [5]:
def get_transformed_df(pca, df):
    new_data = np.array(pca.transform(df))
    columns = ['f_{}'.format(i) for i in range(new_data.shape[1])]
    df = pd.DataFrame(data=new_data,    # values
                  index=df.index,
                  columns=columns)  # 1st row as the column names
    return df

In [6]:
def get_log_loss(fn, df, y):
    pca = PCA(whiten=False, svd_solver='full', n_components=fn)
    pca.fit(df)
    
    X = get_transformed_df(pca, df)
    
    clf = lightgbm.LGBMClassifier(n_jobs=4, objective='binary', random_state=0, n_estimators=100, boosting_type='dart')
    log_loss = -1*cross_val_score(clf, X=X, y=y, scoring='neg_log_loss', cv=kfold)
    avg_log_loss = sum(log_loss) / float(len(log_loss))
    
    return avg_log_loss

In [7]:
def feature_selection(fold_num):
    train_X, train_y = get_train_df(fold_num)
    pca = PCA(whiten=False, svd_solver='full')
    pca.fit(train_X)
    
    th_list = [1e-1, 1e-2, 1e-3, 1e-4, 1e-5, 1e-6, 1e-7, 1e-8]
    f_num = [get_feature_num(pca, th) for th in th_list]
    cv_scores = [get_log_loss(fn, train_X, train_y) for fn in f_num]
    
    index_min = np.argmin(cv_scores)
    print("# of features by light gbm: {}, cv score: {:.3f}".format(f_num[index_min], cv_scores[index_min]))
    
    pca = PCA(whiten=False, svd_solver='full', n_components=f_num[index_min])
    pca.fit(train_X)
    train_X = get_transformed_df(pca, train_X)
    
    print("New train data X: {}, y: {}".format(train_X.shape, train_y.shape))
    return train_X, train_y, pca

In [8]:
class base_tuner():
    def __init__(self, X, y):
        self.X = X
        self.y = y
        self.default_params = {}
        
    def fit_and_update_params(self, params, update=True):
        clf = self.get_clf()

        gs = GridSearchCV(clf, params, scoring='neg_log_loss', cv=kfold, )
        gs.fit(self.X, self.y)
        
        cv_df = pd.DataFrame().from_dict(gs.cv_results_)
        cv_df = cv_df[['mean_test_score', 'std_test_score', 'params', 'rank_test_score']]
        cv_df = cv_df.sort_values(by=['rank_test_score', 'std_test_score']).reset_index(drop=True)
        best_params = cv_df.loc[0, 'params']
        
        if update is True:
            self.default_params.update(best_params)
        
        print('Selected hyper-params:', best_params)
        print('==============================> cv score: {:.4f}'.format(cv_df.loc[0, 'mean_test_score']))
        return best_params
    
    def tune(self):
        pass
    
    def get_clf(self):
        return None

In [9]:
class lgbm_tuner(base_tuner):
    def __init__(self, X, y):
        super(lgbm_tuner, self).__init__(X, y)
        self.default_params = {
            'n_jobs': 4,
            'objective': 'binary',
            'random_state': 0,
            'boosting_type': 'dart'
        }
    
    def tune_est_num_and_lr(self):
        params = {
            'n_estimators': [100, 200, 400, 800],
            'learning_rate': [0.1, 0.05, 0.01, 0.005]
        }

        self.fit_and_update_params(params)
   
    def tune_leaves_num_and_gamma(self):
        params = {
            'num_leaves': [2, 3, 7, 15, 31, 63],
            'min_split_gain': [.0, .1, .2]
        }
        self.fit_and_update_params(params)
     
    def tune_sampling(self):
        params = {
            'subsample': [1., .8, .6, .4, .2],
            'colsample_bytree': [1., .8, .6, .4, .2]
        }
        best_parmas = self.fit_and_update_params(params, update=False)
        
        next_params = {}
        for k,v in best_parmas.items():
            if v == 1.:
                next_params[k] = [1., .95, .9, .85]
            else:
                next_params[k] = [v+.15, v+.1, v+.05, v, v-.05, v-.1, v-.15]
 
        self.fit_and_update_params(next_params)
        
    def tune_regularization(self):
        params = {
            'reg_alpha': [1., .8, .6, .4, .2, .0],
            'reg_lambda': [1., .8, .6, .4, .2, .0]
        }
        
        best_parmas = self.fit_and_update_params(params, update=False)
        
        next_params = {}
        for k,v in best_parmas.items():
            if v == 1.:
                next_params[k] = [1., .95, .9, .85]
            elif v == 0.:
                next_params[k] = [.0, .05, .1, .15]
            else:
                next_params[k] = [v+.15, v+.1, v+.05, v, v-.05, v-.1, v-.15]
 
        self.fit_and_update_params(next_params)
    
    def tune(self):
        print('lgb tuner start tuning')
        self.tune_est_num_and_lr()
        self.tune_leaves_num_and_gamma()
        self.tune_sampling()
        self.tune_regularization()
        
        return self.get_clf()
    
    def get_clf(self):
        return lightgbm.LGBMClassifier(**self.default_params)

In [10]:
class xgb_tuner(base_tuner):
    def __init__(self, X, y):
        super(xgb_tuner, self).__init__(X, y)
        self.X = X
        self.y = y
        self.default_params = {
            'n_jobs': 4,
            'objective': 'binary:logistic',
            'seed': 0,
            'eval_metric': 'logloss'
        }
    
    def tune_booster(self):
        params = {
            'booster': ['dart', 'gbtree']
        }
        self.fit_and_update_params(params)
        
    def tune_est_num_and_lr(self):
        params = {
            'n_estimators': [100, 200, 400, 800],
            'learning_rate': [0.1, 0.05, 0.01, 0.005]
        }

        self.fit_and_update_params(params)
   
    def tune_max_depth(self):
        params = {
            'max_depth': [1, 3, 5, 7, 9]
        }
        self.fit_and_update_params(params)
        
    def tune_child_w_and_gamma(self):
        params = {
            'min_child_weight': [1, 2, 4, 6, 8, 10],
            'gamma': [0, 0.1, 0.2]
        }
        self.fit_and_update_params(params)
     
    def tune_sampling(self):
        params = {
            'subsample': [1., .8, .6, .4, .2],
            'colsample_bytree': [1., .8, .6, .4, .2]
        }
        best_parmas = self.fit_and_update_params(params, update=False)
        
        next_params = {}
        for k,v in best_parmas.items():
            if v == 1.:
                next_params[k] = [1., .95, .9, .85]
            else:
                next_params[k] = [v+.15, v+.1, v+.05, v, v-.05, v-.1, v-.15]
 
        self.fit_and_update_params(next_params)
        
    def tune_regularization(self):
        params = {
            'reg_alpha': [1., .8, .6, .4, .2, .0],
            'reg_lambda': [1., .8, .6, .4, .2, .0]
        }
        
        best_parmas = self.fit_and_update_params(params, update=False)
        
        next_params = {}
        for k,v in best_parmas.items():
            if v == 1.:
                next_params[k] = [1., .95, .9, .85]
            elif v == 0.:
                next_params[k] = [.0, .05, .1, .15]
            else:
                next_params[k] = [v+.15, v+.1, v+.05, v, v-.05, v-.1, v-.15]
 
        self.fit_and_update_params(next_params)
    
    def tune(self):
        print('xgb tuner start tuning')
        self.tune_booster()
        self.tune_est_num_and_lr()
        self.tune_max_depth()
        self.tune_child_w_and_gamma()
        self.tune_sampling()
        self.tune_regularization()
        
        return self.get_clf()
    
    def get_clf(self):
        return xgboost.XGBClassifier(**self.default_params)

In [11]:
class lr_tuner(base_tuner):
    def __init__(self, X, y):
        super(lr_tuner, self).__init__(X, y)
        self.X = X
        self.y = y
        self.default_params = {
            'penalty': 'l1',
            'max_iter': 10000
        }

    def tune(self):
        print('logistic regression tuner start tuning')
        params = {
            'solver': ['liblinear', 'saga']
        }
        
        self.fit_and_update_params(params)
        
        return self.get_clf()
    
    def get_clf(self):
        return LogisticRegression(**self.default_params)

In [12]:
class mlp_tuner(base_tuner):
    def __init__(self, X, y):
        super(mlp_tuner, self).__init__(X, y)
        self.X = X
        self.y = y
        self.default_params = {
            'learning_rate': 'adaptive',
            'learning_rate_init': 0.005,
            'max_iter': 2000,
            'random_state':0
        }

    def tune(self):
        print('mlp tuner start tuning')
        params = {
            'solver':['lbfgs', 'sgd', 'adam'],
            'hidden_layer_sizes': [(100,), (150,), (100, 100,)],
        }
        
        self.fit_and_update_params(params)
        
        params = {
            'alpha': [10., 5., 2., 1., .8, .5, .2, .1],
        }
        
        self.fit_and_update_params(params)
        
        return self.get_clf()
    
    def get_clf(self):
        return MLPClassifier(**self.default_params)

In [13]:
class adb_tuner(base_tuner):
    def __init__(self, X, y):
        super(adb_tuner, self).__init__(X, y)
        self.X = X
        self.y = y
        self.default_params = {
            'algorithm': 'SAMME.R',
            'random_state':0
        }

    def tune(self):
        print('adaboost tuner start tuning')
        params = {
            'n_estimators': [100, 200, 400, 800],
            'learning_rate': [0.1, 0.05, 0.01, 0.005]
        }
        
        self.fit_and_update_params(params)

        return self.get_clf()
    
    def get_clf(self):
        return AdaBoostClassifier(**self.default_params)

In [14]:
class bg_tuner(base_tuner):
    def __init__(self, X, y):
        super(bg_tuner, self).__init__(X, y)
        self.X = X
        self.y = y
        self.default_params = {
            'random_state':0,
            'bootstrap_features':True
        }
        
    def tune_est_num(self):
        params = {
            'n_estimators': [100, 200, 400, 800]
        }

        self.fit_and_update_params(params)
     
    def tune_sampling(self):
        params = {
            'max_samples': [1., .8, .6, .4, .2],
            'max_features': [1., .8, .6, .4, .2]
        }
        best_parmas = self.fit_and_update_params(params, update=False)
        
        next_params = {}
        for k,v in best_parmas.items():
            if v == 1.:
                next_params[k] = [1., .95, .9, .85]
            else:
                next_params[k] = [v+.15, v+.1, v+.05, v, v-.05, v-.1, v-.15]
 
        self.fit_and_update_params(next_params)
    
    def tune(self):
        print('bagging tuner start tuning')
        self.tune_est_num()
        self.tune_sampling()

        return self.get_clf()
    
    def get_clf(self):
        return BaggingClassifier(**self.default_params)

In [15]:
class gb_tuner(base_tuner):
    def __init__(self, X, y):
        super(gb_tuner, self).__init__(X, y)
        self.X = X
        self.y = y
        self.default_params = {
            'random_state':0
        }
        
    def tune_loss_criterion(self):
        params = {
            'loss': ['deviance', 'exponential'],
            'criterion': ['friedman_mse', 'mse']
        }

        self.fit_and_update_params(params)
        
    def tune_est_num_and_lr(self):
        params = {
            'n_estimators': [100, 200, 400, 800],
            'learning_rate': [0.1, 0.05, 0.01, 0.005]
        }

        self.fit_and_update_params(params)
   
    def tune_max_depth(self):
        params = {
            'max_depth': [1, 3, 5, 7, 9]
        }
        self.fit_and_update_params(params)
        
    def tune_child(self):
        params = {
            'min_samples_split': [2, 3, 7, 15,31],
            'min_impurity_decrease': [0, 0.1, 0.2]
        }
        self.fit_and_update_params(params)
     
    def tune_sampling(self):
        params = {
            'subsample': [1., .8, .6, .4, .2],
            'max_features': [1., .8, .6, .4, .2]
        }
        best_parmas = self.fit_and_update_params(params, update=False)
        
        next_params = {}
        for k,v in best_parmas.items():
            if v == 1.:
                next_params[k] = [1., .95, .9, .85]
            else:
                next_params[k] = [v+.15, v+.1, v+.05, v, v-.05, v-.1, v-.15]
 
        self.fit_and_update_params(next_params)
      
    def tune(self):
        print('gradient boosting tuner start tuning')
        self.tune_loss_criterion()
        self.tune_est_num_and_lr()
        self.tune_max_depth()
        self.tune_child()
        self.tune_sampling()
        
        return self.get_clf()
    
    def get_clf(self):
        return GradientBoostingClassifier(**self.default_params)

In [16]:
class rf_tuner(base_tuner):
    def __init__(self, X, y):
        super(rf_tuner, self).__init__(X, y)
        self.X = X
        self.y = y
        self.default_params = {
            'random_state':0,
            'n_jobs': 4
        }
        
    def tune_loss_criterion(self):
        params = {
            'class_weight': [None, 'balanced'],
            'criterion': ['gini', 'entropy']
        }

        self.fit_and_update_params(params)
        
    def tune_est_num(self):
        params = {
            'n_estimators': [100, 200, 400, 800]
        }

        self.fit_and_update_params(params)
   
    def tune_max_depth(self):
        params = {
            'max_depth': [1, 3, 5, 7, 9]
        }
        self.fit_and_update_params(params)
        
    def tune_child(self):
        params = {
            'min_samples_split': [2, 3, 7, 15,31],
            'min_impurity_decrease': [0, 0.1, 0.2]
        }
        self.fit_and_update_params(params)
     
    def tune_sampling(self):
        params = {
            'max_features': [1., .8, .6, .4, .2]
        }
        best_parmas = self.fit_and_update_params(params, update=False)
        
        next_params = {}
        for k,v in best_parmas.items():
            if v == 1.:
                next_params[k] = [1., .95, .9, .85]
            else:
                next_params[k] = [v+.15, v+.1, v+.05, v, v-.05, v-.1, v-.15]
 
        self.fit_and_update_params(next_params)
      
    def tune(self):
        print('random forest tuner start tuning')
        self.tune_loss_criterion()
        self.tune_est_num()
        self.tune_max_depth()
        self.tune_child()
        self.tune_sampling()
        
        return self.get_clf()
    
    def get_clf(self):
        return RandomForestClassifier(**self.default_params)

In [17]:
class et_tuner(rf_tuner):
    def __init__(self, X, y):
        super(et_tuner, self).__init__(X, y)

    def tune(self):
        print('extra tree tuner start tuning')
        self.tune_loss_criterion()
        self.tune_est_num()
        self.tune_max_depth()
        self.tune_child()
        self.tune_sampling()
        
        return self.get_clf()
    
    def get_clf(self):
        return ExtraTreesClassifier(**self.default_params)

In [18]:
from stacking_models_api import StackingAveragedModels

In [19]:
train_df = pd.read_json('Data/train.json')
test_df = pd.read_json('Data/test.json')
print("Test data shape: {}".format(test_df.shape))
test_ids = test_df['id']

train_inc_angles = pd.to_numeric(train_df.inc_angle, errors='coerce')
train_inc_angles.fillna(value=0, inplace=True)
test_inc_angles = pd.to_numeric(test_df.inc_angle, errors='coerce')

for i in range(5):
    fold = i + 1
    
    print("=================================================")
    print("Processing fold ", fold)
    print("=================================================")
    
    train_X, train_y, pca = feature_selection(fold)
    test_X = get_transformed_df(pca, get_test_df(fold))
    train_X['inc_angle'] = train_inc_angles
    test_X['inc_angle'] = test_inc_angles
    
    tuners = [#lr_tuner(train_X, train_y),
              #mlp_tuner(train_X, train_y),
              #adb_tuner(train_X, train_y),
              #bg_tuner(train_X, train_y),
              #gb_tuner(train_X, train_y),
              #rf_tuner(train_X, train_y),
              #et_tuner(train_X, train_y),
              lgbm_tuner(train_X, train_y), 
              xgb_tuner(train_X, train_y)]
    clfs = []
    
    for tuner in tuners:
        tuner.tune()
        clfs.append(tuner.get_clf())
    
    for i, clf in enumerate(clfs):
        if isinstance(tuners[i], lgbm_tuner) or isinstance(tuners[i], xgb_tuner):
            if isinstance(tuners[i], lgbm_tuner):
                name = 'lgb'
            else:
                name = 'xgb'
            
            clf.fit(train_X, train_y)
            predictions = clf.predict_proba(test_X)[:,1]
            print("Fold {}, {} tuner predictions\n".format(fold, name), predictions)
        
            submission = pd.DataFrame()
            submission['id'] = test_ids
            submission['is_iceberg'] = predictions
            submission.to_csv('Submissions/submission_{}_auto_fine_tune_inc_angle_fold_{}.csv'.format(name, fold), 
                              float_format="%.15f", index=False)
     
    
    print("\n*** Start to train meta model for stacking averaged model ***\n")
    
    sl_base_models_dict = {}
    for i, clf in enumerate(clfs):
        sl_base_models_dict['clf_'+str(i+1)] = clf
        
    semi_sl_base_models_dict = {
        'knn_8': KNeighborsClassifier(n_neighbors=8),
        'knn_16': KNeighborsClassifier(n_neighbors=16),
        'knn_32': KNeighborsClassifier(n_neighbors=32),
        'knn_64': KNeighborsClassifier(n_neighbors=64)
    }

    usl_base_models_dict = {
        'kmean_2': KMeans(n_clusters=2),
        'kmean_3': KMeans(n_clusters=3),
        'kmean_4': KMeans(n_clusters=4)
    }
    
    sam = StackingAveragedModels(sl_base_models_dict=sl_base_models_dict, 
                             semi_sl_base_models_dict=semi_sl_base_models_dict,
                             usl_base_models_dict=usl_base_models_dict,
                             meta_model=LogisticRegression(),
                             target_col='is_iceberg',
                             eval_func=log_loss,
                             is_classification=True,
                             random_state=999)
    
    sam.fit(train_X, train_y)
    
    # get meta dataframe to train
    meta_df = sam.get_meta_train_dataframe(get_dummies=True, pca_enabled=False)
    # shuffle meta data frame
    meta_df = meta_df.sample(frac=1, random_state=0).reset_index(drop=True)
    print('feature correlations')
    print(meta_df.corr())
    
    features = meta_df.columns.tolist()
    features.remove('is_iceberg')
    meta_X = meta_df[features]
    meta_y = np.array(meta_df['is_iceberg']).reshape((meta_df.shape[0],))
    print("meta size: ", meta_X.shape, meta_X.shape)
    
    # fine tune for meta model
    meta_tuners = [lgbm_tuner(meta_X, meta_y), xgb_tuner(meta_X, meta_y)]
    meta_clfs = [] 
    for tuner in meta_tuners:
        tuner.tune()
        meta_clfs.append(tuner.get_clf())
        
    for i, clf in enumerate(meta_clfs):
        if isinstance(meta_tuners[i], lgbm_tuner) or isinstance(meta_tuners[i], xgb_tuner):
            if isinstance(meta_tuners[i], lgbm_tuner):
                name = 'lgb'
            else:
                name = 'xgb'
            
            sam.reset_meta_model(clf)
            predictions = sam.predict_proba(test_X)[:,1]
            print("Fold {}, meta model ({} tuner) predictions\n".format(fold, name), predictions)
        
            submission = pd.DataFrame()
            submission['id'] = test_ids
            submission['is_iceberg'] = predictions
            submission.to_csv('Submissions/submission_{}_sam_1227_auto_fine_tune_inc_angle_fold_{}.csv'.format(name, fold), 
                              float_format="%.15f", index=False)
            

Test data shape: (8424, 4)
Processing fold  1
Train data X: (1604, 18433), y: (1604,)
# of features by light gbm: 304, cv score: 0.170
New train data X: (1604, 304), y: (1604,)
lgb tuner start tuning




Selected hyper-params: {'learning_rate': 0.1, 'n_estimators': 200}




Selected hyper-params: {'min_split_gain': 0.0, 'num_leaves': 7}




Selected hyper-params: {'colsample_bytree': 1.0, 'subsample': 0.8}




Selected hyper-params: {'colsample_bytree': 1.0, 'subsample': 0.8}




Selected hyper-params: {'reg_alpha': 0.0, 'reg_lambda': 0.0}




Selected hyper-params: {'reg_alpha': 0.0, 'reg_lambda': 0.0}
xgb tuner start tuning




Selected hyper-params: {'booster': 'dart'}




Selected hyper-params: {'learning_rate': 0.05, 'n_estimators': 100}




Selected hyper-params: {'max_depth': 3}




Selected hyper-params: {'gamma': 0, 'min_child_weight': 1}




Selected hyper-params: {'colsample_bytree': 1.0, 'subsample': 1.0}




Selected hyper-params: {'colsample_bytree': 0.85, 'subsample': 1.0}




Selected hyper-params: {'reg_alpha': 0.2, 'reg_lambda': 0.0}




Selected hyper-params: {'reg_alpha': 0.1, 'reg_lambda': 0.0}
Fold 1, lgb tuner predictions
 [ 0.01289761  0.4454856   0.02046206 ...,  0.02546557  0.98192176
  0.02038061]
Fold 1, xgb tuner predictions
 [ 0.01436938  0.90136641  0.03557604 ...,  0.04811456  0.9792645
  0.02044903]

*** Start to train meta model for stacking averaged model ***


 clf_1
score= 0.132727287713
score= 0.173499626014
score= 0.135703716747
score= 0.161046420034
score= 0.15000138812
Avg score =  0.150595687726

 clf_2
score= 0.131005553446
score= 0.165206306556
score= 0.127815051734
score= 0.152057692841
score= 0.141813590488
Avg score =  0.143579639013

 knn_8
score= 0.402119061498
score= 0.642746466091
score= 0.404159809234
score= 0.620900490137
score= 0.39856250718
Avg score =  0.493697666828

 knn_16
score= 0.315035362684
score= 0.55346191367
score= 0.317309742266
score= 0.423555378023
score= 0.305976435333
Avg score =  0.383067766395

 knn_32
score= 0.223778414289
score= 0.547296861938
score= 0.2249690556



Selected hyper-params: {'learning_rate': 0.05, 'n_estimators': 400}




Selected hyper-params: {'min_split_gain': 0.0, 'num_leaves': 3}




Selected hyper-params: {'colsample_bytree': 0.8, 'subsample': 0.4}




Selected hyper-params: {'colsample_bytree': 0.8, 'subsample': 0.55}




Selected hyper-params: {'reg_alpha': 0.0, 'reg_lambda': 0.0}




Selected hyper-params: {'reg_alpha': 0.0, 'reg_lambda': 0.0}
xgb tuner start tuning




Selected hyper-params: {'booster': 'dart'}




Selected hyper-params: {'learning_rate': 0.05, 'n_estimators': 100}




Selected hyper-params: {'max_depth': 3}




Selected hyper-params: {'gamma': 0.2, 'min_child_weight': 8}




Selected hyper-params: {'colsample_bytree': 0.4, 'subsample': 1.0}




Selected hyper-params: {'colsample_bytree': 0.25, 'subsample': 1.0}




Selected hyper-params: {'reg_alpha': 0.0, 'reg_lambda': 0.0}




Selected hyper-params: {'reg_alpha': 0.0, 'reg_lambda': 0.0}


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  out_of_fold_predictions[self.target_col] = self.out_of_fold_predictions[self.target_col]


meta model's training set score=  0.128329987997 

Fold 1, meta model (lgb tuner) predictions
 [ 0.02194929  0.80059983  0.01601229 ...,  0.01550367  0.96656002
  0.0141076 ]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  out_of_fold_predictions[self.target_col] = self.out_of_fold_predictions[self.target_col]


meta model's training set score=  0.12102658579 

Fold 1, meta model (xgb tuner) predictions
 [ 0.01406887  0.79236448  0.01129174 ...,  0.01114321  0.97587776
  0.01086472]
Processing fold  2
Train data X: (1604, 18433), y: (1604,)
# of features by light gbm: 7, cv score: 0.130
New train data X: (1604, 7), y: (1604,)
lgb tuner start tuning




Selected hyper-params: {'learning_rate': 0.05, 'n_estimators': 400}




Selected hyper-params: {'min_split_gain': 0.2, 'num_leaves': 7}




Selected hyper-params: {'colsample_bytree': 1.0, 'subsample': 0.8}




Selected hyper-params: {'colsample_bytree': 1.0, 'subsample': 0.75}




Selected hyper-params: {'reg_alpha': 0.0, 'reg_lambda': 0.2}




Selected hyper-params: {'reg_alpha': 0.1, 'reg_lambda': 0.1}
xgb tuner start tuning




Selected hyper-params: {'booster': 'dart'}




Selected hyper-params: {'learning_rate': 0.05, 'n_estimators': 100}




Selected hyper-params: {'max_depth': 3}




Selected hyper-params: {'gamma': 0.2, 'min_child_weight': 2}




Selected hyper-params: {'colsample_bytree': 1.0, 'subsample': 0.8}




Selected hyper-params: {'colsample_bytree': 1.0, 'subsample': 0.9}




Selected hyper-params: {'reg_alpha': 0.0, 'reg_lambda': 0.6}




Selected hyper-params: {'reg_alpha': 0.1, 'reg_lambda': 0.44999999999999996}
Fold 2, lgb tuner predictions
 [ 0.01300206  0.83950151  0.01588896 ...,  0.0135112   0.99197093
  0.00864697]
Fold 2, xgb tuner predictions
 [ 0.00719951  0.91538179  0.00891804 ...,  0.01374675  0.99384999
  0.00599172]

*** Start to train meta model for stacking averaged model ***


 clf_1
score= 0.1049068866
score= 0.141192265802
score= 0.0932533393879
score= 0.161361615408
score= 0.112755078154
Avg score =  0.122693837071

 clf_2
score= 0.106241395195
score= 0.140021055417
score= 0.0919927379517
score= 0.153179451717
score= 0.103950606391
Avg score =  0.119077049335

 knn_8
score= 0.187907404413
score= 0.609668564007
score= 0.180569188065
score= 0.721167932479
score= 0.590756086906
Avg score =  0.458013835174

 knn_16
score= 0.180831345678
score= 0.609222423908
score= 0.183401160697
score= 0.421499006578
score= 0.0869883684386
Avg score =  0.29638846106

 knn_32
score= 0.0815035535616
score= 0.50869923415



Selected hyper-params: {'learning_rate': 0.1, 'n_estimators': 200}




Selected hyper-params: {'min_split_gain': 0.1, 'num_leaves': 3}




Selected hyper-params: {'colsample_bytree': 0.2, 'subsample': 0.8}




Selected hyper-params: {'colsample_bytree': 0.2, 'subsample': 0.7000000000000001}




Selected hyper-params: {'reg_alpha': 0.0, 'reg_lambda': 0.0}




Selected hyper-params: {'reg_alpha': 0.0, 'reg_lambda': 0.0}
xgb tuner start tuning




Selected hyper-params: {'booster': 'dart'}




Selected hyper-params: {'learning_rate': 0.05, 'n_estimators': 100}




Selected hyper-params: {'max_depth': 1}




Selected hyper-params: {'gamma': 0, 'min_child_weight': 1}




Selected hyper-params: {'colsample_bytree': 0.4, 'subsample': 0.8}




Selected hyper-params: {'colsample_bytree': 0.30000000000000004, 'subsample': 0.8500000000000001}




Selected hyper-params: {'reg_alpha': 0.0, 'reg_lambda': 0.0}




Selected hyper-params: {'reg_alpha': 0.0, 'reg_lambda': 0.0}


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  out_of_fold_predictions[self.target_col] = self.out_of_fold_predictions[self.target_col]


meta model's training set score=  0.104714727888 

Fold 2, meta model (lgb tuner) predictions
 [ 0.01530456  0.90469185  0.02019722 ...,  0.0133465   0.98858909
  0.02019722]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  out_of_fold_predictions[self.target_col] = self.out_of_fold_predictions[self.target_col]


meta model's training set score=  0.108926066708 

Fold 2, meta model (xgb tuner) predictions
 [ 0.01685197  0.92007136  0.01572918 ...,  0.01572918  0.98467594
  0.01572918]
Processing fold  3
Train data X: (1604, 18433), y: (1604,)
# of features by light gbm: 17, cv score: 0.144
New train data X: (1604, 17), y: (1604,)
lgb tuner start tuning




Selected hyper-params: {'learning_rate': 0.1, 'n_estimators': 200}




Selected hyper-params: {'min_split_gain': 0.0, 'num_leaves': 31}




Selected hyper-params: {'colsample_bytree': 0.8, 'subsample': 1.0}




Selected hyper-params: {'colsample_bytree': 0.7000000000000001, 'subsample': 0.85}




Selected hyper-params: {'reg_alpha': 0.0, 'reg_lambda': 0.0}




Selected hyper-params: {'reg_alpha': 0.0, 'reg_lambda': 0.0}
xgb tuner start tuning




Selected hyper-params: {'booster': 'dart'}




Selected hyper-params: {'learning_rate': 0.01, 'n_estimators': 800}




Selected hyper-params: {'max_depth': 3}




Selected hyper-params: {'gamma': 0, 'min_child_weight': 1}




Selected hyper-params: {'colsample_bytree': 0.8, 'subsample': 1.0}




Selected hyper-params: {'colsample_bytree': 0.7000000000000001, 'subsample': 0.9}




Selected hyper-params: {'reg_alpha': 0.0, 'reg_lambda': 0.6}




Selected hyper-params: {'reg_alpha': 0.0, 'reg_lambda': 0.6}
Fold 3, lgb tuner predictions
 [ 0.01012069  0.11903729  0.01447627 ...,  0.00850419  0.99416329
  0.01732496]
Fold 3, xgb tuner predictions
 [ 0.00292449  0.17305978  0.00617557 ...,  0.00433929  0.99524236
  0.01353846]

*** Start to train meta model for stacking averaged model ***


 clf_1
score= 0.105949616112
score= 0.145342866487
score= 0.1106268426
score= 0.148289457022
score= 0.140808659954
Avg score =  0.130203488435

 clf_2
score= 0.101202779982
score= 0.138817652643
score= 0.106615688677
score= 0.142155537297
score= 0.134878403255
Avg score =  0.124734012371

 knn_8
score= 0.304474913471
score= 0.731814675165
score= 0.200278054453
score= 0.62279832674
score= 0.405040689884
Avg score =  0.452881331943

 knn_16
score= 0.200935043441
score= 0.52728610107
score= 0.105570267956
score= 0.431965853846
score= 0.30634667045
Avg score =  0.314420787353

 knn_32
score= 0.2016760606
score= 0.426482658843
score= 0.105411320726




Selected hyper-params: {'learning_rate': 0.05, 'n_estimators': 400}




Selected hyper-params: {'min_split_gain': 0.0, 'num_leaves': 3}




Selected hyper-params: {'colsample_bytree': 0.4, 'subsample': 0.8}




Selected hyper-params: {'colsample_bytree': 0.35000000000000003, 'subsample': 0.75}




Selected hyper-params: {'reg_alpha': 0.0, 'reg_lambda': 0.0}




Selected hyper-params: {'reg_alpha': 0.0, 'reg_lambda': 0.0}
xgb tuner start tuning




Selected hyper-params: {'booster': 'dart'}




Selected hyper-params: {'learning_rate': 0.05, 'n_estimators': 100}




Selected hyper-params: {'max_depth': 1}




Selected hyper-params: {'gamma': 0, 'min_child_weight': 8}




Selected hyper-params: {'colsample_bytree': 0.8, 'subsample': 0.8}




Selected hyper-params: {'colsample_bytree': 0.8500000000000001, 'subsample': 0.9}




Selected hyper-params: {'reg_alpha': 0.0, 'reg_lambda': 0.0}




Selected hyper-params: {'reg_alpha': 0.0, 'reg_lambda': 0.0}


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  out_of_fold_predictions[self.target_col] = self.out_of_fold_predictions[self.target_col]


meta model's training set score=  0.116669735933 

Fold 3, meta model (lgb tuner) predictions
 [ 0.01890311  0.47774513  0.01018805 ...,  0.00984447  0.98001751
  0.01059172]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  out_of_fold_predictions[self.target_col] = self.out_of_fold_predictions[self.target_col]


meta model's training set score=  0.118721616328 

Fold 3, meta model (xgb tuner) predictions
 [ 0.01548181  0.42074013  0.01297698 ...,  0.01297698  0.98193538
  0.01297698]
Processing fold  4
Train data X: (1604, 18433), y: (1604,)
# of features by light gbm: 47, cv score: 0.172
New train data X: (1604, 47), y: (1604,)
lgb tuner start tuning




Selected hyper-params: {'learning_rate': 0.1, 'n_estimators': 200}




Selected hyper-params: {'min_split_gain': 0.0, 'num_leaves': 3}




Selected hyper-params: {'colsample_bytree': 1.0, 'subsample': 1.0}




Selected hyper-params: {'colsample_bytree': 1.0, 'subsample': 1.0}




Selected hyper-params: {'reg_alpha': 0.0, 'reg_lambda': 0.0}




Selected hyper-params: {'reg_alpha': 0.0, 'reg_lambda': 0.0}
xgb tuner start tuning




Selected hyper-params: {'booster': 'dart'}




Selected hyper-params: {'learning_rate': 0.05, 'n_estimators': 100}




Selected hyper-params: {'max_depth': 3}




Selected hyper-params: {'gamma': 0.2, 'min_child_weight': 2}




Selected hyper-params: {'colsample_bytree': 1.0, 'subsample': 0.8}




Selected hyper-params: {'colsample_bytree': 1.0, 'subsample': 0.8}




Selected hyper-params: {'reg_alpha': 0.0, 'reg_lambda': 0.2}




Selected hyper-params: {'reg_alpha': 0.0, 'reg_lambda': 0.15000000000000002}
Fold 4, lgb tuner predictions
 [ 0.00739637  0.53155851  0.01145734 ...,  0.01370683  0.98548571
  0.00678605]
Fold 4, xgb tuner predictions
 [ 0.00663773  0.62242204  0.01043712 ...,  0.00683701  0.98986685
  0.00976061]

*** Start to train meta model for stacking averaged model ***


 clf_1
score= 0.126139674888
score= 0.164255570303
score= 0.169593996242
score= 0.166895261081
score= 0.161990074114
Avg score =  0.157774915325

 clf_2
score= 0.112401003674
score= 0.157150325698
score= 0.170186357557
score= 0.161550317634
score= 0.154957752579
Avg score =  0.151249151428

 knn_8
score= 0.207941797786
score= 0.651280441111
score= 0.753404759593
score= 0.652725056334
score= 0.641267128996
Avg score =  0.581323836764

 knn_16
score= 0.209423473296
score= 0.449828735079
score= 0.351710935377
score= 0.243339279931
score= 0.33274958187
Avg score =  0.31741040111

 knn_32
score= 0.207137707316
score= 0.444444421385
s



Selected hyper-params: {'learning_rate': 0.05, 'n_estimators': 400}




Selected hyper-params: {'min_split_gain': 0.0, 'num_leaves': 3}




Selected hyper-params: {'colsample_bytree': 1.0, 'subsample': 0.6}




Selected hyper-params: {'colsample_bytree': 1.0, 'subsample': 0.6}




Selected hyper-params: {'reg_alpha': 0.0, 'reg_lambda': 0.0}




Selected hyper-params: {'reg_alpha': 0.0, 'reg_lambda': 0.0}
xgb tuner start tuning




Selected hyper-params: {'booster': 'dart'}




Selected hyper-params: {'learning_rate': 0.05, 'n_estimators': 100}




Selected hyper-params: {'max_depth': 3}




Selected hyper-params: {'gamma': 0, 'min_child_weight': 8}




Selected hyper-params: {'colsample_bytree': 1.0, 'subsample': 1.0}




Selected hyper-params: {'colsample_bytree': 1.0, 'subsample': 1.0}




Selected hyper-params: {'reg_alpha': 0.4, 'reg_lambda': 0.0}




Selected hyper-params: {'reg_alpha': 0.30000000000000004, 'reg_lambda': 0.1}


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  out_of_fold_predictions[self.target_col] = self.out_of_fold_predictions[self.target_col]


meta model's training set score=  0.135903574465 

Fold 4, meta model (lgb tuner) predictions
 [ 0.01301811  0.47451457  0.01047445 ...,  0.01072666  0.98423191
  0.01047445]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  out_of_fold_predictions[self.target_col] = self.out_of_fold_predictions[self.target_col]


meta model's training set score=  0.125108632898 

Fold 4, meta model (xgb tuner) predictions
 [ 0.00758652  0.4013387   0.00758652 ...,  0.00758652  0.99031025
  0.00758652]
Processing fold  5
Train data X: (1604, 18433), y: (1604,)
# of features by light gbm: 36, cv score: 0.155
New train data X: (1604, 36), y: (1604,)
lgb tuner start tuning




Selected hyper-params: {'learning_rate': 0.05, 'n_estimators': 800}




Selected hyper-params: {'min_split_gain': 0.2, 'num_leaves': 7}




Selected hyper-params: {'colsample_bytree': 0.8, 'subsample': 0.8}




Selected hyper-params: {'colsample_bytree': 0.9, 'subsample': 0.8}




Selected hyper-params: {'reg_alpha': 0.2, 'reg_lambda': 0.0}




Selected hyper-params: {'reg_alpha': 0.2, 'reg_lambda': 0.0}
xgb tuner start tuning




Selected hyper-params: {'booster': 'dart'}




Selected hyper-params: {'learning_rate': 0.1, 'n_estimators': 100}




Selected hyper-params: {'max_depth': 3}




Selected hyper-params: {'gamma': 0.2, 'min_child_weight': 8}




Selected hyper-params: {'colsample_bytree': 1.0, 'subsample': 1.0}




Selected hyper-params: {'colsample_bytree': 1.0, 'subsample': 1.0}




Selected hyper-params: {'reg_alpha': 0.0, 'reg_lambda': 0.4}




Selected hyper-params: {'reg_alpha': 0.0, 'reg_lambda': 0.4}
Fold 5, lgb tuner predictions
 [ 0.0086074   0.54377852  0.00889747 ...,  0.00471593  0.9828988   0.0047685 ]
Fold 5, xgb tuner predictions
 [ 0.00970136  0.68070149  0.00477085 ...,  0.00499983  0.9806003
  0.00389884]

*** Start to train meta model for stacking averaged model ***


 clf_1
score= 0.0957572275237
score= 0.156696475645
score= 0.162062540643
score= 0.136960063724
score= 0.113559699312
Avg score =  0.133007201369

 clf_2
score= 0.0907441958911
score= 0.160397130823
score= 0.163775481135
score= 0.14478055526
score= 0.112826546195
Avg score =  0.134504781861

 knn_8
score= 0.287199505241
score= 0.641407002708
score= 0.631397456404
score= 0.617604146639
score= 0.282899819298
Avg score =  0.492101586058

 knn_16
score= 0.288395260674
score= 0.535205895806
score= 0.435606008489
score= 0.525582767416
score= 0.287894578866
Avg score =  0.41453690225

 knn_32
score= 0.192749713675
score= 0.533147493126
score= 0.23738551



Selected hyper-params: {'learning_rate': 0.05, 'n_estimators': 400}




Selected hyper-params: {'min_split_gain': 0.0, 'num_leaves': 3}




Selected hyper-params: {'colsample_bytree': 1.0, 'subsample': 1.0}




Selected hyper-params: {'colsample_bytree': 1.0, 'subsample': 0.9}




Selected hyper-params: {'reg_alpha': 0.0, 'reg_lambda': 0.0}




Selected hyper-params: {'reg_alpha': 0.0, 'reg_lambda': 0.0}
xgb tuner start tuning




Selected hyper-params: {'booster': 'dart'}




Selected hyper-params: {'learning_rate': 0.05, 'n_estimators': 100}




Selected hyper-params: {'max_depth': 1}




Selected hyper-params: {'gamma': 0, 'min_child_weight': 10}




Selected hyper-params: {'colsample_bytree': 1.0, 'subsample': 1.0}




Selected hyper-params: {'colsample_bytree': 0.9, 'subsample': 1.0}




Selected hyper-params: {'reg_alpha': 0.2, 'reg_lambda': 0.0}




Selected hyper-params: {'reg_alpha': 0.05000000000000002, 'reg_lambda': 0.1}


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  out_of_fold_predictions[self.target_col] = self.out_of_fold_predictions[self.target_col]


meta model's training set score=  0.115026390678 

Fold 5, meta model (lgb tuner) predictions
 [ 0.00907363  0.59763671  0.01055113 ...,  0.00907363  0.97371395
  0.00805075]
meta model's training set score=  0.1184641355 

Fold 5, meta model (xgb tuner) predictions
 [ 0.01329209  0.51314974  0.01329209 ...,  0.01329209  0.97339547
  0.01329209]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  out_of_fold_predictions[self.target_col] = self.out_of_fold_predictions[self.target_col]
