In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import gc
import os
import pickle
import warnings

warnings.filterwarnings('ignore')


In [2]:
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, GradientBoostingClassifier, StackingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import log_loss, accuracy_score, f1_score
from sklearn.decomposition import PCA

from pystacknet.pystacknet import StackNetClassifier



In [3]:
import xgboost as xgb
import lightgbm as lgb
from bayes_opt import BayesianOptimization


In [4]:
f = open('./new_cols.bin', 'rb')
cols = pickle.load(f)
f.close()

In [5]:
tr = pd.read_csv('./data/train.csv')
te = pd.read_csv('./data/test.csv')

sub = pd.read_csv('./data/sample_submission.csv', index_col=0)

In [6]:
# target_lbe = LabelEncoder().fit(target)

# t = target_lbe.transform(target)

column_number = {}
for i, column in enumerate(sub.columns):
    column_number[column] = i
    
def to_number(x, dic):
    return dic[x]

tr['type_num'] = tr['type'].apply(lambda x: to_number(x, column_number))



In [7]:
target = tr['type_num']
t = target.copy()

train_X = tr.drop(['id', 'type', 'type_num', 'fiberID'], axis=1)
test_X = te.drop(['id','fiberID'], axis=1)



In [None]:
train_X['index'] = np.ones(len(train_X))
test_X['index'] = np.ones(len(test_X))*2

merge = pd.concat([train_X, test_X], ignore_index=True)

k = train_X.columns[1:-1]

In [None]:
merge = pd.concat([merge, pd.get_dummies(merge['fiberID'], prefix='fiberID')], axis=1)
merge = merge.drop('fiberID', axis=1)


In [None]:
train_X = merge[merge['index'] == 1]
train_X = train_X.drop('index', axis=1)
test_X = merge[merge['index'] == 2]
test_X = test_X.drop('index', axis=1)
test_X.index = range(len(test_X))

In [None]:
tr_X = train_X[k]
te_X = test_X[k]

tr_X = (tr_X - np.mean(tr_X))/np.std(tr_X)
te_X = (te_X - np.mean(te_X))/np.std(te_X)

train_X[k] = tr_X
test_X[k] = te_X

te_X = test_X.copy()
tr_X = train_X.copy()

In [None]:
tr_X = tr_X[cols]
te_X = te_X[cols]

In [8]:
tr_X = (train_X - np.mean(train_X))/np.std(train_X)
te_X = (test_X - np.mean(test_X))/np.std(test_X)

In [9]:
train_X, test_X, train_y, test_y = train_test_split(tr_X, t, test_size=0.1, random_state=42, shuffle=True)



In [None]:
def LGB_bayesian(
    #learning_rate,
    num_leaves, 
    bagging_fraction,
    feature_fraction,
    min_child_weight, 
    min_data_in_leaf,
    max_depth,
    reg_alpha,
    reg_lambda
     ):
    # LightGBM expects next three parameters need to be integer. 
    num_leaves = int(num_leaves)
    min_data_in_leaf = int(min_data_in_leaf)
    max_depth = int(max_depth)

    assert type(num_leaves) == int
    assert type(min_data_in_leaf) == int
    assert type(max_depth) == int
    

    params = {
              'num_leaves': num_leaves, 
              'min_data_in_leaf': min_data_in_leaf,
              'min_child_weight': min_child_weight,
              'bagging_fraction' : bagging_fraction,
              'feature_fraction' : feature_fraction,
#               'learning_rate' : 0.03,
              'max_depth': max_depth,
              'reg_alpha': reg_alpha,
              'reg_lambda': reg_lambda,
              'objective': 'binary',
              'save_binary': True,
              'seed': 12,
              'feature_fraction_seed': 12,
              'bagging_seed': 12,
              'drop_seed': 12,
              'data_random_seed': 12,
              'boosting': 'gbdt', ## some get better result using 'dart'
              'verbose': 1,
              'is_unbalance': False,
              'boost_from_average': True,
              'metric':'multi_logloss'}    
    
    ## set clf options
    clf = lgb.LGBMClassifier(**params).fit(train_X, train_y, early_stopping_rounds=100,eval_set=[(test_X, test_y)], eval_metric='multi_logloss', verbose=0)
    
    score = accuracy_score(test_y, clf.predict(test_X))

    return score


In [None]:
bounds_LGB = {
    'num_leaves': (300, 1000), 
    'min_data_in_leaf': (0, 150),
    'bagging_fraction' : (0.3, 0.9),
    'feature_fraction' : (0.3, 0.9),
#     'learning_rate': (0.01, 0.3),
    'min_child_weight': (0.001, 3),   
    'reg_alpha': (0.1, 3), 
    'reg_lambda': (0.1, 3),
    'max_depth':(10, 30),
}

In [None]:
optimizer = BayesianOptimization(LGB_bayesian, bounds_LGB, random_state=42)


In [None]:
init_points = 10
n_iter = 20

optimizer.maximize(init_points=init_points, n_iter=n_iter)


In [None]:
param_lgb = {
        'min_data_in_leaf': int(optimizer.max['params']['min_data_in_leaf']), 
        'num_leaves': int(optimizer.max['params']['num_leaves']), 
        #'learning_rate': LGB_BO.max['params']['learning_rate'],
        'min_child_weight': optimizer.max['params']['min_child_weight'],
        'bagging_fraction': optimizer.max['params']['bagging_fraction'], 
        'feature_fraction': optimizer.max['params']['feature_fraction'],
        'reg_lambda': optimizer.max['params']['reg_lambda'],
        'reg_alpha': optimizer.max['params']['reg_alpha'],
        'max_depth': int(optimizer.max['params']['max_depth']), 
        'objective': 'binary',
        'save_binary': True,
        'seed': 12,
        'feature_fraction_seed': 12,
        'bagging_seed': 12,
        'drop_seed': 12,
        'data_random_seed': 12,
        'boosting_type': 'gbdt',  # also consider 'dart'
        'verbose': 1,
        'is_unbalance': False,
        'boost_from_average': True,
        'metric':'multi_logloss'
    }

params = param_lgb.copy()

In [None]:
# f = open('best_params_sh.bin', 'wb')
# pickle.dump(params, f)
# f.close()

In [10]:
f = open('best_params_sh.bin', 'rb')
params = pickle.load(f)
f.close()

In [None]:
te_X

In [27]:
lgb_clf = lgb.LGBMClassifier(**params, early_stoppong_rounds = 100)
lgb_clf.fit(tr_X, t)

LGBMClassifier(bagging_fraction=0.8133797481154059, bagging_seed=12,
               boost_from_average=True, boosting_type='gbdt', class_weight=None,
               colsample_bytree=1.0, data_random_seed=12, drop_seed=12,
               early_stoppong_rounds=100, feature_fraction=0.42744912354487063,
               feature_fraction_seed=12, importance_type='split',
               is_unbalance=False, learning_rate=0.1, max_depth=29,
               metric='multi_logloss', min_child_samples=20,
               min_child_weight=2.312439152145239, min_data_in_leaf=2,
               min_split_gain=0.0, n_estimators=100, n_jobs=-1, num_leaves=302,
               objective='binary', random_state=None,
               reg_alpha=1.9121592091690662, reg_lambda=0.4835878331894956,
               save_binary=True, seed=12, silent=True, ...)

In [None]:
y_pred = lgb_clf.predict_proba(te_X)
submission = pd.DataFrame(data=y_pred, columns=sub.columns, index=sub.index)
submission.to_csv('./sub/lgb7.csv', index=True)


In [None]:
'############################'

In [11]:
# parmas for xgboost
params_fx = {'min_data_in_leaf': params['min_data_in_leaf'],
             'num_leaves': params['num_leaves'],
             'min_child_weight': params['min_child_weight'],
             'bagging_fraction': params['bagging_fraction'],
             'feature_fraction': params['feature_fraction'],
             'reg_lambda': params['reg_lambda'],
             'reg_alpha': params['reg_alpha'],
             'max_depth': params['max_depth']
}

In [21]:
lgb_clf = lgb.LGBMClassifier(**params, early_stoppong_rounds = 100)

xgb_clf = xgb.XGBClassifier(
            **params_fx,
            n_estimators=300,
            tree_method = 'hist',
            booster = 'gbtree',
#             eval_metric = 'mlogloss',
#             objective = 'multi:softprob',
#             num_class = 19,
            early_stoppong_rounds = 100
            
    )

rf = RandomForestClassifier(n_estimators=200,
                               max_depth=13,
                               min_samples_split=5,
                               min_samples_leaf=5,
                               min_impurity_decrease = 0.001,
                               max_features=None,
                               oob_score=True,
                               random_state=42,)

pca = PCA(12)

In [22]:
models = [[lgb_clf, rf, pca], 
          [rf]]

In [23]:
model = StackNetClassifier(models, 
                           metric="logloss", 
                           folds=3,
                           restacking=False,
                           use_retraining=True,
                           use_proba=True, # To use predict_proba after training
                           random_state=42,
                           n_jobs=-1, 
                           verbose=1)

model.fit(tr_X, t)

Input Dimensionality 20 at Level 0 
3 models included in Level 0 
Fold 1/4 , model 0 , logloss===0.393031 
Fold 1/4 , model 1 , logloss===0.623155 
Fold 2/4 , model 0 , logloss===0.385407 
Fold 2/4 , model 1 , logloss===0.628474 
Fold 3/4 , model 0 , logloss===0.382767 
Fold 3/4 , model 1 , logloss===0.615601 
Fold 4/4 , model 0 , logloss===0.388745 
Fold 4/4 , model 1 , logloss===0.625972 
Level 0, model 0 , logloss===0.387487 
Level 0, model 1 , logloss===0.623300 
Output dimensionality of level 0 is 50 
 level 0 lasted 2090.965102 seconds 
Input Dimensionality 50 at Level 1 
1 models included in Level 1 
Fold 1/4 , model 0 , logloss===0.474300 
Fold 2/4 , model 0 , logloss===0.472525 
Fold 3/4 , model 0 , logloss===0.465748 
Fold 4/4 , model 0 , logloss===0.474813 
Level 1, model 0 , logloss===0.471846 
Output dimensionality of level 1 is 19 
 level 1 lasted 2879.732177 seconds 
 fit() lasted 4970.715796 seconds 


In [24]:
y_pred = model.predict_proba(te_X)
submission = pd.DataFrame(data=y_pred, columns=sub.columns, index=sub.index)
submission.to_csv('./sub/stk3.csv', index=True)


1 estimators included in Level 0 
1 estimators included in Level 1 


In [None]:
# kk = pd.DataFrame(lgb_clf.predict_proba(te_X), columns=target_lbe.classes_)
# sub[sub.columns[1:]] = kk[sub.columns[1:]]

In [None]:
# sub.to_csv('./sub/stk1.csv', index=False)

In [28]:
lgb_imp = lgb_clf.feature_importances_

lgb_imp_idx = []
cols = tr_X.columns
for i, imp in enumerate(lgb_imp):
    if imp > 0:
        lgb_imp_idx.append(i)
        
new_cols_imp = cols[lgb_imp_idx]
new_cols_imp = list(new_cols_imp)

In [29]:
print(len(tr_X.columns), len(new_cols_imp))

20 20


In [None]:
f = open('./new_cols.bin', 'wb')
pickle.dump(new_cols_imp, f)
f.close()