In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import gc
import os
import pickle
import warnings

warnings.filterwarnings('ignore')


In [2]:
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, GradientBoostingClassifier, StackingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import log_loss, accuracy_score, f1_score
from sklearn.decomposition import PCA

from pystacknet.pystacknet import StackNetClassifier



In [3]:
import xgboost as xgb
import lightgbm as lgb
from bayes_opt import BayesianOptimization


In [4]:
tr = pd.read_csv('./data/train.csv')
te = pd.read_csv('./data/test.csv')

sub = pd.read_csv('./data/sample_submission.csv', index_col=0)

In [5]:
# target_lbe = LabelEncoder().fit(target)

# t = target_lbe.transform(target)

column_number = {}
for i, column in enumerate(sub.columns):
    column_number[column] = i
    
def to_number(x, dic):
    return dic[x]

tr['type_num'] = tr['type'].apply(lambda x: to_number(x, column_number))



In [6]:
target = tr['type_num']
t = target.copy()

train_X = tr.drop(['id', 'type', 'type_num'], axis=1)
test_X = te.drop(['id',], axis=1)



In [7]:
train_X['index'] = np.ones(len(train_X))
test_X['index'] = np.ones(len(test_X))*2

merge = pd.concat([train_X, test_X], ignore_index=True)

k = train_X.columns[1:-1]

In [8]:
merge = pd.concat([merge, pd.get_dummies(merge['fiberID'], prefix='fiberID')], axis=1)
merge = merge.drop('fiberID', axis=1)


In [9]:
train_X = merge[merge['index'] == 1]
train_X = train_X.drop('index', axis=1)
test_X = merge[merge['index'] == 2]
test_X = test_X.drop('index', axis=1)
test_X.index = range(len(test_X))

In [10]:
tr_X = train_X[k]
te_X = test_X[k]

tr_X = (tr_X - np.mean(tr_X))/np.std(tr_X)
te_X = (te_X - np.mean(te_X))/np.std(te_X)

train_X[k] = tr_X
test_X[k] = te_X

te_X = test_X.copy()
tr_X = train_X.copy()

In [None]:
train_X, test_X, train_y, test_y = train_test_split(tr_X, t, test_size=0.1, random_state=42, shuffle=True)



In [None]:
def LGB_bayesian(
    #learning_rate,
    num_leaves, 
    bagging_fraction,
    feature_fraction,
    min_child_weight, 
    min_data_in_leaf,
    max_depth,
    reg_alpha,
    reg_lambda
     ):
    # LightGBM expects next three parameters need to be integer. 
    num_leaves = int(num_leaves)
    min_data_in_leaf = int(min_data_in_leaf)
    max_depth = int(max_depth)

    assert type(num_leaves) == int
    assert type(min_data_in_leaf) == int
    assert type(max_depth) == int
    

    params = {
              'num_leaves': num_leaves, 
              'min_data_in_leaf': min_data_in_leaf,
              'min_child_weight': min_child_weight,
              'bagging_fraction' : bagging_fraction,
              'feature_fraction' : feature_fraction,
#               'learning_rate' : 0.03,
              'max_depth': max_depth,
              'reg_alpha': reg_alpha,
              'reg_lambda': reg_lambda,
              'objective': 'binary',
              'save_binary': True,
              'seed': 12,
              'feature_fraction_seed': 12,
              'bagging_seed': 12,
              'drop_seed': 12,
              'data_random_seed': 12,
              'boosting': 'gbdt', ## some get better result using 'dart'
              'verbose': 1,
              'is_unbalance': False,
              'boost_from_average': True,
              'metric':'multi_logloss'}    
    
    ## set clf options
    clf = lgb.LGBMClassifier(**params).fit(train_X, train_y, early_stopping_rounds=100,eval_set=[(test_X, test_y)], eval_metric='multi_logloss', verbose=0)
    
    score = accuracy_score(test_y, clf.predict(test_X))

    return score


In [None]:
bounds_LGB = {
    'num_leaves': (300, 1000), 
    'min_data_in_leaf': (0, 150),
    'bagging_fraction' : (0.3, 0.9),
    'feature_fraction' : (0.3, 0.9),
#     'learning_rate': (0.01, 0.3),
    'min_child_weight': (0.001, 3),   
    'reg_alpha': (0.1, 3), 
    'reg_lambda': (0.1, 3),
    'max_depth':(10, 30),
}

In [None]:
optimizer = BayesianOptimization(LGB_bayesian, bounds_LGB, random_state=42)


In [None]:
init_points = 10
n_iter = 20

optimizer.maximize(init_points=init_points, n_iter=n_iter)


In [None]:
param_lgb = {
        'min_data_in_leaf': int(optimizer.max['params']['min_data_in_leaf']), 
        'num_leaves': int(optimizer.max['params']['num_leaves']), 
        #'learning_rate': LGB_BO.max['params']['learning_rate'],
        'min_child_weight': optimizer.max['params']['min_child_weight'],
        'bagging_fraction': optimizer.max['params']['bagging_fraction'], 
        'feature_fraction': optimizer.max['params']['feature_fraction'],
        'reg_lambda': optimizer.max['params']['reg_lambda'],
        'reg_alpha': optimizer.max['params']['reg_alpha'],
        'max_depth': int(optimizer.max['params']['max_depth']), 
        'objective': 'binary',
        'save_binary': True,
        'seed': 12,
        'feature_fraction_seed': 12,
        'bagging_seed': 12,
        'drop_seed': 12,
        'data_random_seed': 12,
        'boosting_type': 'gbdt',  # also consider 'dart'
        'verbose': 1,
        'is_unbalance': False,
        'boost_from_average': True,
        'metric':'multi_logloss'
    }

params = param_lgb.copy()

In [None]:
f = open('best_params2.bin', 'wb')
pickle.dump(params, f)
f.close()

In [11]:
f = open('best_params2.bin', 'rb')
params = pickle.load(f)
f.close()

In [None]:
te_X

In [None]:
lgb_clf = lgb.LGBMClassifier(**params, early_stoppong_rounds = 100)
lgb_clf.fit(tr_X, t)

In [None]:
y_pred = lgb_clf.predict_proba(te_X)
submission = pd.DataFrame(data=y_pred, columns=sub.columns, index=sub.index)
submission.to_csv('./sub/lgb5.csv', index=True)


In [None]:
lgb_clf.predict_proba(te_X)

In [None]:
# kk = pd.DataFrame(lgb_clf.predict_proba(te_X), columns=target_lbe.classes_)
sub[sub.columns[1:]] = lgb_clf.predict_proba(te_X)

In [None]:
sub.to_csv('./sub/lgb3.csv', index=False)

In [None]:
sub

In [None]:
'############################'

In [None]:
# parmas for xgboost
params_fx = {'min_data_in_leaf': params['min_data_in_leaf'],
             'num_leaves': params['num_leaves'],
             'min_child_weight': params['min_child_weight'],
             'bagging_fraction': params['bagging_fraction'],
             'feature_fraction': params['feature_fraction'],
             'reg_lambda': params['reg_lambda'],
             'reg_alpha': params['reg_alpha'],
             'max_depth': params['max_depth']
}

In [None]:
# lgb_clf = lgb.LGBMClassifier(**params, early_stoppong_rounds = 100)

xgb_clf = xgb.XGBClassifier(
            **params_fx,
#             n_estimators=500,
            tree_method = 'hist',
            booster = 'dart',
#             eval_metric = 'mlogloss',
#             objective = 'multi:softprob',
#             num_class = 19,
            early_stoppong_rounds = 100
            
    )

rf = RandomForestClassifier(n_estimators=150,
                                max_depth=9, 
                                max_features='sqrt', 
                                random_state=42)

pca = PCA(50)

In [None]:
models = [[lgb_clf, rf, pca], 
          [rf]]

In [None]:
model = StackNetClassifier(models, 
                           metric="logloss", 
                           folds=2,
                           restacking=False,
                           use_retraining=True,
                           use_proba=True, # To use predict_proba after training
                           random_state=12,
                           n_jobs=-1, 
                           verbose=1)

model.fit(tr_X, t)

In [None]:
y_pred = model.predict_proba(te_X)
submission = pd.DataFrame(data=y_pred, columns=sub.columns, index=sub.index)
submission.to_csv('./sub/stk1.csv', index=True)


In [None]:
# kk = pd.DataFrame(lgb_clf.predict_proba(te_X), columns=target_lbe.classes_)
# sub[sub.columns[1:]] = kk[sub.columns[1:]]

In [None]:
# sub.to_csv('./sub/stk1.csv', index=False)

In [None]:
lgb_imp = lgb_clf.feature_importances_

lgb_imp_idx = []
cols = tr_X.columns
for i, imp in enumerate(lgb_imp):
    if imp > 0:
        lgb_imp_idx.append(i)
        
new_cols_imp = cols[lgb_imp_idx]
new_cols_imp = list(new_cols_imp)

In [None]:
print(len(tr_X.columns), len(new_cols_imp))

In [None]:
f = open('./new_cols.bin', 'wb')
pickle.dump(new_cols_imp, f)
f.close()