In [34]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import tensorflow as tf
import keras
import keras.backend as K
from keras import layers, models, optimizers
from keras.regularizers import L1L2
from keras.callbacks import EarlyStopping, LearningRateScheduler
from tensorflow.keras.losses import Huber

import gc
import os
import pickle
import warnings

# os.environ['KMP_DUPLICATE_LIB_OK']='True'
warnings.filterwarnings('ignore')
# tf.config.experimental.set_visible_devices([], 'GPU')


In [2]:
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, GradientBoostingClassifier, StackingClassifier, AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import log_loss, accuracy_score, f1_score
from sklearn.decomposition import PCA

from pystacknet.pystacknet import StackNetClassifier



In [3]:
import xgboost as xgb
import lightgbm as lgb
import catboost as cb
from bayes_opt import BayesianOptimization


In [4]:
tr = pd.read_csv('./data/train.csv')
te = pd.read_csv('./data/test.csv')

sub = pd.read_csv('./data/sample_submission.csv', index_col=0)

In [5]:
# target_lbe = LabelEncoder().fit(target)

# t = target_lbe.transform(target)

column_number = {}
for i, column in enumerate(sub.columns):
    column_number[column] = i
    
def to_number(x, dic):
    return dic[x]

tr['type_num'] = tr['type'].apply(lambda x: to_number(x, column_number))



In [6]:
target = tr['type_num']
t = target.copy()

train_X = tr.drop(['id', 'type', 'type_num', 'fiberID'], axis=1)
test_X = te.drop(['id','fiberID'], axis=1)



In [7]:
tr_X = (train_X - np.mean(train_X))/np.std(train_X)
te_X = (test_X - np.mean(test_X))/np.std(test_X)

In [8]:
# f = open('best_params_sh.bin', 'wb')
# pickle.dump(params, f)
# f.close()

In [9]:
f = open('best_params_sh.bin', 'rb')
params = pickle.load(f)
f.close()

In [None]:
params

In [None]:
lgb_clf = lgb.LGBMClassifier(**params, early_stoppong_rounds = 100)
# lgb_clf.fit(tr_X, t)

In [None]:
y_pred = lgb_clf.predict_proba(te_X)
submission = pd.DataFrame(data=y_pred, columns=sub.columns, index=sub.index)
submission.to_csv('./sub/lgb7.csv', index=True)


In [None]:
'############################'

In [11]:
# parmas for xgboost
params_fx = {'min_data_in_leaf': params['min_data_in_leaf'],
             'num_leaves': params['num_leaves'],
             'min_child_weight': params['min_child_weight'],
             'bagging_fraction': params['bagging_fraction'],
             'feature_fraction': params['feature_fraction'],
             'reg_lambda': params['reg_lambda'],
             'reg_alpha': params['reg_alpha'],
             'max_depth': params['max_depth']
}

In [12]:
lgb_clf = lgb.LGBMClassifier(**params, early_stoppong_rounds = 100)

xgb_clf = xgb.XGBClassifier(
            **params_fx,
#             n_estimators=300,
            tree_method = 'hist',
            booster = 'gbtree',
            eval_metric = 'mlogloss',
            objective = 'multi:softprob',
            num_class = 19,
            early_stoppong_rounds = 50
    )

cat_clf = cb.CatBoostClassifier(max_depth=13, learning_rate= 0.01 ,early_stopping_rounds=50, task_type='GPU')

rf = RandomForestClassifier(n_estimators=150,
                               max_depth=9,
                               oob_score=True,
                               random_state=42,)

rf1 = RandomForestClassifier(n_estimators=200,
                               max_depth=13,
                               min_samples_split=5,
                               min_samples_leaf=5,
                               min_impurity_decrease = 0.001,
                               max_features=None,
                               oob_score=True,
                               random_state=42,)

pca = PCA(12)

In [16]:
models = [[lgb_clf, cat_clf, rf1], 
          [rf]]

In [17]:
model = StackNetClassifier(models, 
                           metric="logloss", 
                           folds=3,
                           restacking=False,
                           use_retraining=True,
                           use_proba=True, # To use predict_proba after training
                           random_state=42,
                           n_jobs=-1, 
                           verbose=1)

model.fit(tr_X, t)

Input Dimensionality 20 at Level 0 
3 models included in Level 0 
Fold 1/3 , model 0 , logloss===0.394186 
Fold 1/3 , model 1 , logloss===0.459611 
Fold 1/3 , model 2 , logloss===0.621144 
Fold 2/3 , model 0 , logloss===0.387173 
Fold 2/3 , model 1 , logloss===0.456001 
Fold 2/3 , model 2 , logloss===0.623546 
Fold 3/3 , model 0 , logloss===0.389766 
Fold 3/3 , model 1 , logloss===0.457847 
Fold 3/3 , model 2 , logloss===0.621661 
Level 0, model 0 , logloss===0.390375 
Level 0, model 1 , logloss===0.457820 
Level 0, model 2 , logloss===0.622117 
Output dimensionality of level 0 is 57 
 level 0 lasted 1930.188873 seconds 
Input Dimensionality 57 at Level 1 
1 models included in Level 1 
Fold 1/3 , model 0 , logloss===0.486617 
Fold 2/3 , model 0 , logloss===0.479906 
Fold 3/3 , model 0 , logloss===0.482420 
Level 1, model 0 , logloss===0.482981 
Output dimensionality of level 1 is 19 
 level 1 lasted 342.676234 seconds 
 fit() lasted 2272.882619 seconds 


In [18]:
y_pred = model.predict_proba(te_X)
submission = pd.DataFrame(data=y_pred, columns=sub.columns, index=sub.index)
submission.to_csv('./sub/sk_fin1.csv', index=True)


1 estimators included in Level 0 
1 estimators included in Level 1 


In [None]:
'##########'

In [19]:
models = [[lgb_clf, cat_clf, rf1, pca], 
          [rf]]

In [20]:
model = StackNetClassifier(models, 
                           metric="logloss", 
                           folds=3,
                           restacking=False,
                           use_retraining=True,
                           use_proba=True, # To use predict_proba after training
                           random_state=42,
                           n_jobs=-1, 
                           verbose=1)

model.fit(tr_X, t)

Input Dimensionality 20 at Level 0 
4 models included in Level 0 
Fold 1/3 , model 0 , logloss===0.394186 
Fold 1/3 , model 1 , logloss===0.459567 
Fold 1/3 , model 2 , logloss===0.621144 
Fold 2/3 , model 0 , logloss===0.387173 
Fold 2/3 , model 1 , logloss===0.455983 
Fold 2/3 , model 2 , logloss===0.623546 
Fold 3/3 , model 0 , logloss===0.389766 
Fold 3/3 , model 1 , logloss===0.457946 
Fold 3/3 , model 2 , logloss===0.621661 
Level 0, model 0 , logloss===0.390375 
Level 0, model 1 , logloss===0.457832 
Level 0, model 2 , logloss===0.622117 
Output dimensionality of level 0 is 69 
 level 0 lasted 1953.524622 seconds 
Input Dimensionality 69 at Level 1 
1 models included in Level 1 
Fold 1/3 , model 0 , logloss===0.519896 
Fold 2/3 , model 0 , logloss===0.513859 
Fold 3/3 , model 0 , logloss===0.509813 
Level 1, model 0 , logloss===0.514523 
Output dimensionality of level 1 is 19 
 level 1 lasted 417.278841 seconds 
 fit() lasted 2370.833526 seconds 


In [21]:
y_pred = model.predict_proba(te_X)
submission = pd.DataFrame(data=y_pred, columns=sub.columns, index=sub.index)
submission.to_csv('./sub/sk_fin2.csv', index=True)


1 estimators included in Level 0 
1 estimators included in Level 1 


In [None]:
'#######################'

In [23]:
res_X = model.predict_proba(tr_X)

1 estimators included in Level 0 
1 estimators included in Level 1 


In [35]:
rte_X = model.predict_proba(te_X)

1 estimators included in Level 0 
1 estimators included in Level 1 


In [37]:
# f = open('./rte_X.bin', 'wb')
# pickle.dump(res_X, f)
# f.close()

f = open('./res_X.bin', 'rb')
r = pickle.load(f)
f.close()

In [26]:
K.clear_session()

inputs = layers.Input(shape=(res_X.shape[1], ))

x = layers.Dense(128, kernel_initializer='he_normal', kernel_regularizer=L1L2(l2=0.0001))(inputs)
x = layers.advanced_activations.LeakyReLU(0.3)(x)

x = layers.Dense(64, kernel_initializer='he_normal', kernel_regularizer=L1L2(l2=0.0001))(x)
x = layers.advanced_activations.LeakyReLU(0.3)(x)

x = layers.Dense(19, activation='softmax')(x)

m = models.Model(inputs, x)

# sparse_categorical_crossentropy
m.compile(optimizer = 'adam',
         loss = 'sparse_categorical_crossentropy', 
          metrics = ['acc'])

In [27]:
es = EarlyStopping(patience=50, restore_best_weights=True)
def schedule(epoch):
    if epoch < 80:
        return 0.0005
    elif epoch < 160:
        return 0.0001
    else:
        return 0.00001
lrs = LearningRateScheduler(schedule)

In [28]:
m.fit(res_X, t, 
     epochs = 1000,
     validation_split=0.15,
     batch_size=1024*16,
      callbacks=[es, lrs]
     )

Train on 169992 samples, validate on 29999 samples
Epoch 1/1000


InternalError:  Blas GEMM launch failed : a.shape=(16384, 19), b.shape=(19, 128), m=16384, n=128, k=19
	 [[node dense_1/MatMul (defined at c:\users\young\anaconda3\envs\study\lib\site-packages\tensorflow_core\python\framework\ops.py:1751) ]] [Op:__inference_keras_scratch_graph_1074]

Function call stack:
keras_scratch_graph


In [None]:
'##############'

In [None]:
# kk = pd.DataFrame(lgb_clf.predict_proba(te_X), columns=target_lbe.classes_)
# sub[sub.columns[1:]] = kk[sub.columns[1:]]

In [None]:
# sub.to_csv('./sub/stk1.csv', index=False)

In [None]:
lgb_imp = lgb_clf.feature_importances_

lgb_imp_idx = []
cols = tr_X.columns
for i, imp in enumerate(lgb_imp):
    if imp > 0:
        lgb_imp_idx.append(i)
        
new_cols_imp = cols[lgb_imp_idx]
new_cols_imp = list(new_cols_imp)

In [None]:
print(len(tr_X.columns), len(new_cols_imp))

In [None]:
f = open('./new_cols.bin', 'wb')
pickle.dump(new_cols_imp, f)
f.close()

In [None]:
from catboost import CatBoostClassifier

clf = CatBoostClassifier(max_depth=13, learning_rate= 0.01 ,early_stopping_rounds=50, task_type='GPU')


In [None]:
clf.fit(tr_X, t)

In [None]:
y_pred = clf.predict_proba(te_X)
submission = pd.DataFrame(data=y_pred, columns=sub.columns, index=sub.index)
submission.to_csv('./sub/cat1.csv', index=True)


In [None]:
params_cat = {}
# params_cat['iterations'] = 200
# params_cat['custom_loss'] = 'TotalF1'
params_cat['loss_function'] = 'MultiClass'
params_cat['eval_metric'] = 'AUC'
params_cat['random_seed'] = 42
# params_cat['min_data_in_leaf'] = params['min_data_in_leaf']
params_cat['depth'] = min(16, params['max_depth'])
params_cat['max_leaves'] = params['num_leaves']
params_cat['l2_leaf_reg'] =  params['reg_lambda']
params_cat['bagging_temperature'] = params['bagging_fraction']

In [None]:
# parmas for xgboost
params_fx = {'min_data_in_leaf': params['min_data_in_leaf'],
             'num_leaves': params['num_leaves'],
#              'min_child_weight': params['min_child_weight'],
             'bagging_fraction': params['bagging_fraction'],
             'feature_fraction': params['feature_fraction'],
             'reg_lambda': params['reg_lambda'],
             'reg_alpha': params['reg_alpha'],
             'max_depth': params['max_depth']
}

In [None]:
tr_X