In [None]:
import pandas as pd
import numpy as np

import gc
import os
import pickle
import seaborn as sns

In [None]:
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, GradientBoostingClassifier, StackingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import log_loss, accuracy_score, f1_score
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

from pystacknet.pystacknet import StackNetClassifier



In [None]:
tr = pd.read_csv('./data/train.csv')
te = pd.read_csv('./data/test.csv')

sub = pd.read_csv('./data/sample_submission.csv', index_col=0)

In [None]:
# target_lbe = LabelEncoder().fit(target)

# t = target_lbe.transform(target)

column_number = {}
for i, column in enumerate(sub.columns):
    column_number[column] = i
    
def to_number(x, dic):
    return dic[x]

tr['type_num'] = tr['type'].apply(lambda x: to_number(x, column_number))

target = tr['type_num']
t = target.copy()

train_X = tr.drop(['id', 'type', 'type_num'], axis=1)
test_X = te.drop(['id',], axis=1)



In [None]:
train_X['index'] = np.ones(len(train_X))
test_X['index'] = np.ones(len(test_X))*2

merge = pd.concat([train_X, test_X], ignore_index=True)

k = train_X.columns[1:-1]

In [None]:
merge = pd.concat([merge, pd.get_dummies(merge['fiberID'], prefix='fiberID')], axis=1)
merge = merge.drop('fiberID', axis=1)


In [None]:
train_X = merge[merge['index'] == 1]
train_X = train_X.drop('index', axis=1)
test_X = merge[merge['index'] == 2]
test_X = test_X.drop('index', axis=1)
test_X.index = range(len(test_X))

In [None]:
tr_X = train_X[k]
te_X = test_X[k]

tr_X = (tr_X - np.mean(tr_X))/np.std(tr_X)
te_X = (te_X - np.mean(te_X))/np.std(te_X)

train_X[k] = tr_X
test_X[k] = te_X

In [None]:
np.random.seed(42)
lr = LogisticRegression(max_iter=500)
knn = KNeighborsClassifier(n_neighbors=100)
svc = SVC(kernel='rbf', probability=True)
tree = DecisionTreeClassifier(max_depth=19)
extree = ExtraTreeClassifier(max_depth=19)

rf = RandomForestClassifier(n_estimators=1200,
                               max_depth=13,
                               min_samples_split=5,
                               min_samples_leaf=5,
                               min_impurity_decrease = 0.001,
                               max_features=None,
                               oob_score=True,
                               random_state=42)

gbr = GradientBoostingClassifier(n_estimators=6000,
                                learning_rate=0.005,
                                max_depth=9,
                                max_features='sqrt',
                                min_samples_leaf=15,
                                min_samples_split=10,
#                                 loss='exponential',
                                n_iter_no_change = 100,
                                random_state=42)

vclf = VotingClassifier(estimators=[
    ('rf', rf),
    ('gbr', gbr),
    ('lr', lr),
#     ('extree', extree),
    ('knn', knn)
],  n_jobs=-1, voting='soft'
)

stk_clf = StackingClassifier(estimators=[
    ('rf', rf),
#     ('gbr', gbr),
    ('lr', lr),
    ('extree', extree),
#     ('knn', knn)
],  n_jobs=-1,
    final_estimator=rf
)



models = [lr, extree, rf, gbr, vclf, stk_clf]

In [None]:
%%time
rf = RandomForestClassifier(n_estimators=150,
                                max_depth=9, 
                                max_features='sqrt', 
                                random_state=42)

pca = PCA(100)

models = [[pca, rf], 
          [rf]]


model = StackNetClassifier(models, 
                           metric="logloss", 
                           folds=2,
                           restacking=False,
                           use_retraining=True,
                           use_proba=True, # To use predict_proba after training
                           random_state=42,
                           n_jobs=-1, 
                           verbose=1)

model.fit(train_X, t)

In [None]:
y_pred = model.predict_proba(test_X)
submission = pd.DataFrame(data=y_pred, columns=sub.columns, index=sub.index)
submission.to_csv('./sub/stk_sample3.csv', index=True)


In [None]:
for m in models:
    print(np.mean(cross_val_score(m, train_X, t, cv=4, scoring='neg_log_loss')))