In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import gc
import os
import pickle
import warnings

warnings.filterwarnings('ignore')


In [2]:
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, GradientBoostingClassifier, StackingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import log_loss, accuracy_score, f1_score
from sklearn.decomposition import PCA

from pystacknet.pystacknet import StackNetClassifier



In [3]:
import xgboost as xgb
import lightgbm as lgb
from bayes_opt import BayesianOptimization


In [4]:
tr = pd.read_csv('./data/train.csv')
te = pd.read_csv('./data/test.csv')

sub = pd.read_csv('./data/sample_submission.csv')

In [5]:
train_X = tr.drop(['id', 'type'], axis=1)
test_X = te.drop(['id'], axis=1)

target = tr['type']


In [6]:
target_lbe = LabelEncoder().fit(target)

t = target_lbe.transform(target)
# target_lbe.inverse_transform(t)

In [7]:
train_X['index'] = np.ones(len(train_X))
test_X['index'] = np.ones(len(test_X))*2

merge = pd.concat([train_X, test_X], ignore_index=True)

k = train_X.columns[1:-1]

In [8]:
merge = pd.concat([merge, pd.get_dummies(merge['fiberID'], prefix='fiberID')], axis=1)
merge = merge.drop('fiberID', axis=1)


In [9]:
train_X = merge[merge['index'] == 1]
train_X = train_X.drop('index', axis=1)
test_X = merge[merge['index'] == 2]
test_X = test_X.drop('index', axis=1)
test_X.index = range(len(test_X))

In [10]:
tr_X = train_X[k]
te_X = test_X[k]

tr_X = (tr_X - np.mean(tr_X))/np.std(tr_X)
te_X = (te_X - np.mean(te_X))/np.std(te_X)

train_X[k] = tr_X
test_X[k] = te_X

te_X = test_X.copy()
tr_X = train_X.copy()

In [20]:
pca = PCA(20)
tr_pca = pd.DataFrame(pca.fit_transform(tr_X))
pca = PCA(20)
te_pca = pd.DataFrame(pca.fit_transform(te_X))

In [21]:
np.random.seed(42)
lr = LogisticRegression(max_iter=500)
knn = KNeighborsClassifier(n_neighbors=100)
svc = SVC(kernel='rbf', probability=True)
tree = DecisionTreeClassifier(max_depth=19)
extree = ExtraTreeClassifier(max_depth=19)

rf = RandomForestClassifier(n_estimators=200,
                               max_depth=13,
                               min_samples_split=5,
                               min_samples_leaf=5,
                               min_impurity_decrease = 0.001,
                               max_features=None,
                               oob_score=True,
                               random_state=42)

gbr = GradientBoostingClassifier(n_estimators=1000,
                                learning_rate=0.01,
                                max_depth=9,
                                max_features='sqrt',
                                min_samples_leaf=15,
                                min_samples_split=10,
#                                 loss='exponential',
                                n_iter_no_change = 100,
                                random_state=42)

vclf = VotingClassifier(estimators=[
    ('rf', rf),
    ('gbr', gbr),
    ('lr', lr),
#     ('extree', extree),
    ('knn', knn)
],  n_jobs=-1, voting='soft'
)

stk_clf = StackingClassifier(estimators=[
#     ('rf', rf),
    ('gbr', gbr),
    ('lr', lr),
    ('extree', extree),
    ('knn', knn)
],  n_jobs=-1,
    final_estimator=rf
)



models = [lr, extree, rf, gbr, vclf, stk_clf]

In [None]:
for m in models:
    print(np.mean(cross_val_score(m, tr_pca, t, cv=4, scoring='neg_log_loss')))

-1.3218680872916009
-2.012617744788759
