In [None]:
import pandas as pd
import numpy as np

import gc
import os



In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, GradientBoostingClassifier, StackingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score


In [None]:
tr = pd.read_csv('./data/train.csv')
te = pd.read_csv('./data/test.csv')
trr = tr.copy()
sub = pd.read_csv('./data/sample_submission.csv')

In [None]:
train_X = tr.drop(['id', 'type', 'fiberID'], axis=1)
test_X = te.drop(['id', 'fiberID'], axis=1)

target = tr['type']

In [None]:
target_lbe = LabelEncoder().fit(target)

t = target_lbe.transform(target)
# target_lbe.inverse_transform(t)

In [None]:
train_X = (train_X - np.mean(train_X))/np.std(train_X)
test_X = (test_X - np.mean(test_X))/np.std(test_X)

In [None]:
np.random.seed(42)
lr = LogisticRegression(max_iter=2000)
knn = KNeighborsClassifier(n_neighbors=100)
svc = SVC(kernel='rbf', probability=True)
tree = DecisionTreeClassifier(max_depth=19)
extree = ExtraTreeClassifier(max_depth=19)

rf = RandomForestClassifier(n_estimators=200,
                               max_depth=13,
                               min_samples_split=5,
                               min_samples_leaf=5,
                               min_impurity_decrease = 0.001,
                               max_features=None,
                               oob_score=True,
                               random_state=42)

gbr = GradientBoostingClassifier(n_estimators=1000,
                                learning_rate=0.01,
                                max_depth=9,
                                max_features='sqrt',
                                min_samples_leaf=15,
                                min_samples_split=10,
#                                 loss='exponential',
                                n_iter_no_change = 100,
                                random_state=42)

vclf = VotingClassifier(estimators=[
    ('rf', rf),
    ('gbr', gbr),
    ('lr', lr),
#     ('extree', extree),
    ('knn', knn)
],  n_jobs=-1, voting='soft'
)

stk_clf = StackingClassifier(estimators=[
#     ('rf', rf),
    ('gbr', gbr),
    ('lr', lr),
    ('extree', extree),
    ('knn', knn)
],  n_jobs=-1,
    final_estimator=rf
)



models = [lr, extree, rf, gbr, vclf, stk_clf]

In [None]:
for m in models:
    print(np.mean(cross_val_score(m, train_X, t, cv=4, scoring='neg_log_loss')))

In [None]:
kk = pd.DataFrame(m.predict(test_X), columns=target_lbe.classes_)
sub[sub.columns[1:]] = kk[sub.columns[1:]]

In [None]:
sub.to_csv('./sub/rf1.csv', index=False)