In [1]:
import pandas as pd
import numpy as np

import gc
import os



In [6]:
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier, 
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, GradientBoostingClassifier, StackingClassifier, AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import log_loss, accuracy_score, f1_score
from sklearn.decomposition import PCA

from pystacknet.pystacknet import StackNetClassifier





In [3]:
tr = pd.read_csv('./data/train.csv')
te = pd.read_csv('./data/test.csv')

sub = pd.read_csv('./data/sample_submission.csv', index_col=0)

In [4]:
# target_lbe = LabelEncoder().fit(target)

# t = target_lbe.transform(target)

column_number = {}
for i, column in enumerate(sub.columns):
    column_number[column] = i
    
def to_number(x, dic):
    return dic[x]

tr['type_num'] = tr['type'].apply(lambda x: to_number(x, column_number))



In [5]:
target = tr['type_num']
t = target.copy()

train_X = tr.drop(['id', 'type', 'type_num', 'fiberID'], axis=1)
test_X = te.drop(['id','fiberID'], axis=1)




In [None]:
train_X = (train_X - np.mean(train_X))/np.std(train_X)
test_X = (test_X - np.mean(test_X))/np.std(test_X)

In [7]:
np.random.seed(42)
lr = LogisticRegression(max_iter=2000)
knn = KNeighborsClassifier(n_neighbors=100)
svc = SVC(kernel='rbf', probability=True)
tree = DecisionTreeClassifier(max_depth=13)
extree = ExtraTreeClassifier(max_depth=13)

rf = RandomForestClassifier(n_estimators=200,
                               max_depth=13,
                               min_samples_split=5,
                               min_samples_leaf=5,
                               min_impurity_decrease = 0.001,
                               max_features=None,
                               oob_score=True,
                               random_state=42)

gbr = GradientBoostingClassifier(n_estimators=1000,
                                learning_rate=0.01,
                                max_depth=9,
                                max_features='sqrt',
                                min_samples_leaf=15,
                                min_samples_split=10,
#                                 loss='exponential',
                                n_iter_no_change = 100,
                                random_state=42)

vclf = VotingClassifier(estimators=[
    ('rf', rf),
    ('extree', extree),
    ('tree', tree),
],  n_jobs=-1, voting='soft'
)

stk_clf = StackingClassifier(estimators=[
    ('rf', rf),
    ('extree', extree),
    ('tree', tree),
],  n_jobs=-1,
    final_estimator=rf
)

pca = PCA(12)

models = [lr, extree, rf, gbr, vclf, stk_clf]

In [None]:
for m in models:
    print(np.mean(cross_val_score(m, train_X, t, cv=4, scoring='neg_log_loss')))

In [8]:
models = [[rf, extree, tree, pca], 
          [rf]]

In [10]:
model = StackNetClassifier(models, 
                           metric="logloss", 
                           folds=3,
                           restacking=False,
                           use_retraining=True,
                           use_proba=True, # To use predict_proba after training
                           random_state=42,
                           n_jobs=-1, 
                           verbose=1)

model.fit(train_X, t)

Input Dimensionality 20 at Level 0 
4 models included in Level 0 
Fold 1/3 , model 0 , logloss===0.620507 
Fold 1/3 , model 1 , logloss===1.832116 
Fold 1/3 , model 2 , logloss===1.702189 
Fold 2/3 , model 0 , logloss===0.621523 
Fold 2/3 , model 1 , logloss===1.629268 
Fold 2/3 , model 2 , logloss===1.609244 
Fold 3/3 , model 0 , logloss===0.622132 
Fold 3/3 , model 1 , logloss===1.603872 
Fold 3/3 , model 2 , logloss===1.711076 
Level 0, model 0 , logloss===0.621387 
Level 0, model 1 , logloss===1.688418 
Level 0, model 2 , logloss===1.674170 
Output dimensionality of level 0 is 69 
 level 0 lasted 2008.674325 seconds 
Input Dimensionality 69 at Level 1 
1 models included in Level 1 
Fold 1/3 , model 0 , logloss===0.596662 
Fold 2/3 , model 0 , logloss===0.574865 
Fold 3/3 , model 0 , logloss===0.586158 
Level 1, model 0 , logloss===0.585895 
Output dimensionality of level 1 is 19 
 level 1 lasted 2165.562702 seconds 
 fit() lasted 4174.253491 seconds 


In [12]:
y_pred = model.predict_proba(test_X)
submission = pd.DataFrame(data=y_pred, columns=sub.columns, index=sub.index)
submission.to_csv('./sub/sample1.csv', index=True)


1 estimators included in Level 0 
1 estimators included in Level 1 


In [None]:
sub.to_csv('./sub/rf1.csv', index=False)