In [9]:
import pandas as pd
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, GradientBoostingClassifier, StackingClassifier, AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import log_loss, accuracy_score, f1_score
from sklearn.decomposition import PCA

from pystacknet.pystacknet import StackNetClassifier



In [10]:
tr = pd.read_csv('./data/train.csv')
te = pd.read_csv('./data/test.csv')

sub = pd.read_csv('./data/sample_submission.csv', index_col=0)

In [11]:
# target_lbe = LabelEncoder().fit(target)

# t = target_lbe.transform(target)

column_number = {}
for i, column in enumerate(sub.columns):
    column_number[column] = i
    
def to_number(x, dic):
    return dic[x]

tr['type_num'] = tr['type'].apply(lambda x: to_number(x, column_number))



In [12]:
target = tr['type_num']
t = target.copy()

train_X = tr.drop(['id', 'type', 'type_num', 'fiberID'], axis=1)
test_X = te.drop(['id','fiberID'], axis=1)


In [13]:
extree = ExtraTreeClassifier(max_depth=13)
rf = RandomForestClassifier(n_estimators=200,
                                max_depth=9, 
                                max_features='sqrt', 
                                random_state=42)


pca = PCA(12)

models = [
    [rf, extree],
    [rf]
]

In [15]:
clf = StackNetClassifier(models, 
                           metric="logloss", 
                           folds=4,
                           restacking=False,
                           use_retraining=True,
                           use_proba=True, # To use predict_proba after training
                           random_state=12,
                           n_jobs=-1, 
                           verbose=1)

clf.fit(train_X, target)

Input Dimensionality 20 at Level 0 
2 models included in Level 0 
Fold 1/4 , model 0 , logloss===0.498884 
Fold 1/4 , model 1 , logloss===1.416549 
Fold 2/4 , model 0 , logloss===0.495275 
Fold 2/4 , model 1 , logloss===2.200374 
Fold 3/4 , model 0 , logloss===0.499327 
Fold 3/4 , model 1 , logloss===1.709932 
Fold 4/4 , model 0 , logloss===0.498121 
Fold 4/4 , model 1 , logloss===2.064159 
Level 0, model 0 , logloss===0.497902 
Level 0, model 1 , logloss===1.847753 
Output dimensionality of level 0 is 38 
 level 0 lasted 555.795955 seconds 
Input Dimensionality 38 at Level 1 
1 models included in Level 1 
Fold 1/4 , model 0 , logloss===0.507772 
Fold 2/4 , model 0 , logloss===0.518050 
Fold 3/4 , model 0 , logloss===0.541809 
Fold 4/4 , model 0 , logloss===0.511977 
Level 1, model 0 , logloss===0.519902 
Output dimensionality of level 1 is 19 
 level 1 lasted 321.565938 seconds 
 fit() lasted 877.378392 seconds 


In [17]:
y_pred = clf.predict_proba(test_X)

1 estimators included in Level 0 
1 estimators included in Level 1 


In [18]:
submission = pd.DataFrame(data=y_pred, columns=sub.columns, index=sub.index)
submission.to_csv('./sub/sample2.csv', index=True)
