In [27]:
import pickle
import time
import myslack

import lightgbm
from sklearn.svm import SVC
from scipy.sparse import csr_matrix
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.externals import joblib
from mlxtend.classifier import StackingClassifier
from mlxtend.feature_selection import ColumnSelector
from vecstack import stacking
from vecstack import StackingTransformer

#### - train.csv

In [2]:
%%time
train = pickle.load(open("./data/my_train.pkl", "rb"))
print("test data shape: "+str(train.shape))

test data shape: (95674, 5166)
CPU times: user 188 ms, sys: 483 ms, total: 671 ms
Wall time: 716 ms


In [3]:
train.shape

(95674, 5166)

#### - test.csv

In [4]:
%%time
test = pickle.load(open("./data/my_test.pkl", "rb"))
print("test data shape: "+str(test.shape))

test data shape: (95674, 5165)
CPU times: user 204 ms, sys: 598 ms, total: 802 ms
Wall time: 929 ms


In [5]:
test.shape

(95674, 5165)

---

# Stacking(Ensemble)

# mlxtend

In [17]:
best_RandomForest_model = joblib.load('./RandomForest/RandomForest_model.pkl') 
best_lgbm_model = joblib.load('./LightGBM/Best_lightgbm_est300_dep25.pkl')

### Train_X, Train_y

In [7]:
X = train.drop(columns='TripType')
y = train['TripType']

In [8]:
%time csr_X = csr_matrix(X); csr_X

CPU times: user 9.8 s, sys: 1.59 s, total: 11.4 s
Wall time: 11.6 s


In [9]:
%time csr_test = csr_matrix(test); csr_test

CPU times: user 9.58 s, sys: 1.71 s, total: 11.3 s
Wall time: 11.4 s


In [34]:
sclf = StackingClassifier(classifiers=[best_RandomForest_model], 
                          meta_classifier=best_lgbm_model,
                          verbose=2)

In [35]:
%time sclf_model = sclf.fit(csr_X, y)

Fitting 1 classifiers...
Fitting classifier1: randomforestclassifier (1/1)
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=80, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=6,
            oob_score=False, random_state=0, verbose=0, warm_start=False)
CPU times: user 17min 34s, sys: 55.3 s, total: 18min 29s
Wall time: 3min 52s


In [36]:
myslack.send_slack('best_RandomForest_model Stacking finish!!')

<Response [200]>


### Confusion Matrix

In [37]:
confusion_matrix(y, sclf_model.predict(csr_X))

  if diff:


array([[3596,    0,    0, ...,    0,    0,   46],
       [   0,  318,   26, ...,    0,    0,    0],
       [   0,    0, 4593, ...,    0,    0,    0],
       ...,
       [   0,    0,    0, ...,  872,    0,    0],
       [   0,    0,    0, ...,    0, 1187,    0],
       [ 250,    2,   24, ...,    0,    0, 7938]])

### Accuracy Score

In [38]:
accuracy_score(y, sclf_model.predict(csr_X))

  if diff:


0.9818236929573343

### Train Test Split

In [30]:
X_train, X_test, y_train, y_test = train_test_split(csr_X, y, test_size=0.1, random_state=0)

X_train.shape[0], X_test.shape[0], len(y_train), len(y_test)

(86106, 9568, 86106, 9568)

In [39]:
%%time
y_pred = sclf_model.predict(X_test)

CPU times: user 14.8 s, sys: 215 ms, total: 15 s
Wall time: 2.83 s


  if diff:


In [40]:
print(classification_report(y_test, y_pred))

             precision    recall  f1-score   support

          3       0.92      0.99      0.95       405
          4       1.00      0.94      0.97        35
          5       0.98      1.00      0.99       455
          6       1.00      1.00      1.00       143
          7       1.00      0.98      0.99       631
          8       0.97      1.00      0.98      1184
          9       0.91      0.99      0.95       909
         12       1.00      1.00      1.00        23
         15       1.00      0.98      0.99        86
         18       1.00      0.91      0.96        47
         19       1.00      0.81      0.89        31
         20       1.00      0.94      0.97        71
         21       1.00      0.95      0.97        60
         22       1.00      0.93      0.97        90
         23       1.00      0.89      0.94        19
         24       1.00      0.98      0.99       247
         25       1.00      0.98      0.99       382
         26       1.00      0.91      0.95   

## 모델 저장

In [41]:
from sklearn.externals import joblib

joblib.dump(sclf_model, './Stacking/RF_lgbm_sclf_model_2.pkl') 
my_sclf_model = joblib.load('./Stacking/RF_lgbm_sclf_model_2.pkl') 

---

# Predict

In [42]:
%time result = my_sclf_model.predict_proba(csr_test)

CPU times: user 3min 26s, sys: 6.26 s, total: 3min 32s
Wall time: 42.5 s


### Make Submission CSV file

In [43]:
samsub = pd.read_csv('./data/sample_submission.csv')
subform_df_columns = samsub.columns[1:]
result_df = pd.DataFrame(result)
result_df.columns = subform_df_columns
subform_df = pd.concat([test.reset_index()['VisitNumber'],result_df],axis=1)
subform_df.set_index('VisitNumber',inplace=True)
subform_df.tail()

Unnamed: 0_level_0,TripType_3,TripType_4,TripType_5,TripType_6,TripType_7,TripType_8,TripType_9,TripType_12,TripType_14,TripType_15,...,TripType_36,TripType_37,TripType_38,TripType_39,TripType_40,TripType_41,TripType_42,TripType_43,TripType_44,TripType_999
VisitNumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
191338,1.996101e-09,3.478235e-10,2.218397e-10,3.478789e-10,1.904548e-09,1.905029e-09,3.648428e-09,1.046096e-09,4.410683e-10,1.245966e-09,...,4.253862e-10,3.840913e-10,1.90409e-09,1.0,2.217195e-10,1.461902e-09,1.345084e-09,2.007288e-10,2.007288e-10,4.234061e-09
191339,1.410461e-08,2.457749e-09,1.567537e-09,2.458141e-09,2.109e-08,2.103141e-08,7.372441e-08,7.3918e-09,3.116624e-09,8.804098e-09,...,4.549132e-09,1.323542e-09,3.155806e-09,1.560245e-09,1.566689e-09,3.44067e-09,3.590005e-09,1.418366e-09,1.418366e-09,1.658811e-07
191340,2.922299e-08,0.0001596738,1.25033e-08,0.0004791635,0.00750743,0.9681338,0.001757021,2.166631e-08,1.188427e-08,0.0008784932,...,0.004153027,0.002316086,0.0007986257,0.00063889,7.982629e-05,2.166403e-08,0.0002395633,1.131346e-08,1.131346e-08,0.00519128
191341,1.411134e-08,2.458922e-09,1.568285e-09,2.459313e-09,2.110005e-08,2.104144e-08,2.579239e-08,7.395328e-09,3.118112e-09,8.808299e-09,...,1.554691e-08,1.324173e-09,3.157311e-09,1.560989e-09,1.567436e-09,3.442311e-09,3.591717e-09,1.419043e-09,1.419043e-09,0.0005078661
191348,1.812389e-09,3.158113e-10,2.014225e-10,3.158616e-10,2.709983e-09,2.702455e-09,9.473303e-09,9.498178e-10,4.004742e-10,1.131293e-09,...,5.845459e-10,1.700701e-10,4.055089e-10,2.004854e-10,2.013134e-10,4.421129e-10,4.613017e-10,1.822546e-10,1.822546e-10,2.131508e-08


In [44]:
subform_df.to_csv('./Stacking/RF_lgbm_sclf_model_2.csv')

In [45]:
myslack.send_slack('Making RF_lgbm_sclf_model CSV finish!!')

<Response [200]>
