In [23]:
import pickle
import time
import myslack

import lightgbm
from sklearn.svm import SVC
from scipy.sparse import csr_matrix
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.externals import joblib
from mlxtend.classifier import StackingClassifier
from mlxtend.feature_selection import ColumnSelector
from vecstack import stacking
from vecstack import StackingTransformer

#### - train.csv

In [2]:
%%time
train = pickle.load(open("./data/my_train.pkl", "rb"))
print("test data shape: "+str(train.shape))

test data shape: (95674, 5166)
CPU times: user 168 ms, sys: 463 ms, total: 631 ms
Wall time: 636 ms


In [3]:
train.shape

(95674, 5166)

#### - test.csv

In [4]:
%%time
test = pickle.load(open("./data/my_test.pkl", "rb"))
print("test data shape: "+str(test.shape))

test data shape: (95674, 5165)
CPU times: user 159 ms, sys: 450 ms, total: 608 ms
Wall time: 607 ms


In [5]:
test.shape

(95674, 5165)

---

# Stacking(Ensemble)

# mlxtend

In [6]:
svc_model = joblib.load('./SVM/model_SVC_1e2.pkl')
lgbm_model = joblib.load('./LightGBM/Best_lightgbm_est300_dep25.pkl')

### Train_X, Train_y

In [7]:
X = train.drop(columns='TripType')
y = train['TripType']

In [8]:
%time csr_X = csr_matrix(X); csr_X

CPU times: user 10.3 s, sys: 1.41 s, total: 11.7 s
Wall time: 11.8 s


In [9]:
%time csr_test = csr_matrix(test); csr_test

CPU times: user 10.1 s, sys: 1.37 s, total: 11.5 s
Wall time: 11.5 s


In [10]:
sclf = StackingClassifier(classifiers=[svc_model], 
                          meta_classifier=lgbm_model,
                          use_probas=True,
                          verbose=2)

In [13]:
%time sclf_model = sclf.fit(csr_X, y)

Fitting 1 classifiers...
Fitting classifier1: svc (1/1)
SVC(C=100.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=True, random_state=None, shrinking=True,
  tol=0.001, verbose=False)
CPU times: user 1h 47min 21s, sys: 1min 55s, total: 1h 49min 17s
Wall time: 1h 46min 51s


In [14]:
myslack.send_slack('StackingClassifier modeling finish!!')

<Response [200]>


## 모델 저장

In [17]:
from sklearn.externals import joblib

joblib.dump(sclf_model, './Stacking/svc_lgbm_sclf_model.pkl') 
my_sclf_model = joblib.load('./Stacking/svc_lgbm_sclf_model.pkl') 

### Confusion Matrix

In [None]:
confusion_matrix(y, sclf_model.predict(csr_X))

### Accuracy Score

In [None]:
accuracy_score(y, sclf_model.predict(csr_X))

### Train Test Split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(csr_X, y, test_size=0.1, random_state=0)

X_train.shape[0], X_test.shape[0], len(y_train), len(y_test)

In [None]:
%%time
y_pred = sclf_model.predict(X_test)

In [None]:
print(classification_report(y_test, y_pred))

### prediction

In [18]:
%time result = sclf_model.predict_proba(csr_test)

CPU times: user 18min 48s, sys: 5.28 s, total: 18min 54s
Wall time: 16min 18s


### Make Submission CSV file

In [19]:
samsub = pd.read_csv('./data/sample_submission.csv')
subform_df_columns = samsub.columns[1:]
result_df = pd.DataFrame(result)
result_df.columns = subform_df_columns
subform_df = pd.concat([test.reset_index()['VisitNumber'],result_df],axis=1)
subform_df.set_index('VisitNumber',inplace=True)
subform_df.tail()

Unnamed: 0_level_0,TripType_3,TripType_4,TripType_5,TripType_6,TripType_7,TripType_8,TripType_9,TripType_12,TripType_14,TripType_15,...,TripType_36,TripType_37,TripType_38,TripType_39,TripType_40,TripType_41,TripType_42,TripType_43,TripType_44,TripType_999
VisitNumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
191338,2.912214e-07,5.054454e-08,5.9e-05,5.927065e-06,0.000365,6.7e-05,0.000171,2.808156e-06,2.316411e-07,2.4e-05,...,0.000592,6.2e-05,0.000205,0.15275,2.071621e-07,1.280613e-06,3e-05,2.5e-05,4.123472e-07,5.1e-05
191339,2.352744e-08,3.61668e-08,1e-05,4.428118e-07,3.7e-05,2.1e-05,3.1e-05,2.751383e-07,1.391439e-08,1.1e-05,...,2.9e-05,2.2e-05,6.7e-05,0.00034,9.110215e-09,7.197013e-05,0.846822,0.000385,9.514141e-08,4.9e-05
191340,3.91558e-08,4.450883e-08,1.3e-05,4.567384e-06,0.0002,0.996317,0.001441,1.252326e-07,3.875052e-09,3.6e-05,...,3.8e-05,2e-06,2.9e-05,0.000113,1.122064e-08,1.046503e-07,0.000114,1e-06,2.115078e-08,0.001223
191341,4.255341e-09,8.667127e-09,1.4e-05,2.05866e-07,2.4e-05,4.4e-05,2.3e-05,2.74895e-08,1.670041e-09,1e-06,...,1.8e-05,2e-06,8e-06,0.000334,4.481649e-09,5.089306e-05,0.000413,4e-06,8.595698e-07,5.1e-05
191348,9.560205e-09,1.891953e-08,1.4e-05,9.747886e-07,0.0001,2.6e-05,3.9e-05,2.544224e-07,2.372316e-09,7e-06,...,1.1e-05,2e-06,6e-06,6.9e-05,3.971395e-09,1.79264e-05,0.000324,1e-06,6.861288e-08,0.000351


In [20]:
subform_df.to_csv('./Stacking/svc_lgbm_sclf_model.csv')

In [21]:
myslack.send_slack('Making svc_lgbm_sclf_model CSV finish!!')

<Response [200]>


---

# vecstack

In [6]:
svc_model = joblib.load('./SVM/model_SVC_1e2.pkl')
lgbm_model = joblib.load('./LightGBM/Best_lightgbm_est300_dep25.pkl')

### Train_X, Train_y

In [7]:
X = train.drop(columns='TripType')
y = train['TripType']

In [8]:
%time csr_X = csr_matrix(X)

CPU times: user 10.3 s, sys: 1.47 s, total: 11.8 s
Wall time: 12 s


In [9]:
%time csr_test = csr_matrix(test)

CPU times: user 10.1 s, sys: 1.33 s, total: 11.4 s
Wall time: 11.4 s


In [10]:
X_train, X_test, y_train, y_test = train_test_split(csr_X, y, test_size=0.1)

In [None]:
%%time

models = [svc_model, lgbm_model]

# Get your stacked features in a single line
S_train, S_test = \
stacking(models, X_train, y_train, X_test, n_folds=3, regression=False, metric=accuracy_score, verbose=2)

In [None]:
myslack.send_slack('StackingClassifier modeling finish!!')