In [1]:
import numpy as np
import pandas as pd
pd.set_option('max_rows',38)
pd.set_option('max_columns',25)
import time
import pickle
# from tqdm import tqdm
import myslack_incomming
# model
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
# import xgboost
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from scipy.sparse import csr_matrix
from sklearn.model_selection import train_test_split
from sklearn.model_selection import validation_curve
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import classification_report

# Data Check

#### - train.csv

In [2]:
%%time
train = pickle.load(open("./data/final_train_Ver3.pkl", "rb"))
print("test data shape: "+str(train.shape))

test data shape: (95674, 5494)
CPU times: user 210 ms, sys: 677 ms, total: 887 ms
Wall time: 908 ms


#### - test.csv

In [3]:
%%time
test = pickle.load(open("./data/final_test_Ver3.pkl", "rb"))
print("test data shape: "+str(test.shape))

test data shape: (95674, 5493)
CPU times: user 258 ms, sys: 805 ms, total: 1.06 s
Wall time: 2.16 s


#### - sample_submission.csv

In [4]:
samplesub = pd.read_csv("./data/sample_submission.csv")
print("sample_submission data shape: "+str(samplesub.shape))

sample_submission data shape: (95674, 39)


---

# Modeling 7 - SVM

In [5]:
# in test, not in train
not_in_train = [i for i in test.columns if i not in train.columns[1:]]

In [6]:
# in train, not in test
not_in_test = [i for i in train.columns[1:] if i not in test.columns]

In [7]:
len(not_in_train)

328

In [8]:
len(not_in_test)

328

In [9]:
train.drop(columns=not_in_test,inplace=True)

In [10]:
train.shape

(95674, 5166)

In [11]:
test.drop(columns=not_in_train,inplace=True)

In [12]:
test.shape

(95674, 5165)

### Train_X, Train_y

In [13]:
X = train.drop(columns='TripType')
y = train['TripType']

In [14]:
X = csr_matrix(X); X

<95674x5165 sparse matrix of type '<class 'numpy.float64'>'
	with 2214766 stored elements in Compressed Sparse Row format>

In [15]:
csr_test = csr_matrix(test); csr_test

<95674x5165 sparse matrix of type '<class 'numpy.float64'>'
	with 2221183 stored elements in Compressed Sparse Row format>

### SVM

In [30]:
%%time

C = 1e2

rbfsvc_model = SVC(kernel="rbf", C=C, probability=True).fit(X, y)

myslack_incomming.send_slack('SVC 1e2 modeling finish!')

<Response [200]>
CPU times: user 1h 21min 12s, sys: 21.8 s, total: 1h 21min 34s
Wall time: 1h 22min 3s


### Confusion Matrix

In [31]:
%time confusion_matrix(y, rbfsvc_model.predict(X))

CPU times: user 14min 1s, sys: 707 ms, total: 14min 2s
Wall time: 14min 2s


array([[3581,    0,    0, ...,    1,    0,   26],
       [   1,  176,  139, ...,    0,    0,    0],
       [   4,    9, 4272, ...,    1,    0,    0],
       ...,
       [   0,    0,   29, ...,  426,    2,    0],
       [   0,    0,    1, ...,   19, 1054,    0],
       [ 317,    4,   73, ...,    0,    0, 7122]])

In [32]:
myslack_incomming.send_slack('SVC confusion_matrix finish!')

<Response [200]>


### Train Test Split

In [33]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)

X_train.shape[0], X_test.shape[0], len(y_train), len(y_test)

(86106, 9568, 86106, 9568)

In [34]:
%time y_pred = rbfsvc_model.predict(X_test)

CPU times: user 1min 25s, sys: 115 ms, total: 1min 25s
Wall time: 1min 25s


In [35]:
myslack_incomming.send_slack('SVC 1e2 predict finish!')

<Response [200]>


In [36]:
print(classification_report(y_test, y_pred))

             precision    recall  f1-score   support

          3       0.86      0.97      0.91       363
          4       0.79      0.53      0.63        36
          5       0.77      0.93      0.84       452
          6       0.90      0.84      0.87       112
          7       0.82      0.82      0.82       566
          8       0.86      0.86      0.86      1255
          9       0.70      0.90      0.79       955
         12       0.94      0.62      0.74        26
         15       0.85      0.67      0.75       106
         18       0.71      0.61      0.66        64
         19       0.83      0.45      0.58        42
         20       0.79      0.81      0.80        70
         21       0.79      0.80      0.80        71
         22       0.75      0.49      0.59        88
         23       0.67      0.20      0.31        10
         24       0.87      0.74      0.80       280
         25       0.86      0.84      0.85       360
         26       0.64      0.62      0.63   

### prediction

In [37]:
%%time
result = rbfsvc_model.predict_proba(csr_test)
result

CPU times: user 14min 28s, sys: 1.52 s, total: 14min 29s
Wall time: 14min 30s


### Make Submission CSV file

In [38]:
subform_df_columns = samplesub.columns[1:]
result_df = pd.DataFrame(result)
result_df.columns = subform_df_columns
subform_df = pd.concat([test.reset_index()['VisitNumber'],result_df],axis=1)
subform_df.set_index('VisitNumber',inplace=True)
subform_df.tail()

Unnamed: 0_level_0,TripType_3,TripType_4,TripType_5,TripType_6,TripType_7,TripType_8,TripType_9,TripType_12,TripType_14,TripType_15,TripType_18,TripType_19,...,TripType_34,TripType_35,TripType_36,TripType_37,TripType_38,TripType_39,TripType_40,TripType_41,TripType_42,TripType_43,TripType_44,TripType_999
VisitNumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1
191338,0.000962,0.001394,0.008246,0.004807,0.023484,0.001085,0.001131,0.010528,0.00118,0.013596,0.003167,0.002032,...,0.003672,0.231646,0.010441,0.015506,0.03362,0.307638,0.005598,0.004377,0.008324,0.007503,0.006831,0.007505
191339,0.000493,0.000734,0.00321,0.002127,0.0065,0.000967,0.00087,0.006535,0.000564,0.009225,0.014376,0.002037,...,0.001663,0.01311,0.00592,0.004029,0.009256,0.018581,0.002515,0.034428,0.321135,0.041018,0.009937,0.006544
191340,0.001316,0.001636,0.003826,0.00565,0.017444,0.82759,0.036143,0.001398,0.00011,0.010322,0.001978,0.000545,...,0.001316,0.018483,0.005017,0.00187,0.002277,0.00271,0.00045,0.000444,0.003649,0.000148,0.000303,0.029907
191341,7e-06,1.3e-05,0.000216,4.2e-05,0.000137,3e-05,1.3e-05,0.001181,2.6e-05,0.001078,6.7e-05,1.7e-05,...,0.000101,0.000139,0.000211,0.000176,0.000245,0.003434,0.001124,0.003315,0.003796,0.001094,0.005899,7.2e-05
191348,5e-05,4.7e-05,2.1e-05,3.4e-05,1.6e-05,2e-06,1.7e-05,7.2e-05,6.1e-05,3.3e-05,3.2e-05,2e-05,...,4.1e-05,1e-05,3.4e-05,2.7e-05,2.3e-05,9.6e-05,4e-05,0.003987,0.008178,0.000587,0.003578,0.000224


In [39]:
subform_df.to_csv('./SVM/result_SVC_1e2.csv')

## 모델 저장

In [40]:
from sklearn.externals import joblib

joblib.dump(rbfsvc_model, './SVM/model_SVC_1e2.pkl' )
svc = joblib.load('./SVM/model_SVC_1e2.pkl' )

## le6

### Confusion Matrix

In [17]:
%time confusion_matrix(y, rbfsvc_model.predict(X))

CPU times: user 14min 10s, sys: 2.05 s, total: 14min 12s
Wall time: 1h 17min 21s


array([[3623,    0,    0, ...,    0,    0,   19],
       [   0,  346,    0, ...,    0,    0,    0],
       [   0,    0, 4593, ...,    0,    0,    0],
       ...,
       [   0,    0,    0, ...,  872,    0,    0],
       [   0,    0,    0, ...,    0, 1187,    0],
       [ 280,    2,    5, ...,    0,    0, 8089]])

In [18]:
myslack_incomming.send_slack('SVC confusion_matrix finish!')

<Response [200]>


### Train Test Split

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)

X_train.shape[0], X_test.shape[0], len(y_train), len(y_test)

(86106, 9568, 86106, 9568)

In [21]:
%time y_pred = rbfsvc_model.predict(X_test)

CPU times: user 1min 26s, sys: 391 ms, total: 1min 26s
Wall time: 1min 27s


In [22]:
myslack_incomming.send_slack('SVC 1e2 predict finish!')

<Response [200]>


In [27]:
print(classification_report(y_test, y_pred))

             precision    recall  f1-score   support

          3       0.90      0.99      0.95       347
          4       1.00      1.00      1.00        25
          5       1.00      1.00      1.00       458
          6       1.00      1.00      1.00       132
          7       1.00      1.00      1.00       617
          8       0.99      1.00      0.99      1247
          9       0.97      0.99      0.98       907
         12       1.00      1.00      1.00        31
         15       0.99      1.00      1.00       105
         18       1.00      1.00      1.00        51
         19       1.00      1.00      1.00        37
         20       0.99      0.97      0.98        76
         21       1.00      1.00      1.00        53
         22       0.98      0.95      0.96        93
         23       0.94      0.94      0.94        18
         24       1.00      0.99      0.99       272
         25       1.00      1.00      1.00       347
         26       1.00      0.96      0.98   

### prediction

In [24]:
%%time
result = rbfsvc_model.predict_proba(csr_test)
result

CPU times: user 16min 59s, sys: 3.59 s, total: 17min 3s
Wall time: 17min 21s


### Make Submission CSV file

In [25]:
subform_df_columns = samplesub.columns[1:]
result_df = pd.DataFrame(result)
result_df.columns = subform_df_columns
subform_df = pd.concat([test.reset_index()['VisitNumber'],result_df],axis=1)
subform_df.set_index('VisitNumber',inplace=True)
subform_df.tail()

Unnamed: 0_level_0,TripType_3,TripType_4,TripType_5,TripType_6,TripType_7,TripType_8,TripType_9,TripType_12,TripType_14,TripType_15,TripType_18,TripType_19,...,TripType_34,TripType_35,TripType_36,TripType_37,TripType_38,TripType_39,TripType_40,TripType_41,TripType_42,TripType_43,TripType_44,TripType_999
VisitNumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1
191338,0.001088,0.001465,0.01454,0.006039,0.018617,0.001138,0.001298,0.00839,0.001227,0.011282,0.003329,0.002071,...,0.003489,0.098075,0.008542,0.013117,0.022594,0.517258,0.00558,0.003473,0.00577,0.007517,0.005077,0.006377
191339,0.000907,0.001105,0.00456,0.00245,0.006041,0.001604,0.001549,0.006033,0.000859,0.00558,0.015923,0.003135,...,0.002096,0.012667,0.004781,0.005802,0.012522,0.013235,0.004267,0.027352,0.48738,0.017046,0.0135,0.005677
191340,0.004437,0.003773,0.008617,0.010314,0.035386,0.654351,0.043544,0.005015,0.000495,0.011146,0.009238,0.001618,...,0.003057,0.070153,0.013591,0.003991,0.006672,0.011513,0.003335,0.00188,0.009653,0.000961,0.001242,0.028577
191341,1.7e-05,3.3e-05,0.000364,9.2e-05,0.000298,5.8e-05,0.000217,0.001616,5.2e-05,0.001872,9.5e-05,3.7e-05,...,0.00015,0.000268,0.000396,0.000294,0.000456,0.004471,0.001984,0.003939,0.0056,0.001714,0.010705,0.000234
191348,5e-06,1.6e-05,1.2e-05,3.3e-05,1.4e-05,4e-06,1e-05,2.9e-05,6.2e-05,1.8e-05,1.8e-05,1.1e-05,...,0.000182,1e-05,2.7e-05,1.8e-05,1.9e-05,4.5e-05,3.1e-05,0.001587,0.00233,0.000237,0.001942,8.5e-05


In [26]:
subform_df.to_csv('./SVM/result_SVC_1e2.csv')

## 모델 저장

In [29]:
from sklearn.externals import joblib

joblib.dump(rbfsvc_model, './SVM/model_SVC_1e2.pkl' )
svc = joblib.load('./SVM/model_SVC_1e2.pkl' )