In [76]:
import pickle
import time
import tqdm
import myslack

from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, roc_auc_score
from sklearn.ensemble import ExtraTreesClassifier

#### - train.csv

In [4]:
%%time
train = pickle.load(open("./data/my_train.pkl", "rb"))
print("test data shape: "+str(train.shape))

test data shape: (95674, 5166)
CPU times: user 218 ms, sys: 639 ms, total: 857 ms
Wall time: 887 ms


In [5]:
train.shape

(95674, 5166)

#### - test.csv

In [6]:
# %%time
test = pickle.load(open("./data/my_test.pkl", "rb"))
print("test data shape: "+str(test.shape))

test data shape: (95674, 5165)


In [7]:
test.shape

(95674, 5165)

---

# ExtraTreesClassifier

### Train_X, Train_y

In [8]:
X = train.drop(columns='TripType')
y = train['TripType']

In [63]:
estimators=300
depth=60

In [64]:
%time extree_model = ExtraTreesClassifier(n_estimators = estimators, max_depth= depth).fit(X,y)

CPU times: user 11min 8s, sys: 7.06 s, total: 11min 15s
Wall time: 11min 17s


In [65]:
myslack.send_slack('finish')

<Response [200]>


### Confusion Matrix

In [66]:
confusion_matrix(y, extree_model.predict(X))

array([[3597,    0,    0, ...,    0,    0,   22],
       [   0,   50,    5, ...,    0,    0,    3],
       [   1,    0, 2731, ...,    0,    0,    6],
       ...,
       [   0,    0,    0, ...,  579,    0,    0],
       [   0,    0,    0, ...,    0, 1147,    0],
       [ 276,    0,    1, ...,    0,    0, 7325]])

### Accuracy Score

In [67]:
%time print('accuracy_score '+str(accuracy_score(y, extree_model.predict(X))))

accuracy_score 0.8022555762276062
CPU times: user 17.8 s, sys: 3.46 s, total: 21.3 s
Wall time: 21.5 s


### Train Test Split

In [68]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=1)

len(X_train), len(X_test), len(y_train), len(y_test)

(86106, 9568, 86106, 9568)

In [69]:
y_pred = extree_model.predict(X_test)

In [70]:
print(classification_report(y_test, y_pred))

             precision    recall  f1-score   support

          3       0.90      0.99      0.94       357
          4       1.00      0.14      0.25        28
          5       1.00      0.59      0.74       472
          6       1.00      0.53      0.69       139
          7       0.98      0.81      0.88       544
          8       0.45      1.00      0.62      1186
          9       0.70      0.74      0.72       911
         12       1.00      0.71      0.83        21
         15       1.00      0.58      0.74        91
         18       1.00      0.49      0.66        63
         19       1.00      0.27      0.42        41
         20       1.00      0.53      0.69        51
         21       1.00      0.48      0.65        84
         22       1.00      0.43      0.61       106
         23       1.00      0.19      0.32        16
         24       0.99      0.63      0.77       235
         25       0.99      0.78      0.88       371
         26       1.00      0.42      0.59   

### Prediction

In [71]:
%time result = extree_model.predict_proba(test)

CPU times: user 18.8 s, sys: 4.69 s, total: 23.4 s
Wall time: 24 s


In [72]:
result.shape

(95674, 38)

In [73]:
result

array([[0.01418589, 0.00292786, 0.03803827, ..., 0.00866913, 0.00524126,
        0.07321052],
       [0.01364851, 0.00350445, 0.04530083, ..., 0.02728213, 0.00253141,
        0.17495134],
       [0.02489958, 0.00434514, 0.05741996, ..., 0.00596614, 0.00312831,
        0.20765996],
       ...,
       [0.02428597, 0.00461999, 0.05814477, ..., 0.00629677, 0.0034628 ,
        0.11416539],
       [0.00602574, 0.00149156, 0.02045307, ..., 0.01111685, 0.03612186,
        0.03075966],
       [0.01381585, 0.00303012, 0.03984058, ..., 0.01459433, 0.0097195 ,
        0.0753907 ]])

### Make Submission CSV file

In [74]:
samsub = pd.read_csv('./data/sample_submission.csv')
subform_df_columns = samsub.columns[1:]
result_df = pd.DataFrame(result)
result_df.columns = subform_df_columns
subform_df = pd.concat([test.reset_index()['VisitNumber'],result_df],axis=1)
subform_df.set_index('VisitNumber',inplace=True)
subform_df.tail()

Unnamed: 0_level_0,TripType_3,TripType_4,TripType_5,TripType_6,TripType_7,TripType_8,TripType_9,TripType_12,TripType_14,TripType_15,...,TripType_36,TripType_37,TripType_38,TripType_39,TripType_40,TripType_41,TripType_42,TripType_43,TripType_44,TripType_999
VisitNumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
191338,0.00762,0.002364,0.028204,0.014012,0.043342,0.079782,0.0446,0.002915,7.9e-05,0.012528,...,0.028603,0.0219,0.029101,0.227683,0.034785,0.006219,0.019306,0.012933,0.014166,0.035717
191339,0.016563,0.003308,0.054423,0.01091,0.047777,0.11799,0.099623,0.002666,4.2e-05,0.018392,...,0.045598,0.014861,0.020292,0.088726,0.004872,0.007768,0.0318,0.018138,0.009137,0.065815
191340,0.024286,0.00462,0.058145,0.017798,0.056816,0.226232,0.152125,0.002102,2.6e-05,0.009688,...,0.024113,0.010681,0.01593,0.052168,0.004241,0.00375,0.012675,0.006297,0.003463,0.114165
191341,0.006026,0.001492,0.020453,0.005021,0.026976,0.055652,0.039603,0.002734,4.9e-05,0.007525,...,0.037173,0.034348,0.021937,0.205958,0.019969,0.010109,0.038932,0.011117,0.036122,0.03076
191348,0.013816,0.00303,0.039841,0.009669,0.041392,0.097251,0.098666,0.002463,3.5e-05,0.009131,...,0.029673,0.01817,0.018382,0.078295,0.004718,0.0219,0.03067,0.014594,0.009719,0.075391


In [75]:
subform_df.to_csv('./ExtraTree/EXT_est{}_dep{}.csv'.format(estimators,depth))