In [50]:
import numpy as np
import pandas as pd
pd.set_option('max_rows',38)
pd.set_option('max_columns',25)
import myslack

# Plotting Decision tree
from sklearn import tree
from IPython.display import Image as PImage
from subprocess import check_call
from PIL import Image, ImageDraw, ImageFont

# model
from scipy.sparse import csr_matrix
from sklearn import preprocessing
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
import xgboost
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.model_selection import validation_curve
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import classification_report

# Data Check

#### - train.csv

In [2]:
%%time
train = pickle.load(open("./data/final_train_Ver3.pkl", "rb"))
print("test data shape: "+str(train.shape))

test data shape: (95674, 5494)
CPU times: user 203 ms, sys: 641 ms, total: 844 ms
Wall time: 865 ms


#### - test.csv

In [3]:
%%time
test = pickle.load(open("./data/final_test_Ver3.pkl", "rb"))
print("test data shape: "+str(test.shape))

test data shape: (95674, 5493)
CPU times: user 219 ms, sys: 672 ms, total: 890 ms
Wall time: 1.78 s


#### - sample_submission.csv

In [5]:
samplesub = pd.read_csv("./data/sample_submission.csv")
print("sample_submission data shape: "+str(samplesub.shape))

sample_submission data shape: (95674, 39)


---

# Modeling 4 - Random Forest

In [6]:
# in test, not in train
not_in_train = [i for i in test.columns if i not in train.columns[1:]]

In [7]:
# in train, not in test
not_in_test = [i for i in train.columns[1:] if i not in test.columns]

In [8]:
len(not_in_train)

328

In [9]:
len(not_in_test)

328

In [10]:
train.drop(columns=not_in_test,inplace=True)

In [11]:
train.shape

(95674, 5166)

In [12]:
test.drop(columns=not_in_train,inplace=True)

In [13]:
test.shape

(95674, 5165)

---

### Train_X, Train_y

In [14]:
X = train.drop(columns='TripType')
y = train['TripType']

In [28]:
X = csr_matrix(X); X

<95674x5165 sparse matrix of type '<class 'numpy.float64'>'
	with 2214766 stored elements in Compressed Sparse Row format>

In [29]:
csr_test = csr_matrix(test); csr_test

<95674x5165 sparse matrix of type '<class 'numpy.float64'>'
	with 2221183 stored elements in Compressed Sparse Row format>

### Random Forest

In [40]:
%%time
estimator=500
depth=100
RandomForest_model = RandomForestClassifier(n_estimators=estimator, max_depth=depth, n_jobs=6).fit(X,y)

CPU times: user 41min 4s, sys: 23.6 s, total: 41min 28s
Wall time: 7min 11s


### Confusion Matrix

In [41]:
confusion_matrix(y, RandomForest_model.predict(X))

array([[3598,    0,    0, ...,    0,    0,   45],
       [   0,  334,   12, ...,    0,    0,    0],
       [   0,    0, 4593, ...,    0,    0,    0],
       ...,
       [   0,    0,    0, ...,  872,    0,    0],
       [   0,    0,    0, ...,    0, 1187,    0],
       [ 251,    2,   14, ...,    0,    0, 8029]])

### Train Test Split

In [42]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, shuffle=True)

X_train.shape[0], X_test.shape[0], len(y_train), len(y_test)

(86106, 9568, 86106, 9568)

In [43]:
%%time
y_pred = RandomForest_model.predict(X_test)

CPU times: user 8.95 s, sys: 12.1 s, total: 21 s
Wall time: 6.79 s


In [44]:
print(classification_report(y_test, y_pred))

             precision    recall  f1-score   support

          3       0.94      0.99      0.96       366
          4       1.00      0.98      0.99        41
          5       0.99      1.00      0.99       498
          6       1.00      1.00      1.00       132
          7       1.00      0.99      1.00       564
          8       0.99      1.00      0.99      1152
          9       0.95      0.99      0.97       925
         12       1.00      1.00      1.00        30
         15       0.99      0.97      0.98       102
         18       1.00      0.93      0.96        56
         19       1.00      0.93      0.97        30
         20       1.00      1.00      1.00        54
         21       1.00      0.97      0.99        70
         22       0.99      0.95      0.97        87
         23       0.91      1.00      0.95        10
         24       1.00      0.97      0.98       283
         25       1.00      1.00      1.00       377
         26       1.00      0.95      0.98   

### prediction

In [45]:
result = RandomForest_model.predict_proba(csr_test)
result

array([[0.00000000e+00, 6.20346772e-03, 4.56512166e-03, ...,
        7.33359909e-04, 1.05043367e-04, 6.79774161e-03],
       [3.96904494e-04, 4.38287445e-03, 3.04839919e-02, ...,
        1.38061449e-02, 2.47072080e-04, 6.17544335e-02],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 9.97987013e-01],
       ...,
       [9.50232069e-04, 6.96379746e-04, 1.02033476e-02, ...,
        2.49532127e-07, 6.86787474e-06, 1.96230506e-02],
       [0.00000000e+00, 0.00000000e+00, 2.00000000e-03, ...,
        6.08510638e-03, 4.00000000e-02, 2.00000000e-03],
       [8.28250946e-05, 7.99300350e-05, 2.72573144e-03, ...,
        1.66802665e-02, 8.13212127e-03, 2.27294007e-03]])

### Make Submission CSV file

In [46]:
subform_df_columns = samplesub.columns[1:]
result_df = pd.DataFrame(result)
result_df.columns = subform_df_columns
subform_df = pd.concat([test.reset_index()['VisitNumber'],result_df],axis=1)
subform_df.set_index('VisitNumber',inplace=True)
subform_df.tail()

Unnamed: 0_level_0,TripType_3,TripType_4,TripType_5,TripType_6,TripType_7,TripType_8,TripType_9,TripType_12,TripType_14,TripType_15,TripType_18,TripType_19,...,TripType_34,TripType_35,TripType_36,TripType_37,TripType_38,TripType_39,TripType_40,TripType_41,TripType_42,TripType_43,TripType_44,TripType_999
VisitNumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1
191338,3e-05,1.1e-05,0.014323,0.00257,0.023277,0.002126,6e-05,6.4e-05,0.0,0.010435,0.000408,0.002063,...,0.008669,0.200972,0.014062,0.009101,0.028515,0.302008,0.004034,0.002069,0.004275,0.004251309,0.012113,8.4e-05
191339,0.000217,0.002275,0.040664,0.003609,0.030562,0.003238,0.003562,0.000562,3.7e-05,0.029843,0.019399,0.000701,...,0.008065,0.058293,0.047647,0.008773,0.02036,0.078244,4.3e-05,0.011322,0.070377,0.02153074,0.000193,0.001455
191340,0.00095,0.000696,0.010203,0.005977,0.015479,0.81283,0.107135,0.00036,0.0,0.002075,0.001602,0.00091,...,0.000736,0.000415,0.000439,6.1e-05,4.7e-05,5.5e-05,3e-06,3e-06,0.001038,2.495321e-07,7e-06,0.019623
191341,0.0,0.0,0.002,0.002,0.008149,0.0,0.002,0.0,0.0,0.004128,2.1e-05,0.0,...,0.002,0.006,0.02,0.016043,0.006064,0.216574,0.008,0.008,0.062043,0.006085106,0.04,0.002
191348,8.3e-05,8e-05,0.002726,0.002342,0.006791,9.5e-05,0.002429,0.002181,2.7e-05,0.007193,0.005571,0.00019,...,0.008493,0.013247,0.007212,0.003985,0.005187,0.031725,4e-06,0.046506,0.051061,0.01668027,0.008132,0.002273


In [47]:
subform_df.to_csv('./RandomForest/RFT_est{}_dep{}.csv'.format(estimator,depth))

In [51]:
myslack.send_slack('Making RFT_est{}_dep{} CSV finish!.format(estimator,depth)')

<Response [200]>


---