In [18]:
import numpy as np
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from xgboost import XGBClassifier

import warnings
warnings.filterwarnings('ignore')

In [7]:
npzfile = np.load('../data/processed/train_data.npz')
X = npzfile['x']
y = npzfile['y']

#### Random Forest

In [39]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)
clf = RandomForestClassifier(n_estimators=200, n_jobs=8, random_state=0)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print("Acc: {:.3f} P: {:.3f} R: {:.3f} F1: {:.3f}".format(accuracy_score(y_test, y_pred), precision_score(y_test, y_pred), recall_score(y_test, y_pred), f1_score(y_test, y_pred)))


Acc: 0.841 P: 0.823 R: 0.738 F1: 0.778


In [40]:
clf.score(X_train, y_train)

1.0

#### XGBoost

[https://www.kaggle.com/stuarthallows/using-xgboost-with-scikit-learn]()  
[https://xgboost.readthedocs.io/en/latest/build.html]()

In [41]:
model = XGBClassifier()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print("Acc: {:.3f} P: {:.3f} R: {:.3f} F1: {:.3f}".format(accuracy_score(y_test, y_pred), precision_score(y_test, y_pred), recall_score(y_test, y_pred), f1_score(y_test, y_pred)))

Acc: 0.831 P: 0.795 R: 0.745 F1: 0.769


In [42]:
model.score(X_train, y_train)

0.8863686902379364

#### KFold

In [27]:
kf = KFold(n_splits=5, random_state=42, shuffle=True)
scores = []

for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    clf.fit(X_train, y_train)
    scores.append(clf.score(X_test, y_test))

In [28]:
print(np.mean(scores))

0.8414007782101167


In [30]:
scores

[0.846692607003891,
 0.8311284046692607,
 0.8342412451361868,
 0.8373540856031129,
 0.8575875486381322]

In [20]:
np.mean(cross_val_score(clf, X, y, cv=5))

0.6891847815155011

#### Test on Sydney

In [62]:
train_npzfile = np.load('../data/processed/train_data_sydney.npz')
test_npzfile = np.load('../data/processed/test_data_sydney.npz')

In [63]:
X_train, y_train = train_npzfile['x'], train_npzfile['y']
X_test, y_test = test_npzfile['x'], test_npzfile['y']

In [64]:
clf = RandomForestClassifier(n_estimators=200, n_jobs=12, random_state=0)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

acc, p, r, f1 = accuracy_score(y_test, y_pred), precision_score(y_test, y_pred), recall_score(y_test, y_pred), f1_score(y_test, y_pred)
print("Predict:", (acc, p, r, f1))

Predict: (0.7100737100737101, 0.7709677419354839, 0.4578544061302682, 0.5745192307692307)


In [69]:
train_npzfile_2 = np.load('../data/processed/train_data_sydney.npz')
test_npzfile_2 = np.load('../data/processed/test_data_sydney.npz')

X_train_2, y_train_2 = train_npzfile_2['x'], train_npzfile_2['y']
X_test_2, y_test_2 = test_npzfile_2['x'], test_npzfile_2['y']

In [70]:
X_test - X_test_2

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])