In [1]:
import pandas as pd
import numpy as np
from utils.ModelingUtils import make_dummies, make_ml_target
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from catboost import CatBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score, roc_curve, auc
from sklearn.preprocessing import label_binarize

In [4]:
data = pd.read_parquet('../data/prepared_data.parquet')

### make ml target

In [5]:
data = make_ml_target(data)
data = make_dummies(data, 'near_city_station_name')

In [6]:
y = data['ML_TARGET'].values
# y = label_binarize(y, classes=[0,1,2,3])
# n_classes = y.shape[1]

X = data.iloc[:,9:].drop('ML_TARGET', axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

In [None]:
model = CatBoostClassifier(max_depth=10, loss_function='MultiClass', random_seed=123)
model.fit(X_train, y_train)

Learning rate set to 0.112008
0:	learn: 1.1740528	total: 3.38s	remaining: 56m 13s
1:	learn: 1.0305080	total: 6.46s	remaining: 53m 46s
2:	learn: 0.9265809	total: 9.67s	remaining: 53m 35s
3:	learn: 0.8471693	total: 12.8s	remaining: 53m 13s
4:	learn: 0.7836545	total: 16.3s	remaining: 54m 2s
5:	learn: 0.7322920	total: 19.1s	remaining: 52m 37s
6:	learn: 0.6896603	total: 22.3s	remaining: 52m 48s
7:	learn: 0.6542049	total: 25.7s	remaining: 53m 1s
8:	learn: 0.6246839	total: 29.2s	remaining: 53m 39s
9:	learn: 0.5998032	total: 32.7s	remaining: 53m 56s
10:	learn: 0.5790403	total: 35.9s	remaining: 53m 49s
11:	learn: 0.5606539	total: 39.8s	remaining: 54m 37s
12:	learn: 0.5449211	total: 43.1s	remaining: 54m 33s
13:	learn: 0.5315532	total: 46s	remaining: 53m 59s
14:	learn: 0.5194860	total: 49.3s	remaining: 53m 59s
15:	learn: 0.5097106	total: 52.2s	remaining: 53m 29s
16:	learn: 0.5008574	total: 54.3s	remaining: 52m 18s
17:	learn: 0.4928348	total: 56.4s	remaining: 51m 16s
18:	learn: 0.4861271	total: 58

In [38]:
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)

In [43]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.87      0.99      0.93    193326
           1       0.57      0.10      0.17     26277
           2       0.54      0.11      0.19      6840
           3       0.68      0.05      0.10      1199

    accuracy                           0.86    227642
   macro avg       0.66      0.32      0.35    227642
weighted avg       0.82      0.86      0.81    227642


In [45]:
roc_auc_score(y_test, y_prob, multi_class='ovr', average='macro')

0.8807862587345204

In [42]:
fpr = dict()
tpr = dict()
roc_auc = dict()
n_classes = len(model.classes_)

for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(y_test, y_prob[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

# Compute micro-average ROC curve and ROC area
fpr["micro"], tpr["micro"], _ = roc_curve(y_test.ravel(), y_prob.ravel())
roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])

# Plot ROC curve for each class
plt.figure(figsize=(8, 6))
lw = 2
for i in range(n_classes):
    plt.plot(fpr[i], tpr[i], lw=lw,
             label='ROC curve of class {0} (area = {1:0.2f})'
             ''.format(i, roc_auc[i]))

# Plot micro-average ROC curve
plt.plot(fpr["micro"], tpr["micro"],
         label='micro-average ROC curve (area = {0:0.2f})'
               ''.format(roc_auc["micro"]),
         color='deeppink', linestyle=':', linewidth=4)

plt.plot([0, 1], [0, 1], 'k--', lw=lw)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve for Multiclass Classification')
plt.legend(loc="lower right")
plt.show()

ValueError: multiclass format is not supported

In [41]:
y_test

array([0, 0, 1, ..., 0, 0, 0])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)
model = RandomForestClassifier(random_state=123)

param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

cv = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='accuracy')
cv.fit(X_train, y_train)