In [1]:
import pandas as pd
import numpy as np
from utils.ModelingUtils import make_ml_target, MODEL_FEATURES
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.impute import KNNImputer
from catboost import CatBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score, roc_curve, auc
from sklearn.preprocessing import label_binarize, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import RFECV

In [2]:
data = pd.read_parquet('../data/prepared_data_with_weather.parquet')
data = make_ml_target(data)
data = data[MODEL_FEATURES + ['ML_TARGET']]

In [4]:
y = data['ML_TARGET'].values
X = data.drop('ML_TARGET', axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

numeric_features = X_train.select_dtypes(include=['number']).columns
categorical_features = X_train.select_dtypes(exclude=['number']).columns

In [5]:
numeric_transformer = Pipeline(steps=[
    ('imputer', KNNImputer()),
])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

pipe = Pipeline([
    ('preprocessor', preprocessor),
])

In [6]:
pipe.fit(X_train)

In [None]:
X_train_processed = pipe.transform(X_train)
X_test_processed = pipe.transform(X_test)

In [None]:
estimator = RandomForestClassifier(random_state=123)

# Create RFECV object
rfecv = RFECV(estimator=estimator, step=1, cv=5, scoring='accuracy')

# Fit RFECV to training data
rfecv.fit(X_train_processed, y_train)

In [7]:
# model = CatBoostClassifier(max_depth=10, loss_function='MultiClass', random_seed=123)
# model.fit(X_train, y_train)

In [38]:
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)

In [43]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.87      0.99      0.93    193326
           1       0.57      0.10      0.17     26277
           2       0.54      0.11      0.19      6840
           3       0.68      0.05      0.10      1199

    accuracy                           0.86    227642
   macro avg       0.66      0.32      0.35    227642
weighted avg       0.82      0.86      0.81    227642


In [45]:
roc_auc_score(y_test, y_prob, multi_class='ovr', average='macro')

0.8807862587345204

In [42]:
fpr = dict()
tpr = dict()
roc_auc = dict()
n_classes = len(model.classes_)

for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(y_test, y_prob[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

# Compute micro-average ROC curve and ROC area
fpr["micro"], tpr["micro"], _ = roc_curve(y_test.ravel(), y_prob.ravel())
roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])

# Plot ROC curve for each class
plt.figure(figsize=(8, 6))
lw = 2
for i in range(n_classes):
    plt.plot(fpr[i], tpr[i], lw=lw,
             label='ROC curve of class {0} (area = {1:0.2f})'
             ''.format(i, roc_auc[i]))

# Plot micro-average ROC curve
plt.plot(fpr["micro"], tpr["micro"],
         label='micro-average ROC curve (area = {0:0.2f})'
               ''.format(roc_auc["micro"]),
         color='deeppink', linestyle=':', linewidth=4)

plt.plot([0, 1], [0, 1], 'k--', lw=lw)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve for Multiclass Classification')
plt.legend(loc="lower right")
plt.show()

ValueError: multiclass format is not supported

In [41]:
y_test

array([0, 0, 1, ..., 0, 0, 0])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)
model = RandomForestClassifier(random_state=123)

param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

cv = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='accuracy')
cv.fit(X_train, y_train)