In [71]:
import pandas as pd
import numpy as np
from utils.ModelingUtils import *
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, roc_auc_score, roc_curve, auc, make_scorer, accuracy_score, precision_score, f1_score
import joblib

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

In [14]:
data = pd.read_parquet('../data/final_data_to_modeling1105.parquet')
data = make_ml_target_classification(data)[MODEL_FEATURES]

y = data['ML_TARGET'].values
X = data.drop('ML_TARGET', axis=1)

_, _, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

processed_df_train = pd.read_parquet(f'../data/preprocessed_for_modeling/train_data_1205.parquet')
processed_df_test = pd.read_parquet(f'../data/preprocessed_for_modeling/test_data_1205.parquet')

In [15]:
X_train = processed_df_train.drop('ML_TARGET', axis=1)
X_test = processed_df_test.drop('ML_TARGET', axis=1)

In [40]:
param_grid = {
    'learning_rate': [0.01, 0.05, 0.1, 0.3],
    'max_depth': [3, 5, 7, 9],
    'min_child_weight': [1, 3, 5, 7],
    'subsample': [0.5, 0.7, 0.9, 1.0],
    'colsample_bytree': [0.5, 0.7, 0.9, 1.0],
    'n_estimators': [100, 200, 300, 400],
    'reg_lambda': [0, 1e-5, 1e-2, 0.1],
    'reg_alpha': [0, 1e-5, 1e-2, 0.1]
}

In [72]:
model = XGBClassifier(objective='multi:softmax', random_state=123, num_class=4)
precision_scorer = make_scorer(precision_score, average='macro')
f1_scorer = make_scorer(f1_score, average='macro')
cv = RandomizedSearchCV(model, param_distributions=param_grid, scoring=f1_scorer, cv=2, verbose=3, random_state=123, n_iter=50)
cv.fit(X_train, y_train)

Fitting 2 folds for each of 50 candidates, totalling 100 fits
[CV 1/2] END colsample_bytree=1.0, learning_rate=0.01, max_depth=9, min_child_weight=3, n_estimators=400, reg_alpha=0.1, reg_lambda=0.1, subsample=0.9;, score=0.305 total time= 2.1min
[CV 2/2] END colsample_bytree=1.0, learning_rate=0.01, max_depth=9, min_child_weight=3, n_estimators=400, reg_alpha=0.1, reg_lambda=0.1, subsample=0.9;, score=0.300 total time= 2.1min
[CV 1/2] END colsample_bytree=0.5, learning_rate=0.3, max_depth=9, min_child_weight=3, n_estimators=200, reg_alpha=0.01, reg_lambda=0.1, subsample=0.7;, score=0.435 total time=  55.4s
[CV 2/2] END colsample_bytree=0.5, learning_rate=0.3, max_depth=9, min_child_weight=3, n_estimators=200, reg_alpha=0.01, reg_lambda=0.1, subsample=0.7;, score=0.426 total time=  49.8s
[CV 1/2] END colsample_bytree=0.7, learning_rate=0.1, max_depth=9, min_child_weight=3, n_estimators=200, reg_alpha=0.1, reg_lambda=0.1, subsample=0.9;, score=0.376 total time=  52.0s
[CV 2/2] END colsam

In [74]:
best_params = cv.best_params_
best_score = cv.best_score_

print("Best Parameters:", best_params)
print("Best Accuracy:", best_score)

# Evaluate the best model on the test set
best_model = cv.best_estimator_
y_pred = best_model.predict(X_test)
test_accuracy = precision_score(y_test, y_pred)
print("Test Accuracy:", test_accuracy)

In [75]:
best_model

In [76]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.91      0.98      0.94    168214
           1       0.67      0.36      0.47     22011
           2       0.62      0.28      0.39      5142
           3       0.76      0.26      0.38      1042

    accuracy                           0.89    196409
   macro avg       0.74      0.47      0.55    196409
weighted avg       0.87      0.89      0.87    196409



In [77]:
pd.DataFrame({
    'names': X_train.columns,
    'importance': best_model.feature_importances_
}).sort_values('importance', ascending=False)

Unnamed: 0,names,importance
5,cumsum_distances,0.023765
36,yearday_sin,0.021692
75,conditions_Snow,0.018933
77,conditions_Snow_Partially_cloudy,0.017651
78,conditions_Snow_Rain,0.016608
34,weekofyear_sin,0.016423
71,conditions_Rain,0.016063
81,conditions_Snow_Rain_Partially_cloudy,0.015878
76,conditions_Snow_Overcast,0.015543
74,conditions_Rain_Partially_cloudy,0.01538


In [78]:
joblib.dump(best_model, 'models/multiclassifier0506_max_f1')

['models/multiclassifier0506_max_f1']

In [79]:
xx=joblib.load('models/multiclassifier0506_max_f1')

In [70]:
xx

In [38]:
pred3 = model.predict(X_test)

In [39]:
print(classification_report(y_test, pred3))

              precision    recall  f1-score   support

           0       0.87      0.99      0.93    168214
           1       0.56      0.08      0.14     22011
           2       0.51      0.10      0.16      5142
           3       0.66      0.09      0.16      1042

    accuracy                           0.86    196409
   macro avg       0.65      0.32      0.35    196409
weighted avg       0.83      0.86      0.82    196409

