<a href="https://colab.research.google.com/github/wiroel/my-repo/blob/main/project_hyperparameter_tuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [13]:
from google.colab import drive
import joblib

drive.mount('/content/drive', force_remount=True)

features = joblib.load('/content/drive/MyDrive/features.pkl')
X_train_resampled = joblib.load('/content/drive/MyDrive/X_train_resampled.pkl')
y_train_resampled = joblib.load('/content/drive/MyDrive/y_train_resampled.pkl')
X_test = joblib.load('/content/drive/MyDrive/X_test.pkl')
y_test = joblib.load('/content/drive/MyDrive/y_test.pkl')

Mounted at /content/drive


In [14]:
X_train_resampled


Unnamed: 0,Track_encoded,Starting Grid,Weather,Team_encoded,Driver_encoded
0,0,1.000000,1,7,70
1,0,6.000000,1,17,20
2,0,2.000000,1,7,100
3,0,10.000000,1,22,77
4,0,4.000000,1,12,37
...,...,...,...,...,...
15217,20,1.554513,0,7,101
15218,33,2.000000,1,18,58
15219,0,1.046058,0,7,55
15220,30,2.789354,0,0,29


In [2]:
features

['Track_encoded', 'Starting Grid', 'Weather', 'Team_encoded', 'Driver_encoded']

# Hyperparameter tuning on random forest


In [15]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix

rf = RandomForestClassifier(random_state=42)

param_grid ={
    'n_estimators': [100, 200],
    'max_depth': [5, 10, 20],
    'min_samples_split': [2,5,10],
    'min_samples_leaf': [1,2,4],
    'class_weight': ['balanced']

}

grid_search = GridSearchCV(
    estimator=rf,
    param_grid = param_grid,
    cv=5,
    n_jobs=1,
    verbose=2,
    scoring = 'f1'
)

grid_search.fit(X_train_resampled, y_train_resampled)

best_rf = grid_search.best_estimator_
print('Best params: ', grid_search.best_params_)

Fitting 5 folds for each of 54 candidates, totalling 270 fits
[CV] END class_weight=balanced, max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.6s
[CV] END class_weight=balanced, max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.5s
[CV] END class_weight=balanced, max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.5s
[CV] END class_weight=balanced, max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.5s
[CV] END class_weight=balanced, max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.6s
[CV] END class_weight=balanced, max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   1.1s
[CV] END class_weight=balanced, max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   1.1s
[CV] END class_weight=balanced, max_depth=5, min_samples_leaf=1, min_samples_

so i have to replace the above parameters with the best params that i got after tuning

# # adjusting decision threshold

In [16]:


final_rf = RandomForestClassifier(
    n_estimators=200,
    max_depth=20,
    min_samples_split=5,
    min_samples_leaf=1,
    class_weight='balanced',
    random_state=42
)


final_rf.fit(X_train_resampled, y_train_resampled)


# this is for identifying the threshold for y_prob as the default is 0.5, this gives perffect threshold

from sklearn.metrics import precision_recall_curve

y_probs= final_rf.predict_proba(X_test)[:, 1]

precision, recall, thresholds = precision_recall_curve(y_test, y_probs)


f1_score = 2 * (precision * recall) / (precision+recall)
best_threshold = thresholds[f1_score.argmax()]

print(f"Best threshold: {best_threshold}")



#y_pred = final_rf.predict(X_test)

y_pred = (y_probs >= best_threshold).astype(int)

from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

Best threshold: 0.4483838065529241
              precision    recall  f1-score   support

           0       0.98      0.96      0.97      1254
           1       0.41      0.56      0.47        66

    accuracy                           0.94      1320
   macro avg       0.69      0.76      0.72      1320
weighted avg       0.95      0.94      0.94      1320

[[1200   54]
 [  29   37]]


# hyperparameter tuning for xgboost

In [17]:
from xgboost import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV

In [18]:
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3,5,7,10],
    'learning_rate': [0.01, 0.1, 0.2, 0.3],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'gamma': [0, 0.1, 0.2],
    'scale_pos_weight':[1,5,10,20]
}

xgb = XGBClassifier(eval_metric='logloss')

random_search = RandomizedSearchCV(
    estimator=xgb,
    param_distributions=param_grid,
    n_iter=50,
    cv=5,
    random_state=42,
    scoring='f1', verbose=2, n_jobs=1
)

random_search.fit(X_train_resampled.to_numpy(), y_train_resampled)

print('Best parameters: ', random_search.best_params_)

Fitting 5 folds for each of 50 candidates, totalling 250 fits
[CV] END colsample_bytree=0.6, gamma=0.1, learning_rate=0.1, max_depth=10, n_estimators=300, scale_pos_weight=10, subsample=1.0; total time=   3.1s
[CV] END colsample_bytree=0.6, gamma=0.1, learning_rate=0.1, max_depth=10, n_estimators=300, scale_pos_weight=10, subsample=1.0; total time=   0.4s
[CV] END colsample_bytree=0.6, gamma=0.1, learning_rate=0.1, max_depth=10, n_estimators=300, scale_pos_weight=10, subsample=1.0; total time=   0.3s
[CV] END colsample_bytree=0.6, gamma=0.1, learning_rate=0.1, max_depth=10, n_estimators=300, scale_pos_weight=10, subsample=1.0; total time=   0.3s
[CV] END colsample_bytree=0.6, gamma=0.1, learning_rate=0.1, max_depth=10, n_estimators=300, scale_pos_weight=10, subsample=1.0; total time=   0.3s
[CV] END colsample_bytree=1.0, gamma=0, learning_rate=0.2, max_depth=3, n_estimators=300, scale_pos_weight=5, subsample=0.8; total time=   0.2s
[CV] END colsample_bytree=1.0, gamma=0, learning_rate=

In [19]:
final_xg = XGBClassifier(
    subsample=0.8,
    scale_pos_weight=1,
    n_estimators=100,
    max_depth=7, learning_rate=0.3,
    gamma=0.2, colsample_bytree=0.8
)

final_xg.fit(X_train_resampled.to_numpy(), y_train_resampled)


##again for identifying threshold for y_prob


from sklearn.metrics import precision_recall_curve

y_probs = final_xg.predict_proba(X_test.to_numpy())[:, 1]

precision, recall, threshold = precision_recall_curve(y_test, y_probs)

f1_score = 2 * (precision*recall)/(precision+recall)

best_threshold = threshold[f1_score.argmax()]

print(f"Best threshold: {best_threshold}")




#y_pred = final_xg.predict(X_test.to_numpy())

y_pred = (y_probs >= best_threshold).astype(int)

print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

Best threshold: 0.6570426225662231
              precision    recall  f1-score   support

           0       0.97      0.95      0.96      1254
           1       0.34      0.48      0.40        66

    accuracy                           0.93      1320
   macro avg       0.66      0.72      0.68      1320
weighted avg       0.94      0.93      0.93      1320

[[1192   62]
 [  34   32]]


# joblib

In [20]:
import joblib

joblib.dump(final_rf, '/content/drive/MyDrive/final_rf.pkl')
joblib.dump(final_xg, '/content/drive/MyDrive/final_xg.pkl')



['/content/drive/MyDrive/final_xg.pkl']