# Best Parameters finder file

This file will load in the data going to be used for the random forest model, using optuna it will attempt to find the best hyper parameters that optimize accuracy, precision, f1 and recall scores. And then save those values for the completeProject to use

In [1]:
# Imports
# Packages for numerics + dataframes
import pandas as pd
import numpy as np

# Packages for visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Packages for date conversions for calculating trip durations
from datetime import datetime
from datetime import date
from datetime import timedelta

# Packages for OLS, MLR, confusion matrix
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import sklearn.metrics as metrics # For confusion matrix
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error,r2_score,mean_squared_error

#machine learning packages 
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.metrics import accuracy_score, precision_score, recall_score,\
f1_score, confusion_matrix, ConfusionMatrixDisplay, RocCurveDisplay
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import plot_tree
from sklearn.ensemble import RandomForestClassifier

%store -r data
df_tree=data

In [2]:
pip install optuna

Note: you may need to restart the kernel to use updated packages.


In [3]:
import optuna
from sklearn.metrics import make_scorer
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_iris

# Load a sample dataset (Replace this with your dataset)
iris = df_tree
X, y = df_tree.drop('generous', axis=1), df_tree['generous']

# Split the dataset into training and validation sets
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

# Modify the objective function to include multiple metrics and validation
def objective(trial):
    n_estimators = trial.suggest_int('n_estimators', 100, 1500, step=100)
    max_depth = trial.suggest_int('max_depth', 3, 30)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 20)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 10)
    max_features = trial.suggest_categorical('max_features', ['sqrt', 'log2', None])
    criterion = trial.suggest_categorical('criterion', ['gini', 'entropy'])

    model = RandomForestClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        max_features=max_features,
        criterion=criterion,
        random_state=42  # Add a random_state for reproducibility
    )
    
    model.fit(X_train, y_train)
    y_pred = model.predict(X_valid)
    
    accuracy = accuracy_score(y_valid, y_pred)
    precision = precision_score(y_valid, y_pred, average='macro', zero_division=0)
    recall = recall_score(y_valid, y_pred, average='macro', zero_division=0)
    f1 = f1_score(y_valid, y_pred, average='macro', zero_division=0)
    
    return accuracy + precision + recall + f1  # Combine metrics for optimization

# Enable pruning to terminate unpromising trials early
study = optuna.create_study(direction='maximize', pruner=optuna.pruners.MedianPruner())
study.optimize(objective, n_trials=300, n_jobs=4)

[I 2023-12-08 23:37:51,121] A new study created in memory with name: no-name-a7f3c72e-963f-4e93-97fd-abae0d046214
[I 2023-12-08 23:37:52,787] Trial 3 finished with value: 1.850560656111753 and parameters: {'n_estimators': 300, 'max_depth': 22, 'min_samples_split': 15, 'min_samples_leaf': 9, 'max_features': 'log2', 'criterion': 'gini'}. Best is trial 3 with value: 1.850560656111753.
[I 2023-12-08 23:37:57,479] Trial 4 finished with value: 1.850560656111753 and parameters: {'n_estimators': 1500, 'max_depth': 11, 'min_samples_split': 10, 'min_samples_leaf': 9, 'max_features': 'log2', 'criterion': 'entropy'}. Best is trial 3 with value: 1.850560656111753.
[I 2023-12-08 23:38:54,129] Trial 5 finished with value: 2.253374100206091 and parameters: {'n_estimators': 100, 'max_depth': 5, 'min_samples_split': 7, 'min_samples_leaf': 2, 'max_features': None, 'criterion': 'entropy'}. Best is trial 5 with value: 2.253374100206091.
[I 2023-12-08 23:39:46,008] Trial 2 finished with value: 2.26290743382

In [15]:
# Fetch the best trial
best_trial = study.best_trial

# Print the best trial's values and parameters
print(f"Best Value: {best_trial.value}")
print("Best Parameters:")
for key, value in best_trial.params.items():
    print(f"{key}: {value}")

# Convert dict_items to a list of tuples
best_parms = list(best_trial.params.items())

# Store the best_parms variable
%store best_parms

Best Value: 2.4710319374918495
Best Parameters:
n_estimators: 100
max_depth: 28
min_samples_split: 15
min_samples_leaf: 6
max_features: None
criterion: entropy
Stored 'best_parms' (list)
