In [4]:
# Importing the necessary libraries
import seaborn as sns
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
!pip install optuna
import optuna
from sklearn.model_selection import cross_val_score
import matplotlib.pyplot as plt

# Load the healthexp dataset from seaborn
# This dataset contains health expenditure data
healthexp = sns.load_dataset('healthexp')

# Display the first 100 rows to understand the dataset structure
healthexp.head(100)

# Convert categorical columns to dummy/one-hot encoded variables
healthexp = pd.get_dummies(healthexp)

# Split the dataset into features (X) and target (y)
# 'Life_Expectancy' is our target variable
X = healthexp.drop(['Life_Expectancy'], axis=1)
y = healthexp['Life_Expectancy']

# Split the dataset into training and testing sets
# 80% for training, 20% for testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=19)

# Initialize and train a Random Forest Regressor
# Random Forest is a robust machine learning algorithm for regression
rfr = RandomForestRegressor(random_state=13)
rfr.fit(X_train, y_train)

# Predict on the test dataset
y_pred = rfr.predict(X_test)

# Evaluate the model performance
# Using metrics: Mean Absolute Error (MAE), Mean Squared Error (MSE), and R-squared (R2)
print("MAE:", mean_absolute_error(y_test, y_pred))
print("MSE:", mean_squared_error(y_test, y_pred))
print("R2:", r2_score(y_test, y_pred))

# Define the objective function for Optuna
# This function tunes hyperparameters for the Random Forest Regressor
def objective(trial):
    n_estimators = trial.suggest_int('n_estimators', 100, 1000)
    max_depth = trial.suggest_int('max_depth', 10, 50)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 32)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 32)
    model = RandomForestRegressor(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf
    )
    score = cross_val_score(model, X, y, n_jobs=-1, cv=5, scoring='neg_mean_squared_error').mean()
    return score

# Create an Optuna study
# The study optimizes the hyperparameters of Random Forest Regressor
study = optuna.create_study(direction='maximize', sampler=optuna.samplers.RandomSampler(seed=42))

# Run the optimization process for 100 trials
study.optimize(objective, n_trials=100)

# Retrieve the best hyperparameters and their score
best_params = study.best_params
best_score = study.best_value
print(f"Best Hyperparameters: {best_params}")
print(f"Best Accuracy: {best_score:.3f}")

# Visualize the Optuna results
# These visualizations help understand the optimization process and parameter importance
optuna.visualization.plot_optimization_history(study)
optuna.visualization.plot_parallel_coordinate(study)
optuna.visualization.plot_slice(study, params=['n_estimators', 'max_depth', 'min_samples_leaf', 'min_samples_split'])
optuna.visualization.plot_param_importances(study)

# Extract the best hyperparameters
best_n_estimators = best_params['n_estimators']
best_max_depth = best_params['max_depth']
best_min_samples_split = best_params['min_samples_split']
best_min_samples_leaf = best_params['min_samples_leaf']

# Build and train a new Random Forest model using the best hyperparameters
best_model = RandomForestRegressor(
    n_estimators=best_n_estimators,
    max_depth=best_max_depth,
    min_samples_split=best_min_samples_split,
    min_samples_leaf=best_min_samples_leaf
)
best_model.fit(X_train, y_train)

# Predict with the tuned model
y_pred = best_model.predict(X_test)

# Evaluate the tuned model
print("Tuned MAE:", mean_absolute_error(y_test, y_pred))
print("Tuned MSE:", mean_squared_error(y_test, y_pred))
print("Tuned R2:", r2_score(y_test, y_pred))




[I 2025-01-18 08:21:34,084] A new study created in memory with name: no-name-c20fdb6e-a30f-45fc-8046-458be23ff83a


MAE: 0.2591818181818008
MSE: 0.10222285454545255
R2: 0.9910447583779421


[I 2025-01-18 08:21:38,787] Trial 0 finished with value: -4.502017696233201 and parameters: {'n_estimators': 437, 'max_depth': 48, 'min_samples_split': 24, 'min_samples_leaf': 20}. Best is trial 0 with value: -4.502017696233201.
[I 2025-01-18 08:21:40,888] Trial 1 finished with value: -5.17020291237105 and parameters: {'n_estimators': 240, 'max_depth': 16, 'min_samples_split': 3, 'min_samples_leaf': 28}. Best is trial 0 with value: -4.502017696233201.
[I 2025-01-18 08:21:46,473] Trial 2 finished with value: -5.6033026031825175 and parameters: {'n_estimators': 641, 'max_depth': 39, 'min_samples_split': 2, 'min_samples_leaf': 32}. Best is trial 0 with value: -4.502017696233201.
[I 2025-01-18 08:21:51,794] Trial 3 finished with value: -2.997774315886926 and parameters: {'n_estimators': 850, 'max_depth': 18, 'min_samples_split': 7, 'min_samples_leaf': 6}. Best is trial 3 with value: -2.997774315886926.
[I 2025-01-18 08:21:54,093] Trial 4 finished with value: -3.8149133428323845 and paramet

Best Hyperparameters: {'n_estimators': 358, 'max_depth': 34, 'min_samples_split': 2, 'min_samples_leaf': 2}
Best Accuracy: -1.864
Tuned MAE: 0.30958946668138504
Tuned MSE: 0.13820062801388486
Tuned R2: 0.9878929225593658
