In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Load the dataset
df = pd.read_csv('insurance.csv')
X = df.drop('charges', axis=1)
y = df['charges']

# Identify numeric and categorical columns
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object']).columns

# Create preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

# Create a pipeline
rf = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(random_state=42))
])

# Define the parameter distribution for RandomizedSearchCV
param_dist = {
    'regressor__n_estimators': [50, 100, 150],
    'regressor__max_depth': [3, 5, 7, 10],
    'regressor__min_samples_split': [2, 5, 10],
    'regressor__min_samples_leaf': [1, 2, 4]
}

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Perform hyperparameter tuning using RandomizedSearchCV
rf_tuned = RandomizedSearchCV(rf, param_distributions=param_dist, n_iter=20, cv=5, scoring='r2', random_state=42)
rf_tuned.fit(X_train, y_train)

# Evaluate the tuned model
y_pred_tuned = rf_tuned.predict(X_test)
print(f"Tuned Random Forest: MAE = {mean_absolute_error(y_test, y_pred_tuned):.2f}, RMSE = {np.sqrt(mean_squared_error(y_test, y_pred_tuned)):.2f}, R2 = {r2_score(y_test, y_pred_tuned):.2f}")

# Print the best parameters
print("Best parameters:", rf_tuned.best_params_)

Tuned Random Forest: MAE = 2538.26, RMSE = 4359.07, R2 = 0.87
Best parameters: {'regressor__n_estimators': 50, 'regressor__min_samples_split': 10, 'regressor__min_samples_leaf': 2, 'regressor__max_depth': 5}
