# Task 5: Hyperparameter Tuning

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Load the processed data
df = pd.read_csv('Global_Health_Statistics.csv')  # Replace with your processed data file

# Define features (X) and target (y)
X = df.drop(columns=['Mortality Rate (%)'])
y = df['Mortality Rate (%)']

# Split data into training (80%) and test (20%) sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define numeric and categorical features
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = X.select_dtypes(include=['object']).columns.tolist()

# Preprocessing for numeric features
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

# Preprocessing for categorical features
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

# Define the model (e.g., Random Forest or XGBoost)
model = RandomForestRegressor(random_state=42)  # Replace with XGBRegressor if needed

# Create a pipeline with preprocessor and model
pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('model', model)])

# Define hyperparameter grid for tuning
param_grid = {
    'model__n_estimators': [100, 200, 300],  # Number of trees in the forest
    'model__max_depth': [None, 10, 20, 30],   # Maximum depth of the tree
    'model__min_samples_split': [2, 5, 10],   # Minimum number of samples required to split a node
    'model__min_samples_leaf': [1, 2, 4],     # Minimum number of samples required at each leaf node
    'model__max_features': ['auto', 'sqrt']   # Number of features to consider at each split
}

# Perform GridSearchCV for hyperparameter tuning
grid_search = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    scoring='neg_mean_squared_error',  # Use RMSE for regression
    cv=5,  # 5-fold cross-validation
    n_jobs=-1  # Use all available CPU cores
)

# Fit the GridSearchCV to the training data
grid_search.fit(X_train, y_train)

# Get the best model and hyperparameters
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_

# Evaluate the best model on the test set
y_pred = best_model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

# Print results
print("Best Hyperparameters:")
print(best_params)
print("\nTest Set Performance:")
print(f"RMSE: {rmse}")
print(f"R² Score: {r2}")

# Compare with previous models (from Task 4)
# Add your previous model results here for comparison
# Example:
# print("Previous Model (Random Forest): RMSE = 0.10, R² = 0.88")

I ran the above code for 95 min but i didnt get the output

# Optimized Version

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Load the processed data
df = pd.read_csv('Global_Health_Statistics.csv')  # Replace with your processed data file

# Use a smaller subset of the data for testing (optional)
df = df.sample(frac=0.5, random_state=42)  # Use 50% of the data

# Define features (X) and target (y)
X = df.drop(columns=['Mortality Rate (%)'])
y = df['Mortality Rate (%)']

# Split data into training (80%) and test (20%) sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define numeric and categorical features
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = X.select_dtypes(include=['object']).columns.tolist()

# Preprocessing for numeric features
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

# Preprocessing for categorical features
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

# Define the model (e.g., Random Forest)
model = RandomForestRegressor(random_state=42, n_jobs=-1)  # Use all CPU cores

# Create a pipeline with preprocessor and model
pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('model', model)])

# Define hyperparameter grid for tuning
param_dist = {
    'model__n_estimators': [100, 200, 300],  # Number of trees in the forest
    'model__max_depth': [None, 10, 20, 30],   # Maximum depth of the tree
    'model__min_samples_split': [2, 5, 10],   # Minimum number of samples required to split a node
    'model__min_samples_leaf': [1, 2, 4],     # Minimum number of samples required at each leaf node
    'model__max_features': ['sqrt', 'log2']   # Number of features to consider at each split
}

# Perform RandomizedSearchCV for hyperparameter tuning
random_search = RandomizedSearchCV(
    estimator=pipeline,
    param_distributions=param_dist,
    n_iter=10,  # Number of parameter settings sampled (reduce for faster runtime)
    scoring='neg_mean_squared_error',  # Use RMSE for regression
    cv=3,  # Use fewer folds (e.g., 3 instead of 5)
    n_jobs=-1,  # Use all available CPU cores
    random_state=42
)

# Fit the RandomizedSearchCV to the training data
random_search.fit(X_train, y_train)

# Get the best model and hyperparameters
best_model = random_search.best_estimator_
best_params = random_search.best_params_

# Evaluate the best model on the test set
y_pred = best_model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

# Print results
print("Best Hyperparameters:")
print(best_params)
print("\nTest Set Performance:")
print(f"RMSE: {rmse}")
print(f"R² Score: {r2}")

# Compare with previous models (from Task 4)
# Add your previous model results here for comparison
# Example:
# print("Previous Model (Random Forest): RMSE = 0.10, R² = 0.88")