# Task 8: Comparing Holdout Validation, Cross-Validation, and K-Fold Validation (20 Marks)

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, KFold, LeaveOneOut
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Load the processed data
df = pd.read_csv('Global_Health_Statistics.csv')  # Replace with your processed data file

# Define features (X) and target (y)
X = df.drop(columns=['Mortality Rate (%)'])
y = df['Mortality Rate (%)']

# Define numeric and categorical features
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = X.select_dtypes(include=['object']).columns.tolist()

# Preprocessing for numeric features
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

# Preprocessing for categorical features
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

# Define the model
model = RandomForestRegressor(random_state=42, n_jobs=-1)

# Create a pipeline with preprocessor and model
pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('model', model)])

# Function to evaluate model performance
def evaluate_model(model, X, y):
    y_pred = model.predict(X)
    rmse = np.sqrt(mean_squared_error(y, y_pred))
    r2 = r2_score(y, y_pred)
    return rmse, r2

# 1. Holdout Validation (80-20 split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
pipeline.fit(X_train, y_train)
holdout_rmse, holdout_r2 = evaluate_model(pipeline, X_test, y_test)

# 2. K-Fold Cross-Validation (k=5)
kfold = KFold(n_splits=5, shuffle=True, random_state=42)
kfold_scores_rmse = -cross_val_score(pipeline, X, y, cv=kfold, scoring='neg_mean_squared_error')
kfold_rmse = np.sqrt(kfold_scores_rmse.mean())
kfold_r2 = cross_val_score(pipeline, X, y, cv=kfold, scoring='r2').mean()

# 3. Leave-One-Out Cross-Validation (LOOCV)
loocv = LeaveOneOut()
loocv_scores_rmse = -cross_val_score(pipeline, X, y, cv=loocv, scoring='neg_mean_squared_error')
loocv_rmse = np.sqrt(loocv_scores_rmse.mean())
loocv_r2 = cross_val_score(pipeline, X, y, cv=loocv, scoring='r2').mean()

# Print results
print("Holdout Validation:")
print(f"RMSE: {holdout_rmse}, R²: {holdout_r2}")

print("\nK-Fold Cross-Validation (k=5):")
print(f"RMSE: {kfold_rmse}, R²: {kfold_r2}")

print("\nLeave-One-Out Cross-Validation (LOOCV):")
print(f"RMSE: {loocv_rmse}, R²: {loocv_r2}")

KeyboardInterrupt: 


<h2>Holdout Validation:</h2>

The dataset is split into training (80%) and test (20%) sets.

The model is trained on the training set and evaluated on the test set.

<h2>K-Fold Cross-Validation:</h2>

The dataset is divided into k=5 folds.

The model is trained and evaluated k times, with each fold used as the test set once.

The average RMSE and R² are calculated.

<h2>Leave-One-Out Cross-Validation (LOOCV):</h2>

Each data point is used as a test set once, and the model is trained on the remaining data.

The average RMSE and R² are calculated.

<h2>Performance Metrics:</h2>

RMSE: Measures the average prediction error.

R²: Measures the proportion of variance explained by the model.

Bias-Variance Tradeoff
<h2>Holdout Validation:</h2>

Advantages: Simple and fast.

Disadvantages: High variance in performance due to a single train-test split.

Bias-Variance: Moderate bias, high variance.

<h2>K-Fold Cross-Validation:</h2>

Advantages: Reduces variance by averaging performance across multiple folds.

Disadvantages: Computationally expensive.

Bias-Variance: Low bias, moderate variance.

<h2>Leave-One-Out Cross-Validation (LOOCV):</h2>

Advantages: Uses all data for training, reducing bias.

Disadvantages: Extremely computationally expensive.

Bias-Variance: Very low bias, very high variance.

Final Recommendation
Best Method: K-Fold Cross-Validation (k=5).

It provides a good balance between bias and variance.

It is computationally feasible and provides reliable performance estimates.

Holdout Validation can be used for quick testing, but it is less reliable.

LOOCV is too computationally expensive for large datasets and provides little additional benefit over K-Fold.