# Task 4: Model Selection & Training

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, precision_score, recall_score, f1_score
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, GradientBoostingRegressor, GradientBoostingClassifier
from xgboost import XGBRegressor, XGBClassifier
from sklearn.neural_network import MLPRegressor, MLPClassifier
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Load the processed data
df = pd.read_csv('Global_Health_Statistics.csv')  # Replace with your processed data file

# Define features (X) and target (y)
# Assuming 'Mortality Rate (%)' is the target variable for regression
X = df.drop(columns=['Mortality Rate (%)'])
y = df['Mortality Rate (%)']

# Split data into training (80%) and test (20%) sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define numeric and categorical features
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = X.select_dtypes(include=['object']).columns.tolist()

# Preprocessing for numeric features
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

# Preprocessing for categorical features
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

# Define models (regression and classification)
regression_models = {
    'Linear Regression': LinearRegression(),
    'Decision Tree': DecisionTreeRegressor(random_state=42),
    'Random Forest': RandomForestRegressor(random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(random_state=42),
    'XGBoost': XGBRegressor(random_state=42),
    'Neural Network': MLPRegressor(random_state=42, max_iter=1000)
}

classification_models = {
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'Random Forest': RandomForestClassifier(random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(random_state=42),
    'XGBoost': XGBClassifier(random_state=42),
    'Neural Network': MLPClassifier(random_state=42, max_iter=1000)
}

# Function to evaluate regression models
def evaluate_regression(model, X_train, y_train, X_test, y_test):
    # Create a pipeline with preprocessor and model
    pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('model', model)])
    
    # Train the model
    pipeline.fit(X_train, y_train)
    
    # Predict on test set
    y_pred = pipeline.predict(X_test)
    
    # Calculate metrics
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)
    
    # Cross-validation
    cv_scores = cross_val_score(pipeline, X_train, y_train, cv=5, scoring='neg_mean_squared_error')
    cv_rmse = np.sqrt(-cv_scores.mean())
    
    return rmse, r2, cv_rmse

# Function to evaluate classification models
def evaluate_classification(model, X_train, y_train, X_test, y_test):
    # Create a pipeline with preprocessor and model
    pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('model', model)])
    
    # Train the model
    pipeline.fit(X_train, y_train)
    
    # Predict on test set
    y_pred = pipeline.predict(X_test)
    
    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    
    # Cross-validation
    cv_scores = cross_val_score(pipeline, X_train, y_train, cv=5, scoring='accuracy')
    cv_accuracy = cv_scores.mean()
    
    return accuracy, precision, recall, f1, cv_accuracy

# Evaluate regression models
regression_results = {}
for name, model in regression_models.items():
    rmse, r2, cv_rmse = evaluate_regression(model, X_train, y_train, X_test, y_test)
    regression_results[name] = {'RMSE': rmse, 'R²': r2, 'CV RMSE': cv_rmse}

# Evaluate classification models (if applicable)
classification_results = {}
for name, model in classification_models.items():
    accuracy, precision, recall, f1, cv_accuracy = evaluate_classification(model, X_train, y_train, X_test, y_test)
    classification_results[name] = {'Accuracy': accuracy, 'Precision': precision, 'Recall': recall, 'F1-Score': f1, 'CV Accuracy': cv_accuracy}

# Display results
print("Regression Results:")
print(pd.DataFrame(regression_results).T)

print("\nClassification Results:")
print(pd.DataFrame(classification_results).T)

# Optimized Version

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, precision_score, recall_score, f1_score
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, GradientBoostingRegressor, GradientBoostingClassifier
from xgboost import XGBRegressor, XGBClassifier
from sklearn.neural_network import MLPRegressor, MLPClassifier
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Load the dataset
df = pd.read_csv('Global_Health_Statistics.csv')  # Ensure this file is correctly formatted

# Define features (X) and target (y)
X = df.drop(columns=['Mortality Rate (%)'])
y = df['Mortality Rate (%)']

# Split data into training (80%) and test (20%) sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Identify numeric and categorical features
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = X.select_dtypes(include=['object']).columns.tolist()

# Preprocessing for numeric features
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

# Optimized categorical processing using Ordinal Encoding
categorical_transformer = Pipeline(steps=[
    ('ordinal', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
])

# Combine preprocessing steps
preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, numeric_features),
    ('cat', categorical_transformer, categorical_features)
])

# Define regression models with optimizations
regression_models = {
    'Linear Regression': LinearRegression(),
    'Decision Tree': DecisionTreeRegressor(random_state=42),
    'Random Forest': RandomForestRegressor(n_jobs=-1, random_state=42),  # Parallel processing
    'Gradient Boosting': GradientBoostingRegressor(n_estimators=100, random_state=42),
    'XGBoost': XGBRegressor(n_estimators=100, random_state=42, n_jobs=-1),  # Parallel processing
    'Neural Network': MLPRegressor(random_state=42, max_iter=500, early_stopping=True)  # Faster training
}

# Define classification models with optimizations
classification_models = {
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'Random Forest': RandomForestClassifier(n_jobs=-1, random_state=42),  # Parallel processing
    'Gradient Boosting': GradientBoostingClassifier(n_estimators=100, random_state=42),
    'XGBoost': XGBClassifier(n_estimators=100, random_state=42, n_jobs=-1),  # Parallel processing
    'Neural Network': MLPClassifier(random_state=42, max_iter=500, early_stopping=True)  # Faster training
}

# Reduce cross-validation folds for faster execution
cv_folds = 3

# Function to evaluate regression models
def evaluate_regression(model, X_train, y_train, X_test, y_test):
    pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('model', model)])
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)
    
    # Cross-validation
    cv_scores = cross_val_score(pipeline, X_train, y_train, cv=cv_folds, scoring='neg_mean_squared_error', n_jobs=-1)
    cv_rmse = np.sqrt(-cv_scores.mean())
    
    return rmse, r2, cv_rmse

# Function to evaluate classification models
def evaluate_classification(model, X_train, y_train, X_test, y_test):
    pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('model', model)])
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
    recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
    f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)
    
    # Cross-validation
    cv_scores = cross_val_score(pipeline, X_train, y_train, cv=cv_folds, scoring='accuracy', n_jobs=-1)
    cv_accuracy = cv_scores.mean()
    
    return accuracy, precision, recall, f1, cv_accuracy

# Evaluate regression models
regression_results = {}
for name, model in regression_models.items():
    rmse, r2, cv_rmse = evaluate_regression(model, X_train, y_train, X_test, y_test)
    regression_results[name] = {'RMSE': rmse, 'R²': r2, 'CV RMSE': cv_rmse}

# Evaluate classification models (if applicable)
classification_results = {}
for name, model in classification_models.items():
    accuracy, precision, recall, f1, cv_accuracy = evaluate_classification(model, X_train, y_train, X_test, y_test)
    classification_results[name] = {'Accuracy': accuracy, 'Precision': precision, 'Recall': recall, 'F1-Score': f1, 'CV Accuracy': cv_accuracy}

# Display results
print("\nRegression Results:")
print(pd.DataFrame(regression_results).T)

print("\nClassification Results:")
print(pd.DataFrame(classification_results).T)
