# **Training, testing and evaluation of linear and nonlinear models for GM**
@author: Ruijia & Zaylea

### **Linear models**


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [None]:
# Set random seed
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

# Load and preprocess data
df = pd.read_csv('a2009s_with_age_sex.csv')

# Remove rows with missing or zero Age
initial_rows = df.shape[0]
df = df[df['Age'].notna()]
df = df[df['Age'] != 0]
print(f"Removed {initial_rows - df.shape[0]} rows with Age = 0 or NaN")

# Split features and target
X = df.drop(columns=['CCID', 'Age'])
y = df['Age'].values

# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train-test split (80/20)
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=RANDOM_SEED
)

# Define models and hyperparameter grids
param_grids = {
    "RidgeRegression": {'alpha': [0.01, 0.1, 1, 10, 100]},
    "LassoRegression": {'alpha': [0.01, 0.1, 1, 10, 100]},
    "ElasticNet": {'alpha': [0.1, 1, 10, 100], 'l1_ratio': [0.1, 0.5, 0.9]}

}

linear_models = {
    "RidgeRegression": Ridge(random_state=RANDOM_SEED, max_iter=100000),
    "LassoRegression": Lasso(random_state=RANDOM_SEED, max_iter=100000),
    "ElasticNet": ElasticNet(random_state=RANDOM_SEED, max_iter=100000)
}

# Cross-validation for best hyperparameters + final model training
kf = KFold(n_splits=5, shuffle=True, random_state=RANDOM_SEED)
results = {}

for model_name, model in linear_models.items():
    print(f"Model: {model_name}")
    fold_best_params = []
    fold_best_scores = []

    for fold_idx, (train_idx, val_idx) in enumerate(kf.split(X_train)):
        X_train_fold, X_val_fold = X_train[train_idx], X_train[val_idx]
        y_train_fold, y_val_fold = y_train[train_idx], y_train[val_idx]

        grid_search = GridSearchCV(model, param_grids[model_name], cv=5)
        grid_search.fit(X_train_fold, y_train_fold)

        fold_best_params.append(grid_search.best_params_)
        fold_best_scores.append(grid_search.best_score_)

        print(f"Fold {fold_idx+1} Best Params: {grid_search.best_params_}, CV Score: {grid_search.best_score_:.4f}")

    # Select best parameters based on highest score
    best_index = np.argmax(fold_best_scores)
    selected_best_param = fold_best_params[best_index]
    print(f"Selected best parameters for final training: {selected_best_param}")

    # Train final model on full training set with selected parameters
    final_model = model.set_params(**selected_best_param)
    final_model.fit(X_train, y_train)
    y_pred = final_model.predict(X_test)

    # Evaluate on test set
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    mean_age_gap = np.mean(np.abs(y_test - y_pred))

    results[model_name] = {
        'MSE': mse,
        'MAE': mae,
        'R²': r2,
        'Mean_Age_Gap': mean_age_gap
    }

# Print final evaluation results
print("\nModel Performance (5-Fold Cross-Validation):")
for model_name, metrics in results.items():
    print(f"{model_name} - MSE: {metrics['MSE']:.4f}, MAE: {metrics['MAE']:.4f}, R²: {metrics['R²']:.4f}, Mean Age Gap: {metrics['Mean_Age_Gap']:.4f}")

Removed 17 rows with Age = 0 or NaN
Model: RidgeRegression
Fold 1 Best Params: {'alpha': 100}, CV Score: 0.7196
Fold 2 Best Params: {'alpha': 100}, CV Score: 0.6912
Fold 3 Best Params: {'alpha': 100}, CV Score: 0.7591
Fold 4 Best Params: {'alpha': 100}, CV Score: 0.7058
Fold 5 Best Params: {'alpha': 100}, CV Score: 0.7430
Selected best parameters for final training: {'alpha': 100}
Model: LassoRegression
Fold 1 Best Params: {'alpha': 0.1}, CV Score: 0.6698
Fold 2 Best Params: {'alpha': 1}, CV Score: 0.6669
Fold 3 Best Params: {'alpha': 1}, CV Score: 0.7191
Fold 4 Best Params: {'alpha': 1}, CV Score: 0.6820
Fold 5 Best Params: {'alpha': 1}, CV Score: 0.6963
Selected best parameters for final training: {'alpha': 1}
Model: ElasticNet
Fold 1 Best Params: {'alpha': 1, 'l1_ratio': 0.1}, CV Score: 0.7397
Fold 2 Best Params: {'alpha': 1, 'l1_ratio': 0.1}, CV Score: 0.7339
Fold 3 Best Params: {'alpha': 1, 'l1_ratio': 0.1}, CV Score: 0.7807
Fold 4 Best Params: {'alpha': 1, 'l1_ratio': 0.1}, CV Sc

### **Nonlinear models**

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV, KFold
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from scipy.stats import pearsonr, spearmanr

In [None]:
# Set random seed
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

# Load and preprocess data
df = pd.read_csv('a2009s_with_age_sex.csv')

# Remove rows with missing or zero Age
initial_rows = df.shape[0]
df = df[df['Age'].notna()]
df = df[df['Age'] != 0]
print(f"Removed {initial_rows - df.shape[0]} rows with Age = 0 or NaN")

# Select features (including Sex if categorical)
X = df.drop(columns=['CCID', 'Age'])
y = df['Age'].values

# Encode sex if necessary
if "Sex" in X.columns and (X['Sex'].dtype == object or X['Sex'].nunique() <= 3):
    le = LabelEncoder()
    X['Sex'] = le.fit_transform(X['Sex'])

# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train-test split (80/20)
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=RANDOM_SEED
)

# Define models and hyperparameter grids
param_grids = {
    "RandomForest": {'n_estimators': [50, 100, 150],
                     'max_depth': [10, 20, 30]},
    "XGBoost": {'n_estimators': [50, 100, 150],
                'learning_rate': [0.05, 0.1, 0.2],
                'max_depth': [3, 5, 7]}
}

nonlinear_models = {
    "RandomForest": RandomForestRegressor(random_state=RANDOM_SEED),
    "XGBoost": XGBRegressor(random_state=RANDOM_SEED, verbosity=0)
}

# Cross-validation setup
kf = KFold(n_splits=5, shuffle=True, random_state=RANDOM_SEED)
results = {}

for model_name, model in nonlinear_models.items():
    print(f"Model: {model_name}")
    fold_best_params = []
    fold_best_scores = []

    for fold_idx, (train_idx, val_idx) in enumerate(kf.split(X_train)):
        X_train_fold, X_val_fold = X_train[train_idx], X_train[val_idx]
        y_train_fold, y_val_fold = y_train[train_idx], y_train[val_idx]

        search = RandomizedSearchCV(
            model,
            param_distributions=param_grids[model_name],
            n_iter=5,
            cv=3,
            scoring='neg_mean_absolute_error',
            n_jobs=-1,
            random_state=RANDOM_SEED
        )
        search.fit(X_train_fold, y_train_fold)

        fold_best_params.append(search.best_params_)
        fold_best_scores.append(search.best_score_)

        print(f"Fold {fold_idx+1} Best Params: {search.best_params_}, "
              f"CV Score: {search.best_score_:.4f}")

    # Select best parameters based on highest CV score
    best_index = np.argmax(fold_best_scores)
    selected_best_param = fold_best_params[best_index]
    print(f"Selected best parameters for final training: {selected_best_param}")

    # Train final model on full training set with selected parameters
    final_model = model.set_params(**selected_best_param)
    final_model.fit(X_train, y_train)
    y_pred = final_model.predict(X_test)

    # Evaluate on test set
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    mean_age_gap = np.mean(np.abs(y_test - y_pred))
    pcc, _ = pearsonr(y_test, y_pred)
    spcc, _ = spearmanr(y_test, y_pred)

    results[model_name] = {
        'MSE': mse,
        'MAE': mae,
        'R²': r2,
        'Mean_Age_Gap': mean_age_gap,
        'PCC': pcc,
        'SPCC': spcc
    }

# Print final evaluation results
print("\nModel Performance (5-Fold Cross-Validation):")
for model_name, metrics in results.items():
    print(f"{model_name} - MSE: {metrics['MSE']:.4f}, "
          f"MAE: {metrics['MAE']:.4f}, "
          f"R²: {metrics['R²']:.4f}, "
          f"Mean Age Gap: {metrics['Mean_Age_Gap']:.4f}, "
          f"PCC: {metrics['PCC']:.4f}, "
          f"SPCC: {metrics['SPCC']:.4f}")

Removed 17 rows with Age = 0 or NaN
Model: RandomForest
Fold 1 Best Params: {'n_estimators': 150, 'max_depth': 20}, CV Score: -9.3576
Fold 2 Best Params: {'n_estimators': 150, 'max_depth': 20}, CV Score: -9.3380
Fold 3 Best Params: {'n_estimators': 100, 'max_depth': 30}, CV Score: -9.2160
Fold 4 Best Params: {'n_estimators': 150, 'max_depth': 20}, CV Score: -9.1335
Fold 5 Best Params: {'n_estimators': 150, 'max_depth': 20}, CV Score: -8.9494
Selected best parameters for final training: {'n_estimators': 150, 'max_depth': 20}
Model: XGBoost
Fold 1 Best Params: {'n_estimators': 50, 'max_depth': 3, 'learning_rate': 0.1}, CV Score: -8.7096
Fold 2 Best Params: {'n_estimators': 50, 'max_depth': 3, 'learning_rate': 0.1}, CV Score: -8.5732
Fold 3 Best Params: {'n_estimators': 50, 'max_depth': 3, 'learning_rate': 0.1}, CV Score: -8.5439
Fold 4 Best Params: {'n_estimators': 50, 'max_depth': 3, 'learning_rate': 0.1}, CV Score: -8.3635
Fold 5 Best Params: {'n_estimators': 50, 'max_depth': 3, 'learn