# Cross validation

Outer k-fold cross validation as an alternative assessment method. 


In [1]:
# Read data
import pandas as pd
import constants
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.metrics import r2_score, mean_absolute_error
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
import numpy as np


adg_data = pd.read_csv('variables/processed.csv')


In [None]:
X = adg_data.iloc[:,:constants.FEATURES_NUM]
y = adg_data.iloc[:,constants.FEATURES_NUM:]

# Define the outer cross-validation
outer_cv = KFold(n_splits=4, shuffle=True, random_state=42)

models=["LR","Lasso","Ridge","KNN","RF","RF_all"]

# Define the model and pipeline components
pipelines = [Pipeline([
    ('scaler', StandardScaler()),            
    ('feature_select', SelectFromModel(Lasso(random_state=12))), 
    ('model', LinearRegression())                       
    ]),
    Pipeline([
    ('scaler', StandardScaler()),            
    ('feature_select', SelectFromModel(Lasso(random_state=12))), 
    ('model', Lasso(random_state=12))                       
    ]),
    Pipeline([
        ('scaler', StandardScaler()),            
        ('feature_select', SelectFromModel(Lasso(random_state=12))),  
        ('model', Ridge(random_state=12))                     
    ]),
    Pipeline([
        ('scaler', StandardScaler()),            
        ('feature_select', SelectFromModel(Lasso(random_state=12))),  
        ('model', KNeighborsRegressor())                     
    ]),
    Pipeline([
        ('scaler', StandardScaler()),            
        ('feature_select', SelectFromModel(Lasso(random_state=12))),  
        ('model', RandomForestRegressor(random_state=12))                     
    ]),
    Pipeline([
        ('scaler', StandardScaler()),            
        ('model', RandomForestRegressor(random_state=12))                     
    ]),
    ]

# Define hyperparameters to tune
param_grids = [
    {},
    {
    'model__alpha': [0.01,0.1, 1.0, 10.0]         
    },
    {
    'model__alpha': [0.1, 1.0, 10.0]         # Ridge regularization strength
    },
    {'model__n_neighbors': [3, 5, 7, 9],
    'model__metric': ['jaccard_score', 'hamming_loss','manhattan_distances','cosine_similarity']
     },
    {
        'model__max_depth': [5, 10, 20, 80, 90, 100],
        'model__max_features': [1,2, 3],
        'model__n_estimators': [50, 80, 100]
    },
    {
        'model__max_depth': [5, 10, 20, 80, 90, 100],
        'model__max_features': [1,2, 3],
        'model__n_estimators': [50, 80, 100]
    }]

mae_df = pd.DataFrame(index=y.columns,columns=["LR","Lasso","Ridge","KNN","RF","RF_all"])
std_df = pd.DataFrame(index=y.columns,columns=["LR","Lasso","Ridge","KNN","RF","RF_all"])

# Store results
outer_results = []

for i in range(16):
    y_i = y.iloc[:,i]
    tmp_maes = []
    tmp_stds = []
    for setup in range(len(models)):
        pipeline = pipelines[setup]
        model = models[setup]
        param_grid = param_grids[setup]
        print(f"Model: {models[setup]}, Target: {y.columns[i]}")
        tmp_maes_folds = []
        tmp_stds_folds = []
        # Outer CV loop
        for train_idx, test_idx in outer_cv.split(X, y_i):
            # Split data into training and test sets for this fold
            X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
            y_train, y_test = y_i.iloc[train_idx], y_i.iloc[test_idx]
            
            # Inner CV: Hyperparameter tuning
            inner_cv = KFold(n_splits=3, shuffle=True, random_state=42)
            grid_search = GridSearchCV(pipeline, param_grid, cv=inner_cv, scoring='neg_mean_absolute_error')
            grid_search.fit(X_train, y_train)


            # Evaluate on the outer test set
            best_model = grid_search.best_estimator_
            print(f"Best Model: {best_model}")
            y_pred = best_model.predict(X_test)
            mae = mean_absolute_error(y_test, y_pred)
            std = np.std(y_test - y_pred)

            tmp_maes_folds.append(mae)
            tmp_stds_folds.append(std)


            # Print results for this outer fold
            print(f"Fold MAE: {mae:.4f}, Best Params: {grid_search.best_params_}")

        tmp_maes.append(np.mean(tmp_maes_folds))
        tmp_stds.append(np.mean(tmp_stds_folds))    
        print(f"Mean MAE across folds: {np.mean(tmp_maes_folds):.4f}")
    mae_df.iloc[i] = tmp_maes
    std_df.iloc[i] = tmp_stds
    




In [27]:
startcol = 0
# Create an empty excel file in the same folder before executing this
with pd.ExcelWriter("test_results_cv.xlsx", engine='openpyxl', mode="a") as writer:
    # Write each DataFrame to a separate sheet
    mae_df.to_excel(writer, sheet_name='MAE',startcol=startcol, index=False)
    std_df.to_excel(writer, sheet_name='STD',startcol=startcol,index=False)
