# Cross validation

Outer k-fold cross validation as an alternative assessment method. 


In [1]:
# Read data
import pandas as pd
import constants
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.metrics import r2_score, mean_absolute_error
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
import numpy as np
import constants


adg_data = pd.read_csv('variables/processed.csv')


In [None]:
X = adg_data.iloc[:,:constants.FEATURES_NUM]
y = adg_data.iloc[:,constants.FEATURES_NUM:]

# Define the outer cross-validation
outer_cv = KFold(n_splits=4, shuffle=True, random_state=42)

models=["LR","Lasso","Ridge","KNN","RF","RF_all"]

# Define the model and pipeline components
pipelines = [Pipeline([
    ('scaler', StandardScaler()),            
    ('model', LinearRegression())                       
    ]),
    Pipeline([
    ('scaler', StandardScaler()),            
    ('feature_select', SelectFromModel(Lasso(random_state=12))), 
    ('model', Lasso(random_state=12))                       
    ]),
    Pipeline([
        ('scaler', StandardScaler()),            
        ('feature_select', SelectFromModel(Lasso(random_state=12))),  
        ('model', Ridge(random_state=12))                     
    ]),
    Pipeline([
        ('scaler', StandardScaler()),            
        ('feature_select', SelectFromModel(Lasso(random_state=12))),  
        ('model', KNeighborsRegressor())                     
    ]),
    Pipeline([
        ('scaler', StandardScaler()),            
        ('feature_select', SelectFromModel(Lasso(random_state=12))),  
        ('model', RandomForestRegressor(random_state=12))                     
    ]),
    Pipeline([
        ('scaler', StandardScaler()),            
        ('model', RandomForestRegressor(random_state=12))                     
    ]),
    ]

# Define hyperparameters to tune
param_grids = [
    {},
    {
    'model__alpha': [0.01,0.1, 1.0, 10.0]         
    },
    {
    'model__alpha': [0.1, 1.0, 10.0]         # Ridge regularization strength
    },
    {'model__n_neighbors': [3, 5, 7, 9],
    'model__metric': ['jaccard_score', 'hamming_loss','manhattan_distances','cosine_similarity']
     },
    {
        'model__max_depth': [5, 10, 20, 80, 90, 100],
        'model__max_features': [1,2, 3],
        'model__n_estimators': [50, 80, 100]
    },
    {
        'model__max_depth': [5, 10, 20, 80, 90, 100],
        'model__max_features': [1,2, 3],
        'model__n_estimators': [50, 80, 100]
    }]

mae_df = pd.DataFrame(index=y.columns,columns=["LR","Lasso","Ridge","KNN","RF","RF_all"])
std_df = pd.DataFrame(index=y.columns,columns=["LR","Lasso","Ridge","KNN","RF","RF_all"])
mae_df_naive = pd.DataFrame(index=y.columns,columns=["yavg","xavg","x"])
std_df_naive = pd.DataFrame(index=y.columns,columns=["yavg","xavg","x"])

# Store results
outer_results = []

naive_maesyavg = []
naive_stdsyavg = []
naive_maesx = []
naive_stdsx = []
naive_maesxavg = []
naive_stdsxavg = []

for i in range(16):
    y_i = y.iloc[:,i]
    tmp_maes = []
    tmp_stds = []
    computed_naive = False
    naive_maesyavg_folds = []
    naive_stdsyavg_folds = []
    naive_maesx_folds = []
    naive_stdsx_folds = []
    naive_maesxavg_folds = []
    naive_stdsxavg_folds = []
    for setup in range(len(models)):
        pipeline = pipelines[setup]
        model = models[setup]
        param_grid = param_grids[setup]
        print(f"Model: {models[setup]}, Target: {y.columns[i]}")
        tmp_maes_folds = []
        tmp_stds_folds = []
        # Outer CV loop
        for train_idx, test_idx in outer_cv.split(X, y_i):
            # Split data into training and test sets for this fold
            X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
            y_train, y_test = y_i.iloc[train_idx], y_i.iloc[test_idx]
            
            # Inner CV: Hyperparameter tuning
            inner_cv = KFold(n_splits=3, shuffle=True, random_state=42)
            grid_search = GridSearchCV(pipeline, param_grid, cv=inner_cv, scoring='neg_mean_absolute_error')
            grid_search.fit(X_train, y_train)


            # Evaluate on the outer test set
            best_model = grid_search.best_estimator_
            print(f"Best Model: {best_model}")
            y_pred = best_model.predict(X_test)
            mae = mean_absolute_error(y_test, y_pred)
            std = np.std(y_test - y_pred)

            if not computed_naive:
                #Naive predictions
                y_avg = np.mean(y_train)
                x_avg = np.mean(X_train[constants.FQ_FEATURES[i]])
                x_naive = X_test[constants.FQ_FEATURES[i]]

                test_mean_vect = np.zeros((y_test.shape[0],1))
                test_mean_vect += y_avg
                test_mean_vect = test_mean_vect.reshape(-1)
                naive_mae = mean_absolute_error(y_test,test_mean_vect)
                #print(naive_mae)
                naive_maesyavg_folds.append(naive_mae)
                naive_stdsyavg_folds.append(np.std(y_test - test_mean_vect))

                naive_mae = mean_absolute_error(y_test,x_naive)
                #print(naive_mae)
                naive_maesx_folds.append(naive_mae)
                naive_stdsx_folds.append(np.std(y_test - x_naive))

                test_mean_vect = np.zeros((y_test.shape[0],1))
                test_mean_vect += x_avg
                test_mean_vect = test_mean_vect.reshape(-1)
                naive_mae = mean_absolute_error(y_test,test_mean_vect)
                #print(naive_mae)
                naive_maesxavg_folds.append(naive_mae)
                naive_stdsxavg_folds.append(np.std(y_test - test_mean_vect))

            tmp_maes_folds.append(mae)
            tmp_stds_folds.append(std)


            # Print results for this outer fold
            print(f"Fold MAE: {mae:.4f}, Best Params: {grid_search.best_params_}")
        computed_naive = True
        tmp_maes.append(np.mean(tmp_maes_folds))
        tmp_stds.append(np.mean(tmp_stds_folds))    
        print(f"Mean MAE across folds: {np.mean(tmp_maes_folds):.4f}")
    naive_maesyavg.append(np.mean(naive_maesyavg_folds))
    naive_stdsyavg.append(np.mean(naive_stdsyavg_folds))
    naive_maesx.append(np.mean(naive_maesx_folds))
    naive_stdsx.append(np.mean(naive_stdsx_folds))
    naive_maesxavg.append(np.mean(naive_maesxavg_folds))
    naive_stdsxavg.append(np.mean(naive_stdsxavg_folds))
    mae_df.iloc[i] = tmp_maes
    std_df.iloc[i] = tmp_stds
    
mae_df_naive["yavg"] = naive_maesyavg
mae_df_naive["x"] = naive_maesx
mae_df_naive["xavg"] = naive_maesxavg

std_df_naive["yavg"] = naive_stdsyavg
std_df_naive["x"] = naive_stdsx 
std_df_naive["xavg"] = naive_stdsxavg



In [15]:
print(mae_df_naive)
print(std_df_naive)

                  yavg       xavg          x
125AftAir    10.506136  25.218657  24.725806
250AftAir     9.163248  28.072474  27.196237
500AftAir     9.019908  30.677088  30.034946
1000AftAir    9.111491  26.587015  25.754032
1500AftAir    9.754471  22.317662  20.443548
2000AftAir   11.361296  22.595109  20.854839
3000AftAir   14.308984  21.829564  18.815860
4000AftAir   15.696032  20.145025  16.184140
6000AftAir   17.845664  19.312539  14.901882
8000AftAir   19.567808  19.571607  12.263441
500AftBone    5.329087   7.505084   4.790323
1000AftBone   6.608725   8.300598   5.198925
1500AftBone   9.805156  10.975923   6.788978
2000AftBone  11.454097  12.435377   6.387097
3000AftBone  13.413414  14.062734   5.700269
4000AftBone  14.024301  14.013324   5.211022
                  yavg       xavg          x
125AftAir    12.651983  12.651983  13.383100
250AftAir    12.011830  12.011830  12.416251
500AftAir    11.730643  11.730643  12.694755
1000AftAir   11.608666  11.608666  11.032491
1500AftAir

In [27]:
startcol = 0
# Create an empty excel file in the same folder before executing this
with pd.ExcelWriter("test_results_cv.xlsx", engine='openpyxl', mode="a") as writer:
    # Write each DataFrame to a separate sheet
    mae_df.to_excel(writer, sheet_name='MAE',startcol=startcol, index=False)
    std_df.to_excel(writer, sheet_name='STD',startcol=startcol,index=False)
