## Testing

In [1]:
import funct
import importlib
from sklearn.linear_model import Lasso, LassoCV, Ridge, RidgeCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold, cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import r2_score, mean_absolute_error
import pandas as pd
import numpy as np

importlib.reload(funct)


col_path = "variables/class_columns"
var_path = "variables/"


x_train = pd.read_csv(var_path + 'x_train.csv')
y_train = pd.read_csv(var_path + 'y_train.csv')
x_test = pd.read_csv(var_path + 'x_test.csv')
y_test = pd.read_csv(var_path + 'y_test.csv')


filters = funct.load_pickle(col_path)  

outer_cv = KFold(n_splits=10, shuffle=True, random_state=0)

mae_df = pd.DataFrame(index=y_train.columns,columns=["Lasso","Ridge","KNN","RF","RF_all"])
mae_rd_df = pd.DataFrame(index=y_train.columns,columns=["Lasso","Ridge","KNN","RF","RF_all"])
std_df = pd.DataFrame(index=y_train.columns,columns=["Lasso","Ridge","KNN","RF","RF_all"])
r2_df = pd.DataFrame(index=y_train.columns,columns=["Lasso","Ridge"])

best_models = []
all_features = np.zeros(16)

for i in range(16):
    x_cols = filters[i]
    x_train_sel = x_train[x_cols]
    y_train_i = y_train.iloc[:,i]
    y_test_i = y_test.iloc[:,i]
    x_test_sel = x_test[x_cols]

    print(x_cols)   
    
    print(y_train_i.name)
    
    tmp_maes = []
    tmp_maes_rd=[]
    tmp_stds = []
    tmp_r2s = []

    lasso_model_opt = LassoCV(cv=10, random_state=1).fit(x_train_sel, y_train_i)
    lasso_model = Lasso(lasso_model_opt.alpha_)
    lasso_model.fit(x_train_sel, y_train_i)

    lasso_predicted = lasso_model.predict(x_test_sel)
    lasso_predicted_rounded = funct.predict_and_round(lasso_model,x_test_sel)

    lasso_test_mae = mean_absolute_error(y_test_i,lasso_predicted)
    lasso_test_pr_mae = mean_absolute_error(y_test_i,lasso_predicted_rounded)
    lasso_test_r2 = r2_score(y_test_i,lasso_predicted)
    lasso_test_std = np.std(y_test_i - lasso_predicted)

    lasso_scores =  f"{lasso_test_mae = }\n" \
                    f"{lasso_test_r2 = }\n" \
                    f"{lasso_test_std = }\n" \
                    # f"{ridge_score_mean = }\n" \
    print(lasso_scores)
    tmp_maes.append(lasso_test_mae)
    tmp_maes_rd.append(lasso_test_pr_mae)   
    tmp_stds.append(lasso_test_std)
    tmp_r2s.append(lasso_test_r2)
    #3

    ridge_alphas = np.arange(10) + 1
    ridge_model_opt = RidgeCV(alphas=ridge_alphas,cv=None,store_cv_values=True).fit(x_train_sel, y_train_i)

    ridge_model = Ridge(ridge_model_opt.alpha_)

    ridge_model.fit(x_train_sel, y_train_i)

    ridge_predicted = ridge_model.predict(x_test_sel)
    ridge_predicted_rounded = funct.predict_and_round(ridge_model,x_test_sel)


    ridge_test_mae = mean_absolute_error(y_test_i,ridge_predicted)
    ridge_test_pr_mae = mean_absolute_error(y_test_i,ridge_predicted_rounded)
    ridge_test_r2 = r2_score(y_test_i,ridge_predicted)
    ridge_test_std = np.std(y_test_i - ridge_predicted)

    ridge_scores =  f"{ridge_test_mae = }\n" \
                    f"{ridge_test_r2 = }\n" \
                    f"{ridge_test_std = }\n" \
                    # f"{ridge_score_mean = }\n" \
    print(ridge_scores)
    tmp_maes.append(ridge_test_mae)
    tmp_maes_rd.append(ridge_test_pr_mae)   
    tmp_stds.append(ridge_test_std)
    tmp_r2s.append(ridge_test_r2)


    param_grid = {'n_neighbors': [3, 5, 7, 9]}
    grid_search = GridSearchCV(KNeighborsRegressor(), param_grid, cv=5)
    grid_search.fit(x_train_sel, y_train_i)
    print(grid_search.best_params_)

    knn_model = KNeighborsRegressor(n_neighbors=grid_search.best_params_["n_neighbors"])

    knn_model.fit(x_train_sel, y_train_i)


    knn_predicted = knn_model.predict(x_test_sel)
    knn_predicted_rounded = funct.predict_and_round(knn_model,x_test_sel)


    knn_test_mae = mean_absolute_error(y_test_i,knn_predicted)
    knn_test_pr_mae = mean_absolute_error(y_test_i,knn_predicted_rounded)
    knn_test_r2 = r2_score(y_test_i,knn_predicted)
    knn_test_std = np.std(knn_predicted - y_test_i)

    knn_scores =  f"{knn_test_mae = }\n" \
                    f"{knn_test_r2 = }\n" \
                    f"{knn_test_std = }\n" \
                    # f"{ridge_score_mean = }\n" \
    print(knn_scores)
    tmp_maes.append(knn_test_mae)
    tmp_maes_rd.append(knn_test_pr_mae)   
    tmp_stds.append(knn_test_std)

    param_grid = {
        'max_depth': [5, 10, 20, 80, 90, 100],
        'max_features': [1,2, 3],
        'n_estimators': [50, 80, 100]
    }
    # Create a based model
    rf = RandomForestRegressor(random_state=33)
    # Instantiate the grid search model
    grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, cv = 3)
    grid_search.fit(x_train_sel, y_train_i)
    rf_model = RandomForestRegressor(random_state=33)  #random_state=1
    rf_model.set_params(**grid_search.best_params_)
    #print(grid_search.best_params_)

    rf_model.fit(x_train_sel, y_train_i)
    
    rf_predicted = rf_model.predict(x_test_sel)
    rf_predicted_rounded = funct.predict_and_round(rf_model,x_test_sel)


    rf_test_mae = mean_absolute_error(y_test_i,rf_predicted)
    rf_test_pr_mae = mean_absolute_error(y_test_i,rf_predicted_rounded)
    rf_test_r2 = r2_score(y_test_i,rf_predicted)
    rf_test_std = np.std(y_test_i - rf_predicted)

    rf_scores =  f"{rf_test_mae = }\n" \
                    f"{rf_test_r2 = }\n" \
                    f"{rf_test_std = }\n" \
                    # f"{ridge_score_mean = }\n" \
    print(rf_scores)
    tmp_maes.append(rf_test_mae)
    tmp_maes_rd.append(rf_test_pr_mae)   
    tmp_stds.append(rf_test_std)
    #tmp_r2s.append(rf_test_r2)

    # Create a based model
    rf = RandomForestRegressor(random_state=33)
    # Instantiate the grid search model
    grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, cv = 3)
    grid_search.fit(x_train, y_train_i)
    rfall_model = RandomForestRegressor(random_state=33)  #random_state=1
    rfall_model.set_params(**grid_search.best_params_)
    print(grid_search.best_params_)    

    rfall_model.fit(x_train, y_train_i)

    rf_predicted = rfall_model.predict(x_test)
    rf_predicted_rounded = funct.predict_and_round(rfall_model,x_test)

    print("Random Forest with all features")
    rf_test_mae = mean_absolute_error(y_test_i,rf_predicted)
    rf_test_pr_mae = mean_absolute_error(y_test_i,rf_predicted_rounded)
    rf_test_r2 = r2_score(y_test_i,rf_predicted)
    rf_test_std = np.std(y_test_i - rf_predicted)

    rf_scores =  f"{rf_test_mae = }\n" \
                    f"{rf_test_r2 = }\n" \
                    f"{rf_test_std = }\n" \
                    # f"{ridge_score_mean = }\n" \
    print(rf_scores)
    tmp_maes.append(rf_test_mae)
    tmp_maes_rd.append(rf_test_pr_mae)   
    tmp_stds.append(rf_test_std)

    mae_df.iloc[i] = tmp_maes
    mae_rd_df.iloc[i] = tmp_maes_rd
    std_df.iloc[i] = tmp_stds
    r2_df.iloc[i] = tmp_r2s

    bestmodelindex = tmp_maes.index(min(tmp_maes))
    if(bestmodelindex == 0):
        best_models.append(lasso_model)
    elif(bestmodelindex == 1):
        best_models.append(ridge_model)
    elif(bestmodelindex == 2):
        best_models.append(knn_model)
    elif(bestmodelindex == 3):
        best_models.append(rf_model)
    elif(bestmodelindex == 4):
        best_models.append(rfall_model)
        all_features[i] = 1

    

Index(['1500_2000BA', '125BefAir', '500BefAir', '6000BefAir', '1000BefBone'], dtype='object')
125AftAir
lasso_test_mae = 10.056816468746321
lasso_test_r2 = 0.146479756954766
lasso_test_std = 11.944662186390877

ridge_test_mae = 10.076261977613756
ridge_test_r2 = 0.14692175253667106
ridge_test_std = 11.945037938708811

{'n_neighbors': 9}
knn_test_mae = 12.244444444444444
knn_test_r2 = -0.08191045279319353
knn_test_std = 13.6902957430022

rf_test_mae = 10.473287380037066
rf_test_r2 = 0.11912699760480971
rf_test_std = 12.648425009900476

{'max_depth': 5, 'max_features': 3, 'n_estimators': 50}
Random Forest with all features
rf_test_mae = 10.807314671844216
rf_test_r2 = 0.06944619155291054
rf_test_std = 12.979100267245787

Index(['250BefAir', '500BefAir', '2000BefAir', '8000BefAir', '1000BefBone'], dtype='object')
250AftAir
lasso_test_mae = 10.810560549918783
lasso_test_r2 = 0.12062718152803575
lasso_test_std = 13.847439905015166

ridge_test_mae = 10.686168002275162
ridge_test_r2 = 0.13822

In [2]:
print(best_models)

[Lasso(alpha=0.006321188026816793), RandomForestRegressor(max_depth=5, max_features=1, n_estimators=50,
                      random_state=33), RandomForestRegressor(max_depth=10, max_features=3, random_state=33), Lasso(alpha=0.08708368902613214), Ridge(alpha=10), RandomForestRegressor(max_depth=20, max_features=3, n_estimators=80,
                      random_state=33), Lasso(alpha=0.01893178403936121), RandomForestRegressor(max_depth=20, max_features=2, n_estimators=50,
                      random_state=33), Lasso(alpha=0.013576428257668622), KNeighborsRegressor(n_neighbors=9), RandomForestRegressor(max_depth=5, max_features=3, random_state=33), Ridge(alpha=2), RandomForestRegressor(max_depth=5, max_features=3, random_state=33), RandomForestRegressor(max_depth=5, max_features=2, n_estimators=50,
                      random_state=33), Ridge(alpha=1), RandomForestRegressor(max_depth=5, max_features=2, random_state=33)]


In [4]:
startcol = 0
# Create an empty excel file in the same folder before executing this
with pd.ExcelWriter("test_results.xlsx", engine='openpyxl', mode="a") as writer:
    # Write each DataFrame to a separate sheet
    mae_df.to_excel(writer, sheet_name='MAE',startcol=startcol, index=False)
    mae_rd_df.to_excel(writer, sheet_name='MAE_RD',startcol=startcol, index=False)
    r2_df.to_excel(writer, sheet_name='R2',startcol=startcol,index=False)
    std_df.to_excel(writer, sheet_name='STD',startcol=startcol,index=False)


funct.save_pickle(best_models,'models/best_models',True)
funct.save_np(all_features,"variables/ignore_filters",True)