In [None]:
import pandas as pd
import matplotlib.pyplot as plt 

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import ElasticNet
from sklearn.feature_selection import RFE
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import GroupKFold
from sklearn.pipeline import Pipeline

from skopt import BayesSearchCV
import shap

import warnings
warnings.filterwarnings("ignore")

In [None]:
### IMPORT PREPROCESSED DATAFRAMES ###

mmse = pd.read_csv()
badl = pd.read_csv()

In [None]:
mmse['MMSE_12'] = mmse['MMSE_ROC_12'] + mmse['Baseline MMSE']
mmse.drop(columns=['MMSE_ROC_12'], inplace=True)

badl['BADL_12'] = badl['BADL ROC'] + badl['Baseline BADL']
badl.drop(columns=['BADL ROC'], inplace=True)

In [None]:
test_dyad = []

mmse_train = mmse[~mmse['Dyad'].isin(test_dyad)]
mmse_test = mmse[mmse['Dyad'].isin(test_dyad)]

X_mmse_train = mmse_train.drop(columns=['MMSE_12', 'Dyad'])
y_mmse_train = mmse_train['MMSE_12']
groups_mmse_train = mmse_train['Dyad']

X_mmse_test = mmse_test.drop(columns=['MMSE_12', 'Dyad'])
y_mmse_test = mmse_test['MMSE_12']
groups_mmse_test = mmse_test['Dyad']

In [None]:
def test_model(model, X_train, y_train, X_test, y_test):
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('rfe', RFE(model)),
        ('model', model)
    ])

    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)

    results_df = pd.DataFrame({'y_test': y_test.values, 'y_pred': y_pred})

    return pipeline, mse, mae, results_df

In [None]:
param_grid = {"model__alpha": (1e-3, 10.0, "log-uniform"),
                "model__l1_ratio": (0.05, 0.5, "uniform")}

pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('rfe', RFE(estimator=ElasticNet(random_state=42))),
    ('model',ElasticNet(random_state=42))
])

gkf = GroupKFold(n_splits=10)

bayes_search = BayesSearchCV(
            estimator=pipeline,
            search_spaces=param_grid,
            cv=gkf, 
            n_iter=50, 
            scoring="neg_mean_squared_error",
            random_state=42,
            n_jobs=-1
        )

bayes_search.fit(X_mmse_train, y_mmse_train, groups=groups_mmse_train)
y_mmse_pred = bayes_search.predict(X_mmse_test)

MSE = mean_squared_error(y_mmse_test, y_mmse_pred)
MAE = mean_absolute_error(y_mmse_test, y_mmse_pred)

print(f"MSE: {MSE}, MAE: {MAE}")
print(f"Best parameters are: {bayes_search.best_params_}")

In [None]:
### TRAIN MODEL WITH BEST PARAMETERS ###

model_mmse = ElasticNet(alpha=, l1_ratio=)
pipeline, mse, mae, results_df = test_model(model_mmse, X_mmse_train, y_mmse_train, X_mmse_test, y_mmse_test)
print(f"MSE: {mse}, MAE: {mae}")

In [None]:
scaler = pipeline.named_steps['scaler']
selected_features_mask = pipeline.named_steps['rfe'].support_
model = pipeline.named_steps['model']

X_scaled = pd.DataFrame(scaler.transform(X_mmse_test), columns=X_mmse_test.columns)

selected_features = X_mmse_train.columns[selected_features_mask]
X_selected = X_scaled[selected_features]

explainer = shap.Explainer(model, X_selected)
shap_values = explainer(X_selected)

plt.figure(figsize=(20,5))
shap.waterfall_plot(shap_values[0], max_display=10)

plt.show()

### Repeat for BADL prediction

In [None]:
test_dyad = []

badl_train = badl[~badl['Dyad'].isin(test_dyad)]
badl_test = badl[badl['Dyad'].isin(test_dyad)]

X_badl_train = badl_train.drop(columns=['BADL_12', 'Dyad'])
y_badl_train = badl_train['BADL_12']
groups_badl_train = badl_train['Dyad']

X_badl_test = badl_test.drop(columns=['BADL_12', 'Dyad'])
y_badl_test = badl_test['BADL_12']
groups_badl_test = badl_test['Dyad']

In [None]:
param_grid = {"model__alpha": (1e-3, 10.0, "log-uniform"),
                "model__l1_ratio": (0.05, 0.5, "uniform")}

pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('rfe', RFE(estimator=ElasticNet(random_state=42))),
    ('model',ElasticNet(random_state=42))
])

gkf = GroupKFold(n_splits=10)

bayes_search = BayesSearchCV(
            estimator=pipeline,
            search_spaces=param_grid,
            cv=gkf, 
            n_iter=50, 
            scoring="neg_mean_squared_error",
            random_state=42,
            n_jobs=-1
        )

bayes_search.fit(X_badl_train, y_badl_train, groups=groups_badl_train)
y_badl_pred = bayes_search.predict(X_badl_test)

MSE = mean_squared_error(y_badl_test, y_badl_pred)
MAE = mean_absolute_error(y_badl_test, y_badl_pred)

print(f"MSE: {MSE}, MAE: {MAE}")
print(f"Best parameters are: {bayes_search.best_params_}")

In [None]:
### TRAIN MODEL USING BEST PARAMETERS ###
model_badl = ElasticNet(alpha=, l1_ratio=)

pipeline_badl, mse_badl, mae_badl, results_df_badl = test_model(model_badl, X_badl_train, y_badl_train, X_badl_test, y_badl_test)
print(f"MSE: {mse_badl}, MAE: {mae_badl}")

In [None]:
pipeline_badl.fit(X_badl_train, y_badl_train)

scaler = pipeline_badl.named_steps['scaler']
model = pipeline_badl.named_steps['model']

X_scaled_test = pd.DataFrame(scaler.transform(X_badl_test), columns=X_badl_test.columns)
X_scaled_train = pd.DataFrame(scaler.transform(X_badl_train), columns=X_badl_train.columns)

selected_features_mask = pipeline_badl.named_steps['rfe'].support_
selected_features = X_badl_train.columns[selected_features_mask]

background = X_scaled_train[selected_features]

explainer = shap.Explainer(model, background)
shap_values = explainer(X_scaled_test[selected_features])

plt.figure(figsize=(20,10))
shap.waterfall_plot(shap_values[0], max_display=10)

plt.show()