# Opioid Misuse Prediction Models

*Yiyu Wang 2024/04/19*

In [None]:
data_dir = '../data/'
figures_dir = '../figures/'
model_dir = '../model_results/'


In [None]:
raw_PREDICTOR_COLUMNS=['k23_age', 'demo_hispanic', 'demo_ethnicity_1', 'demo_ethnicity_2', 'demo_ethnicity_3',
       'demo_ethnicity_4', 'demo_ethnicity_5', 'demo_ethnicity_6',
       'demo_ethnicity_99', 'demo_gender_1', 'demo_gender_2', 'demo_gender_99',
       'demo_income', 'demo_education', 'demo_legal', 'demo_employment___1',
       'demo_employment___2', 'demo_employment___3', 'demo_employment___4',
       'demo_employment___5', 'demo_employment___6', 'demo_employment___7',
       'demo_employment___8', 'demo_employment___9', 'demo_employment___99',
       'demo_disability', 'demo_marital', 'mh_accident', 'mh_pain_duration',
       'promis_pi_01', 'promis_pi_02', 'promis_pi_03', 'opioid_years_v2',
       'meds_more_v2', 'PainInT', 'AngerT', 'AnxietyT', 'DepressT', 'FatigueT',
       'GlobalpT', 'GlobalmT', 'PhyFxT', 'SleepDisT', 'audittot', 'AUDITpos',
       'pcstotal', 'pcs_help', 'pcs_rum', 'pcs_mag', 'dasttot', 'c_eactotl',
       'aeqtot', 'ctq_emo_abu', 'ctq_phy_abu', 'ctq_emo_neg', 'ctq_phy_neg',
       'ctq_sex_abu', 'ctqtot', 'mh_psychological_yes_binary']

rename_dict = {'k23_age': 'age', 
              'demo_hispanic': 'Hispanic', 
              'demo_ethnicity_1':'Asian',
              'demo_ethnicity_2':'Caucasian', 
              'demo_ethnicity_3':'NativeHawaiian', 
              'demo_ethnicity_4':'Black', 
              'demo_ethnicity_5':'AmericanIndian', 
              'demo_ethnicity_6':'MoreThanOneRace', 
              'demo_ethnicity_99':'OtherEthnicity',
              'demo_income':'income', 'demo_education':'education',
              'demo_legal':'legal', 
              'demo_gender_1': 'male', 'demo_gender_2': 'female', 'demo_gender_99':'OtherGender',
              'demo_employment___1':'part_time', 'demo_employment___2':'full_time',
              'demo_employment___3':'not_employed', 'demo_employment___4':'homemaker',
              'demo_employment___5':'temp_unemployed',
              'demo_employment___6':'unemployed', 'demo_employment___7':'looking_unemployed',
              'demo_employment___8':'disabled', 'demo_employment___9':'retired',
              'demo_employment___99':'OtherEmployment', 'demo_disability':'disability',
              'demo_marital':'marital', 'opioid_years_v2': 'opioid_years', 'meds_more_v2':'meds_more',
              'mh_accident':'accident', 'mh_pain_duration':'pain_duration', 'mh_psychological_yes_binary':'psychological_treatment_yes',
              'promis_pi_01':'past_pain_intensity', 'promis_pi_02':'worst_pain_intensity', 'promis_pi_03':'current_pain_intensity',
              'PainInT':'PainInterference', 'AngerT':'Anger', 'AnxietyT':'Anxiety', 'DepressT':'Depression', 'FatigueT':'Fatigue',
              'GlobalpT':'GlobalPhysical', 'GlobalmT':'GlobalMental', 'PhyFxT':'PhysicalFunction', 'SleepDisT':'SleepDisturbance',
              'audittot':'AlcoholUseScore', 'AUDITpos':'AlcoholUserBinary', 
              'pcstotal':'PainCatastrophizing_total', 'pcs_help':'PCS_helplessness', 'pcs_rum':'PCS_rumination', 'pcs_mag':'PCS_magnification',
              'dasttot':'DrugUseScore', 'c_eactotl':'CocaineUseScore', 'aeqtot':'AmbivalenceEmotion',
              'ctqtot': 'ChildhoodTrauma_total', 'ctq_emo_abu':'CTQ_EmotionalAbuse', 'ctq_phy_abu':'CTQ_PhysicalAbuse', 'ctq_emo_neg':'CTQ_EmotionalNeglect', 'ctq_phy_neg':'CTQ_PhysicalNeglect','ctq_sex_abu':'ctq_SexualAbuse'}

PREDICTOR_COLUMNS = [rename_dict[col] for col in raw_PREDICTOR_COLUMNS]
print(PREDICTOR_COLUMNS)
print('n predictors =', len(PREDICTOR_COLUMNS))
test_size = 0.2
SEED = 100

In [None]:
import numpy as np
import glob
from sklearn.metrics import roc_auc_score, mean_squared_error
import matplotlib.pyplot as plt
import joblib
import seaborn as sns


from xgboost import XGBRegressor
from sklearn.svm import SVR
from sklearn.linear_model import Lasso, Ridge, ElasticNet
from sklearn.neural_network import MLPRegressor
import warnings
warnings.filterwarnings('ignore')


# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import make_pipeline
from sklearn.decomposition import PCA
from sklearn.metrics import r2_score



In [None]:

def split_data(df, split_data = 'split', outcome = 'ouddx', predictor_columns = PREDICTOR_COLUMNS, test_size = 0.2, SEED = SEED):
    if split_data == 'cohort':

        y_train, y_test, y_val = df.loc[df['cohort'] == 'train', outcome], df.loc[df['cohort'] == 'test', outcome], df.loc[df['cohort'] == 'val', outcome]
        X_train, X_test, X_val = df.loc[df['cohort'] == 'train', predictor_columns], df.loc[df['cohort'] == 'test', predictor_columns], df.loc[df['cohort'] == 'validation', predictor_columns]

        print(X_train.shape, X_test.shape, X_val.shape)
    elif split_data == 'split':
        X = df[predictor_columns]
        y = df[outcome]
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=SEED)
        
    else:
        X_train, X_test, y_train, y_test = [], [], [], []
        print('split_data must be either "split" or "cohort"')
    return X_train, X_test, y_train, y_test
    # return {'X_train': X_train,'X_test':X_test, 'y_train': y_train, 'y_test':y_test}




In [None]:
df = pd.read_csv(data_dir + 'M_K23_ML_reduced_imputed.csv')
df = df.rename(columns=rename_dict)
df.head()

In [None]:
# split select participants with commtot > 9
which_group = 'high'    
if which_group == 'high':
    df = df.loc[df['commtot'] > 9].reset_index(drop=True)
    df.to_csv(data_dir + f'M_K23_ML_group-{which_group}_reduced_imputed.csv', index=False)
elif which_group == 'low':
    df = df.loc[df['commtot'] <= 9].reset_index(drop=True)
    df.to_csv(data_dir + f'M_K23_ML_group-{which_group}_reduced_imputed.csv', index=False)
elif which_group == 'all':
    pass
    df.to_csv(data_dir + f'M_K23_ML_group-{which_group}_reduced_imputed.csv', index=False)

df.describe().to_csv(data_dir + f'M_K23_ML_group-{which_group}_reduced_imputed_describe.csv')
df.describe()    

In [None]:
def monte_carlo_cv(model, df, split, outcome, n_splits=100, test_size=0.3, seed_start=SEED, predictor_columns=PREDICTOR_COLUMNS):
    auc_scores, acc_scores, mse_scores, r_scores = [], [], [], []
    seeds = np.arange(seed_start, seed_start + n_splits)  # Define seeds for repeatability
    models = []
    for seed in seeds:
        # Split the data
        X_train, X_test, y_train, y_test = split_data(df, split_data = split, outcome = outcome, predictor_columns=predictor_columns, test_size = test_size, SEED = seed)
        
         # set up model
        if model == None:
            from sklearn.ensemble import RandomForestClassifier
            model = RandomForestClassifier(
                n_estimators=200,          # Number of trees in the forest
                max_depth=4,              # Maximum depth of the trees
                min_samples_split=4,       # Minimum number of samples required to split an internal node
                min_samples_leaf=2,        # Minimum number of samples required to be at a leaf node
                random_state=seed,            # Ensures a deterministic outcome for reproducibily
            )
        else:
            model = model
            model.random_state = seed 

        model.fit(X_train, y_train)
        models.append(model)
        
        y_pred = model.predict(X_test)
        # Evaluate the model
        if outcome == 'commtot':
            mse_score = mean_squared_error(y_test, y_pred)
            mse_scores.append(mse_score)

            r_score = np.corrcoef(y_test, y_pred)[0, 1]
            r_scores.append(r_score)

        elif outcome == 'ouddx':
            y_pred_proba = model.predict_proba(X_test)[:, 1]
            auc_score = roc_auc_score(y_test, y_pred_proba)
            auc_scores.append(auc_score)

            accuracy = np.mean(y_pred == y_test)
            acc_scores.append(accuracy)

        else:
            raise ValueError(f'Invalid outcome: {outcome}')    

    # collect the results   
    if outcome == 'commtot':
        # Calculate mean mse and 95% CI
        mean_mse = np.mean(mse_scores)
        mse_ci_lower, mse_ci_upper = np.percentile(mse_scores, [2.5, 97.5])
        mean_r = np.mean(r_scores)
        r_ci_lower, r_ci_upper = np.percentile(r_scores, [2.5, 97.5])
        # Calculate mean r and 95% CI
        r2_scores = [r**2 for r in r_scores]
        mean_r2 = np.nanmean(r2_scores)
        result = {
            'models': models,
            'mean_mse': mean_mse,
            'mse_ci_lower': mse_ci_lower,
            'mse_ci_upper': mse_ci_upper,
            'mean_r': mean_r,
            'median_r': np.nanmedian(r_scores), 
            'r_ci_lower': r_ci_lower,
            'r_ci_upper': r_ci_upper,
            'mse_scores': mse_scores,
            'r_scores': r_scores,
            'r2_scores': r2_scores,
            'mean_r2': mean_r2,
            'median_r2': np.nanmedian(r2_scores)
        }
      
    elif outcome == 'ouddx':
        # Calculate mean AUC and 95% CI
        mean_auc = np.mean(auc_scores)
        auc_ci_lower, auc_ci_upper = np.percentile(auc_scores, [2.5, 97.5])

        mean_accuracy = np.mean(acc_scores)
        accuracy_ci_lower, accuracy_ci_upper = np.percentile(acc_scores, [2.5, 97.5])
        
        result = {
            'models': models,
            'mean_auc': mean_auc,
            'auc_ci_lower': auc_ci_lower,
            'auc_ci_upper': auc_ci_upper,
            'mean_accuracy': mean_accuracy,
            'accuracy_ci_lower': accuracy_ci_lower,
            'accuracy_ci_upper': accuracy_ci_upper,
            'auc_scores': auc_scores,
            'acc_scores': acc_scores,
        }   
           
    return result


def mc_cv_ml_pipeline(model, df, split, outcome, test_size=0.3, n_split=100, seed_start=0, predictor_columns = PREDICTOR_COLUMNS):
    # Split the dataframe into X (features) and y (target)
    
    result = monte_carlo_cv(model, df, split, outcome, n_splits=n_split, test_size=test_size, seed_start=seed_start, predictor_columns=predictor_columns)
    if outcome == 'ouddx':
        print(f"Mean AUC: {result['mean_auc']:.3f}, 95% CI: [{result['auc_ci_lower']:.3f}, {result['auc_ci_upper']:.3f}]")
        print(f"Mean Accuracy: {result['mean_accuracy']:.3f}, 95% CI: [{result['accuracy_ci_lower']:.3f}, {result['accuracy_ci_upper']:.3f}]")
    elif outcome == 'commtot':
        print(f"Mean MSE: {result['mean_mse']:.3f}, 95% CI: [{result['mse_ci_lower']:.3f}, {result['mse_ci_upper']:.3f}]")
        print(f"Mean R: {result['mean_r']:.3f}, 95% CI: [{result['r_ci_lower']:.3f}, {result['r_ci_upper']:.3f}]")
        print(f"Median R: {np.median(result['r_scores']):.3f}")    
    return result




# fit models

In [None]:
# create hyperparameter grid for each model

models_list = [RandomForestRegressor(), XGBRegressor(), SVR(), Lasso(), Ridge(), ElasticNet(), MLPRegressor()]
param_grid_list = [
    {
        'n_estimators': list(range(50, 500, 50)),
        'max_depth': [4, 8, 16],
        'min_samples_split': [2, 4, 8],
        'min_samples_leaf': [1, 2, 4]
    },
    {
        'n_estimators': list(range(50, 500, 50)),
        'max_depth': [4, 8, 16],
        'learning_rate': [0.01, 0.1, 0.3]
    },
    {
        'C': [0.1, 1, 10],
        'gamma': [0.01, 0.1, 1],
        'kernel': ['rbf', 'linear']
    },
    {
        'alpha': [0.1, 0.5, 1, 5, 10],
        'max_iter': [100, 200, 300, 400]
    },
    {
        'alpha': [0.1, 0.5, 1, 5, 10],
        'max_iter': [100, 200, 300, 400]
    },
    {
        'alpha': [0.1, 0.5, 1, 5, 10],
        'l1_ratio': [0.1, 0.5, 0.9],
        'max_iter': [100, 200, 300, 400]
    },
    {
        'hidden_layer_sizes': [(100,), (200,), (300,)],
        'activation': ['tanh', 'relu', 'logistic'],
        'solver': ['adam', 'sgd'],
        'alpha': [0.0001, 0.001, 0.01],
        'learning_rate': ['constant', 'adaptive', 'invscaling'],
        'learning_rate_init': [0.0001, 0.001, 0.005, 0.01, 0.05]
    }
]

X_train, X_test, y_train, y_test = split_data(df, split_data = 'split', outcome = 'commtot', predictor_columns=PREDICTOR_COLUMNS, test_size = test_size, SEED = SEED)

for i in enumerate(models_list):
    model = i[1]
    model_name = model.__class__.__name__
    print(model_name)
    param_grid = param_grid_list[i[0]]
    GS = GridSearchCV(model, param_grid, cv=5, scoring=['r2', 'neg_mean_squared_error'], refit = 'r2', n_jobs=-1, verbose =0)
    GS.fit(X_train, y_train)

    cv_df = pd.DataFrame(GS.cv_results_)
    cv_df.to_csv(model_dir + f'{model_name}_grid_search_results_group-{which_group}.csv', index=False)
    joblib.dump(GS, model_dir + f'{model_name}_GS_best_model.pkl')


    


In [None]:
# running model based on the model with the highest r2 score
model_results = {}
for model in models_list:
    model_name = model.__class__.__name__
    print(model_name)
    cv_df = pd.read_csv(model_dir + f'{model_name}_grid_search_results_group-{which_group}.csv')
    best_model_index = cv_df['rank_test_r2'].idxmin()
    best_model_param = eval(cv_df.loc[best_model_index, 'params'])
    best_model = model.set_params(**best_model_param)
    joblib.dump(best_model, model_dir + f'{model_name}_rank_best_model_group-{which_group}.pkl')
    model_results[model_name] = mc_cv_ml_pipeline(best_model, df, split='split', outcome='commtot', test_size=0.3, n_split=100, predictor_columns=PREDICTOR_COLUMNS)

joblib.dump(model_results, model_dir + f'rank_model_mc_results_group-{which_group}.pkl')



# make bar plots:

In [None]:
# r2 rank test
# Extract the r_scores for each model
which_group = 'high'
model_results = joblib.load(model_dir + f'rank_model_mc_results_group-{which_group}.pkl')
r_scores = [result['r_scores'] for result in model_results.values()]

# model labels:
model_labels = [model for model in model_results.keys()]
model_labels = [model.replace('Regressor', '') for model in model_labels]
# Find the model with the highest mean r_score
max_mean_index = np.argmax([results['mean_r'] for results in model_results.values()])

# Create a box plot
plt.boxplot(r_scores, labels=model_labels, medianprops = dict(color = "orange", linewidth = 3))

# Add scatter points
for i, r_score in enumerate(r_scores):
    if i == max_mean_index:
        color = 'orange'  # Set the color of the model with the highest mean
        print(np.nanmean(r_score))
    else:
        color = 'gray'  # Set the color of other models to gray
    x = np.random.normal(i + 1, 0.04, size=len(r_score))
    plt.scatter(x, r_score, alpha=0.5, color=color)

# Customize font sizes
axis_fontsize = 14
xtick_fontsize = 12
ytick_fontsize = 12
title_fontsize = 16

plt.xlabel('Models', fontsize=axis_fontsize)
plt.ylabel("Pearson's correlation (r)", fontsize=axis_fontsize)
# plt.title('Distribution of correlation values by model', fontsize=title_fontsize)
plt.title('Model Performance', fontsize=title_fontsize)

plt.xticks(fontsize=xtick_fontsize, rotation=30)
plt.yticks(fontsize=ytick_fontsize)


# save figure
plt.savefig(figures_dir + f'rank_models_box_plot_mean_r_score_group-{which_group}.png', dpi=300, bbox_inches='tight')
plt.show()


In [None]:
# r2 rank test: median r score
# Extract the r_scores for each model
model_results = joblib.load(model_dir + f'rank_model_mc_results_group-{which_group}.pkl')
r_scores = [result['r_scores'] for result in model_results.values()]

# model labels:
model_labels = [model for model in model_results.keys()]
model_labels = [model.replace('Regressor', '') for model in model_labels]
# Find the model with the highest mean r_score
max_mean_index = np.argmax([results['median_r'] for results in model_results.values()])

# Create a box plot
plt.boxplot(r_scores, labels=model_labels, medianprops = dict(color = "orange", linewidth = 3))

# Add scatter points
for i, r_score in enumerate(r_scores):
    if i == max_mean_index:
        color = 'orange'  # Set the color of the model with the highest mean
        print(np.median(r_score))
    else:
        color = 'gray'  # Set the color of other models to gray
    x = np.random.normal(i + 1, 0.04, size=len(r_score))
    plt.scatter(x, r_score, alpha=0.5, color=color)

plt.xlabel('Model')
plt.ylabel('r_score')
plt.title('Box Plot of r_score for Each Model')
#plt.xticks(rotation=45)

# save figure
plt.savefig(figures_dir + f'rank_models_box_plot_median_r_score_group-{which_group}.png', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
# plot r2

model_results = joblib.load(model_dir + f'rank_model_mc_results_group-{which_group}.pkl')
r_scores = [result['r_scores'] for result in model_results.values()]
r2_scores = [result['r2_scores'] for result in model_results.values()]
# model labels:
model_labels = [model for model in model_results.keys()]
model_labels = [model.replace('Regressor', '') for model in model_labels]
# Find the model with the highest mean r_score
max_mean_index = np.argmax([results['median_r2'] for results in model_results.values()])

# Create a box plot
plt.boxplot(r2_scores, labels=model_labels, medianprops = dict(color = "orange", linewidth = 3))

# Add scatter points
for i, r2_score in enumerate(r2_scores):
    if i == max_mean_index:
        color = 'orange'  # Set the color of the model with the highest mean
        print(np.nanmedian(r2_score))
        
    else:
        color = 'gray'  # Set the color of other models to gray
    x = np.random.normal(i + 1, 0.04, size=len(r2_score))
    plt.scatter(x, r2_score, alpha=0.5, color=color)

plt.xlabel('Model')
plt.ylabel('r2')
plt.title('Box Plot of median r2 for Each Model')
#plt.xticks(rotation=45)

# save figure
plt.savefig(figures_dir + f'rank_models_box_plot_median_r2_group-{which_group}.png', dpi=300, bbox_inches='tight')
plt.show()

## confusion matrix if model is trained on all participants

In [None]:
def MakeAucPlot(auc_score, fpr,tpr, title):
    # Plot the ROC curve
    plt.figure()
    plt.plot(fpr, tpr, color='orange', lw=2, label=f'AUC = {auc_score:.2f}')
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(title)
    plt.legend(loc="lower right")
    plt.show()

which_group = 'all'
if which_group == 'all':
    df = pd.read_csv(data_dir + f'M_K23_ML_group-{which_group}_reduced_imputed.csv')
    from sklearn.metrics import roc_auc_score, confusion_matrix, roc_curve

    # find the median model
    final_best_model_name = 'ElasticNet'
    models_list = [RandomForestRegressor(), XGBRegressor(), SVR(), Lasso(), Ridge(), ElasticNet(), MLPRegressor()]
    model = models_list[[model.__class__.__name__ for model in models_list].index(final_best_model_name)]
    cv_df = pd.read_csv(model_dir + f'{final_best_model_name}_grid_search_results_group-{which_group}.csv')

    best_model_index = cv_df['rank_test_r2'].idxmin()
    best_model_param = eval(cv_df.loc[best_model_index, 'params'])
    final_best_model = model.set_params(**best_model_param)

    print(final_best_model)

    # model results:
    model_results = joblib.load(model_dir + f'rank_model_mc_results_group-{which_group}.pkl')
    r2_scores = model_results[final_best_model_name]['r2_scores']
    seed = np.argsort(r2_scores)[len(r2_score)//2-2]
    print(f'seed: {seed}')

    X_train, X_test, y_train, y_test = split_data(df, split_data = 'split', outcome = 'commtot', predictor_columns=PREDICTOR_COLUMNS, test_size = test_size, SEED = seed)

    
    
    final_best_model.fit(X_train, y_train)
    y_pred = final_best_model.predict(X_test)

    print(f'MSE: {mean_squared_error(y_test, y_pred):.3f}')
    print(f'R2: {np.corrcoef(y_test, y_pred)[0,1]**2:.3f}')

    # binarize y_pred and y_test
    y_pred_bin = np.where(y_pred > 9, 1, 0)
    y_test_bin = np.where(y_test > 9, 1, 0)
    
    import matplotlib.pyplot as plt
    plt.scatter(y_pred, y_test, color='orange')
    # add fit line
    plt.plot(np.unique(y_pred), np.poly1d(np.polyfit(y_pred, y_test, 1))(np.unique(y_pred)), color='green')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.title(f'Predicted vs Actual')
    plt.savefig(figures_dir + f'{final_best_model_name}_AUC_group-{which_group}.png', dpi=300, bbox_inches='tight')
    plt.show()

    fpr, tpr, thresholds = roc_curve(y_test_bin, y_pred)

    auc_score = roc_auc_score(y_test_bin, y_pred)
    print(f'AUC: {auc_score:.3f}')
    MakeAucPlot(auc_score, fpr, tpr, f'ROC Curve: binarize COMM >= 9')

    cm = confusion_matrix(y_test_bin, y_pred_bin)
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, cmap='BuPu', fmt='d', xticklabels=['Low COMM', 'High COMM'], yticklabels=['Low COMM', 'High COMM'],annot_kws={"size": 16})
    plt.xlabel('Predicted Labels')
    plt.ylabel('True Labels')
    plt.title('Confusion Matrix')
    plt.savefig(figures_dir + f'{final_best_model_name}_ConfusionMatrix_group-{which_group}.png', dpi=300, bbox_inches='tight')
    plt.show()


    # calculate accuracy:
    accuracy = np.mean(y_pred_bin == y_test_bin)
    print(f'Accuracy: {accuracy:.4f}')

    print('chance level: ', 1 - np.mean(y_test_bin))

    # calculate sensitivity and specificity
    TP = cm[1,1]
    TN = cm[0,0]
    FP = cm[0,1]
    FN = cm[1,0]
    sensitivity = TP / (TP + FN)
    specificity = TN / (TN + FP)
    print(f'Sensitivity: {sensitivity:.4f}')
    print(f'Specificity: {specificity:.4f}')

    

# make ablation plot

In [None]:
# select the best model from both the rank test

final_best_model_name = 'RandomForestRegressor'
models_list = [RandomForestRegressor(), XGBRegressor(), SVR(), Lasso(), Ridge(), ElasticNet(), MLPRegressor()]
model = models_list[[model.__class__.__name__ for model in models_list].index(final_best_model_name)]

cv_df = pd.read_csv(model_dir + f'{final_best_model_name}_grid_search_results_group-{which_group}.csv')
best_model_index = cv_df['rank_test_r2'].idxmin()
best_model_param = eval(cv_df.loc[best_model_index, 'params'])

final_best_model = model.set_params(**best_model_param)

final_best_model

In [None]:
print(which_group)

In [None]:
ablation = False
if ablation:
    ablation_results = {}
    df = pd.read_csv(data_dir + f'M_K23_ML_group-{which_group}_reduced_imputed.csv')
    df = df.rename(columns=rename_dict)
    
    for c in PREDICTOR_COLUMNS:
        print(c)
        ablated_df = df.copy()
        ablated_df[c] = 0
        ablation_results[c] = mc_cv_ml_pipeline(final_best_model, ablated_df, split='split', n_split=100, outcome='commtot')
        
    # save ablation results
    joblib.dump(ablation_results, model_dir + f'{final_best_model_name}_ablation_results_group-{which_group}.pkl')    
else:
    ablation_results = joblib.load(model_dir + f'{final_best_model_name}_ablation_results_group-{which_group}.pkl')

In [None]:
import scipy.stats as stats
r_scores = [result['r_scores'] for result in ablation_results.values()]
# Sort the r_scores from low to high
r_scores.sort(key=np.mean)
# Find the model with the highest mean r_score
min_mean_index = np.argmin([np.mean(scores) for scores in r_scores])
sorted_ablation_index = np.argsort([np.mean(result['r_scores']) for result in ablation_results.values()])
labels = [list(ablation_results.keys())[i] for i in sorted_ablation_index]
# Create a box plot
plt.figure(figsize=(12, 8))  # Set the figure size
plt.boxplot(r_scores, labels=labels)

# Add scatter points
for i, r_score in enumerate(r_scores):
    if i == min_mean_index:
        color = 'orange'  # Set the color of the model with the highest mean
    else:
        color = 'gray'  # Set the color of other models to gray
    x = np.random.normal(i + 1, 0.04, size=len(r_score))
    
    plt.scatter(x, r_score, alpha=0.5, color=color)


plt.xlabel('Predictors')
plt.ylabel('r_score')
plt.title('Ablation Analysis for Predictor Impacts')
plt.xticks(rotation=90)



plt.show()

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Sample data
x = labels
y = np.mean(delta_r_list, axis=1)
yerr = delta_r_ci_list

# Sort the data in ascending order
sorted_indices = np.argsort(y)
x_sorted = np.array(x)[sorted_indices]
y_sorted = y[sorted_indices]
yerr_sorted = np.array(yerr)[sorted_indices]


axis_fontsize = 14
xtick_fontsize = 12
ytick_fontsize = 12
title_fontsize = 16


# Plotting horizontal bar graph with error bars
plt.figure(figsize=(6, 12))
plt.barh(x_sorted, y_sorted, xerr=np.abs(yerr_sorted), capsize=5, color='orange', edgecolor='black')
plt.ylabel('Predictors', fontsize=axis_fontsize)  # Horizontal bar plot has ylabel for predictors
plt.xlabel('delta r after ablation', fontsize=axis_fontsize)
plt.title('Feature Ablation Impacts on Prediction Performance', fontsize=title_fontsize)
plt.xticks(fontsize=xtick_fontsize)
plt.yticks(fontsize=ytick_fontsize)
# Show the plot
plt.savefig(figures_dir + f'{final_best_model_name}_ablation_analysis_group-{which_group}.png', dpi=300, bbox_inches='tight')
plt.show()



In [None]:
import shap
df = pd.read_csv(data_dir + f'M_K23_ML_group-{which_group}_reduced_imputed.csv')
X_train, X_test, y_train, y_test = split_data(df, split_data = 'split', outcome = 'commtot', predictor_columns=PREDICTOR_COLUMNS, test_size = test_size, SEED=SEED)
model = joblib.load(model_dir + f'{final_best_model_name}_rank_best_model.pkl')
model.fit(X_train, y_train)
# Create a SHAP explainer
explainer = shap.Explainer(model, X_train)
shap_values = explainer.shap_values(X_train)
shap.summary_plot(shap_values, X_train, show=False, max_display=11)

# save the shap figure
plt.savefig(figures_dir + f'{final_best_model_name}_shap_summary_plot_group-{which_group}.png', dpi=300, bbox_inches='tight')
plt.show()