# Opioid Misuse Prediction Models

## preprocessing data (imputation) + data visual check

*Yiyu Wang 2024/04/02*

In [None]:
data_dir = '../data/'
figures_dir = '../figures/'


# use this code to find the abnormal character in the file

# # Open the file in binary mode and seek to a position a bit before 74036 to get some context
# with open(data_dir + 'M_K23_ML.csv', 'rb') as file:
#     file.seek(max(0, 74036 - 100))  # Go back 100 bytes for context, or to the start if too close to the beginning
#     data = file.read(200)  # Read enough bytes around the position for context

# print(data)

In [None]:
import numpy as np
import glob
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# make some descriptive statistics
raw_df = pd.read_csv(data_dir + 'M_K23_ML.csv', index_col=0)

        
variables_df = pd.read_csv(data_dir + 'Variable_Name.csv', index_col=0)

variables_to_keep = variables_df.loc[variables_df['Include'] == 'X'].index.tolist()
print(variables_to_keep)

df = raw_df[variables_to_keep]

# recode ethnicity with one-hot encoding with 1 and 0
df = pd.get_dummies(df, columns=['demo_ethnicity'],dtype='int')


# recode gender such that if 99, then 4, else keep the same
df = pd.get_dummies(df, columns=['demo_gender'],dtype='int')


df.columns

In [None]:

# recode mh_psychological_yes such that if any entry of a text, then 1, else 0
df["mh_psychological_yes_binary"] = np.where(pd.isna(df["mh_psychological_yes"]), 1, 0)
print(np.unique(df["mh_psychological_yes_binary"]))

df = df.drop(columns=["mh_psychological_yes"])
# convert df to numeric
df = df.apply(pd.to_numeric, errors='coerce')
    

print(df.columns)
print(len(df.columns))
df.to_csv(data_dir + 'M_K23_ML_reduced.csv')
df.head()


In [None]:
# data descriptives:

# check missing data
N = len(df)
print(f'Total sample N: {N}')


print('mean age = ', df['k23_age'].mean())
print('std age = ', df['k23_age'].std())



In [None]:
# check missing data
N = len(df)
print(f'Total sample N: {N}')


for column, n_miss in df.isnull().sum().to_dict().items():
    if n_miss > 0:
        print(f'{column}: {n_miss}')

In [None]:
# impute with population mean
imputed_df = df.fillna(df.mean())

imputed_df.to_csv(data_dir + 'M_K23_ML_reduced_imputed.csv')

##  data visualization

In [None]:
# plot covariances for df1

# Compute the correlation matrix
cov_matrix = df.corr()

# Create a mask for the upper triangle
mask = np.triu(np.ones_like(cov_matrix, dtype=bool))

# Set up the matplotlib figure
plt.figure(figsize=(10, 8))

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(cov_matrix, mask=mask, cmap='coolwarm', vmax = 1, center=0,
            square=True, linewidths=.5,  annot=False)

# Show the plot with only the lower triangle and a colorbar
plt.show()


In [None]:
# Set the number of rows and columns for the subplots
num_rows = 11
num_cols = 6

# Create the subplots
fig, axes = plt.subplots(num_rows, num_cols, figsize=(14, 20))

# Flatten the axes array
axes = axes.flatten()

# Plot the histograms for shared columns
for i, column in enumerate(df.columns):
    ax = axes[i]
    ax.hist(df[column], bins=30, alpha=0.5, label='df1')
    ax.axvline(df[column].mean(), color='b', linestyle='dashed', linewidth=1, label='df mean')
    # ax.set_xlabel(column)
    ax.title.set_text(column)
    if i % num_cols == 0:
        ax.set_ylabel('Frequency')

# Remove axis on empty subplots
for j in range(len(df.columns), num_rows * num_cols):
    axes[j].axis('off')

# Adjust the spacing between subplots
plt.subplots_adjust(hspace=0.5)


# Show the plot
plt.show()


# save the plot
plt.savefig(figures_dir + 'factor_distribution_by_dataset.png')





In [None]:
df1

In [None]:
# Set the number of rows and columns for the subplots
num_rows = 11
num_cols = 6

# Create the subplots
fig, axes = plt.subplots(num_rows, num_cols, figsize=(18, 16))

# Flatten the axes array
axes = axes.flatten()
new_df = df.reset_index()
df1 = new_df[new_df['record_id'].str.contains('A')]
df2 = new_df[new_df['record_id'].str.contains('B')]
df3 = new_df[new_df['record_id'].str.contains('C')]
df1 = df1.reset_index().drop(columns=['record_id'])
df2 =df2.reset_index().drop(columns=['record_id'])
df3 =df3.reset_index().drop(columns=['record_id'])
# Plot the histograms for shared columns
for i, column in enumerate(df1.columns):
    
    ax = axes[i]
    ax.hist(df1[column], bins=30, alpha=0.5, label='df A')
    ax.hist(df2[column], bins=30, alpha=0.5, label='df B')
    ax.hist(df3[column], bins=30, alpha=0.5, label='df C')
    ax.axvline(df1[column].mean(), color='b', linestyle='dashed', linewidth=1, label='df A mean')
    ax.axvline(df2[column].mean(), color='orange', linestyle='dashed', linewidth=1, label='df B mean')
    ax.axvline(df3[column].mean(), color='green', linestyle='dashed', linewidth=1, label='df C mean')
    # ax.set_xlabel(column)
    ax.title.set_text(column)
    if i % num_cols == 0:
        ax.set_ylabel('Frequency')

# Remove axis on empty subplots
for j in range(len(df1.columns), num_rows * num_cols):
    axes[j].axis('off')

# Adjust the spacing between subplots
plt.subplots_adjust(hspace=0.5)

# Show the legend
handles, labels = ax.get_legend_handles_labels()
labels = ['df A', 'df B', 'df C', 'A mean', 'B mean', 'C mean']
legend_loc = (0.87, 0.05)
fig.legend(handles, labels, loc=legend_loc, ncol=1)

# Show the plot
plt.show()


# save the plot
plt.savefig(figures_dir + 'factor_distribution_by_dataset.png')





In [None]:
# create train, test, validation split based on A, B, C in the record_id
new_df['cohort'] = 'train'
new_df.loc[new_df['record_id'].str.contains('A'), 'cohort'] = 'train'
new_df.loc[new_df['record_id'].str.contains('B'), 'cohort'] = 'test'
new_df.loc[new_df['record_id'].str.contains('C'), 'cohort'] = 'validation'


new_df.to_csv(data_dir + 'M_K23_ML_split.csv', index=False)

# df with collapsed demographic factors

In [None]:
# rename and collapse columns
df = pd.read_csv(data_dir + 'M_K23_ML_reduced_imputed.csv')
df

In [None]:
raw_PREDICTOR_COLUMNS=['k23_age', 'demo_hispanic', 'demo_ethnicity_1', 'demo_ethnicity_2', 'demo_ethnicity_3',
       'demo_ethnicity_4', 'demo_ethnicity_5', 'demo_ethnicity_6',
       'demo_ethnicity_99', 'demo_gender_1', 'demo_gender_2', 'demo_gender_99',
       'demo_income', 'demo_education', 'demo_legal', 'demo_employment___1',
       'demo_employment___2', 'demo_employment___3', 'demo_employment___4',
       'demo_employment___5', 'demo_employment___6', 'demo_employment___7',
       'demo_employment___8', 'demo_employment___9', 'demo_employment___99',
       'demo_disability', 'demo_marital', 'mh_accident', 'mh_pain_duration',
       'promis_pi_01', 'promis_pi_02', 'promis_pi_03', 'opioid_years_v2',
       'meds_more_v2', 'PainInT', 'AngerT', 'AnxietyT', 'DepressT', 'FatigueT',
       'GlobalpT', 'GlobalmT', 'PhyFxT', 'SleepDisT', 'audittot', 'AUDITpos',
       'pcstotal', 'pcs_help', 'pcs_rum', 'pcs_mag', 'dasttot', 'c_eactotl',
       'aeqtot', 'ctq_emo_abu', 'ctq_phy_abu', 'ctq_emo_neg', 'ctq_phy_neg',
       'ctq_sex_abu', 'ctqtot', 'mh_psychological_yes_binary']

# rename and collapse columns


rename_dict = {'k23_age': 'age', 
              'demo_hispanic': 'Hispanic', 
              'demo_ethnicity_1':'Asian',
              'demo_ethnicity_2':'Caucasian', 
              'demo_ethnicity_3':'NativeHawaiian', 
              'demo_ethnicity_4':'Black', 
              'demo_ethnicity_5':'AmericanIndian', 
              'demo_ethnicity_6':'MoreThanOneRace', 
              'demo_ethnicity_99':'OtherEthnicity',
              'demo_income':'income', 'demo_education':'education',
              'demo_legal':'legal', 
              'demo_gender_1': 'male', 'demo_gender_2': 'female', 'demo_gender_99':'OtherGender',
              'demo_employment___1':'part_time', 'demo_employment___2':'full_time',
              'demo_employment___3':'not_employed', 'demo_employment___4':'homemaker',
              'demo_employment___5':'temp_unemployed',
              'demo_employment___6':'unemployed', 'demo_employment___7':'looking_unemployed',
              'demo_employment___8':'disabled', 'demo_employment___9':'retired',
              'demo_employment___99':'OtherEmployment', 'demo_disability':'disability',
              'demo_marital':'marital', 'opioid_years_v2': 'opioid_years', 'meds_more_v2':'meds_more',
              'mh_accident':'accident', 'mh_pain_duration':'pain_duration', 'mh_psychological_yes_binary':'psychological_treatment_yes',
              'promis_pi_01':'past_pain_intensity', 'promis_pi_02':'worst_pain_intensity', 'promis_pi_03':'current_pain_intensity',
              'PainInT':'PainInterference', 'AngerT':'Anger', 'AnxietyT':'Anxiety', 'DepressT':'Depression', 'FatigueT':'Fatigue',
              'GlobalpT':'GlobalPhysical', 'GlobalmT':'GlobalMental', 'PhyFxT':'PhysicalFunction', 'SleepDisT':'SleepDisturbance',
              'audittot':'AlcoholUseScore', 'AUDITpos':'AlcoholUserBinary', 
              'pcstotal':'PCS_total', 'pcs_help':'PCS_helplessness', 'pcs_rum':'PCS_rumination', 'pcs_mag':'PCS_magnification',
              'dasttot':'DrugUseScore', 'c_eactotl':'CocaineUseScore', 'aeqtot':'AmbivalenceEmotion',
              'ctqtot': 'CTQ_total', 'ctq_emo_abu':'CTQ_EmotionalAbuse', 'ctq_phy_abu':'CTQ_PhysicalAbuse', 'ctq_emo_neg':'CTQ_EmotionalNeglect', 'ctq_phy_neg':'CTQ_PhysicalNeglect','ctq_sex_abu':'CTQ_SexualAbuse'}

df = df.rename(columns=rename_dict)
df.head()

In [None]:
# collapse the columns


# 1. White (coded =1 ) vs non-White (0)
df['White'] = df['Caucasian'].apply(lambda x: 1 if x == 1 else 0)

# 2. Female (1) vs the others
df['Female'] = df['female'].apply(lambda x: 1 if x == 1 else 0)

# 5. married/partnered (1) vs the others 
df['Married'] = df['marital'].apply(lambda x: 0 if x==1 or x ==6 else 1)

# 6. Not employed (1) vs the others
df['Not_employed'] = df.apply(lambda row: 0 if (row['full_time'] == 1 or row['part_time'] == 1) else 1, axis=1)




In [None]:
# drop for collapsed deomographic variables
columns_to_drop = [
   'male', 'female', 
   'Caucasian', 'Asian', 'NativeHawaiian', 'Black', 'AmericanIndian', 'MoreThanOneRace', 'OtherEthnicity', 'MoreThanOneRace',
   'legal', 'marital', 'accident', 'OtherGender', 'psychological_treatment_yes', 'AlcoholUserBinary', 'ouddx', 'commpos', 'past_pain_intensity','meds_more', 
   'part_time', 'full_time', 'not_employed', 'homemaker', 'temp_unemployed', 'unemployed', 'looking_unemployed', 'disabled', 'retired', 'OtherEmployment'
   ]

df.drop(columns=columns_to_drop, inplace=True)
df

In [None]:
df.columns

In [None]:
len(df.columns)

In [None]:
# save df
df.to_csv(data_dir + 'M_K23_ML_reduced_imputed_collapsed.csv', index=False)