# JCOIN Tracking Opioid Stigma

*Please note: This notebook uses controlled access data*    
*Please note:  JCOIN Google Login in the BRH Profile Page needs to be authorized*


- JCOIN consortium - MAARC Survey Core - Survey Project 1 of 4
- Cross-sectional Repeated Brief Survey tracking stigma related to OUD
- **Significance:** Stigma related to OUD and public attitudes towards persons with OUD and treatment of/for persons with OUD may impact/modify the effect of OUD interventions.

### Import modules

In [None]:
import os
import pandas as pd
import numpy as np
import pyreadstat
import seaborn as sns
import matplotlib.pyplot as plt
from pathlib import Path

pd.set_option('mode.chained_assignment', None)

### Import data
- 4 survey time-points in 2020; Feb, Apr, Jun, Oct
- roughly 1000 respondents per survey time-point
- 1st survey time-point before COVID, all other survey time-points after COVID

In [None]:
# Pull file objects using the Gen3 SDK
!gen3 drs-pull object dg.6VTS/b96018c5-db06-4af8-a195-28e339ba815e
!gen3 drs-pull object dg.6VTS/6d3eb293-8388-4c5d-83ef-d0c2bd5ba604
!gen3 drs-pull object dg.6VTS/6f9a924f-9d83-4597-8f66-fe7d3021729f
!gen3 drs-pull object dg.6VTS/0e618fef-e359-424b-b844-0ca320105176

In [None]:
# Read data using pyreadstat library
df1, meta1 = pyreadstat.read_sav('./JCOIN_NORC_Omnibus_SURVEY1_Feb2020.sav',apply_value_formats=True)
df2, meta2 = pyreadstat.read_sav('./JCOIN_NORC_Omnibus_SURVEY2_April2020.sav',apply_value_formats=True)
df3, meta3 = pyreadstat.read_sav('./JCOIN_NORC_Omnibus_SURVEY3_June2020.sav',apply_value_formats=True)
df4, meta4 = pyreadstat.read_sav('./JCOIN_NORC_Omnibus_SURVEY4_Oct2020.sav',apply_value_formats=True)

### Clean data
- column names to lowercase
- get long, combined dataset; keep only variables available for each survey time-point
- take a look at the data dictionary to check meaning of variable names and identify interesting variables that will let us look at stigma trajectory over time

In [None]:
# standardize column names across datasets and metadatasets

for df in [df1, df2, df3, df4]:
    df.columns = df.columns.str.lower()
    
    
for meta in [meta1, meta2, meta3, meta4]:
    meta.column_names = [x.lower() for x in meta.column_names]
    meta.column_names_to_labels =  {k.lower(): v for k, v in meta.column_names_to_labels.items()}
    meta.variable_value_labels =  {k.lower(): v for k, v in meta.variable_value_labels.items()}

In [None]:
# combine data from all surveys into one long dataset, keeping only vars available across all survey datasets
all_df = pd.concat([df1, df2, df3, df4], keys=['s1','s2','s3','s4'], join="inner").reset_index().rename(columns={"level_0": "time-point"})

# when displaying the data, don't print geo information
all_df.drop(['level_1','region4','region9'],axis=1).head()

In [None]:
# print out data dictionary for vars available across all survey datasets
data_dictionary = {k: v for k, v in meta1.column_names_to_labels.items() if k in all_df.columns}
data_dictionary

# print out just data labels for vars available across all survey datasets
#[v for k, v in meta1.column_names_to_labels.items() if k in all_df.columns]

### Get a subset of interesting variables that let us look at stigma trajectory over time
- check for missing
- impute missing stigma scale score values with median per timepoint
- impute missing personaluse_ever status with mode, 'No'

In [None]:
sub_df_1 = all_df[['time-point','weight','stigma_scale_score','age4','region4','personaluse_ever']]
#sub_df_1

In [None]:
# check if missing values
sub_df_1.isnull().sum()

# get all var types
#sub_df_1.info()

# summary of numeric vars (weight and stigma_scale_score)
sub_df_1.describe()

# summary of cat vars (age4, region9, personaluse_ever)
sub_df_1.age4.value_counts(dropna=False)

# summary of cat vars (age4, region9, personaluse_ever)
sub_df_1.region4.value_counts(dropna=False)

# summary of cat vars (age4, region9, personaluse_ever)
sub_df_1.personaluse_ever.value_counts(dropna=False)

# impute missing stigma scale score values as the median score by survey time-point
sub_df_1['stigma_scale_score'].fillna(sub_df_1.groupby('time-point')['stigma_scale_score'].transform('median'),inplace=True)
sub_df_1.isnull().sum()

# replace missing values of personaluse_ever with mode value of 'No'
sub_df_1.personaluse_ever.fillna('No',inplace=True)
sub_df_1.isnull().sum()

#sub_df_1

# print the df with geo info hidden
sub_df_1_print = sub_df_1.copy()
sub_df_1_print['region4'] = sub_df_1_print['region4'].replace(['Midwest','West','Northeast','South'],'hidden')

sub_df_1_print

### Unweighted and weighted stigma scale score over time
- full sample; sum of weights normalized to count of individuals
- by personal use status
- by geographic region
- by age group

In [None]:
sub_df_1['w_stigma_scale_score'] = sub_df_1['stigma_scale_score'] * sub_df_1['weight']
stigma_all_df = sub_df_1.groupby('time-point').agg({'weight':['count','sum'],'stigma_scale_score':'mean','w_stigma_scale_score':'mean'})
stigma_all_df.columns = stigma_all_df.columns.to_flat_index().str.join('_').str.rstrip('_')
stigma_all_df

In [None]:
# sum of weights will not be equal to count of individuals when we look at sub-groups of the full population; 
# will have to formally calculate weighted average


def weighted_mean(x, **kws):
    val, weight = map(np.asarray, zip(*x))
    weighted_mean = (val * weight).sum() / weight.sum()
    return (val * weight).sum() / weight.sum()

sub_df_1["score_and_weight"] = list(zip(sub_df_1.stigma_scale_score, sub_df_1.weight))

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2, sharey=True)
fig.suptitle('Stigma Scale Score over time',fontsize='x-large',fontweight='bold')

sns.lineplot(ax = ax1,
             data = sub_df_1,
             x = 'time-point',
             y = 'stigma_scale_score',
             #hue = 'personaluse_ever'
            )

ax1.set_title('Unweighted')
ax1.set(ylabel='Stigma Scale Score')
ax1.invert_yaxis()

sns.lineplot(ax = ax2,
             data = sub_df_1,
             x = 'time-point',
             y = 'score_and_weight',
             estimator = weighted_mean
             #hue = 'personaluse_ever'
            )

ax2.set_title('Weighted')
ax2.invert_yaxis()

plt.subplots_adjust(top=0.80)
plt.show()

### Stigma scale score by personal use ever status

In [None]:
sub_df_1.groupby(['time-point','personaluse_ever']).stigma_scale_score.describe(percentiles=[0.25, 0.5, 0.75])

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2, sharey=True)
fig.suptitle('Stigma Scale Score over time,\nby Personal Use Status',fontsize='x-large',fontweight='bold')

sns.lineplot(ax = ax1,
             data = sub_df_1,
             x = 'time-point',
             y = 'stigma_scale_score',
             hue = 'personaluse_ever'
            )

ax1.set_title('Unweighted')
ax1.set(ylabel='Stigma Scale Score')
ax1.invert_yaxis()

sns.lineplot(ax = ax2,
             data = sub_df_1,
             x = 'time-point',
             y = 'score_and_weight',
             hue = 'personaluse_ever',
             estimator = weighted_mean
            )

ax2.set_title('Weighted')
ax2.invert_yaxis()

legend1 = ax1.legend()
legend1.remove()

legend2 = ax2.legend()
legend2.remove()

leg = ax1.legend(loc='center', bbox_to_anchor=(1.1, -0.25), shadow=False, ncol=2, frameon=False)

plt.subplots_adjust(top=0.80)
plt.show()

### Stigma scale score by geographic region

In [None]:
sub_df_1.groupby(['time-point','region4']).stigma_scale_score.describe(percentiles=[0.25, 0.5, 0.75])

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2, sharey=True)
fig.suptitle('Stigma Scale Score over time,\nby Geographic Region',fontsize='x-large',fontweight='bold')

sns.lineplot(ax = ax1,
             data = sub_df_1,
             x = 'time-point',
             y = 'stigma_scale_score',
             hue = 'region4'
            )

ax1.set_title('Unweighted')
ax1.set(ylabel='Stigma Scale Score')
ax1.invert_yaxis()

sns.lineplot(ax = ax2,
             data = sub_df_1,
             x = 'time-point',
             y = 'score_and_weight',
             hue = 'region4',
             estimator = weighted_mean
            )

ax2.set_title('Weighted')
ax2.invert_yaxis()


legend1 = ax1.legend()
legend1.remove()

legend2 = ax2.legend()
legend2.remove()

leg = ax1.legend(loc='center', bbox_to_anchor=(1.1, -0.25), shadow=False, ncol=4, frameon=False)
plt.subplots_adjust(top=0.80)
plt.show()

### Stigma scale score by age group

In [None]:
sub_df_1.groupby(['time-point','age4']).stigma_scale_score.describe(percentiles=[0.25, 0.5, 0.75])

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2, sharey=True)
fig.suptitle('Stigma Scale Score over time,\nby Age Group',fontsize='x-large',fontweight='bold')

sns.lineplot(ax = ax1,
             data = sub_df_1,
             x = 'time-point',
             y = 'stigma_scale_score',
             hue = 'age4'
            )

ax1.set_title('Unweighted')
ax1.set(ylabel='Stigma Scale Score')
ax1.invert_yaxis()

sns.lineplot(ax = ax2,
             data = sub_df_1,
             x = 'time-point',
             y = 'score_and_weight',
             hue = 'age4',
             estimator = weighted_mean
            )

ax2.set_title('Weighted')
ax2.invert_yaxis()


legend1 = ax1.legend()
legend1.remove()

legend2 = ax2.legend()
legend2.remove()

leg = ax1.legend(loc='center', bbox_to_anchor=(1.1, -0.25), shadow=False, ncol=4, frameon=False)
plt.subplots_adjust(top=0.80)
plt.show()