In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve, confusion_matrix, f1_score
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import CountVectorizer

# Read in Datasets

In [None]:
from google.colab import files

# This will prompt you to select and upload the file manually
uploaded = files.upload()


In [None]:
# Read in the matched updated cohort
df_cohort = pd.read_csv("matched_ad_only_cohort.csv")
df_cohort.head()

In [None]:
df_cohort['subject_id']

In [None]:
# Read diagnoses CSV file into a DataFrame
df_icd = pd.read_csv("diagnoses_icd.csv.gz", compression = 'gzip')
df_icd.head()

# Add Relevant Clinical Data

In [None]:
df_cohort.columns

In [14]:
# Define condition code sets (partial, adjust as needed)
stroke_codes = ['430', '431', '432', '433', '434', '436', 'I63', 'I64']
mi_codes = ['410', '412', 'I21', 'I22']  # Myocardial Infarction
pvd_codes = ['4439', '440', 'I73', 'I70']
cvd_codes = stroke_codes + ['435']  # Cerebrovascular Disease includes stroke + TIA
diabetes_codes = ['250', 'E08', 'E09', 'E10', 'E11', 'E13']
cancer_codes = [str(i) for i in range(140, 210)] + ['C']  # ICD-9 140-209, ICD-10 starts with 'C'

def flag_condition(df_icd, cohort_ids, icd_list, colname):
    # Only consider diagnoses for patients in df_cohort
    df_subset = df_icd[df_icd['subject_id'].isin(cohort_ids)].copy()

    # Normalize ICD codes: remove dots, uppercase
    df_subset['icd_clean'] = df_subset['icd_code'].str.replace('.', '', regex=False).str.upper()

    # Match any ICD code that starts with the condition codes
    condition_mask = df_subset['icd_clean'].apply(lambda x: any(x.startswith(code) for code in icd_list))

    # Get unique subject_ids with this condition
    matching_subjects = df_subset.loc[condition_mask, 'subject_id'].unique()

    # Return a DataFrame with binary indicator for those subject_ids
    return pd.Series(df_cohort['subject_id'].isin(matching_subjects).astype(int), name=colname)

cohort_ids = df_cohort['subject_id']

df_cohort['Stroke_History'] = flag_condition(df_icd, cohort_ids, stroke_codes, 'Stroke_History')
df_cohort['Myocardial_Infarction'] = flag_condition(df_icd, cohort_ids, mi_codes, 'Myocardial_Infarction')
df_cohort['Peripheral_Vascular_Disease'] = flag_condition(df_icd, cohort_ids, pvd_codes, 'Peripheral_Vascular_Disease')
df_cohort['Cerebrovascular_Disease'] = flag_condition(df_icd, cohort_ids, cvd_codes, 'Cerebrovascular_Disease')
df_cohort['Diabetes_Mellitus'] = flag_condition(df_icd, cohort_ids, diabetes_codes, 'Diabetes_Mellitus')
df_cohort['Cancer'] = flag_condition(df_icd, cohort_ids, cancer_codes, 'Cancer')


In [None]:
df_cohort.columns

In [16]:
# Drop other fields
df_cohort.drop(columns = {'Unnamed: 0'}, inplace = True)

In [None]:
df_cohort.isna().sum()

In [18]:
# Impute missing Language values with the mode (most frequent language)
mode_language = df_cohort['language'].mode()[0]
df_cohort['language'] = df_cohort['language'].fillna(mode_language)

In [19]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tabulate import tabulate
import warnings
warnings.filterwarnings('ignore')

# Improve matplotlib fonts for publication-style
plt.rcParams.update({'font.size': 12, 'axes.titlesize': 14})
sns.set(style="whitegrid")

df_treat = df_cohort[df_cohort['adrd'] == 1].copy()
df_ctrl = df_cohort[df_cohort['adrd'] == 0].copy()
df_all = df_cohort.copy()


In [20]:
def summarize_binary_or_categorical(df, feature):
    return df[feature].value_counts(dropna=False).sort_index()

def summarize_numeric(df, feature):
    return {
        'mean': df[feature].mean(),
        'std': df[feature].std(),
        'min': df[feature].min(),
        '25%': df[feature].quantile(0.25),
        'median': df[feature].median(),
        '75%': df[feature].quantile(0.75),
        'max': df[feature].max(),
    }


In [None]:
# These are the features you want bar plots for
plot_features = [
    'gender', 'admission_type', 'insurance',
    'language', 'marital_status', 'race',
    'Stroke_History', 'Myocardial_Infarction', 'Peripheral_Vascular_Disease',
    'Cerebrovascular_Disease', 'Diabetes_Mellitus', 'Cancer'
]

for feature in plot_features:
    plt.figure(figsize=(8, 4))
    plot_data = df_cohort[[feature, 'adrd']].copy()
    plot_data[feature] = plot_data[feature].astype(str)  # ensure categorical

    sns.countplot(data=plot_data, x=feature, hue='adrd')
    plt.title(f"{feature} Distribution by ADRD Status")
    plt.xlabel(feature)
    plt.ylabel("Count")
    plt.xticks(rotation=30)
    plt.tight_layout()
    plt.show()


In [None]:
# Comorbidities plot
binary_cols = ['Stroke_History', 'Myocardial_Infarction', 'Peripheral_Vascular_Disease',
               'Cerebrovascular_Disease', 'Diabetes_Mellitus', 'Cancer']

df_plot = pd.DataFrame({
    'Feature': binary_cols,
    'Treated %': [df_treat[col].mean() * 100 for col in binary_cols],
    'Control %': [df_ctrl[col].mean() * 100 for col in binary_cols]
}).melt(id_vars='Feature', var_name='Group', value_name='Percentage')

plt.figure(figsize=(10, 6))
sns.barplot(data=df_plot, x='Percentage', y='Feature', hue='Group')
plt.title("Comorbidity Prevalence by Group")
plt.tight_layout()
plt.show()


In [None]:
df_cohort.columns

In [24]:
categorical_features = ['gender', 'admission_type', 'insurance', 'language', 'marital_status', 'race', 'race_group', 'language_group',
                        'Stroke_History', 'Myocardial_Infarction', 'Peripheral_Vascular_Disease', 'Cerebrovascular_Disease', 'Diabetes_Mellitus', 'Cancer']
numeric_features = ['age', 'admityear']

In [None]:
from tabulate import tabulate

# 1. Categorical features
for feature in categorical_features:
    treat_counts = summarize_binary_or_categorical(df_treat, feature)
    ctrl_counts = summarize_binary_or_categorical(df_ctrl, feature)
    all_counts = summarize_binary_or_categorical(df_all, feature)

    # Ensure consistent categories
    keys = sorted(set(treat_counts.index) | set(ctrl_counts.index) | set(all_counts.index), key=lambda x: str(x))

    table = []
    for val in keys:
        row = {
            'Category': val,
            'Treated': f"{treat_counts.get(val, 0)} ({100 * treat_counts.get(val, 0) / len(df_treat):.1f}%)",
            'Control': f"{ctrl_counts.get(val, 0)} ({100 * ctrl_counts.get(val, 0) / len(df_ctrl):.1f}%)",
            'Overall': f"{all_counts.get(val, 0)} ({100 * all_counts.get(val, 0) / len(df_all):.1f}%)",
        }
        table.append(row)

    print(f"\n📊 Summary of: {feature}")
    print(tabulate(table, headers='keys', tablefmt='fancy_grid'))

# 2. Numeric features
for feature in numeric_features:
    treat_stats = summarize_numeric(df_treat, feature)
    ctrl_stats = summarize_numeric(df_ctrl, feature)
    all_stats = summarize_numeric(df_all, feature)

    table = []
    for stat in ['mean', 'std', 'min', '25%', 'median', '75%', 'max']:
        row = {
            'Statistic': stat,
            'Treated': round(treat_stats[stat], 2),
            'Control': round(ctrl_stats[stat], 2),
            'Overall': round(all_stats[stat], 2)
        }
        table.append(row)

    print(f"\n📊 Summary of: {feature}")
    print(tabulate(table, headers='keys', tablefmt='fancy_grid'))


In [27]:
# Download updated CSV file
df_cohort.to_csv("df_cohort.csv", index=False)


In [None]:
from google.colab import files

# Uncompressed
files.download("df_cohort.csv")

