In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
matched = pd.read_csv('data/5_matched_pheno.csv')
matched['pheno'].value_counts()

In [None]:
data = pd.read_csv('data/4_demographics_diagnosis_dsnotes.csv')
data = data[data['subject_id'].isin(matched['subject_id'])]
print(len(data[data['case_status']==1]['subject_id'].unique()))
print(data['case_status'].value_counts())
data

## Demographics Tables

In [None]:
matched_keys = set(zip(matched['subject_id'], matched['hadm_id']))
demo = data[data.apply(lambda row: (row['subject_id'], row['hadm_id']) in matched_keys, axis=1)]
print(demo.shape)
demo.columns

In [None]:
table = pd.crosstab(demo['case_status'], demo['adrd_status'])
table_sorted = table.sort_values(by=table.columns.tolist(), ascending=False)
print(table_sorted)

### Race

In [None]:
table = pd.crosstab(demo['race_group1'], demo['case_status'])
table_sorted = table.sort_values(by=table.columns.tolist(), ascending=False)
table_sorted

In [None]:
table = pd.crosstab(demo['race_group2'], demo['case_status'])
table_sorted = table.sort_values(by=table.columns.tolist(), ascending=False)
table_sorted

In [None]:
table = pd.crosstab(demo['race_group3'], demo['case_status'])
table_sorted = table.sort_values(by=table.columns.tolist(), ascending=False)
table_sorted

In [None]:
table = pd.crosstab(demo['race_group4'], demo['case_status'])
table_sorted = table.sort_values(by=table.columns.tolist(), ascending=False)
table_sorted

#### Filter based on race

In [None]:
races = ['White', 'African American', 'Hispanic/Latino', 'Asian', 'Other']
keep_race = demo[demo['race_group1'].isin(races)]['subject_id']
demo_filt = demo[demo['subject_id'].isin(keep_race)]
demo_filt.shape

In [None]:
table = pd.crosstab(demo_filt['case_status'], demo_filt['adrd_status'])
table_sorted = table.sort_values(by=table.columns.tolist(), ascending=False)
print(table_sorted)

### Gender

In [None]:
table = pd.crosstab(demo_filt['gender'], demo_filt['case_status'])
table_sorted = table.sort_values(by=table.columns.tolist(), ascending=False)
table_sorted

### Age

In [None]:
sns.boxplot(x='case_status', y='age', data=demo_filt, palette='Set2', hue='case_status')
plt.title('Age distribution by case/control status')
plt.xlabel('Case Status')
plt.ylabel('Age')
plt.show()

In [None]:
len(demo_filt[(demo_filt['case_status']==1) & (demo_filt['age']<65)])

### Miscellaneous

In [None]:
table = pd.crosstab(demo_filt['marital_status'], demo_filt['case_status'])
table_sorted = table.sort_values(by=table.columns.tolist(), ascending=False)
table_sorted

In [None]:
table = pd.crosstab(demo_filt['insurance_group'], demo_filt['case_status'])
table_sorted = table.sort_values(by=table.columns.tolist(), ascending=False)
table_sorted

In [None]:
table = pd.crosstab(demo_filt['language_group'], demo_filt['case_status'])
table_sorted = table.sort_values(by=table.columns.tolist(), ascending=False)
table_sorted

In [None]:
table = pd.crosstab(demo_filt['admission_type'], demo_filt['case_status'])
table_sorted = table.sort_values(by=table.columns.tolist(), ascending=False)
table_sorted

### Interactions

In [None]:
feature = 'gender'
sns.boxplot(x=feature, y='age', data=demo_filt, palette='Set2', hue='case_status')
plt.title('Age distribution by %s' % feature)
plt.xlabel(feature.title())
plt.ylabel('Age')
plt.legend(bbox_to_anchor=(1.05,1), loc='upper left')
plt.show()

In [None]:
feature = 'marital_status'
sns.boxplot(x=feature, y='age', data=demo_filt, palette='Set2', hue='case_status')
plt.title('Age distribution by %s' % feature)
plt.xlabel(feature.title())
plt.ylabel('Age')
plt.legend(bbox_to_anchor=(1.05,1), loc='upper left')
plt.show()

In [None]:
feature = 'race_group1'
sns.boxplot(x=feature, y='age', data=demo_filt, palette='Set2', hue='case_status')
plt.title('Age distribution by %s' % feature)
plt.xlabel(feature.title())
plt.ylabel('Age')
plt.legend(bbox_to_anchor=(1.05,1), loc='upper left')
plt.show()

In [None]:
feature = 'insurance_group'
sns.boxplot(x=feature, y='age', data=demo_filt, palette='Set2', hue='case_status')
plt.title('Age distribution by %s' % feature)
plt.xlabel(feature.title())
plt.ylabel('Age')
plt.legend(bbox_to_anchor=(1.05,1), loc='upper left')
plt.show()

In [None]:
feature = 'admission_type'
sns.boxplot(x=feature, y='age', data=demo_filt, palette='Set2', hue='case_status')
plt.title('Age distribution by %s' % feature)
plt.xlabel(feature.title())
plt.ylabel('Age')
plt.legend(bbox_to_anchor=(1.05,1), loc='upper left')
plt.show()

In [None]:
feature = 'language_group'
sns.boxplot(x=feature, y='age', data=demo_filt, palette='Set2', hue='case_status')
plt.title('Age distribution by %s' % feature)
plt.xlabel(feature.title())
plt.ylabel('Age')
plt.legend(bbox_to_anchor=(1.05,1), loc='upper left')
plt.show()

In [None]:
cases = demo_filt[demo_filt['case_status'] == 1]
ctrls = demo_filt[demo_filt['case_status'] == 0]

In [None]:
feature = 'marital_status'
print('CASES')
pd.crosstab(cases[feature], cases['gender'])

In [None]:
print('CONTROLS')
pd.crosstab(ctrls[feature], ctrls['gender'])

In [None]:
feature = 'race_group1'
print('CASES')
pd.crosstab(cases[feature], cases['gender'])

In [None]:
print('CONTROLS')
pd.crosstab(ctrls[feature], ctrls['gender'])

## Clinical Notes

### Discharge Notes

In [None]:
notes = data.dropna()
notes = notes[notes['subject_id'].isin(keep_race)]
print(len(notes[notes['case_status']==1]['subject_id'].unique()))
print(len(notes[notes['case_status']==0]['subject_id'].unique()))
print(notes['case_status'].value_counts())
notes.shape

In [None]:
# filter notes to only include notes from before AD diagnosis for AD cases and all notes for controls
# notes_filt = notes[notes['ad'] == 0]
notes_filt = notes
print(len(notes_filt[notes_filt['case_status']==1]['subject_id'].unique()))
print(len(notes_filt[notes_filt['case_status']==0]['subject_id'].unique()))
print(notes_filt['case_status'].value_counts())
notes_filt.shape

In [None]:
cases = notes[notes['case_status']==1]
ctrls = notes[notes['case_status']==0]

In [None]:
# for cases, how many cases have 0 notes when they are not diagnosed with AD
# (i.e., how many patients have no ad=0)
ad_counts = cases.groupby('subject_id')['ad'].value_counts().unstack(fill_value=0)
ad_counts.columns = ['ad0', 'ad1']
ad_counts['ad0'].value_counts()

In [None]:
notes_filt[notes_filt['case_status'] == 1]['subject_id'].value_counts().describe()

In [None]:
notes_filt[notes_filt['case_status'] == 0]['subject_id'].value_counts().describe()

In [None]:
counts = notes_filt.groupby(['case_status', 'subject_id']).size().reset_index(name='count')
freq = counts.groupby(['case_status', 'count']).size().reset_index(name='num_subjects')

plt.figure(figsize=(8, 5))
sns.boxplot(data=counts, x='case_status', y='count', palette='Set2')
plt.title('Distribution of Subject Occurrence Counts by Case Status')
# plt.yscale('log')
plt.xlabel('Case Status')
plt.ylabel('Occurrences per Subject')
plt.tight_layout()
plt.show()

In [None]:
counts['many'] = counts['count'] > 1
pd.crosstab(counts['many'], counts['case_status'])

In [None]:
duration = notes_filt.copy()
duration['admitdate'] = pd.to_datetime(duration[['admityear', 'admitmonth', 'admitday']].rename(
    columns={'admityear': 'year', 'admitmonth': 'month', 'admitday': 'day'}))
duration = duration.groupby('subject_id').agg(
    first_date = ('admitdate', 'min'),
    laste_date = ('admitdate', 'max'),
    duration_days = ('admitdate', lambda x: (x.max() - x.min()).days),
    case_status=('case_status', 'first'))
print(duration['case_status'].value_counts())

In [None]:
plt.figure(figsize=(8, 5))
sns.boxplot(data=duration, x='case_status', y='duration_days', palette='Set2')
plt.title('Distribution of Day Span Between Discharge Summaries by Case Status')
# plt.yscale('log')
plt.xlabel('Case Status')
plt.ylabel('Span in Days')
plt.tight_layout()
plt.show()

In [None]:
duration[duration['case_status'] == 0]['duration_days'].describe()

In [None]:
duration[duration['case_status'] == 1]['duration_days'].describe()

In [None]:
notes_filt.to_csv('data/6_matched_demo_notes.csv', index=False, header=True)