### Radiology notes

```
awk 'BEGIN {FS=","; OFS="\t"} /^[0-9]{4,}/ {print $1,$2,$3,$4,$5,$6,$7}' radiology.csv > radiology_meta.tsv
```


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# want 2,321,355 rows - missing 1?
data = pd.read_csv('/Users/anya/Documents/ML4HC/mimic-iv-3.1/notes/radiology_meta.tsv', sep='\t', header=None)
data.columns = ['note_id', 'subject_id', 'hadm_id', 'note_type', 'note_seq', 'chart_time', 'store_time']
data = data[data['note_type'].isin(['RR', 'AR'])]
data['subject_id'] = data['subject_id'].astype(int)
print(data.shape)
data = data.dropna()
data['hadm_id'] = data['hadm_id'].astype(int)
data

In [None]:
cohort = pd.read_csv('data/7_matched_demo_notes_tabs.csv')
cohort = cohort.drop(columns=['note_id', 'note_type','note_seq', 'chart_time', 'store_time'])
cohort.to_csv('data/8_final_cohort-demo_comorb.csv', index=False, header=True)
cohort

In [None]:
notes = pd.merge(cohort, data, on=['subject_id', 'hadm_id'], how='inner')
print(notes.shape)
notes = notes.dropna()
print(notes.shape)
print(len(notes[notes['case_status']==1]['subject_id'].unique()))
print(len(notes[notes['case_status']==0]['subject_id'].unique()))
print(notes['case_status'].value_counts())
notes.shape

In [None]:
# filter notes to only include notes from before AD diagnosis for AD cases and all notes for controls
# notes_filt = notes[notes['ad'] == 0]
notes_filt = notes
print(len(notes_filt[notes_filt['case_status']==1]['subject_id'].unique()))
print(len(notes_filt[notes_filt['case_status']==0]['subject_id'].unique()))
print(notes_filt['case_status'].value_counts())
notes_filt.shape

In [None]:
cases = notes[notes['case_status']==1]
ctrls = notes[notes['case_status']==0]

In [None]:
# for cases, how many cases have 0 notes when they are not diagnosed with AD
# (i.e., how many patients have no ad=0)
ad_counts = cases.groupby('subject_id')['ad'].value_counts().unstack(fill_value=0)
ad_counts.columns = ['ad0', 'ad1']
ad_counts['ad0'].value_counts()

In [None]:
notes_filt[notes_filt['case_status'] == 1]['subject_id'].value_counts().describe()

In [None]:
notes_filt[notes_filt['case_status'] == 0]['subject_id'].value_counts().describe()

In [None]:
counts = notes_filt.groupby(['case_status', 'subject_id']).size().reset_index(name='count')
freq = counts.groupby(['case_status', 'count']).size().reset_index(name='num_subjects')

plt.figure(figsize=(8, 5))
sns.boxplot(data=counts, x='case_status', y='count', palette='Set2')
plt.title('Distribution of Subject Occurrence Counts by Case Status')
# plt.yscale('log')
plt.xlabel('Case Status')
plt.ylabel('Occurrences per Subject')
plt.tight_layout()
plt.show()

In [None]:
counts['many'] = counts['count'] > 1
pd.crosstab(counts['many'], counts['case_status'])

In [None]:
duration = notes_filt.copy()
duration['admitdate'] = pd.to_datetime(duration[['admityear', 'admitmonth', 'admitday']].rename(
    columns={'admityear': 'year', 'admitmonth': 'month', 'admitday': 'day'}))
duration = duration.groupby('subject_id').agg(
    first_date = ('admitdate', 'min'),
    laste_date = ('admitdate', 'max'),
    duration_days = ('admitdate', lambda x: (x.max() - x.min()).days),
    case_status=('case_status', 'first'))
print(duration['case_status'].value_counts())

In [None]:
plt.figure(figsize=(8, 5))
sns.boxplot(data=duration, x='case_status', y='duration_days', palette='Set2')
plt.title('Distribution of Day Span Between Discharge Summaries by Case Status')
# plt.yscale('log')
plt.xlabel('Case Status')
plt.ylabel('Span in Days')
plt.tight_layout()
plt.show()

In [None]:
duration[duration['case_status'] == 0]['duration_days'].describe()

In [None]:
duration[duration['case_status'] == 1]['duration_days'].describe()

In [None]:
notes_filt.to_csv('data/9_radiology_notes.csv', index=False, header=True)