#### **Output**: matched, cleaned csv files in `workspace/longitudinal_data.csv` and `workspace/longitudinal_events.csv`

# Match data based on study date

In [1]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import matplotlib.colors as mcolors
from tqdm import tqdm as tqdm

In [2]:
RAW_DIR = "/storage/groups/ml01/datasets/raw/2018_LMUAugenklinik_niklas.koehler"
DATA_DIR = "/storage/groups/ml01/datasets/projects/20181610_eyeclinic_niklas.koehler"


raw_data_dir = os.path.join(RAW_DIR, 'joint_export/dwh_tables')
clean_data_dir = os.path.join(DATA_DIR, 'joint_export/dwh_tables_cleaned')

In [3]:
os.listdir(clean_data_dir)

['icd10cm_codes_2018.txt',
 'procedures_clean.csv',
 'naive_patients.csv',
 'octs_fundus_no_dups.csv',
 'diagnosis.csv',
 'octs_fundus_with_dups.csv',
 'visus_labels_clean.csv',
 'diagnosis_clean.csv',
 'diagnosis_longitudinal_clean.csv',
 'icd10cm_order_2018.txt']

In [4]:
octs = pd.read_csv(os.path.join(clean_data_dir, 'octs_fundus_no_dups.csv'), index_col=0)
octs.study_date = pd.to_datetime(octs.study_date, format="%Y%m%d")
octs.patient_id = octs.patient_id.astype(str)
octs = octs.sort_values("study_date")


visus = pd.read_csv(os.path.join(clean_data_dir, 'visus_labels_clean.csv'), index_col=0)
visus.study_date = pd.to_datetime(visus.study_date)
visus = visus.sort_values("study_date")
visus.patient_id = visus.patient_id.astype(int).astype(str)

injections = pd.read_csv(os.path.join(clean_data_dir, 'procedures_clean.csv'), index_col=0)
injections.study_date = pd.to_datetime(injections.study_date)
injections.patient_id = injections.patient_id.astype(str)

diagnosis = pd.read_csv(os.path.join(clean_data_dir, 'diagnosis_longitudinal_clean.csv'), index_col=0)
diagnosis = diagnosis.reset_index()
diagnosis.patient_id = diagnosis.patient_id.astype(str)


## Match OCTs, visus, thickness 
- OCTs and thickness based on study date
- visus and OCTs on same date (NEVER after injection)

number of matches increased when not accurately merging on date with with timespa

In [5]:
###### octs

In [6]:
merged = pd.merge(octs, visus, on=['patient_id', 'laterality', 'study_date'], how='outer') # keep all visus labels
print('Result: {} oct/visus pairs'.format(len(merged.dropna(subset=['oct_path', 'visual_acuity']))))

# comment out as it may cause the bug of duplicate records in sequences.csv
print('Merging {} octs with {} injections'.format(len(merged), len(injections)))#
merged = pd.merge(merged, injections, on=['study_date', 'patient_id', 'laterality'], how='outer') # keep all injections
print('Result: {} oct/visus/injections pairs'.format(len(merged.dropna(subset=['oct_path', 'visual_acuity', 'ICPML', 'ICPMK']))))

Result: 178717 oct/visus pairs
Merging 760047 octs with 315216 injections
Result: 30008 oct/visus/injections pairs


In [7]:
merged['thickness_path'] = np.nan

# make results columns nice
merged.loc[:,'oct?'] = ~merged['oct_path'].isna()
merged.loc[:,'visus?'] = ~merged['logMAR'].isna()
merged.loc[:,'thickness?'] = ~merged['thickness_path'].isna()
merged.loc[:,'fundus?'] = ~merged['fundus_path'].isna()

oct_measurements = merged.copy()
# clean up merged table
oct_measurements = oct_measurements.drop(columns=['visual_acuity_raw', 'logMAR_raw', 'visual_acuity_origin'])

In [8]:
disp_cols = ['patient_id', 'laterality', 'study_date', 'oct?', 'visus?', 'thickness?', 'fundus?']
oct_measurements[disp_cols]

Unnamed: 0,patient_id,laterality,study_date,oct?,visus?,thickness?,fundus?
0,280837,R,2012-09-24,True,False,False,False
1,43876,R,2012-09-24,True,True,False,False
2,43876,L,2012-09-24,True,True,False,False
3,241776,R,2012-09-24,True,False,False,False
4,280837,L,2012-09-24,True,False,False,False
...,...,...,...,...,...,...,...
1013115,375294,R,2018-11-19,False,False,False,False
1013116,375374,R,2018-11-19,False,False,False,False
1013117,375414,R,2018-11-26,False,False,False,False
1013118,375579,R,2018-11-21,False,False,False,False


In [9]:
# results statistics
# OCTs
mask = oct_measurements['oct?']
num = sum(mask)
pat = len(oct_measurements[mask].patient_id.unique())
print('OCTs: {} measurements from {} patients'.format(num, pat))

mask = (oct_measurements['oct?']) & (oct_measurements['visus?'])
num = sum(mask)
pat = len(oct_measurements[mask].patient_id.unique())
print('OCTs with va: {} measurements from {} patients'.format(num, pat))

OCTs: 213048 measurements from 34062 patients
OCTs with va: 179158 measurements from 29720 patients


## Prepare data for Times series modeling

### Limit oct measurements to longitudinal patients, add diagnosis

In [10]:
# merge diagnosis label to each patient instance (laterality + study date)
longitudinal_data = pd.merge(oct_measurements, 
                             diagnosis[['patient_id', 'laterality', 'diagnosis']], 
                             on=['patient_id', 'laterality'], 
                             how='inner').drop_duplicates()

In [11]:
longitudinal_data["MED"] = np.nan

### Create events table with injections and surgery dates

In [12]:
longitudinal_patients = diagnosis[['patient_id', 'laterality']].drop_duplicates(['patient_id', 'laterality'])

# add injections
events = pd.merge(longitudinal_patients, injections[['patient_id', 'laterality', 'study_date', 'ICPML']], 
                  on=['patient_id', 'laterality'], how='inner')

events.loc[:,'injection?'] = ~events['ICPML'].isna()

# add lens surgery data
lens_surgery = diagnosis[['patient_id', 'laterality', 'iol_date']].copy()
lens_surgery.rename(columns={'iol_date': 'study_date'}, inplace=True)
lens_surgery['iol?'] = ~pd.to_datetime(lens_surgery.study_date).isna()

lens_surgery = lens_surgery[lens_surgery['iol?']]

lens_surgery.loc[:, "study_date"] = pd.to_datetime(lens_surgery.study_date)

# limit to longitudinal data
lens_surgery = pd.merge(longitudinal_patients, lens_surgery, on=['patient_id', 'laterality'], how='inner').drop_duplicates()

# add to events
events = pd.merge(events, lens_surgery, on=["laterality", "patient_id", "study_date"], how="outer")

# we do not know medication at this point. Medications to be added
events["MED"] = np.nan

# drop non injection data and duplicates
# events = events.dropna(subset=["injection?"]).drop_duplicates(subset=["laterality", "patient_id", "study_date"])

### merge medication information

In [13]:

'''
All medication tables commented out until medication table is cleared
'''

## load tables
med_pd = pd.read_csv(os.path.join(raw_data_dir, "medications.csv"))
med_pd = med_pd[~med_pd.study_date.isin([-1])]

### pre process date columns
med_pd.loc[:, "study_date"] = pd.to_datetime(med_pd.study_date.astype(int).astype(str), format="%Y%m%d")
med_pd.loc[:, "DAT"] = pd.to_datetime(med_pd.DAT)
med_pd[["pseudo_id"]] = med_pd["pseudo_id"].astype(int).astype(str)

events_med_pd = pd.merge(events, med_pd[["pseudo_id", "AUGE", "DAT", "MED"]], 
         left_on=["patient_id", "laterality", "study_date"], 
         right_on=["pseudo_id", "AUGE", "DAT"], 
        how="left")


columns_final = ["patient_id", "laterality", "study_date", "ICPML", "injection?", "iol?", "MED_y"]

events_med_pd = events_med_pd[columns_final].drop_duplicates()
events_med_pd.rename(columns={"MED_y": "MED"}, inplace=True)
print("Event & MED records: ", events_med_pd.shape[0])

Event & MED records:  38180


In [14]:
events_med_pd

Unnamed: 0,patient_id,laterality,study_date,ICPML,injection?,iol?,MED
0,719,R,2018-08-21,5-156,True,,Lucentis
34,719,R,2018-06-19,5-156,True,,Lucentis
68,719,R,2018-07-17,5-156,True,,Lucentis
102,735,R,2015-04-07,5-156,True,,Lucentis
146,735,R,2015-06-11,5-156,True,,Lucentis
...,...,...,...,...,...,...,...
2151611,347179,L,2017-03-21,,,True,
2151612,347851,L,2016-12-22,,,True,
2151613,357432,L,2017-08-08,,,True,
2151614,358833,L,2017-09-11,,,True,


### save tables

In [18]:
if not os.path.exists(os.path.join(DATA_DIR, 'joint_export/longitudinal_data')):
    os.makedirs(os.path.join(DATA_DIR, 'joint_export/longitudinal_data'))
    
longitudinal_data.to_csv(os.path.join(DATA_DIR, 'joint_export/longitudinal_data/longitudinal_data.csv'))
events.to_csv(os.path.join(DATA_DIR, 'joint_export/longitudinal_data/longitudinal_events.csv'))

In [16]:
os.path.join(DATA_DIR, 'joint_export/longitudinal_data/longitudinal_data.csv')

'/storage/groups/ml01/datasets/projects/20181610_eyeclinic_niklas.koehler/joint_export/longitudinal_data/longitudinal_data.csv'

In [130]:
filtered_diagnosis = longitudinal_data.dropna(subset = ['diagnosis'])
filtered_oct_path = filtered_diagnosis.dropna(subset = ['oct_path'])

all_patients = filtered_oct_path.sort_values('study_date')

# all_patients = all_patients.loc[filtered_diagnosis.patient_id == 3897]

# drop all groups that do not have at least one OCT and one logMAR
grouped = all_patients.groupby(['patient_id', 'laterality'])

all_patients = grouped.filter(lambda x: x.oct_path.count() > 0 and x.logMAR.count() > 0)

grouped_patients = all_patients.groupby(['patient_id', 'laterality'])
grouped_events = events.groupby(['patient_id', 'laterality'])

In [132]:
grouped_patients.obj[["patient_id", "laterality"]].drop_duplicates()

Unnamed: 0,patient_id,laterality
0,43876,L
45,273001,R
17,274080,L
112,79124,R
581,53686,L
...,...,...
56601,376871,L
56609,382809,R
56631,383487,R
56635,378686,R



# get all patients with events before 2014-02-01
Does not belong to above matching

In [133]:
# load naive patients
RAW_DIR = "/storage/groups/ml01/datasets/raw/2018_LMUAugenklinik_niklas.koehler"
longitudinal_dir = os.path.join(RAW_DIR, "joint_export/longitudinal_tables")

naive_list = pd.read_csv(os.path.join(longitudinal_dir, 'full_naive_list.csv'), index_col=0)
naive_patients = naive_list[naive_list.Naive.astype(int) == 1].patient_id.drop_duplicates().astype(str).tolist()

In [134]:
events_before = events[events.study_date <= "2014-02-01"]
events_before = events_before[events_before.patient_id.isin(naive_patients)]
events_before_naive = events_before.sort_values(by="study_date").drop_duplicates(subset=["patient_id", 
                                                                                         "laterality"])  

rows_to_check = []
oct_dates = []
for row in events_before_naive.itertuples():
    patient = row[1]
    laterality = row[2]
    
    lrecord = longitudinal_data[(longitudinal_data.patient_id == f"{str(patient)}") & \
                                (longitudinal_data.laterality == f"{laterality}")]
    
    first_date_w_oct_and_logMAR = lrecord[lrecord.logMAR.isna()].dropna(subset=["oct_path"])
    
    first_date_w_oct_and_logMAR = first_date_w_oct_and_logMAR.sort_values(by=["study_date"])
    
    if first_date_w_oct_and_logMAR.size > 0:
        first_date_w_oct_and_logMAR = first_date_w_oct_and_logMAR.iloc[0].study_date

        if first_date_w_oct_and_logMAR < row[3]:
            rows_to_check.append(row)
            oct_dates.append(first_date_w_oct_and_logMAR)
            
    else:
        print(row)
              
to_check_va = pd.DataFrame(rows_to_check)[["patient_id","laterality","study_date"]]
to_check_va = to_check_va.rename(columns={"study_date": "first_injection_date"})
to_check_va["first_oct_date_registrered_wo_visual_acuity"] = oct_dates

to_check_va.to_csv(os.path.join(workspace_dir, 'joint_export/manual_visual_acuity_check.csv'))

Pandas(Index=12670, patient_id='210579', laterality='L', study_date=Timestamp('2008-02-18 00:00:00'), ICPML='5-156', _5=True, _6=nan, MED=nan)
Pandas(Index=25210, patient_id='83840', laterality='R', study_date=Timestamp('2010-05-11 00:00:00'), ICPML='5-156', _5=True, _6=nan, MED=nan)
Pandas(Index=8353, patient_id='92730', laterality='L', study_date=Timestamp('2011-01-19 00:00:00'), ICPML='5-144', _5=True, _6=True, MED=nan)
Pandas(Index=3137, patient_id='17177', laterality='L', study_date=Timestamp('2011-10-05 00:00:00'), ICPML='5-094', _5=True, _6=nan, MED=nan)
Pandas(Index=35031, patient_id='48212', laterality='R', study_date=Timestamp('2012-04-23 00:00:00'), ICPML=nan, _5=nan, _6=True, MED=nan)
Pandas(Index=22400, patient_id='47552', laterality='R', study_date=Timestamp('2013-04-10 00:00:00'), ICPML='5-144', _5=True, _6=True, MED=nan)
Pandas(Index=1984, patient_id='1557', laterality='L', study_date=Timestamp('2013-04-29 00:00:00'), ICPML='5-156', _5=True, _6=nan, MED=nan)
Pandas(Inde

NameError: name 'workspace_dir' is not defined