#  Clean Labels
Clean each label file individually
- Reads dwh tables from `/storage/groups/ml01/datasets/raw/2018_LMUAugenklinik_niklas.koehler/dwh_tables`
- Writes cleaned dwh tables to `/storage/groups/ml01/datasets/projects/20181610_eyeclinic_niklas.koehler/dwh_tables_cleaned`

In [21]:
import os
import pandas as pd
from ipywidgets import interact, fixed
import ipywidgets as widgets
import matplotlib.pyplot as plt
import numpy as np
import re
from glob import glob
from datetime import datetime
from pydicom import read_file
from tqdm import tqdm
import imageio

import sys
sys.path.append('../../DeepRT/thickness_map_calculation')
import dicom_table as dt

In [22]:
RAW_DIR = "/storage/groups/ml01/datasets/raw/2018_LMUAugenklinik_niklas.koehler"

data_dir = os.path.join(RAW_DIR, 'joint_export/dwh_tables')
# data_dir = '../../raw_data/dwh_tables'  # for local data
clean_data_dir = '/storage/groups/ml01/datasets/projects/20181610_eyeclinic_niklas.koehler/joint_export/dwh_tables_cleaned'
# clean_data_dir = '../../raw_data/dwh_tables_cleaned'  # for local data
longitudinal_dir = os.path.join(RAW_DIR, "joint_export/longitudinal_tables")


In [23]:
diagnosis_check_1 = pd.read_csv(os.path.join(longitudinal_dir, 'longitudinal_records_with_date_CHECKED.csv'), 
                                 index_col=0)

diagnosis_check_2 = pd.read_csv(os.path.join(longitudinal_dir, 'check_naive_patients_CORRECTED.csv'), 
                                 index_col=0)

In [24]:
diagnosis_check_1.head()

Unnamed: 0,pseudo_id,first_injection_date,ICPML,LOK,Naive,Note,Cat,Cat-Date,Unnamed: 9
0,380441,2/12/2020,5-156.9,R,1,,2.0,,
1,379576,1/9/2020,5-156.9,L,0,Ozurdex,,,
2,378420,3/31/2020,5-156.9,L,0,Enophtalmitis,,,
3,380200,7/9/2018,5-156.9,L,0,DME,,,
4,382606,10/8/2020,5-156.9,R,1,,1.0,,


## Clean visus labels

In [25]:
def to_logMAR(va):
    # converts decimal va values to logMAR scale
    try:
        va = float(va)
        if va == 0:
            return np.inf
        else:
            return np.log10(1/va)
    except ValueError:
        return np.nan

def get_visual_acuity(group):
    # parses items in ordered group. If first value cannot be parsed, moves on to next value
    for i, raw in enumerate(list(group.visual_acuity_raw)):
        va = parse_visual_acuity(raw)
        if va is not None:
            res = list(group.iloc[i])
            res.append(va)
            return res
    return None

def parse_visual_acuity(raw):
    if pd.isna(raw):
        return None
    
    # match range of two comma/dot separated values
    m = re.match(r'.*(\d+)[,\.](\d+)[ -]+(\d+)[,\.](\d+).*$', raw)
    if m:
        return np.mean([float('{}.{}'.format(m.group(1), m.group(2))), 
                       float('{}.{}'.format(m.group(3), m.group(4)))])
    
    # match comma/dot separated value, allowing comma, space, "sc" as first character
    m = re.match(r'[ ,\.a-zA-Z]*([\do]+)[\., ]+(\d+).*$', raw)
    if m:
        return float('{}.{}'.format(m.group(1).replace('o', '0'), m.group(2)))
    
    # match two integer values separated with /, allowing space, "sc, HT" as first character
    m = re.match(r'[ a-zA-Z:]*(\d+)[ /]+(\d+)[a-zA-Z \.]*$', raw)
    if m:
        return int(m.group(1))/float(m.group(2))
        
    # match HBW
    if re.match(r'.*[hH]\.*[bB]\.*[wW]*\.*', raw):
        return 'HBW'
    # match FZ
    elif re.match(r'[fF]\.*[zZ]\.*', raw):
        return 'FZ'
    # match Lilo
    elif re.match(r'li(cht){0,1}(lo(kal){0,1}){0,1}', raw, flags=re.IGNORECASE):
        return 'LILO'
    # match Nulla LUX
    elif re.match(r'(n[ulla]*|kein)\.* *l([ux\.]*|[icht]*)', raw, flags=re.IGNORECASE):        
        return 'NL'
    # match FIX
    elif re.match(r'fix.*', raw, flags=re.IGNORECASE):
        return 'FIX'
    # match LUX
    elif re.match(r'.*lux.*', raw, flags=re.IGNORECASE):  
        return 'LUX'
    
    else:
        return "other"

In [26]:
visus_labels = pd.read_csv(os.path.join(data_dir, 'visus_labels.csv'), index_col=0)
print('Starting with {} measurements'.format(len(visus_labels)))

# rename columns
visus_labels.rename(columns={'AUGE': 'laterality_raw', 'MEASUREMENT_DATE':'study_date', 'PATNR':'patient_id',
                            'visual_acuity_VISUS': 'visual_acuity_raw', 'ORIGIN_TYPE': 'visual_acuity_origin'}, 
                    inplace=True)

# format columns 
visus_labels.study_date = pd.to_datetime(visus_labels.study_date)

# imputer origin value, ok?
visus_labels.visual_acuity_origin.fillna("OR", inplace=True)

visus_labels.visual_acuity_origin = pd.Categorical(visus_labels.visual_acuity_origin,
                                                   ['SR', 'OR', 'CC', 'STP', 'SC'])

# remove rows with missing study dates
visus_labels = visus_labels[~ visus_labels.study_date.isna()]

# duplicate entries for laterality=='B'
visus_labels['laterality'] = visus_labels['laterality_raw']
visus_labels.loc[visus_labels['laterality_raw'] == 'B', 'laterality'] = 'L'
visus_R = visus_labels.loc[visus_labels['laterality_raw'] == 'B'].copy().assign(laterality='R')
visus_labels = visus_labels.append(visus_R, ignore_index=True)

# get rid of exact duplicates
num_raw = visus_labels.shape[0]
visus_labels = visus_labels.drop_duplicates(keep='first')
num_raw_nodup = visus_labels.shape[0]
print('Dropped {} exact duplicates'.format(num_raw-num_raw_nodup))

# count duplicate measurements per time point
num_dups = sum(visus_labels.duplicated(['study_date', 'patient_id', 'laterality']))
print('Resolving {} duplicate measurements per time point'.format(num_dups))

# group by time point measurements
groups = visus_labels.groupby(['patient_id', 'laterality', 'study_date'])
keys = groups.groups.keys()

  interactivity=interactivity, compiler=compiler, result=result)
  mask |= (ar1 == a)


Starting with 1212908 measurements
Dropped 2 exact duplicates
Resolving 483504 duplicate measurements per time point


In [27]:
sum(visus_labels.CALCULATED_VALUE.isna()), sum(visus_labels.LOGMAR_VALUE.isna()), sum(visus_labels.MEASUREMENT_VALUE.isna()) 

(820689, 820689, 820005)

### merge new VA MEASUREMENT_VALUE in visual acuity RAW COLUMN

In [28]:
visus_labels.visual_acuity_raw.fillna(visus_labels.MEASUREMENT_VALUE, inplace=True)
visus_labels.visual_acuity_raw.fillna(visus_labels.CALCULATED_VALUE, inplace=True)

### Save away all measurements to use for imputation later

In [29]:
all_measurements = visus_labels[['patient_id', 'laterality', 'study_date', 
              "CALCULATED_VALUE", "LOGMAR_VALUE", "MEASUREMENT_VALUE"]]

In [30]:
visus_labels.drop(columns=["CALCULATED_VALUE", "LOGMAR_VALUE", "MEASUREMENT_VALUE"], inplace=True)

## Parse all visual acuity values into rows

In [31]:
print('Before visual acuity parsing: {} unique time/patient/laterality measurements'.format(len(keys)))

rows = []
for key in tqdm(keys):
    # sort values to ensure that correct VA value is taken 
    grp = groups.get_group(key).sort_values('visual_acuity_origin')
    res = get_visual_acuity(grp)
    if res is not None:
        rows.append(res)
        
'''

# Below code parallelizes the VA parsing accross 10 processors. Despite of copy handling speed is
# rouhly doubled compared to a simple for loop.

import time
from multiprocess import Pool
import multiprocessing

def loop_f(key):
    grp = groups.get_group(key).sort_values('visual_acuity_origin')
    res = get_visual_acuity(grp)
    if res is not None:
        return res
    
max_pool = 8

start = time.time()
with Pool(max_pool) as p:
    rows = list(
        tqdm(
            p.imap(loop_f,
                   list(keys)[0:1000]),
            total=len(keys)
        )
    ) 
'''

  0%|          | 0/730081 [00:00<?, ?it/s]

Before visual acuity parsing: 730081 unique time/patient/laterality measurements


100%|██████████| 730081/730081 [30:56<00:00, 393.36it/s] 


"\n\n# Below code parallelizes the VA parsing accross 10 processors. Despite of copy handling speed is\n# rouhly doubled compared to a simple for loop.\n\nimport time\nfrom multiprocess import Pool\nimport multiprocessing\n\ndef loop_f(key):\n    grp = groups.get_group(key).sort_values('visual_acuity_origin')\n    res = get_visual_acuity(grp)\n    if res is not None:\n        return res\n    \nmax_pool = 8\n\nstart = time.time()\nwith Pool(max_pool) as p:\n    rows = list(\n        tqdm(\n            p.imap(loop_f,\n                   list(keys)[0:1000]),\n            total=len(keys)\n        )\n    ) \n"

In [32]:
visus_labels_clean = pd.DataFrame.from_records(rows, columns=['laterality_raw', 'study_date', 'visual_acuity_origin',
                                                              'patient_id', 'visual_acuity_raw', 'laterality', 
                                                              'visual_acuity'])

print('Number of cleaned measurements {}'.format(visus_labels_clean.shape[0]))

Number of cleaned measurements 730080


In [37]:
visus_labels_clean[(visus_labels_clean.patient_id == 15696) & (visus_labels_clean.laterality == "R")]

Unnamed: 0,laterality_raw,study_date,visual_acuity_origin,patient_id,visual_acuity_raw,laterality,visual_acuity,logMAR_raw
32151,R,2012-08-17,SC,15696,1,R,other,
32152,R,2012-10-31,SC,15696,1,R,other,
32153,R,2012-11-28,OR,15696,1,R,other,
32154,R,2013-01-04,SC,15696,1,R,other,
32155,R,2013-05-29,OR,15696,0.63,R,0.63,0.200659
...,...,...,...,...,...,...,...,...
32233,R,2019-12-19,OR,15696,0.8,R,0.8,0.096910
32234,R,2020-01-30,OR,15696,1,R,other,
32235,R,2020-02-27,OR,15696,0.8,R,0.8,0.096910
32236,R,2020-03-27,OR,15696,0.8,R,0.8,0.096910


In [34]:
# add logMAR_raw column (unmapped, no string values)
visus_labels_clean['logMAR_raw'] = visus_labels_clean.visual_acuity.apply(to_logMAR)

In [43]:
visus_labels_clean.

Unnamed: 0,laterality_raw,study_date,visual_acuity_origin,patient_id,visual_acuity_raw,laterality,visual_acuity,logMAR_raw,logMAR
0,L,2014-03-19,SC,7,1/25,L,0.04,1.397940,1.397940
1,R,2014-03-19,SC,7,005,R,0.05,1.301030,1.301030
2,L,2015-01-22,OR,17,0.6,L,0.6,0.221849,0.221849
3,L,2015-12-10,SC,17,0.6,L,0.6,0.221849,0.221849
4,L,2016-01-29,OR,17,0.8,L,0.8,0.096910,0.096910
...,...,...,...,...,...,...,...,...,...
730075,R,2019-01-15,OR,384249,0.1,R,0.1,1.000000,1.000000
730076,R,2019-05-31,OR,384249,0.1,R,0.1,1.000000,1.000000
730077,R,2019-07-02,CC,384249,<0.1,R,other,,
730078,R,2019-08-27,SC,384249,0.05,R,0.05,1.301030,1.301030


In [38]:
va_num = visus_labels_clean['logMAR_raw']
va_str = visus_labels_clean['visual_acuity_raw']

# print frequency of logMAR values and strings
nums, bins = np.histogram(va_num, bins=np.concatenate([[-np.inf], np.arange(-0.4,2.1,0.2), [10, np.inf]]))
for i, num in enumerate(nums[:-1]):
    print('[{:.1f},{:.1f}):\t{}'.format(bins[i], bins[i+1], nums[i]))
print('[{:.1f},{:.1f}]:\t{}'.format(bins[-2], bins[-1], nums[-1]))

for s in ['FZ', 'HBW', 'FIX', 'LILO', 'LUX', 'NL']:
    print('{}:\t\t{}'.format(s, sum(va_str==s)))

[-inf,-0.4):	302
[-0.4,-0.2):	898
[-0.2,0.0):	15657
[0.0,0.2):	266241
[0.2,0.4):	206860
[0.4,0.6):	32617
[0.6,0.8):	46829
[0.8,1.0):	6442
[1.0,1.2):	31409
[1.2,1.4):	13551
[1.4,1.6):	4822
[1.6,1.8):	1900
[1.8,2.0):	233
[2.0,10.0):	32
[10.0,inf]:	8
FZ:		7377
HBW:		20001
FIX:		1084
LILO:		7
LUX:		2735
NL:		3630


map logMAR and string values: (using values from https://michaelbach.de/sci/acuity.html)
- logMAR < -0.3 gets mapped to -0.3
- logMAR == inf gets mapped to value for NL (va 0 means blind!)
- OLD: logMAR > 2.0 gets mapped to 2.0. This is not done anymore, values stay as is!
- FZ (count fingers) gets mapped to 1.9
- HBW gets mapped to 2.3 
- FIX/LILO gets mapped to 2.6
- LUX gets mapped to 2.8
- NL gets mapped to 3.0

In [39]:
visus_labels_clean['logMAR'] = visus_labels_clean.logMAR_raw
# mapping logMAR values
visus_labels_clean.loc[visus_labels_clean.logMAR_raw<-0.3, 'logMAR'] = -0.3
#visus_labels_clean.loc[visus_labels_clean.logMAR_raw>2.0, 'logMAR'] = 2.0
visus_labels_clean.loc[visus_labels_clean.logMAR_raw==np.inf, 'logMAR'] = 3.0
# string values
visus_labels_clean.loc[visus_labels_clean.visual_acuity=='FZ', 'logMAR'] = 1.9
visus_labels_clean.loc[visus_labels_clean.visual_acuity=='HBW', 'logMAR'] = 2.3
visus_labels_clean.loc[visus_labels_clean.visual_acuity=='FIX', 'logMAR'] = 2.6
visus_labels_clean.loc[visus_labels_clean.visual_acuity=='LILO', 'logMAR'] = 2.6
visus_labels_clean.loc[visus_labels_clean.visual_acuity=='LUX', 'logMAR'] = 2.8
visus_labels_clean.loc[visus_labels_clean.visual_acuity=='NL', 'logMAR'] = 3.0

## Match other measurements to potential CALCULATED Values

As only na values were replaced by "CALCULATED_VALUE" or "LOGMAR_VALUE" or "MEASUREMENT_VALUE". 

Some none na values were not successfully parsed and transformed to logMAR, thus the below code
replaces those we precomputed logMAR provided from the last export.


In [40]:
all_meas_columns = ["study_date", "patient_id", "laterality", "LOGMAR_VALUE"]
visus_labels_clean_ = pd.merge(visus_labels_clean, all_measurements[all_meas_columns].drop_duplicates(), 
                                 on=["study_date", "patient_id", "laterality"], how="left").drop_duplicates()

visus_labels_clean_.loc[:, "logMAR"] = visus_labels_clean_.logMAR.fillna(visus_labels_clean_.LOGMAR_VALUE)

visus_labels_clean_ = visus_labels_clean_[~visus_labels_clean_.logMAR.isna()]

In [42]:
visus_labels_clean[(visus_labels_clean.patient_id == 15696) & (visus_labels_clean.laterality == "R")]

Unnamed: 0,laterality_raw,study_date,visual_acuity_origin,patient_id,visual_acuity_raw,laterality,visual_acuity,logMAR_raw,logMAR
32151,R,2012-08-17,SC,15696,1,R,other,,
32152,R,2012-10-31,SC,15696,1,R,other,,
32153,R,2012-11-28,OR,15696,1,R,other,,
32154,R,2013-01-04,SC,15696,1,R,other,,
32155,R,2013-05-29,OR,15696,0.63,R,0.63,0.200659,0.200659
...,...,...,...,...,...,...,...,...,...
32233,R,2019-12-19,OR,15696,0.8,R,0.8,0.096910,0.096910
32234,R,2020-01-30,OR,15696,1,R,other,,
32235,R,2020-02-27,OR,15696,0.8,R,0.8,0.096910,0.096910
32236,R,2020-03-27,OR,15696,0.8,R,0.8,0.096910,0.096910


In [None]:
## Integrate supplement table

In [None]:
# print frequency of logMAR values and strings
nums, bins = np.histogram(visus_labels_clean_.logMAR, bins=np.concatenate([[-np.inf], np.arange(-0.401,3.1,0.2), [np.inf]]))
for i, num in enumerate(nums[:-1]):
    print('[{:.3f},{:.3f}):\t{}'.format(bins[i], bins[i+1], nums[i]))
print('[{:.3f},{:.3f}]:\t{}'.format(bins[-2], bins[-1], nums[-1]))

In [None]:
# save cleaned table
visus_labels_clean_.to_csv(os.path.join(clean_data_dir, 'visus_labels_clean.csv'))

## Diagnoses

In [None]:
diagnosis = pd.read_csv(os.path.join(data_dir, 'diagnosis.csv'), index_col=0)
print('Starting with {} measurements'.format(len(diagnosis)))

# rename columns
diagnosis.rename(columns={'LOK': 'laterality_raw', 'DAT':'study_date', 'PATNR':'patient_id', 'DKAT':'DKAT', 'DKEY':'diagnosis'}, inplace=True)

# format columns 
diagnosis.study_date = pd.to_datetime(diagnosis.study_date)
diagnosis.diagnosis = diagnosis.diagnosis.astype('str')

# remove non-eye related diagnoses
diagnosis['category'] = diagnosis.diagnosis.apply(lambda x: x[0])
diagnosis = diagnosis[diagnosis.category == 'H']  # TODO need to include 'E' as well for diabetic retinopathy
print('Starting with {} diagnoses of eye diseases'.format(len(diagnosis)))

# removing duplicates
diagnosis = diagnosis.drop_duplicates()
print('After removing duplicates: {} diagnoses'.format(len(diagnosis)))

# todo drop nan laterality diagnoses? 
# put duplicate diagnoses in list

In [None]:
# load descriptions of diagnoses
diagnosis_code = pd.read_csv(os.path.join(clean_data_dir, 'icd10cm_order_2018.txt'), sep='\t', header=None)

codes = []
desc = []
for i in range(len(diagnosis_code)):
    code = diagnosis_code[0][i]
    codes.append(code[6:14].strip())
    desc.append(code[16:77].strip())
    
diagnosis_code['diagnosis'] = codes
diagnosis_code['description'] = desc
diagnosis_code = diagnosis_code[['diagnosis', 'description']]   

In [None]:
# statistics
diag = list(diagnosis['diagnosis'])
diag_cl = []
for d in diag:
    if len(re.findall(r'H\d\d\.\d', d)) > 0:
        diag_cl.append(d)
        
unique, counts = np.unique(diag_cl, return_counts=True)

# sort by frequency
unique_sorted = [x for _,x in sorted(zip(counts,unique), reverse=True)]
counts_sorted = sorted(counts, reverse=True)

In [None]:
for i in range(20):
    desc = diagnosis_code[diagnosis_code.diagnosis==unique_sorted[i].replace('.','')].description
    try:
        desc = desc.iloc[0]
    except IndexError:
        desc = ''
    print('{}: {}: {}'.format(counts_sorted[i], unique_sorted[i], desc))
    

In [None]:
plt.plot(range(len(counts_sorted)), [int(u) for u in counts_sorted])
plt.xlim(0,200)

## Process new confirmed diagnoses from Karsten

In [None]:
diagnosis_export_2 = pd.read_csv(os.path.join(longitudinal_dir, 'longitudinal_records_with_date_CHECKED.csv'), 
                                 index_col=0)

diagnosis_export_2 = diagnosis_export_2.rename(columns={"pseudo_id": "patient_id", 
                                                        "first_injection_date": "iol_date",
                                                        "LOK": "laterality"})

diagnosis_export_2["DIAGNOSE0"] = "AMD"
diagnosis_export_2["DIAGNOSE1"] = "AMD"
diagnosis_export_2["diagnosis"] = "AMD"
diagnosis_export_2["diagnosis_raw"] = "AMD"
diagnosis_export_2["confirm"] = 1
diagnosis_export_2["AGE"] = np.nan
diagnosis_export_2["laterality_raw"] = diagnosis_export_2.laterality

diagnosis_export_2.drop(columns=["Note", "Naive", "Cat", "Cat-Date", "Unnamed: 9"], inplace=True)

## load left and right eye data frame from Karsten

In [None]:
diagnosis_left = pd.read_csv(os.path.join(longitudinal_dir, 'longitudinal_patients_left_eye.csv'), index_col=0)
diagnosis_right = pd.read_csv(os.path.join(longitudinal_dir, 'longitudinal_patients_right_eye.csv'), index_col=0)


# overwrite diagnosis from DWH to only analyse the longitudinal diagnosis records
diagnosis = diagnosis_left.append(diagnosis_right, ignore_index=True)
print('Starting with {} measurements'.format(len(diagnosis)))

# rename columns
diagnosis.rename(columns={'EYE': 'laterality_raw', 'Karsten':'diagnosis_raw', 'pseudo_id':'patient_id', 'IOL':'iol_date'}, inplace=True)
# format columns 
diagnosis.iol_date = pd.to_datetime(diagnosis.iol_date)
diagnosis.diagnosis_raw.loc[diagnosis.diagnosis_raw.isna()] = diagnosis.DIAGNOSE1.loc[diagnosis.diagnosis_raw.isna()]
diagnosis['laterality'] = diagnosis.laterality_raw.str.upper()
# add diagnosis column (only containing AMD and DR diagnosis for now)
diagnosis['diagnosis'] = np.nan
diagnosis.diagnosis.loc[diagnosis.diagnosis_raw.apply(lambda x: 'AMD' in x)] = 'AMD'
diagnosis.diagnosis.loc[diagnosis.diagnosis_raw.apply(lambda x: 'Diabetisches' in x)] = 'DR'

#diagnosis.diagnosis = diagnosis.diagnosis.astype('str')

print('Have {} longitudinal eyes, {} of which have diagnosis AMD, and {} have DR'.format(
    len(diagnosis), 
    (diagnosis.diagnosis == 'AMD').sum(), 
    (diagnosis.diagnosis == 'DR').sum()))


## load new longitudinal naive cases to be included

In [None]:
diagnosis_joint = pd.read_csv(os.path.join(longitudinal_dir, 'diagnosis_longitudinal_clean.csv'), index_col=0)

In [None]:
#### merge new longitudinal records

In [None]:
diagnosis = diagnosis_export_2.append(diagnosis, sort=True)

In [None]:
diagnosis.dropna(subset=["iol_date"]).sort_values("patient_id")

In [None]:
diagnosis_joint.dropna(subset=["iol_date"]).sort_values("patient_id")

# do not save this file as it contains not updated information

In [None]:

# save cleaned table
diagnosis_joint.to_csv(os.path.join(clean_data_dir, 'diagnosis_longitudinal_clean.csv'))

## Other procedures

In [None]:
procedures = pd.read_csv(os.path.join(data_dir, 'prozeduren.csv'), index_col=0)

# rename columns
procedures.rename(columns={'LOK': 'laterality_raw', 'DAT':'study_date', 'PATNR':'patient_id'}, inplace=True)
# format columns 
procedures.study_date = pd.to_datetime(procedures.study_date)

# duplicate entries for laterality=='B'
procedures['laterality_raw'].fillna('B', inplace=True)
procedures['laterality'] = procedures['laterality_raw']
procedures.loc[procedures['laterality_raw'] == 'B', 'laterality'] = 'L'
procedures_R = procedures.loc[procedures['laterality_raw'] == 'B'].copy().assign(laterality='R')
procedures = procedures.append(procedures_R, ignore_index=True)

# get rid of exact duplicates
num_raw = procedures.shape[0]
procedures = procedures.drop_duplicates(keep='first')
num_raw_nodup = procedures.shape[0]
print('Dropped {} exact duplicates'.format(num_raw-num_raw_nodup))

# take care of duplicate measurements per time point
num_dups = sum(procedures.duplicated(['study_date', 'patient_id', 'laterality']))
print('Resolving {} duplicate measurements per time point'.format(num_dups))

# remove everything not in chapter 5 08-16 (surgeries of the eye)
procedures_filtered = procedures[procedures.ICPML.apply(lambda x: x[:4] in ['5-08','5-09','5-10','5-11','5-12','5-13','5-14','5-15','5-16'])]
# reduce code to 3 decimals
procedures_filtered.ICPML = procedures_filtered.loc[:,'ICPML'].apply(lambda x: x[:5])
print('Number of filtered procedures {}'.format(len(procedures_filtered)))

In [None]:
# list most common procedures
unique, counts = np.unique(list(procedures_filtered.ICPML), return_counts=True)
unqiue = [u for _,u in sorted(zip(counts, unique), reverse=True)]
counts = sorted(counts, reverse=True)

for i in range(20):
    print(unqiue[i], counts[i])
    
plt.plot(range(len(counts)), [int(u) for u in counts])
plt.xlim(0,50)

codes with highest occurrence:
- 5-984 nicht relevant
- 5-156.9 - Injektionen
- 5-154 - Netzhautfixierung
- 5-144 - Extrakapsuläre Extraktion der Linse
- 5-158 - Vitrektomie
- 5-985 - Lasertechnik, nicht relevant
- 5-159 - Vitrektomie
- 5-155 - Destruktion von erkranktem Gewebe an Retina und Choroidea
- 5-010 - nicht relevant
- 5-091 - Exzision und Destruktion von (erkranktem) Gewebe des Augenlides, nicht relevant?

In [None]:
# save procedures - not quite clean
procedures_filtered.to_csv(os.path.join(clean_data_dir, 'procedures_clean.csv'))
print('Saved {} procedures'.format(len(procedures_filtered)))

In [None]:
procedures_filtered.head()

## Get OCT and fundus maps
### OCTs

In [None]:
oct_meta_information = pd.read_csv(os.path.join(RAW_DIR,"joint_export", 'oct_meta_information.csv'))

columns_oi = ["PATNR", "laterality", "study_date", "oct_path"]
octs = oct_meta_information[columns_oi]

# rename columns
octs = octs.rename(columns={"PATNR":"patient_id"})

# add non existing fundus path
octs["fundus_path"] = None

print("Number of oct paths before dropping duplicates: ", octs.shape[0])

# drop any duplicates
octs_no_dups = octs.drop_duplicates(subset=["patient_id", "laterality", "study_date"])

print("Number of oct paths after dropping duplicates: ", octs_no_dups.shape[0])

In [None]:
# save cleaned table
octs.to_csv(os.path.join(clean_data_dir, 'octs_fundus_with_dups.csv'))

### explore OCTs

In [None]:
def show_oct_slice(data_table, slice_no=24):
    plt.figure()
    plt.imshow(data_table.pixel_array[slice_no])
    
def interactive_show_oct_slice(data_table):
    interact(show_oct_slice, 
             slice_no=widgets.IntSlider(min=0, max=48, step=1, value=24), 
             data_table=fixed(data_table))
    
def get_oct_data(d):
    x_starts = []
    y_starts = []
    x_ends = []
    y_ends = []
    for i in range(0, len(d.PerFrameFunctionalGroupsSequence)):    
        y_starts.append(d.PerFrameFunctionalGroupsSequence[i].OphthalmicFrameLocationSequence[0].ReferenceCoordinates[0] )
        x_starts.append(d.PerFrameFunctionalGroupsSequence[i].OphthalmicFrameLocationSequence[0].ReferenceCoordinates[1] )
        y_ends.append(d.PerFrameFunctionalGroupsSequence[i].OphthalmicFrameLocationSequence[0].ReferenceCoordinates[2] )
        x_ends.append(d.PerFrameFunctionalGroupsSequence[i].OphthalmicFrameLocationSequence[0].ReferenceCoordinates[3] )
    return y_starts, x_starts, y_ends, x_ends


#### duplicate OCTs
- some are unreadable, or have the wrong format
- of the duplicates, often one will be of the optical nerve and the other of the macula

In [None]:
octs_nona = octs.dropna(subset=['patient_id', 'laterality', 'study_date'])

idx_cols = ['patient_id', 'laterality', 'study_date']
print('Have {} duplicated octs'.format(sum(octs_nona.duplicated(idx_cols))))

dup_octs = octs_nona[octs_nona.duplicated(idx_cols, keep=False)]
octs_grouped = dup_octs.groupby(idx_cols)
oct_keys = octs_grouped.groups.keys()


In [None]:
# show exemplary octs and fundus images
i=12
cur_octs = octs_grouped.get_group(list(oct_keys)[i])

fig, axes = plt.subplots(1, len(cur_octs))
for i, ax in enumerate(axes):
    f = np.zeros([768,768])
    y_starts, x_starts, y_ends, x_ends = get_oct_data(read_file(cur_octs.iloc[i].oct_path))
    for i in range(len(x_starts)):
        ax.plot([x_starts[i],x_ends[i]],[y_starts[i], y_ends[i]], 'w')
    # ax.imshow(f.pixel_array)
for i in range(len(cur_octs)):
    f = read_file(cur_octs.iloc[i].oct_path)
    interactive_show_oct_slice(f)

#### non duplicate OCTs
are they mostly of the macula? or are there also some of the optic nerve?
- seem to be macular octs only
- need to identify macular oct amongst duplicates

In [None]:
nodup_octs = octs_nona.drop_duplicates(idx_cols, keep=False)
print('Have {} non-duplicated octs'.format(len(nodup_octs)))

In [None]:
nodup_octs

In [None]:
# show exemplary octs and fundus images
i=np.random.randint(0,len(nodup_octs))
print(i)
cur = nodup_octs.iloc[i]

# f = read_file(cur.fundus_path)
d = read_file(cur.oct_path)
y_starts, x_starts, y_ends, x_ends = get_oct_data(d)
for i in range(len(x_starts)):
    plt.plot([x_starts[i],x_ends[i]],[y_starts[i], y_ends[i]], 'w')
# plt.imshow(f.pixel_array)
plt.show()
interactive_show_oct_slice(d)

### deal with duplicate octs
... by throwing them out
- in the future, might need to come back to this and do sth more advanced

In [None]:
nodup_octs

In [None]:
# save cleaned table
octs_no_dups.to_csv(os.path.join(clean_data_dir, 'octs_fundus_no_dups.csv'))

# Tensio

- from old notebook, untested
- drop exact duplicates
- take last measurement for duplicate measurements at the same day

In [None]:
#tensio = pd.read_csv(os.path.join(data_dir, 'dwh_tables/tensio.csv'), index_col=0)

# rename columns
#tensio.rename(columns={'AUGE': 'laterality', 'DAT':'study_date', 'PATNR':'patient_id',
                            'TENSIO': 'tensio'}, inplace=True)
# format columns 
#tensio.study_date = pd.to_datetime(tensio.study_date)
#tensio.tensio = [float(t.replace(',','.')) for t in tensio.tensio]

# get rid of exact duplicates
#num_raw = tensio.shape[0]
#tensio = tensio.drop_duplicates(keep='first')
#num_raw_nodup = tensio.shape[0]
#print('Dropped {} exact duplicates'.format(num_raw-num_raw_nodup))

# take care of duplicate measurements per time point
#num_dups = sum(tensio.duplicated(['study_date', 'patient_id', 'laterality']))
#print('Resolving {} duplicate measurements per time point'.format(num_dups))

#tensio_clean = tensio[~tensio.duplicated(['study_date', 'patient_id', 'laterality'], keep='last')]
#tensio_clean.to_csv(os.path.join(workspace_path, 'tensio_clean.csv'))
#print('Saved {} records to workspace'.format(len(tensio_clean)))