In [47]:
import os
import pandas as pd
import numpy as np
import glob
from pydicom import read_file

PROJ_DIR = "/storage/groups/ml01/datasets/raw/2018_LMUAugenklinik_niklas.koehler"

SAVE_PATH = "joint_export/dwh_tables"
EXPORT_1 = os.path.join(PROJ_DIR, "dwh_tables")
EXPORT_2 = os.path.join(PROJ_DIR, "Studies2_202012/dwh_tables")

OCT_DIR_EXPORT_1 = os.path.join(PROJ_DIR, "Studies/Optical Coherence Tomography Scanner")
OCT_DIR_EXPORT_2 = os.path.join(PROJ_DIR, "Studie2_202012/OPT")

DIR_EXPORT_1 = os.path.join(PROJ_DIR, "Studies")
DIR_EXPORT_2 = os.path.join(PROJ_DIR, "Studies2_202012")

## Diagnosis table

In [86]:
diagnosis_export_1 = pd.read_csv(os.path.join(EXPORT_1, "diagnosis.csv"))
diagnosis_export_2 = pd.read_csv(os.path.join(EXPORT_2, "diagnosen_anonymized.csv"))

In [87]:
diagnosis_export_2.head()

Unnamed: 0.1,Unnamed: 0,LOK,DAT,DKAT,DKEY,pseudo_id
0,0,,11-18-03,A1,C44.18,17
1,8,,03-15-06,4,J45.9,17
2,16,,03-15-06,AS,ASA1,17
3,24,,11-07-06,4,C44.1,17
4,32,,08-07-07,5,C44.1,17


In [88]:
diagnosis_export_1.head()

Unnamed: 0.1,Unnamed: 0,PATNR,LOK,DAT,DKAT,DKEY
0,0,1,,2009-01-04,07,H10.0
1,1,2,,2004-05-10,A2,H01.0
2,2,3,,2004-10-21,A2,H25.8
3,3,3,,2004-11-04,A2,H25.8
4,4,3,R,2004-11-05,A2,H25.8


#### Align columns

In [89]:
# rename pseudo ID to PATNR
diagnosis_export_2 = diagnosis_export_2.rename(columns={"pseudo_id":"PATNR"})

# set DAT column to pandas date time
diagnosis_export_2["DAT"] = pd.to_datetime(diagnosis_export_2["DAT"])

diagnosis_export_2 = diagnosis_export_2.drop(columns=["Unnamed: 0"])
diagnosis_export_1 = diagnosis_export_1.drop(columns=["Unnamed: 0"])

In [90]:
diagnosis_export_2.shape, diagnosis_export_1.shape

((343347, 5), (1820324, 5))

#### Append tables and drop duplicates 

In [91]:
diagnosis_joint = diagnosis_export_2.append(diagnosis_export_1, 'sort=True')

In [92]:
print("Number of records before dropping duplicates: ", diagnosis_joint.shape[0])

diagnosis_joint = diagnosis_joint.drop_duplicates(subset=["PATNR", "LOK", "DAT", "DKAT", "DKEY"])

print("Number of records after dropping duplicates: ", diagnosis_joint.shape[0])

Number of records before dropping duplicates:  2163671
Number of records after dropping duplicates:  2041980


#### Save new table to joint export directory

In [93]:
diagnosis_joint.to_csv(os.path.join(PROJ_DIR, SAVE_PATH, "diagnosis.csv"))

## Visus table

In [94]:
visus_export_1 = pd.read_csv(os.path.join(EXPORT_1, "visus_labels.csv"))
visus_export_2 = pd.read_csv(os.path.join(EXPORT_2, "visus_anonymized.csv"))

In [95]:
print("Number of records in export 1 table: ", visus_export_1.shape[0], 
      "Number of records in export 2 table: ", visus_export_2.shape[0])

Number of records in export 1 table:  843608 Number of records in export 2 table:  442017


In [96]:
visus_export_1.head()

Unnamed: 0.1,Unnamed: 0,PATNR,visual_acuity_VISUS,AUGE,MEASUREMENT_DATE,ORIGIN_TYPE
0,0,7,1/25,L,2014-03-19,SC
1,1,7,,L,2014-03-19,OR
2,2,7,005,R,2014-03-19,SC
3,3,7,,R,2014-03-19,OR
4,4,17,06,L,2015-01-22,OR


In [97]:
visus_export_2.head()

Unnamed: 0.1,Unnamed: 0,AUGE,ORIGIN,ORIGIN_TYPE,ORIGIN_KEY,MEASUREMENT_DATE,MEASUREMENT_TIME,MEASUREMENT_VALUE,CALCULATED_VALUE,LOGMAR_VALUE,pseudo_id
0,0,L,YMED_AU_VISUS_v2-PatnrAugeLfdnr,OR,0010121434L00004,01-22-15,2015-01-22 11:40:00.000,0.6,0.6,0.2,17
1,8,R,YMED_AU_VISUS_v2-PatnrAugeLfdnr,SC,0010121434R00011,12-10-15,2015-12-10 11:15:28.000,0.5,0.5,0.3,17
2,16,R,YMED_AU_VISUS_v2-PatnrAugeLfdnr,CC,0010121434R00002,01-22-15,2015-01-22 11:40:00.000,0.8,0.8,0.1,17
3,24,L,YMED_AU_VISUS_v2-PatnrAugeLfdnr,CC,0010121435L00007,10-24-17,2017-10-24 13:00:00.000,0.25,0.25,0.6,18
4,63,L,YMED_AU_VISUS_v2-PatnrAugeLfdnr,CC,0010121435L00021,03-13-18,2018-03-13 11:02:27.000,0.32,0.32,0.5,18


#### Rename, cast and drop columns

In [98]:
# drop coolumns
visus_export_2 = visus_export_2.drop(["ORIGIN_KEY", "MEASUREMENT_TIME", "ORIGIN", "Unnamed: 0"], axis=1)
visus_export_1 = visus_export_1.drop(["Unnamed: 0"], axis=1)

# set DAT column to pandas date time
visus_export_2["MEASUREMENT_DATE"] = pd.to_datetime(visus_export_2["MEASUREMENT_DATE"])

# rename id column
visus_export_2 = visus_export_2.rename(columns={"pseudo_id": "PATNR"})

In [99]:
# add missing columns
visus_export_2["visual_acuity_VISUS"] = np.nan
visus_export_1["MEASUREMENT_VALUE"] = np.nan
visus_export_1["CALCULATED_VALUE"] = np.nan
visus_export_1["LOGMAR_VALUE"] = np.nan

In [100]:
visus_export_1.shape, visus_export_2.shape

((843608, 8), (442017, 8))

#### Append tables and drop duplicates 

In [101]:
visus_export_joint = visus_export_2.append(visus_export_1, 'sort=True')

In [102]:
print("Number of records before dropping duplicates: ", visus_export_joint.shape[0])

visus_export_joint = visus_export_joint.drop_duplicates(subset=list(visus_export_joint.columns))

print("Number of records after dropping duplicates: ", visus_export_joint.shape[0])

Number of records before dropping duplicates:  1285625
Number of records after dropping duplicates:  1212908


#### Save new table to joint export directory

In [103]:
visus_export_joint.to_csv(os.path.join(PROJ_DIR, SAVE_PATH, "visus.csv"))

## Prozeduren table

In [104]:
prozeduren_export_1 = pd.read_csv(os.path.join(EXPORT_1, "prozeduren.csv"))
prozeduren_export_2 = pd.read_csv(os.path.join(EXPORT_2, "prozeduren_anonymized.csv"))

In [105]:
print("Number of records in export 1 table: ", prozeduren_export_1.shape[0], 
      "Number of records in export 2 table: ", prozeduren_export_2.shape[0])

Number of records in export 1 table:  910372 Number of records in export 2 table:  192828


In [106]:
prozeduren_export_1.head()

Unnamed: 0.1,Unnamed: 0,PATNR,LOK,DAT,ICPMK,ICPML,FALNR
0,0,3,,2004-11-05,P2,5-144.01,45805803
1,1,6,L,2011-01-20,P9,5-144.3A,49000113
2,2,6,,2011-01-20,P9,5-984,49000113
3,3,6,L,2011-03-10,P9,5-146.2A,49078466
4,4,6,,2011-03-10,P9,5-984,49078466


In [107]:
prozeduren_export_2.head()

Unnamed: 0.1,Unnamed: 0,ICPMK,ICPML,ICPHC,LOK,DAT,pseudo_id
0,0,P4,5-984,,L,03-15-06,17
1,8,PC,5-091.10,,L,05-21-14,17
2,16,PD,5-984,,,02-10-15,17
3,24,PD,3-300.0,X,,07-13-15,17
4,32,PG,5-156.9,X,R,01-08-18,18


#### Rename, cast and drop columns

In [108]:
# drop coolumns
prozeduren_export_2 = prozeduren_export_2.drop(["ICPHC", "Unnamed: 0"], axis=1)
prozeduren_export_1 = prozeduren_export_1.drop(["Unnamed: 0", "FALNR"], axis=1)

# set DAT column to pandas date time
prozeduren_export_2["DAT"] = pd.to_datetime(prozeduren_export_2["DAT"])

# rename id column
prozeduren_export_2 = prozeduren_export_2.rename(columns={"pseudo_id": "PATNR"})

In [109]:
prozeduren_export_1.shape, prozeduren_export_2.shape

((910372, 5), (192828, 5))

#### Append tables and drop duplicates 

In [110]:
prozeduren_export_joint = prozeduren_export_2.append(prozeduren_export_1, 'sort=True')

In [111]:
print("Number of records before dropping duplicates: ", prozeduren_export_joint.shape[0])

prozeduren_export_joint = prozeduren_export_joint.drop_duplicates(subset=list(prozeduren_export_joint.columns))

print("Number of records after dropping duplicates: ", prozeduren_export_joint.shape[0])

Number of records before dropping duplicates:  1103200
Number of records after dropping duplicates:  1055270


#### Save new table to joint export directory

In [112]:
prozeduren_export_joint.to_csv(os.path.join(PROJ_DIR, SAVE_PATH, "prozeduren.csv"))

## Tensio table

In [114]:
tensio_export_1 = pd.read_csv(os.path.join(EXPORT_1, "tensio.csv"))
tensio_export_2 = pd.read_csv(os.path.join(EXPORT_2, "tensio_anonymized.csv"))

In [115]:
print("Number of records in export 1 table: ", tensio_export_1.shape[0], 
      "Number of records in export 2 table: ", tensio_export_2.shape[0])

Number of records in export 1 table:  116374 Number of records in export 2 table:  283286


In [116]:
tensio_export_1.head()

Unnamed: 0.1,Unnamed: 0,PATNR,AUGE,DAT,TENSIO
0,0,18,L,2017-10-24,16
1,1,18,L,2017-11-22,15
2,2,18,L,2018-01-08,16
3,3,18,L,2018-02-07,16
4,4,18,L,2018-03-13,15


In [117]:
tensio_export_2.head()

Unnamed: 0.1,Unnamed: 0,AUGE,ORIGIN_TYPE,MEASUREMENT_DATE,MEASUREMENT_TIME,MEASUREMENT_VALUE,MAXVAL_DAY,CALCULATED_VALUE,ORIGIN_KEY,ORIGIN,pseudo_id
0,0,L,Luft,01-08-18,2018-01-08 11:10:00.000,16,16.0,16.0,0010121435L00014,YMED_AU_VISUS_v2-PatnrAugeLfdnr,18
1,39,L,Luft,05-08-18,2018-05-08 10:41:51.000,15,15.0,15.0,0010121435L00026,YMED_AU_VISUS_v2-PatnrAugeLfdnr,18
2,78,L,Luft,01-16-19,2019-01-16 14:20:09.000,17,17.0,17.0,0010121435L00058,YMED_AU_VISUS_v2-PatnrAugeLfdnr,18
3,117,L,Luft,12-17-19,2019-12-17 12:44:31.000,15,15.0,15.0,0010121435L00084,YMED_AU_VISUS_v2-PatnrAugeLfdnr,18
4,156,L,Luft,02-27-20,2020-02-27 11:04:41.000,14,14.0,14.0,0010121435L00088,YMED_AU_VISUS_v2-PatnrAugeLfdnr,18


#### Rename, cast and drop columns

In [118]:
# set DAT column to pandas date time
tensio_export_2["DAT"] = pd.to_datetime(tensio_export_2["MEASUREMENT_DATE"])

# drop coolumns
tensio_export_2 = tensio_export_2.drop(["ORIGIN_TYPE", "Unnamed: 0", "MEASUREMENT_TIME", 
                                                "MEASUREMENT_DATE", 
                                                "MAXVAL_DAY", "ORIGIN_KEY", "ORIGIN"], axis=1)

tensio_export_1 = tensio_export_1.drop(["Unnamed: 0"], axis=1)


# rename id column
tensio_export_2 = tensio_export_2.rename(columns={"pseudo_id": "PATNR"})

In [119]:
tensio_export_1.shape, tensio_export_2.shape

((116374, 4), (283286, 5))

#### Append tables and drop duplicates 

In [121]:
tensio_export_joint = tensio_export_2.append(tensio_export_1, 'sort=True')

In [122]:
print("Number of records before dropping duplicates: ", tensio_export_joint.shape[0])

tensio_export_joint = tensio_export_joint.drop_duplicates(subset=list(tensio_export_joint.columns))

print("Number of records after dropping duplicates: ", tensio_export_joint.shape[0])

Number of records before dropping duplicates:  399660
Number of records after dropping duplicates:  357829


#### Save new table to joint export directory

In [123]:
tensio_export_joint.to_csv(os.path.join(PROJ_DIR, SAVE_PATH, "tensio.csv"))

## Patient OCT meta information

In [68]:
oct_dicom_paths_export1 = pd.read_csv(os.path.join(DIR_EXPORT_1, "oct_paths.csv"), header=None)
oct_dicom_paths_export2 = pd.read_csv(os.path.join(DIR_EXPORT_2, "dicom_paths.csv"))

In [69]:
# process dicom path file 
oct_dicom_paths_export2[0] = PROJ_DIR + oct_dicom_paths_export2["path"]
oct_dicom_paths_export2[0] = oct_dicom_paths_export2[0].str.replace("koehler./", "koehler/")
oct_dicom_paths_export2 = oct_dicom_paths_export2[[0]]

In [86]:
class MetaLogging:
    def __init__(self, logdir):
        self.logdir = logdir

    def process(self, i):
        try:
            dc = read_file(i, stop_before_pixels = True)

            dicom_log = {"patient_id": dc.PatientID,
                         "patient_name": dc.PatientName,
                         "laterality": dc.ImageLaterality,
                         "study_date": dc.StudyDate,
                         "birthdate": dc.PatientBirthDate,
                         "image_type": dc.ImageType,
                         "gender": dc.PatientSex,
                         "modality": dc.Modality}

            return dicom_log
        except:
            self.write_log(record=i)
            print(f"record not working: {i}")

    def write_log(self, record):
        if not os.path.exists(self.logdir ):
            os.makedirs(self.logdir )

        if not os.path.exists(self.logdir  + '/logs/meta_error_log.txt'):
            with open('logs/meta_error_log.txt', 'w') as f:
                f.write("%s\n" % record)
        else:
            with open(self.logdir  + '/logs/meta_error_log.txt', 'a') as f:
                f.write("%s\n" % record)

In [93]:
from joblib import Parallel, delayed
from tqdm import tqdm

dicom_files = oct_dicom_paths_export2[0].tolist()

logdir = "/storage/groups/ml01/workspace/olle.holmberg/LODE/eye_clinic_general/notebooks_logs"

meta_logging = MetaLogging(logdir = logdir)
num_cores = -1
    
processed_list = Parallel(n_jobs = num_cores)(delayed(meta_logging.process)(i) for i in tqdm(dicom_files))
processed_list = [pl for pl in processed_list if pl is not None]

export_pd = pd.DataFrame.from_dict(processed_list)

  1%|          | 1284/245346 [00:24<1:35:10, 42.74it/s]

KeyboardInterrupt: 

In [71]:
dicom_files = oct_dicom_paths_export2[0].tolist()
dc = read_file(dicom_files[0])

In [89]:
meta_logging.process

<bound method MetaLogging.process of <__main__.MetaLogging object at 0x7f0fc1f88c50>>