In [1]:
import os
import pandas as pd
import numpy as np

clean_d_columns = ["patient_id", "laterality", "diagnosis", "diagnosis_raw", "iol_date"]

'''
the three naive lists are:

First export: check_naive_patients_CORRECTED
Second export (2 lists): Treatment naive list 08.04.21.xlsx & longitudinal_records_with_date_CHECKED.xlsm

In this documnent the naive lists are getting preprocessed to be of same format and then exported to a
naive full lists to be used for filtering.
'''

RAW_DIR = "/storage/groups/ml01/datasets/raw/2018_LMUAugenklinik_niklas.koehler"
DATA_DIR = "/storage/groups/ml01/datasets/projects/20181610_eyeclinic_niklas.koehler"

raw_data_dir = os.path.join(RAW_DIR, 'joint_export/dwh_tables/supplement_tables')
clean_data_dir = os.path.join(DATA_DIR, 'joint_export/dwh_tables_cleaned')

## Longitudinal diagnosis clean

## first longitudinal list

In [2]:
karstens_long1L = pd.read_csv(os.path.join(raw_data_dir, "longitudinal_patients_left_eye.csv"))
karstens_long1L = karstens_long1L.rename(columns={"pseudo_id": "patient_id", "EYE": "laterality",
                                                 "IOL": "iol_date"})

karstens_long1L.loc[:,"diagnosis"] = "AMD"
karstens_long1L.loc[:, "diagnosis_raw"] = "AMD"

karstens_long1L_dlong = karstens_long1L[clean_d_columns]
karstens_long1L_dlong.head()

Unnamed: 0,patient_id,laterality,diagnosis,diagnosis_raw,iol_date
0,502,L,AMD,AMD,2016-04-04
1,516,L,AMD,AMD,2015-09-21
2,709,L,AMD,AMD,2017-03-28
3,1163,L,AMD,AMD,2014-01-23
4,1263,L,AMD,AMD,2015-07-28


In [3]:
karstens_long1R = pd.read_csv(os.path.join(raw_data_dir, "longitudinal_patients_right_eye.csv"))

karstens_long1R = karstens_long1R.rename(columns={"pseudo_id": "patient_id", "EYE": "laterality",
                                                 "IOL": "iol_date"})

karstens_long1R.loc[:,"diagnosis"] = "AMD"
karstens_long1R.loc[:, "diagnosis_raw"] = "AMD"

karstens_long1R_dlong = karstens_long1R[clean_d_columns]
karstens_long1R_dlong.head()

Unnamed: 0,patient_id,laterality,diagnosis,diagnosis_raw,iol_date
0,18,R,AMD,AMD,
1,176,R,AMD,AMD,2015-05-20
2,502,R,AMD,AMD,2016-11-30
3,709,R,AMD,AMD,
4,910,R,AMD,AMD,2009-12-08


## second longitudinal list

In [4]:
naive_2 = pd.read_excel(os.path.join(raw_data_dir, "longitudinal_records_with_date_CHECKED.xlsm"),engine='openpyxl',)

naive_2_dlong = naive_2[["pseudo_id", "LOK", "Cat-Date", "Cat", "Naive"]]

naive_2["Cat-Date"] = pd.to_datetime(naive_2["Cat-Date"], unit='d').dt.strftime('%#Y-%m-%d').values

# change wrong eye
for row in naive_2.itertuples():            
    if row[8] == 2.0:
        naive_2.loc[row[0], "Cat-Date"] = "prior"
        
naive_2_dlong.loc[:, "diagnosis"] = "AMD"
naive_2_dlong.loc[:, "diagnosis_raw"] = "AMD"

naive_2_dlong = naive_2_dlong.rename(columns={"pseudo_id":"patient_id", "LOK": "laterality",
                                             "Cat-Date":"iol_date"})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value)


## Third longitudinal list

In [5]:
naive_3 = pd.read_excel(os.path.join(raw_data_dir, "Treatment naive list 08.04.21.xlsx") ,engine='openpyxl',)

naive_3 = naive_3[["pseudo_id", "Unnamed: 1", "Unnamed: 3", "Naive", "Cat", "Cat-Date"]]
naive_3["Cat-Date"] = pd.to_datetime(naive_3["Cat-Date"], errors='coerce', unit='d').dt.strftime('%#Y-%m-%d').values

naive_3 = naive_3.rename(columns={"pseudo_id":"patient_id", 
                                  "Unnamed: 1": "first_injection_date", 
                                  "Unnamed: 3": "laterality",
                                  "Cat-Date":"iol_date"})


print(np.unique(naive_3.laterality, return_counts=True))
laterality_switch = {"L":"R", "R":"L"}

# change wrong eye
for row in naive_3.itertuples():    
    if row[4] == 4.0:
        naive_3.loc[row[0], "laterality"] = laterality_switch[row[3]]
        naive_3.loc[row[0], "Naive"] = 1.0
        
    if row[5] == 2.0:
        naive_3.loc[row[0], "iol_date"] = "prior"
        
print(np.unique(naive_3.laterality, return_counts=True))

naive_3.loc[:, "diagnosis"] = "AMD"
naive_3.loc[:, "diagnosis_raw"] = "AMD"

(array(['L', 'R'], dtype=object), array([212, 179]))
(array(['L', 'R'], dtype=object), array([208, 183]))


## Merge joint clean diagnsosis table and save

In [6]:
t2 = naive_2_dlong[clean_d_columns]
t3 = naive_3.dropna(subset=["Naive"])[clean_d_columns]
t1L = karstens_long1L_dlong
t1R = karstens_long1R_dlong

joing_clean_diagnosis_table = t2.append(t2.append(t1L.append(t1R)))

joing_clean_diagnosis_table.loc[:, "iol_date"] = pd.to_datetime(joing_clean_diagnosis_table.iol_date, 
                                                                errors='coerce')

joing_clean_diagnosis_table.to_csv(os.path.join(clean_data_dir, "diagnosis_longitudinal_clean.csv"), index=False)

# Get final Naive patients list

In [7]:
naive_3 = naive_3.dropna(subset=["Naive"])

## Load in naive records from first export

In [8]:
naive_first_export = pd.read_csv(os.path.join(raw_data_dir, "check_naive_patients_CORRECTED.csv"))
naive_first_export = naive_first_export.dropna(subset=["patient_id","laterality","first_injection_date","Naive"], how="all")

naive_first_export = naive_first_export[["patient_id", "laterality", "first_injection_date", "Naive"]]

naive_first_export = pd.merge(naive_first_export, joing_clean_diagnosis_table[["patient_id", "laterality", "iol_date"]], 
         on=["patient_id", "laterality"], how="left")

naive_first_export = naive_first_export.rename(columns={"naive":"Naive"})
naive_first_export = naive_first_export[["patient_id", "laterality", "Naive", "iol_date"]]

## Marge all naive patients in Joint export

In [9]:
naive_list_latest = naive_first_export.append(naive_3.append(naive_2_dlong).drop("first_injection_date", 
                                                                                 axis=1))

In [10]:
naive_list_latest = naive_list_latest[(naive_list_latest.Naive != 2) & (naive_list_latest.Naive != 0)]

In [11]:
naive_list_latest.loc[:, "patient_id"] = naive_list_latest.patient_id.astype(int)
naive_list_latest = naive_list_latest.drop_duplicates(subset=["patient_id", "laterality"])

In [12]:
full_naive_list = naive_list_latest
full_naive_list.to_csv(os.path.join(clean_data_dir, "naive_patients.csv"), index=False)