In [2]:
import numpy as np
#import random
import pandas as pd
from IPython.display import Image
import matplotlib.pyplot as plt
import os
import seaborn as sns
from collections import defaultdict

In [3]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [57]:
DATA_DIR = os.path.join('.',"data")

## 1.1 Checking the patient dataset

In [310]:
data_pt = pd.read_csv(os.path.join(DATA_DIR, 'patient.csv'), sep='\t')
data_pt.rename(columns={'patientid': 'pt_id'}, inplace=True)
data_pt.head()
data_pt.shape

Unnamed: 0,pt_id,sex
0,13151,Female
1,43463,Female
2,42834,Male
3,37531,Female
4,31613,Female


(945, 2)

In [311]:
data_pt.drop_duplicates(inplace=True)
data_pt.shape

(940, 2)

In [312]:
data_pt.drop_duplicates(subset=['pt_id'], inplace=True)
data_pt.shape

(932, 2)

In [313]:
data_pt['sex'].unique()

array(['Female', 'Male', 'female', 'f', 'male', 'Not Reported', 'M', 'm'],
      dtype=object)

In [314]:
data_pt['sex'] = data_pt['sex'].astype(str).str.upper().str[0]
data_pt['sex'].unique()

array(['F', 'M', 'N'], dtype=object)

In [315]:
#Check missing values in specified columns. Note the missing value in Treatment Plan is not counted. 
columns_to_check =['pt_id', 'sex']
row_with_missing_cx = [row_idx for row_idx, row in data_pt[columns_to_check].isnull().iterrows() if True in row.values]
print("\nMissing Value Summary\n{}".format("-"*35))
print(data_pt.isnull().sum(axis = 0))
if len(row_with_missing_cx) > 0:
    print("There were {} rows with missing diagnosis in data_pct dataset".format(len(row_with_missing_cx)))
else:
    print("There were no rows with missing data")


Missing Value Summary
-----------------------------------
pt_id    0
sex      0
dtype: int64
There were no rows with missing data


## 1.2. Check the diagnosis dataset

In [316]:
data_dx = pd.read_csv(os.path.join(DATA_DIR, 'diagnosis.csv'), sep='\t')
data_dx.head()
data_dx.shape
data_dx.dtypes

Unnamed: 0,pt_id,dos,dx_code
0,13151,2018-04-10,H35.3231
1,13151,2018-05-22,H35.3231
2,13151,2018-07-17,H35.3220
3,13151,2018-09-25,H35.3220
4,13151,2018-12-18,H35.3220


(4672, 3)

pt_id       int64
dos        object
dx_code    object
dtype: object

In [317]:
data_dx["dos"] = pd.to_datetime(data_dx['dos'])
data_dx['year']=data_dx['dos'].dt.year
data_dx.head()

Unnamed: 0,pt_id,dos,dx_code,year
0,13151,2018-04-10,H35.3231,2018
1,13151,2018-05-22,H35.3231,2018
2,13151,2018-07-17,H35.3220,2018
3,13151,2018-09-25,H35.3220,2018
4,13151,2018-12-18,H35.3220,2018


In [106]:
data_dx.drop_duplicates(inplace=True)
data_dx.shape

(4484, 4)

In [107]:
#Check missing values in specified columns. Note the missing value in Treatment Plan is not counted. 
columns_to_check =['pt_id', 'dos', 'dx_code']
row_with_missing_cx = [row_idx for row_idx, row in data_dx[columns_to_check].isnull().iterrows() if True in row.values]
print("\nMissing Value Summary\n{}".format("-"*35))
print(data_dx.isnull().sum(axis = 0))
if len(row_with_missing_cx) > 0:
    print("There were {} rows with missing diagnosis in data_pct dataset".format(len(row_with_missing_cx)))
else:
    print("There were no rows with missing data")


Missing Value Summary
-----------------------------------
pt_id      0
dos        0
dx_code    0
year       0
dtype: int64
There were no rows with missing data


In [108]:
columns_to_show =['pt_id']
dx_cnt = pd.pivot_table(data_dx, index = ['dx_code'], values=columns_to_show, aggfunc='count')
dx_cnt.sort_values(by=['pt_id'], ascending=False)[:5]
#print('The number of patients having Wet Age-Related Macular Degeneration (wAMD) is {}'.format(dx_cnt.loc['H35.32'].values))

Unnamed: 0_level_0,pt_id
dx_code,Unnamed: 1_level_1
H35.32,402
362.52,395
E11.9,244
H35.033,197
H35.3220,155


## 1.3. Checking and prepocessing the procedure dataset

In [246]:
data_pct = pd.read_csv(os.path.join(DATA_DIR, 'procedure.csv'), sep='\t')
data_pct.head()
data_pct.shape
data_pct.dtypes

Unnamed: 0,patient_id,dos,procedure_code,mod
0,10026,12/14/19,2027F,
1,10026,11/13/19,92014,25.0
2,10026,11/6/19,G8427,
3,10026,10/2/19,92134,
4,10026,9/23/19,92014,25.0


(5222, 4)

patient_id         int64
dos               object
procedure_code    object
mod               object
dtype: object

In [247]:
data_pct.drop_duplicates(inplace=True)
data_pct.shape

(5149, 4)

In [248]:
data_pct["dos"] = pd.to_datetime(data_pct['dos'])
data_pct['year']=data_pct['dos'].dt.year
data_pct.head()

Unnamed: 0,patient_id,dos,procedure_code,mod,year
0,10026,2019-12-14,2027F,,2019
1,10026,2019-11-13,92014,25.0,2019
2,10026,2019-11-06,G8427,,2019
3,10026,2019-10-02,92134,,2019
4,10026,2019-09-23,92014,25.0,2019


In [249]:
data_pct['procedure_code'].unique()

array(['2027F', '92014', 'G8427', '92134', '2026F', '1036F', '92012',
       '92226', 'EYLEAX1', '99213', '92015', '99212', '3072F', '2019F',
       '4177F', '92083', '67210', '92250', 'C9257', '92235', 'G8397',
       '92135', '99214', '76512', '67228', '92133', '2022F', '5010F',
       '2024F', '4040F', '3284F', '92225', '67028', 'J2778PF', 'J3590',
       'J9035', '92273', 'G8482', '67040', '92020', 'G8420', '99499',
       '92136', 'G8950', '67028MCR', 'J0178OU', 'J0178', 'LACS', '92002',
       '99024', 'G8753', 'LUC5SYRX1', 'G8756', 'G8754', 'G9974', 'J2778',
       'J7312', '92242', '67041', '92081', 'G9744', '66984', 'J7999',
       '3285F', '2021F', 'J2778-5', 'J2778,05MG', 'J2778-5P', 'G8752',
       'AVASTIN', 'G9903', 'RS000', '66821', 'G8918', 'G8907', 'J3490',
       '1', 'J9035,J3490,Q9977', 'D0000', '92283', 'J2778P', 'LUC5MG',
       'G8428', 'J2778DME', '92240', 'J2778POU'], dtype=object)

In [250]:
#Check missing values in specified columns. Note the missing value in Treatment Plan is not counted. 
columns_to_check =['patient_id', 'dos', 'procedure_code', 'mod']
row_with_missing_cx = [row_idx for row_idx,row in data_pct[columns_to_check].isnull().iterrows() if True in row.values]
print("\nMissing Value Summary\n{}".format("-"*35))
print(data_pct.isnull().sum(axis = 0))
if len(row_with_missing_cx) > 0:
    print("There were {} rows with missing modifier in data_pct dataset".format(len(row_with_missing_cx)))
else:
    print("There were no rows with missing data")


Missing Value Summary
-----------------------------------
patient_id           0
dos                  0
procedure_code       0
mod               3709
year                 0
dtype: int64
There were 3709 rows with missing modifier in data_pct dataset


In [251]:
data_pct = data_pct.fillna("NA").reset_index(drop = True)

# 2. Answers to questions

### Q2. How many patients have Wet Age-Related Macular Degeneration (wAMD) in the given dataset?

In [111]:
h=data_dx[data_dx['dx_code'].str.contains('H35.32', case=False)]
h.shape

(856, 4)

In [112]:
hcnt=h.groupby(['pt_id'],as_index=False).agg(order=('dx_code', 'count'))
print('The numbe of patients having Wet Age-Related Macular Degeneration (wAMD) is {}'.format(len(hcnt)))

The numbe of patients having Wet Age-Related Macular Degeneration (wAMD) is 233


### Q3. How many patients have wAMD in 2019?

In [116]:
hcnt_2019=h[h['year']==2019].groupby(['pt_id'],as_index=False).agg(order=('dx_code', 'count'))
print('The numbe of patients having Wet Age-Related Macular Degeneration (wAMD) in 2019 is {}'.format(len(hcnt_2019)))

The numbe of patients having Wet Age-Related Macular Degeneration (wAMD) in 2019 is 66


### Q4. How many patients have wAMD between 2014-2017, stratified by sex?

In [117]:
pt_h = pd.merge(data_pt, h, on='pt_id', how='inner')
pt_h.head()
pt_h.shape

Unnamed: 0,pt_id,sex,dos,dx_code,year
0,13151,F,2018-04-10,H35.3231,2018
1,13151,F,2018-05-22,H35.3231,2018
2,13151,F,2018-07-17,H35.3220,2018
3,13151,F,2018-09-25,H35.3220,2018
4,13151,F,2018-12-18,H35.3220,2018


(856, 5)

In [136]:
pt_h_14to17=pt_h[(pt_h['year']>=2014) & (pt_h['year']<=2017)]
print('The numbe of patients having Wet Age-Related Macular Degeneration (wAMD) between 2014 and 2017, stratified by sex')
pt_h_14to17.groupby(['sex'],as_index=False).agg(order=('pt_id', 'count'))

The numbe of patients having Wet Age-Related Macular Degeneration (wAMD) between 2014 and 2017, stratified by sex


Unnamed: 0,sex,order
0,F,174
1,M,214
2,N,1


### Q5. How would you determine if sex is associated with an increased risk of wAMD?

In [244]:
pt_h_all=pt_h.groupby(['sex'],as_index=False).agg(order=('pt_id', 'count'))
#print('The numbe of patients having Wet Age-Related Macular Degeneration (wAMD), stratified by sex')
pt_h_all

Unnamed: 0,sex,order
0,F,423
1,M,429
2,N,4


### Q6. How many women diagnosed with wAMD between 2014-2017 also had an intravitreal injection during that time?

In [2]:
data_pct.rename(columns={'patient_id': 'pt_id'}, inplace=True)
pt_h_pct = pd.merge(pt_h_14to17, data_pct, on=['pt_id','year'], how='inner')
pt_h_pct_F=pt_h_pct[pt_h_pct['sex']=='F']

NameError: name 'data_pct' is not defined

### Q7. What is the most common type of intravitreal injection in women diagnosed with wAMD between 2014-2017?

In [253]:
pt_h_pct_F[pt_h_pct_F['procedure_code'].isin(['67028'])].groupby('mod').agg(order=('pt_id','count')).sort_values(by="order",ascending=False)

Unnamed: 0_level_0,order
mod,Unnamed: 1_level_1
RT,148
,125
LT,28


### Q8. Stratify the type and count of intravitreal injections by eye laterality (right, left, unspecified) in 2014-2017 for patients with wAMD by year.

In [254]:
df=pt_h_pct[pt_h_pct['procedure_code'].isin(['67028'])].copy()
df['mod'].unique()

array(['LT', 'NA', 'RT', '50', '18944008'], dtype=object)

In [255]:
df['mod']=df['mod'].map(lambda x: x.replace('18944008','RT').replace('50','NA'))
df['mod'].unique()

array(['LT', 'NA', 'RT'], dtype=object)

In [256]:
columns_to_show =['dos_y']
pd.pivot_table(df, index = ['year', 'mod'], values=columns_to_show, aggfunc='count')

Unnamed: 0_level_0,Unnamed: 1_level_0,dos_y
year,mod,Unnamed: 2_level_1
2015,,23
2015,RT,1
2016,LT,5
2016,,185
2016,RT,27
2017,LT,180
2017,,145
2017,RT,145


### Q9. Find the ratio of patient diagnosis dates that have a corresponding procedure date? Does this ratio tell you anything about the data? If so, what might it indicate?

In [290]:
pt_dx = pd.merge(data_pt, data_dx, on='pt_id', how='inner')
pt_dx.head()
pt_dx.shape

Unnamed: 0,pt_id,sex,dos,dx_code,year
0,13151,F,2018-04-10,H35.3231,2018
1,13151,F,2018-05-22,H35.3231,2018
2,13151,F,2018-07-17,H35.3220,2018
3,13151,F,2018-09-25,H35.3220,2018
4,13151,F,2018-12-18,H35.3220,2018


(4484, 5)

In [299]:
pt_dx_pct_all = pd.merge(pt_dx, data_pct, on=['pt_id', 'dos'], how='right')
pt_dx_pct_all

Unnamed: 0,pt_id,sex,dos,dx_code,year_x,procedure_code,mod,year_y
0,10026,M,2019-12-14,H35.32,2019.0,2027F,,2019
1,10026,M,2019-11-13,H43.813,2019.0,92014,25,2019
2,10026,M,2019-11-06,E11.9,2019.0,G8427,,2019
3,10026,M,2019-10-02,H35.81,2019.0,92134,,2019
4,10026,M,2019-09-23,H35.033,2019.0,92014,25,2019
...,...,...,...,...,...,...,...,...
5890,43502,M,2016-04-14,H35.3210,2016.0,67028,18944008,2016
5891,43502,M,2016-05-19,H35.3210,2016.0,67028,18944008,2016
5892,43502,M,2016-07-21,H35.3210,2016.0,67028,18944008,2016
5893,43502,M,2016-09-01,H35.3210,2016.0,67028,18944008,2016


In [300]:
#Check missing values in specified columns. 
columns_to_check =['dx_code']
row_with_missing = [row_idx for row_idx, row in pt_dx_pct_all[columns_to_check].isnull().iterrows() if True in row.values]
print("\nMissing Value Summary\n{}".format("-"*35))
print(pt_dx_pct_all.isnull().sum(axis = 0))


Missing Value Summary
-----------------------------------
pt_id              0
sex               10
dos                0
dx_code           10
year_x            10
procedure_code     0
mod                0
year_y             0
dtype: int64


In [298]:
print('The ratio of patient diagnosis dates that have a corresponding procedure date is {:.3f}'.format((len(pt_h_pct_all)-len(row_with_missing))/len(pt_h_pct_all)))

The ratio of patient diagnosis dates that have a corresponding procedure date is 0.998


### Q10. Are there any issues with the data? If so, what issues did you notice?

We notice the following data quality issses:
1. There are duplicate values in patient, diagnosis, and procedures data
2. In patient data, there are eight distinct values in the sex category. They are normalized to two
3. In the patient data, there are 8 patient_id having different sex 
4. 

### Q11. How would you define completeness of patient notes and how would you go about validating it?

### Q12. If you were to build a threshold for acceptable data quality, what would you take into consideration and how would you approach it?