## Jan 2023
## Clean up OBS table

In [1]:
!pip install fastparquet



In [2]:
import pandas as pd
import numpy as np
import os as os

pd.set_option('display.max_rows', 500)

import seaborn as sns
import matplotlib as plt

datadir = '/challenge/seeing-through-the-fog/data/train_data'

In [3]:
df_obs = pd.read_parquet(datadir + "/" + "observations.parquet")

## df_obs

In [4]:
df_obs.head()

Unnamed: 0,patientid,obs_type,obs_result,obs_unit,days_to_covid_diag
0,RAADC3-364095,DBP,56.0,mm Hg,12
1,RAADC3-364095,BMI,21.8,,12
2,RAADC3-364095,PULSE,72.0,bpm,28
3,RAADC3-364095,SBP,95.0,mm Hg,12
4,RAADC3-364095,DBP,58.0,mm Hg,28


### After dedup, only 1/3 of records remained

In [5]:
df_obs_dedup = df_obs.drop_duplicates()

In [6]:
print(df_obs.shape)
print(df_obs_dedup.shape)

(14734829, 5)
(5662458, 5)


In [7]:
df_obs_dedup['obs_type'].unique()

array(['DBP', 'BMI', 'PULSE', 'SBP', 'TEMP', 'RESP', 'WT', 'HT', 'SMOKE',
       'EXERCISE', 'ALCOHOL', 'HR', 'UROUT', 'PAIN',
       'ACO.PHQ-2.DEPRESSION SCREEN', 'ACO.BP FOLLOW-UP',
       'ADVANCED CARE DIRECTIVE: YES/NO', 'ACO.TOBACCO USE', 'LVEF',
       'SMOKE_CESS_CONSULT', 'PACK_YEARS',
       'SELF MONITORED BLOOD GLUCOSE (SMBG)',
       '6-CLICKS INPT DLY ASSESS:QN1(HELP PT CURRENT NEEDS TO PUT ON/TAKE OFF LOWER BODY CLOTHING?)',
       '6-CLICKS INPT DAILY ACTIVITY ASSESS:QN3(HELP PATIENT CURRENT NEEDS TOILETING?)',
       '6-CLICKS INPT DLY ASSESS:QN4(HELP PT CURRENT NEEDS TO PUT ON/TAKE OFF UPPER BODY CLOTHING?)',
       'ACO.PHQ-9.DEPRESSION SCREEN',
       '6-CLICKS INPATIENT BASIC MOBILITY ASSESSMENT: TOTAL SCORE',
       '6-CLICKS INPATIENT DAILY ACTIVITY ASSESSMENT: TOTAL SCORE',
       '6-CLICKS INPT BASIC MOBIL ASSESS: QN3(DIFFICULTY PT HAS MOVING LYING ON BACK TO SIT ON BEDSIDE?)',
       '6-CLICKS INPT DAILY ACTIVITY ASSESS:QN2(HELP PATIENT CURRENT NEEDS BATHING?

In [8]:
df_obs_top_obstype_count =  df_obs_dedup.groupby('obs_type')[['patientid']].count().\
reset_index().sort_values(by='patientid', ascending=False)

In [9]:
df_obs_top_obstype_count.shape

(106, 2)

In [10]:
df_obs_top_obstype_count

Unnamed: 0,obs_type,patientid
89,SBP,908449
40,DBP,815406
85,PULSE,750610
102,TEMP,425857
87,RESP,411511
38,BMI,346390
105,WT,345327
91,SMOKE,324166
76,HT,265536
75,HR,243456


In [11]:
## Only pick the top 15 "obs_type"

In [12]:
top_15_obs_type_list = df_obs_top_obstype_count.head(15)['obs_type'].to_list()

In [13]:
top_15_obs_type_list

['SBP',
 'DBP',
 'PULSE',
 'TEMP',
 'RESP',
 'BMI',
 'WT',
 'SMOKE',
 'HT',
 'HR',
 'ALCOHOL',
 'PAIN',
 'ACO.TOBACCO USE',
 'UROUT',
 'PACK_YEARS']

In [14]:
df_obs_dedup_topObs = df_obs_dedup[df_obs_dedup['obs_type'].isin(top_15_obs_type_list)]

In [15]:
print(df_obs_dedup.shape)
print(df_obs_dedup_topObs.shape)

(5662458, 5)
(5534168, 5)


In [16]:
df_obs_dedup_topObs.head()

Unnamed: 0,patientid,obs_type,obs_result,obs_unit,days_to_covid_diag
0,RAADC3-364095,DBP,56.0,mm Hg,12
1,RAADC3-364095,BMI,21.8,,12
2,RAADC3-364095,PULSE,72.0,bpm,28
3,RAADC3-364095,SBP,95.0,mm Hg,12
4,RAADC3-364095,DBP,58.0,mm Hg,28


In [17]:
df_obs_dedup_topObs['obs_type_unit'] = df_obs_dedup_topObs['obs_type'].astype(str) + "(" \
+ df_obs_dedup_topObs['obs_unit'].astype(str) + ")"

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_obs_dedup_topObs['obs_type_unit'] = df_obs_dedup_topObs['obs_type'].astype(str) + "(" \


### Understand the top 15 obs_types
#### 1. SBP / DBP: blood pressure readings given in two numbers, top number is the max pressure, bottom number is the amount of pressure
#### The definition of normal BP (systolic blood pressure [SBP] < 140 mmHg and diastolic blood pressure [DBP] < 90 mmHg) was first proposed by the 3rd report of Joint National committee on Detection, Evaluation and Treatment of High Blood Pressure in 1984 (JNC III)

In [18]:
df_obs_dedup_topObs_combined = df_obs_dedup_topObs[['patientid', 'obs_type_unit', 'obs_result', 'days_to_covid_diag']]

In [19]:
df_obs_dedup_topObs_combined.head()

Unnamed: 0,patientid,obs_type_unit,obs_result,days_to_covid_diag
0,RAADC3-364095,DBP(mm Hg),56.0,12
1,RAADC3-364095,BMI(None),21.8,12
2,RAADC3-364095,PULSE(bpm),72.0,28
3,RAADC3-364095,SBP(mm Hg),95.0,12
4,RAADC3-364095,DBP(mm Hg),58.0,28


In [20]:
df_obs_top_obstype_count.head(15)

Unnamed: 0,obs_type,patientid
89,SBP,908449
40,DBP,815406
85,PULSE,750610
102,TEMP,425857
87,RESP,411511
38,BMI,346390
105,WT,345327
91,SMOKE,324166
76,HT,265536
75,HR,243456


In [21]:
df_obs_dedup_topObs_combined[df_obs_dedup_topObs_combined['patientid']=='RAADC3-364095'].\
sort_values(by='days_to_covid_diag')

Unnamed: 0,patientid,obs_type_unit,obs_result,days_to_covid_diag
0,RAADC3-364095,DBP(mm Hg),56.0,12
1,RAADC3-364095,BMI(None),21.8,12
22,RAADC3-364095,WT(kg),64.98,12
3,RAADC3-364095,SBP(mm Hg),95.0,12
20,RAADC3-364095,SBP(mm Hg),105.0,21
18,RAADC3-364095,TEMP(deg c),36.9,21
15,RAADC3-364095,DBP(mm Hg),67.0,21
8,RAADC3-364095,PULSE(bpm),73.0,21
14,RAADC3-364095,RESP(breaths/min),16.0,21
19,RAADC3-364095,HT(cm),172.72,28


In [22]:
df_obs_dedup_topObs_combined['obs_type_unit'].unique()

array(['DBP(mm Hg)', 'BMI(None)', 'PULSE(bpm)', 'SBP(mm Hg)',
       'TEMP(deg c)', 'RESP(breaths/min)', 'WT(kg)', 'HT(cm)',
       'SMOKE(None)', 'ALCOHOL(None)', 'HR(bpm)', 'UROUT(ml)',
       'PAIN(out of 10)', 'ACO.TOBACCO USE(None)',
       'PACK_YEARS(pack-years)'], dtype=object)

In [23]:
df_obs_dedup_topObs_combined[df_obs_dedup_topObs_combined['obs_type_unit']=='SMOKE(None)']

Unnamed: 0,patientid,obs_type_unit,obs_result,days_to_covid_diag
25,RAADC3-026938,SMOKE(None),Never smoked,34
53,RAADC3-593426,SMOKE(None),Never smoked,35
64,RAADC3-042337,SMOKE(None),Never smoked,50
65,RAADC3-042337,SMOKE(None),Never smoked,13
93,RAADC3-461210,SMOKE(None),Never smoked,29
...,...,...,...,...
14734567,RAADC3-079138,SMOKE(None),Never smoked,49
14734576,RAADC3-265909,SMOKE(None),Never smoked,54
14734624,RAADC3-488203,SMOKE(None),Previously smoked,14
14734669,RAADC3-268628,SMOKE(None),Previously smoked,25


In [24]:
df_obs_dedup_topObs_combined[df_obs_dedup_topObs_combined['obs_type_unit']=='UROUT(ml)']

Unnamed: 0,patientid,obs_type_unit,obs_result,days_to_covid_diag
81,RAADC3-461210,UROUT(ml),250,2
2404,RAADC3-646894,UROUT(ml),,1
3673,RAADC3-307110,UROUT(ml),0.0,33
3674,RAADC3-307110,UROUT(ml),600,33
3801,RAADC3-307110,UROUT(ml),,33
...,...,...,...,...
14734125,RAADC3-205515,UROUT(ml),100,31
14734126,RAADC3-205515,UROUT(ml),100,32
14734256,RAADC3-488203,UROUT(ml),1650,38
14734314,RAADC3-205515,UROUT(ml),900,31


In [25]:
df_obs_dedup_topObs_combined[df_obs_dedup_topObs_combined['obs_type_unit']=='ACO.TOBACCO USE(None)']

Unnamed: 0,patientid,obs_type_unit,obs_result,days_to_covid_diag
1009,RAADC3-293562,ACO.TOBACCO USE(None),3141.use.4,35
1013,RAADC3-293562,ACO.TOBACCO USE(None),3141.tobuser.4,6
1014,RAADC3-293562,ACO.TOBACCO USE(None),3141.smokeless.3,42
1053,RAADC3-293562,ACO.TOBACCO USE(None),3141.smokeless.3,28
1060,RAADC3-293562,ACO.TOBACCO USE(None),3141.use.4,6
...,...,...,...,...
14734723,RAADC3-181324,ACO.TOBACCO USE(None),3141.smokeless.3,51
14734739,RAADC3-346494,ACO.TOBACCO USE(None),7789.smokeless.3,16
14734745,RAADC3-072789,ACO.TOBACCO USE(None),10236.use.5,53
14734755,RAADC3-084140,ACO.TOBACCO USE(None),4121.smokeless.3,7


In [26]:
df_obs_dedup_topObs_combined[df_obs_dedup_topObs_combined['obs_type_unit']=='SMOKE(None)']

Unnamed: 0,patientid,obs_type_unit,obs_result,days_to_covid_diag
25,RAADC3-026938,SMOKE(None),Never smoked,34
53,RAADC3-593426,SMOKE(None),Never smoked,35
64,RAADC3-042337,SMOKE(None),Never smoked,50
65,RAADC3-042337,SMOKE(None),Never smoked,13
93,RAADC3-461210,SMOKE(None),Never smoked,29
...,...,...,...,...
14734567,RAADC3-079138,SMOKE(None),Never smoked,49
14734576,RAADC3-265909,SMOKE(None),Never smoked,54
14734624,RAADC3-488203,SMOKE(None),Previously smoked,14
14734669,RAADC3-268628,SMOKE(None),Previously smoked,25


In [27]:
df_obs_dedup_topObs_combined[df_obs_dedup_topObs_combined['obs_type_unit']=='PAIN(out of 10)']

Unnamed: 0,patientid,obs_type_unit,obs_result,days_to_covid_diag
108,RAADC3-461210,PAIN(out of 10),5,2
135,RAADC3-461210,PAIN(out of 10),2,29
141,RAADC3-461210,PAIN(out of 10),4,2
194,RAADC3-461210,PAIN(out of 10),6,2
267,RAADC3-461210,PAIN(out of 10),4.5,23
...,...,...,...,...
14733550,RAADC3-265909,PAIN(out of 10),5,60
14733880,RAADC3-083856,PAIN(out of 10),10,31
14734032,RAADC3-038276,PAIN(out of 10),0,9
14734087,RAADC3-256859,PAIN(out of 10),8.0,43


In [28]:
df_obs_dedup_topObs_combined[df_obs_dedup_topObs_combined['obs_type_unit']=='PACK_YEARS(pack-years)']

Unnamed: 0,patientid,obs_type_unit,obs_result,days_to_covid_diag
1262,RAADC3-190508,PACK_YEARS(pack-years),.5,13
1264,RAADC3-027277,PACK_YEARS(pack-years),30,4
1655,RAADC3-206530,PACK_YEARS(pack-years),3,57
1677,RAADC3-206530,PACK_YEARS(pack-years),3,28
1693,RAADC3-597597,PACK_YEARS(pack-years),12,6
...,...,...,...,...
14733296,RAADC3-338275,PACK_YEARS(pack-years),5.5,20
14734046,RAADC3-057271,PACK_YEARS(pack-years),1,13
14734298,RAADC3-294499,PACK_YEARS(pack-years),1.25,41
14734363,RAADC3-228214,PACK_YEARS(pack-years),45,23


### BMI
### df_obs_patient_BMI_max

In [29]:
df_obs_dedup_topObs_BMI = df_obs_dedup_topObs_combined[df_obs_dedup_topObs_combined\
                                                       ['obs_type_unit']=='BMI(None)'].dropna()

In [30]:
df_obs_dedup_topObs_BMI.head()

Unnamed: 0,patientid,obs_type_unit,obs_result,days_to_covid_diag
1,RAADC3-364095,BMI(None),21.8,12
5,RAADC3-364095,BMI(None),20.4,40
17,RAADC3-364095,BMI(None),19.7,61
24,RAADC3-364095,BMI(None),19.8,28
30,RAADC3-586023,BMI(None),33.3,20


In [31]:
df_obs_dedup_topObs_BMI.obs_result.unique()

array(['21.8', '20.4', '19.7', '19.8', '33.3', '33.5', '53.7', '22.5',
       '31.4', '29.7', '32.6', '30.9', '34.9', '36.2', '40.3', '40.4',
       '23.8', '23.3', '23.4', '17.9', '43.7', '36.6', '27.3', '33.0',
       '34.0', '34.2', '37.1', '38.2', '34.7', '22.2', '22.0', '22',
       '30.6', '35.9', '68.2', '67.8', '26.2', '28.8', '28.9', '28.2',
       '28.7', '29.0', '28.5', '26.3', '38.6', '32.3', '32.1', '31.2',
       '32', '33.1', '26.4', '27.7', '44.5', '44.2', '32.0', '30.7',
       '31.5', '33.2', '21.4', '22.3', '28.6', '43.8', '38.3', '43.4',
       '25.4', '33.4', '35', '35.0', '25.6', '45.2', '35.7', '38.8',
       '46.8', '41.5', '21.5', '26.0', '26', '27.2', '27', '31.0', '30.4',
       '30.2', '24.8', '25.2', '25.1', '31', '25.8', '23.0', '25.3', '23',
       '24.9', '27.5', '27.8', '20.8', '29', '33', '26.6', '32.9', '33.8',
       '34', '22.8', '18.8', '45.4', '45.0', '45', '39.7', '39.8', '43.5',
       '42.0', '43', '43.0', '41.1', '42', '30', '43.1', '19.9', '3

In [32]:
df_obs_patient_BMI_max = df_obs_dedup_topObs_BMI.groupby('patientid')[['obs_result']]\
.max().reset_index().sort_values(by='patientid')

df_obs_patient_BMI_max.columns = ['patientid', 'BMI_max']

In [33]:
df_obs_patient_BMI_max.head()

Unnamed: 0,patientid,BMI_max
0,RAADC3-000005,31.7
1,RAADC3-000006,25.2
2,RAADC3-000009,30.2
3,RAADC3-000011,27.9
4,RAADC3-000013,20.8


### HT
### df_obs_dedup_topObs_HT_max
### WT
### df_obs_dedup_topObs_WT_max

In [34]:
df_obs_dedup_topObs_HT = df_obs_dedup_topObs_combined[df_obs_dedup_topObs_combined\
                                                       ['obs_type_unit']=='HT(cm)'].dropna()

df_obs_dedup_topObs_HT_max = df_obs_dedup_topObs_HT.groupby('patientid')[['obs_result']]\
.max().reset_index().sort_values(by='patientid')

df_obs_dedup_topObs_HT_max.columns = ['patientid', 'HT_max']

In [35]:
df_obs_dedup_topObs_HT_max.head()

Unnamed: 0,patientid,HT_max
0,RAADC3-000006,162.56
1,RAADC3-000008,142.24
2,RAADC3-000009,160.02
3,RAADC3-000011,152.4
4,RAADC3-000013,170.18


In [36]:
df_obs_dedup_topObs_WT = df_obs_dedup_topObs_combined[df_obs_dedup_topObs_combined\
                                                       ['obs_type_unit']=='WT(kg)'].dropna()

df_obs_dedup_topObs_WT_max = df_obs_dedup_topObs_WT.groupby('patientid')[['obs_result']]\
.max().reset_index().sort_values(by='patientid')

df_obs_dedup_topObs_WT_max.columns = ['patientid', 'WT_max']

In [37]:
df_obs_dedup_topObs_WT_max.head()

Unnamed: 0,patientid,WT_max
0,RAADC3-000005,72.3
1,RAADC3-000006,66.68
2,RAADC3-000009,77.22
3,RAADC3-000011,64.86
4,RAADC3-000013,60.33


### Pulse
### df_obs_dedup_topObs_PULSE_max
### TEMP
### df_obs_dedup_topObs_TEMP_max
### RESP
### df_obs_dedup_topObs_RESP_max
### HR
### df_obs_dedup_topObs_HR_max

In [38]:
df_obs_dedup_topObs_PULSE = df_obs_dedup_topObs_combined[df_obs_dedup_topObs_combined\
                                                       ['obs_type_unit']=='PULSE(bpm)'].dropna()

df_obs_dedup_topObs_PULSE_max = df_obs_dedup_topObs_PULSE.groupby('patientid')[['obs_result']]\
.max().reset_index().sort_values(by='patientid')

df_obs_dedup_topObs_PULSE_max.columns = ['patientid', 'PULSE_max']

In [39]:
df_obs_dedup_topObs_PULSE_max.head()

Unnamed: 0,patientid,PULSE_max
0,RAADC3-000005,70
1,RAADC3-000006,83
2,RAADC3-000009,85
3,RAADC3-000011,77
4,RAADC3-000018,82


In [40]:
df_obs_dedup_topObs_TEMP = df_obs_dedup_topObs_combined[df_obs_dedup_topObs_combined\
                                                       ['obs_type_unit']=='TEMP(deg c)'].dropna()

df_obs_dedup_topObs_TEMP_max = df_obs_dedup_topObs_TEMP.groupby('patientid')[['obs_result']]\
.max().reset_index().sort_values(by='patientid')

df_obs_dedup_topObs_TEMP_max.columns = ['patientid', 'TEMP_max']

In [41]:
df_obs_dedup_topObs_TEMP_max.head()

Unnamed: 0,patientid,TEMP_max
0,RAADC3-000009,37.1
1,RAADC3-000011,36.5
2,RAADC3-000013,36.7
3,RAADC3-000018,36.8
4,RAADC3-000019,36.9


In [42]:
df_obs_dedup_topObs_RESP = df_obs_dedup_topObs_combined[df_obs_dedup_topObs_combined\
                                                       ['obs_type_unit']=='RESP(breaths/min)'].dropna()

df_obs_dedup_topObs_RESP_max = df_obs_dedup_topObs_RESP.groupby('patientid')[['obs_result']]\
.max().reset_index().sort_values(by='patientid')

df_obs_dedup_topObs_RESP_max.columns = ['patientid', 'RESP_max']

In [43]:
df_obs_dedup_topObs_RESP_max.head()

Unnamed: 0,patientid,RESP_max
0,RAADC3-000009,20.0
1,RAADC3-000018,18.0
2,RAADC3-000024,20.0
3,RAADC3-000036,16.0
4,RAADC3-000045,20.0


In [44]:
df_obs_dedup_topObs_HR = df_obs_dedup_topObs_combined[df_obs_dedup_topObs_combined\
                                                       ['obs_type_unit']=='HR(bpm)'].dropna()

df_obs_dedup_topObs_HR_max = df_obs_dedup_topObs_HR.groupby('patientid')[['obs_result']]\
.max().reset_index().sort_values(by='patientid')

df_obs_dedup_topObs_HR_max.columns = ['patientid', 'HR_max']

In [45]:
df_obs_dedup_topObs_HR_max.head()

Unnamed: 0,patientid,HR_max
0,RAADC3-000051,88
1,RAADC3-000055,88
2,RAADC3-000073,97
3,RAADC3-000130,61
4,RAADC3-000131,99


## Inspect tables
## Inconsistent records per patientid

### SMOKE
### df_obs_dedup_topObs_SMOKE
### ALCOHOL
### df_obs_dedup_topObs_ALCOHOL

In [167]:
df_obs_dedup_topObs_SMOKE = df_obs_dedup_topObs_combined[df_obs_dedup_topObs_combined\
                                                       ['obs_type_unit']=='SMOKE(None)']\
[['patientid', 'obs_result']].drop_duplicates().reset_index(drop=True)

df_obs_dedup_topObs_SMOKE.columns = ['patientid', 'SMOKE']

In [168]:
df_obs_dedup_topObs_SMOKE.head()

Unnamed: 0,patientid,SMOKE
0,RAADC3-026938,Never smoked
1,RAADC3-593426,Never smoked
2,RAADC3-042337,Never smoked
3,RAADC3-461210,Never smoked
4,RAADC3-326688,Never smoked


In [169]:
df_obs_smoke_count = df_obs_dedup_topObs_SMOKE.groupby('patientid')[['SMOKE']].count().\
reset_index().sort_values(by='SMOKE', ascending=False)

smoke_inconsistent_to_drop = df_obs_smoke_count[df_obs_smoke_count['SMOKE']!=1]['patientid'].to_list()

print(len(smoke_inconsistent_to_drop))

25233


In [170]:
print(df_obs_dedup_topObs_SMOKE.shape)
df_obs_dedup_topObs_SMOKE_dedup = df_obs_dedup_topObs_SMOKE[~df_obs_dedup_topObs_SMOKE['patientid'].\
                                                      isin(smoke_inconsistent_to_drop)]

print(df_obs_dedup_topObs_SMOKE_dedup.shape)

(172262, 2)
(118280, 2)


In [157]:
df_obs_dedup_topObs_SMOKE[df_obs_dedup_topObs_SMOKE['patientid']=='RAADC3-000013']

Unnamed: 0,patientid,SMOKE
101416,RAADC3-000013,Not currently smoking
101417,RAADC3-000013,Current smoker
101418,RAADC3-000013,Never smoked


In [149]:
df_obs_dedup_topObs_SMOKE.groupby('patientid')[['SMOKE']].count().\
reset_index().sort_values(by='patientid', ascending=True)

Unnamed: 0,patientid,SMOKE
0,RAADC3-000005,1
1,RAADC3-000006,1
2,RAADC3-000008,1
3,RAADC3-000011,1
4,RAADC3-000013,3
...,...,...
143507,RAADC3-727950,1
143508,RAADC3-727953,1
143509,RAADC3-727954,1
143510,RAADC3-727956,1


In [129]:
len(df_obs_dedup_topObs_SMOKE['patientid'].unique())

143513

In [171]:
df_obs_dedup_topObs_ALCOHOL = df_obs_dedup_topObs_combined[df_obs_dedup_topObs_combined\
                                                       ['obs_type_unit']=='ALCOHOL(None)'].dropna()\
[['patientid', 'obs_result']].drop_duplicates().reset_index(drop=True)

df_obs_dedup_topObs_ALCOHOL.columns = ['patientid', 'ALCOHOL']


In [172]:
df_obs_dedup_topObs_ALCOHOL.head()

Unnamed: 0,patientid,ALCOHOL
0,RAADC3-593426,Unknown alcohol consumption
1,RAADC3-593426,Consumes alcohol
2,RAADC3-326688,Consumes alcohol
3,RAADC3-230262,Does not consume alcohol
4,RAADC3-098476,Consumes alcohol


In [174]:
df_obs_alcohol_count = df_obs_dedup_topObs_ALCOHOL.groupby('patientid')[['ALCOHOL']].count().\
reset_index().sort_values(by='ALCOHOL', ascending=False)

alcohol_inconsistent_to_drop = df_obs_alcohol_count[df_obs_alcohol_count['ALCOHOL']!=1]['patientid'].to_list()

print(len(alcohol_inconsistent_to_drop))

15061


In [175]:
print(df_obs_dedup_topObs_ALCOHOL.shape)
df_obs_dedup_topObs_ALCOHOL_dedup = df_obs_dedup_topObs_ALCOHOL[~df_obs_dedup_topObs_ALCOHOL['patientid'].\
                                                      isin(alcohol_inconsistent_to_drop)]

print(df_obs_dedup_topObs_ALCOHOL_dedup.shape)

(109590, 2)
(78943, 2)


### PAIN
### df_obs_dedup_topObs_PAIN_max
### Pack-year
### df_obs_dedup_topObs_PACKYRS_max

In [50]:
df_obs_dedup_topObs_PAIN = df_obs_dedup_topObs_combined[df_obs_dedup_topObs_combined\
                                                       ['obs_type_unit']=='PAIN(out of 10)'].dropna()

df_obs_dedup_topObs_PAIN_max = df_obs_dedup_topObs_PAIN.groupby('patientid')[['obs_result']]\
.max().reset_index().sort_values(by='patientid')

df_obs_dedup_topObs_PAIN_max.columns = ['patientid', 'PAIN_max']

In [51]:
df_obs_dedup_topObs_PAIN.head()

Unnamed: 0,patientid,obs_type_unit,obs_result,days_to_covid_diag
108,RAADC3-461210,PAIN(out of 10),5.0,2
135,RAADC3-461210,PAIN(out of 10),2.0,29
141,RAADC3-461210,PAIN(out of 10),4.0,2
194,RAADC3-461210,PAIN(out of 10),6.0,2
267,RAADC3-461210,PAIN(out of 10),4.5,23


In [52]:
df_obs_dedup_topObs_PACKYRS = df_obs_dedup_topObs_combined[df_obs_dedup_topObs_combined\
                                                       ['obs_type_unit']=='PACK_YEARS(pack-years)'].dropna()

df_obs_dedup_topObs_PACKYRS_max = df_obs_dedup_topObs_PACKYRS.groupby('patientid')[['obs_result']]\
.max().reset_index().sort_values(by='patientid')

df_obs_dedup_topObs_PACKYRS_max.columns = ['patientid', 'PACKYRS_max']

In [53]:
df_obs_dedup_topObs_PACKYRS_max.head()

Unnamed: 0,patientid,PACKYRS_max
0,RAADC3-000008,0
1,RAADC3-000013,0
2,RAADC3-000044,0
3,RAADC3-000062,40
4,RAADC3-000066,0


### DBP
### df_obs_dedup_topObs_DBP_max
### SBP
### df_obs_dedup_topObs_SBP_max

In [54]:
df_obs_dedup_topObs_DBP = df_obs_dedup_topObs_combined[df_obs_dedup_topObs_combined\
                                                       ['obs_type_unit']=='DBP(mm Hg)'].dropna()

df_obs_dedup_topObs_DBP_max = df_obs_dedup_topObs_DBP.groupby('patientid')[['obs_result']]\
.max().reset_index().sort_values(by='patientid')

df_obs_dedup_topObs_DBP_max.columns = ['patientid', 'DBP_max']

In [55]:
df_obs_dedup_topObs_DBP_max.head()

Unnamed: 0,patientid,DBP_max
0,RAADC3-000005,80
1,RAADC3-000006,72
2,RAADC3-000009,70
3,RAADC3-000011,74
4,RAADC3-000013,80


In [56]:
df_obs_dedup_topObs_SBP = df_obs_dedup_topObs_combined[df_obs_dedup_topObs_combined\
                                                       ['obs_type_unit']=='SBP(mm Hg)'].dropna()

df_obs_dedup_topObs_SBP_max = df_obs_dedup_topObs_SBP.groupby('patientid')[['obs_result']]\
.max().reset_index().sort_values(by='patientid')

df_obs_dedup_topObs_SBP_max.columns = ['patientid', 'SBP_max']

In [57]:
df_obs_dedup_topObs_SBP_max.head()

Unnamed: 0,patientid,SBP_max
0,RAADC3-000005,138
1,RAADC3-000006,144
2,RAADC3-000009,130
3,RAADC3-000011,160
4,RAADC3-000013,122


## Aggregation of all tables after pivot and clean

In [132]:
### Each patient may have multiple observations over the visits. 
### For BMI: take max (done)
### For WT/HT: take max (done)
### For DBP/SBP: take highest (high BP), categorize
### For Pulse: take highest (done)
### For TEMP/RESP/HR: take highest (done)
### For SMOKE, Alcohol: drop duplicates (done)
### For ACO.TOBACCO USE (do not include)
### For PAIN: the the value from 1-10 (take max) (done)
### For Pack-year: take the value (done)


df_obs_dedup_topObs_combined['obs_type_unit'].unique()

array(['DBP(mm Hg)', 'BMI(None)', 'PULSE(bpm)', 'SBP(mm Hg)',
       'TEMP(deg c)', 'RESP(breaths/min)', 'WT(kg)', 'HT(cm)',
       'SMOKE(None)', 'ALCOHOL(None)', 'HR(bpm)', 'UROUT(ml)',
       'PAIN(out of 10)', 'ACO.TOBACCO USE(None)',
       'PACK_YEARS(pack-years)'], dtype=object)

In [133]:
df_obs_dedup_topObs_SMOKE.shape

(172262, 2)

In [134]:
df_obs_dedup_topObs_ALCOHOL.shape

(109590, 2)

In [176]:
obs_table_list = [df_obs_patient_BMI_max, df_obs_dedup_topObs_HT_max, df_obs_dedup_topObs_WT_max, \
                 df_obs_dedup_topObs_PULSE_max, df_obs_dedup_topObs_TEMP_max, df_obs_dedup_topObs_RESP_max, \
                 df_obs_dedup_topObs_HR_max, \
                 df_obs_dedup_topObs_PAIN_max, df_obs_dedup_topObs_PACKYRS_max, \
                 df_obs_dedup_topObs_DBP_max, df_obs_dedup_topObs_SBP_max, \
                 df_obs_dedup_topObs_SMOKE_dedup, df_obs_dedup_topObs_ALCOHOL_dedup]

In [177]:
df_obs_pid_for_left_merge = df_obs_dedup_topObs_combined[['patientid']].drop_duplicates()
df_obs_pid_for_left_merge['status'] ='PT_with_OBS'

In [178]:
df_obs_pid_for_left_merge.head()

Unnamed: 0,patientid,status
0,RAADC3-364095,PT_with_OBS
25,RAADC3-026938,PT_with_OBS
27,RAADC3-586023,PT_with_OBS
42,RAADC3-593426,PT_with_OBS
63,RAADC3-042337,PT_with_OBS


In [179]:
df_obs_pid_for_left_merge.shape

(178952, 2)

In [180]:
df_obs_patient_BMI_max.head()

Unnamed: 0,patientid,BMI_max
0,RAADC3-000005,31.7
1,RAADC3-000006,25.2
2,RAADC3-000009,30.2
3,RAADC3-000011,27.9
4,RAADC3-000013,20.8


In [181]:
print(df_obs_patient_BMI_max.shape)
print(df_obs_patient_BMI_max.drop_duplicates().shape)

(144702, 2)
(144702, 2)


In [182]:
df = df_obs_pid_for_left_merge
print(df.head())

        patientid       status
0   RAADC3-364095  PT_with_OBS
25  RAADC3-026938  PT_with_OBS
27  RAADC3-586023  PT_with_OBS
42  RAADC3-593426  PT_with_OBS
63  RAADC3-042337  PT_with_OBS


In [183]:
df_obs_pid_for_left_merge.head()

Unnamed: 0,patientid,status
0,RAADC3-364095,PT_with_OBS
25,RAADC3-026938,PT_with_OBS
27,RAADC3-586023,PT_with_OBS
42,RAADC3-593426,PT_with_OBS
63,RAADC3-042337,PT_with_OBS


In [184]:
df_obs_pid_for_left_merge.merge(df_obs_patient_BMI_max, on='patientid', how='left')

Unnamed: 0,patientid,status,BMI_max
0,RAADC3-364095,PT_with_OBS,21.8
1,RAADC3-026938,PT_with_OBS,
2,RAADC3-586023,PT_with_OBS,33.5
3,RAADC3-593426,PT_with_OBS,53.7
4,RAADC3-042337,PT_with_OBS,22.5
...,...,...,...
178947,RAADC3-260173,PT_with_OBS,
178948,RAADC3-529851,PT_with_OBS,
178949,RAADC3-642887,PT_with_OBS,
178950,RAADC3-251634,PT_with_OBS,


In [185]:
## left merge

df = df_obs_pid_for_left_merge
for obs_ind_table in obs_table_list:
    print("shape of the table to be merged", obs_ind_table.shape)
    print(obs_ind_table.head())
    df = df.merge(obs_ind_table, on='patientid', how='left').sort_values(by='patientid')
    print("shape of intermediate table", df.shape)
    print(df.head())
    print("========")

shape of the table to be merged (144702, 2)
       patientid BMI_max
0  RAADC3-000005    31.7
1  RAADC3-000006    25.2
2  RAADC3-000009    30.2
3  RAADC3-000011    27.9
4  RAADC3-000013    20.8
shape of intermediate table (178952, 3)
            patientid       status BMI_max
18496   RAADC3-000002  PT_with_OBS     NaN
114966  RAADC3-000005  PT_with_OBS    31.7
1620    RAADC3-000006  PT_with_OBS    25.2
161696  RAADC3-000008  PT_with_OBS     NaN
127635  RAADC3-000009  PT_with_OBS    30.2
shape of the table to be merged (135957, 2)
       patientid  HT_max
0  RAADC3-000006  162.56
1  RAADC3-000008  142.24
2  RAADC3-000009  160.02
3  RAADC3-000011   152.4
4  RAADC3-000013  170.18
shape of intermediate table (178952, 4)
       patientid       status BMI_max  HT_max
0  RAADC3-000002  PT_with_OBS     NaN     NaN
1  RAADC3-000005  PT_with_OBS    31.7     NaN
2  RAADC3-000006  PT_with_OBS    25.2  162.56
3  RAADC3-000008  PT_with_OBS     NaN  142.24
4  RAADC3-000009  PT_with_OBS    30.2  160.0

shape of intermediate table (178952, 14)
       patientid       status BMI_max  HT_max WT_max PULSE_max TEMP_max  \
0  RAADC3-000002  PT_with_OBS     NaN     NaN    NaN       NaN      NaN   
1  RAADC3-000005  PT_with_OBS    31.7     NaN   72.3        70      NaN   
2  RAADC3-000006  PT_with_OBS    25.2  162.56  66.68        83      NaN   
3  RAADC3-000008  PT_with_OBS     NaN  142.24    NaN       NaN      NaN   
4  RAADC3-000009  PT_with_OBS    30.2  160.02  77.22        85     37.1   

  RESP_max HR_max PAIN_max PACKYRS_max DBP_max SBP_max         SMOKE  
0      NaN    NaN        2         NaN     NaN     NaN           NaN  
1      NaN    NaN      NaN         NaN      80     138  Never smoked  
2      NaN    NaN        0         NaN      72     144  Never smoked  
3      NaN    NaN        0           0     NaN     NaN  Never smoked  
4     20.0    NaN      NaN         NaN      70     130           NaN  
shape of the table to be merged (78943, 2)
       patientid                   ALCO

In [82]:
df_obs_pivot_by_top15_obs = df.drop(['status'], axis=1)

In [83]:
df_obs_pivot_by_top15_obs.head()

Unnamed: 0,patientid,BMI_max,HT_max,WT_max,PULSE_max,TEMP_max,RESP_max,HR_max,SMOKE,ALCOHOL,PAIN_max,PACKYRS_max,DBP_max,SBP_max
0,RAADC3-364095,21.8,172.72,64.98,73.0,36.9,20.0,,,,,,67.0,95.0
1,RAADC3-026938,,,,,,,,Never smoked,,,,,
2,RAADC3-586023,33.5,162.56,88.45,86.0,36.9,16.0,,,,,,64.0,92.0
3,RAADC3-593426,53.7,162.6,141.97,100.0,36.9,18.0,,Never smoked,Unknown alcohol consumption,,,80.0,142.0
4,RAADC3-593426,53.7,162.6,141.97,100.0,36.9,18.0,,Never smoked,Consumes alcohol,,,80.0,142.0


In [84]:
df_obs_pivot_by_top15_obs.shape

(226785, 14)

In [85]:
df_obs_pid_for_left_merge.shape

(178952, 2)