## Jan 25, 2023
## Summary of preload list generation for training / testing data preprocess

In [1]:
!pip install fastparquet



In [2]:
import pandas as pd
import numpy as np
import os as os

pd.set_option('display.max_rows', 500)

import seaborn as sns
import matplotlib as plt

datadir = '/challenge/seeing-through-the-fog/data/train_data'

In [3]:
df_dia = pd.read_parquet(datadir + "/" + "diagnoses.parquet")
df_obs = pd.read_parquet(datadir + "/" + "observations.parquet")
df_med = pd.read_parquet(datadir + "/" + "medication.parquet")
df_lab = pd.read_parquet(datadir + "/" + "labs.parquet")
df_imm = pd.read_parquet(datadir + "/" + "immunization.parquet")
df_dem = pd.read_parquet(datadir + "/" + "demo.parquet")

In [8]:
def write_list_to_txt(output_file, lines):
    with open(output_file, 'w') as f:
        for line in lines:
            f.write(f"{line}\n")
            
            
def read_txt_to_list(input_file):
    with open(input_file) as file_in:
        list_of_lines = []
        for line in file_in:
            list_of_lines.append(line.rstrip())
    return list_of_lines        

### Generate Top15_obs_type.txt

In [4]:
df_obs_dedup = df_obs.drop_duplicates()
print(df_obs.shape)
print(df_obs_dedup.shape)

(14734829, 5)
(5662458, 5)


In [5]:
df_obs_top_obstype_count =  df_obs_dedup.groupby('obs_type')[['patientid']].count().\
reset_index().sort_values(by='patientid', ascending=False)

df_obs_top_obstype_count.shape

(106, 2)

In [6]:
df_obs_top_obstype_count.head(20)

Unnamed: 0,obs_type,patientid
89,SBP,908449
40,DBP,815406
85,PULSE,750610
102,TEMP,425857
87,RESP,411511
38,BMI,346390
105,WT,345327
91,SMOKE,324166
76,HT,265536
75,HR,243456


In [7]:
top_15_obs_type_list = df_obs_top_obstype_count.head(15)['obs_type'].to_list()

In [9]:
write_list_to_txt("/home/huangz36/Top15_obs_type.txt", top_15_obs_type_list)

### Generate Top40_labtests.txt and Top50_labtests.txt

In [10]:
df_lab_drop = df_lab.drop_duplicates()
df_lab_drop_short_group = df_lab_drop.groupby('test_name')[['patientid']].count().\
reset_index().sort_values(by='patientid', ascending=False)

In [11]:
df_lab_drop_short_group.head(50)

Unnamed: 0,test_name,patientid
1123,Oxygen saturation (SpO2).pulse oximetry,570326
657,Glucose.random,366950
1213,Potassium (K),203356
449,Creatinine,201129
707,Hemoglobin (HGB),196228
322,Calcium.total,193091
705,Hematocrit (HCT),192723
1516,White blood cell count (WBC),189030
1282,Red blood cell count (RBC),181645
937,Mean corpuscular hemoglobin concentration (MCHC),178592


In [12]:
Top_50_lab_test =  df_lab_drop_short_group.head(50)['test_name'].tolist()
Top_40_lab_test =  df_lab_drop_short_group.head(40)['test_name'].tolist()

In [13]:
write_list_to_txt("/home/huangz36/Top50_labtests.txt", Top_50_lab_test)
write_list_to_txt("/home/huangz36/Top40_labtests.txt", Top_40_lab_test)

### Generate Top100_drug
### Further searched drug details and manually assigned annotated categories

In [14]:
df_med_dedup = df_med.drop_duplicates()
print(df_med.shape)
print(df_med_dedup.shape)

(6021819, 4)
(4473741, 4)


In [15]:
df_med_dedup.head(5)

Unnamed: 0,patientid,days_to_covid_diag,drug_name,ndc
0,RAADC3-504477,21,Mupirocin,68462018022
1,RAADC3-241668,15,Glucose Meter Test Control Strips,53885024450
2,RAADC3-125094,55,Ergocalciferol,69452015120
3,RAADC3-349386,15,Vitamin B Complex and Vitamin C,60258016001
4,RAADC3-556667,21,Tiagabine Hydrochloride,93503156


In [16]:
df_med_dedup_count = df_med_dedup.groupby('drug_name')[['patientid']].count().\
reset_index().sort_values(by='patientid', ascending=False)

In [17]:
df_med_dedup_count.head(100)

Unnamed: 0,drug_name,patientid
792,Acetaminophen,121654
1046,Aspirin,85551
1064,Atorvastatin Calcium,80620
10060,Sodium Chloride,72888
8918,Pantoprazole Sodium,72031
851,Albuterol Sulfate,71584
6715,Lisinopril,70013
933,Amlodipine Besylate,55707
6678,Levothyroxine Sodium,48536
4059,Enoxaparin Sodium,47594


In [None]:
top100_drug_list = df_med_dedup_count['drug_name'].to_list()