<a href="https://colab.research.google.com/github/victormurcia/VCHAMPS/blob/main/Mapping_Immunizations_Encounter_IDs.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#General utilities
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm  # Import tqdm for the progress bar
import glob,shutil,os,warnings,math,time,sys,re
from typing import List
import dask.dataframe as dd
from dask.diagnostics import ProgressBar

#For converting states to their abbreviations
#!pip install us
#import us

#For performing UTC normalization on datetime columns based on the STATE column
import pytz

#For Slider viz
import ipywidgets as widgets
from IPython.display import display, clear_output,HTML

#For EDA
#!pip install dataprep
#from dataprep.eda import create_report

#Enable data to be extracted and downloaded from my Google Drive
from google.colab import drive, files
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# Specify the path to the desired directory
directory_path = r'/content/drive/MyDrive/VCHAMPS - Train Cleaned'

# Change the current working directory to the desired directory
os.chdir(directory_path)

# Verify the current working directory
cwd = os.getcwd()


print(f"Current working directory: {cwd}")

Current working directory: /content/drive/MyDrive/VCHAMPS - Train Cleaned


# Generate Encounter IDs
Do this based on the inpatient_admisssions, ed_visits,and outpatient visits files.

I'll either use a Hash or a UUID to define the Encounter ID

In [3]:
import uuid
# Define a dictionary to store generated UUIDs
uuid_dict = {}

# Define a custom function to generate UUIDs and ensure uniqueness
def generate_uuid(row, df_val):
    if df_val == 1:
      columns = ['Internalpatientid', 'Ed visit start date', 'Discharge date ed']
    elif df_val == 2:
      columns = ['Internalpatientid', 'Admission date', 'Discharge date']
    elif df_val == 3:
      columns = ['Internalpatientid', 'Visit start date']

    unique_values = tuple(row[column] for column in columns)
    unique_values += (df_val,)  # Append the additional parameter to the unique_values tuple
    key = str(unique_values)

    # Check if UUID already exists in the dictionary
    if key in uuid_dict:
        return uuid_dict[key]

    # Generate a new UUID and store it in the dictionary
    new_uuid = str(uuid.uuid5(uuid.NAMESPACE_OID, key))
    uuid_dict[key] = new_uuid
    return new_uuid

I decided to use UUID instead of Hash since UUIDs used far less memory

# Load Encounter DFs

In [4]:
#Load the Dataframes
ed_visits_df            = dd.read_parquet('/content/drive/MyDrive/VCHAMPS - Train Cleaned-Mapped/ed_visits.parquet')
inpatient_admissions_df = dd.read_parquet('/content/drive/MyDrive/VCHAMPS - Train Cleaned-Mapped/inpatient_admissions.parquet')
outpatient_visits_df    = dd.read_parquet('/content/drive/MyDrive/VCHAMPS - Train Cleaned-Mapped/outpatient_visits.parquet')

ed_visits_df = ed_visits_df.compute()
inpatient_admissions_df = inpatient_admissions_df.compute()
outpatient_visits_df = outpatient_visits_df.compute()

# Mapping Vectorized

In [5]:
def map_encounter_id_vectorized(row, age_col, date_col):
    patient_id = row['Internalpatientid']
    patient_age = row[age_col]
    date_to_match = row[date_col]

    filtered_ed_visits = ed_visits_df[ed_visits_df['Internalpatientid'] == patient_id]
    ed_visit_match = (filtered_ed_visits['Ed visit start date'] <= date_to_match) & (filtered_ed_visits['Discharge date ed'] >= date_to_match) & (filtered_ed_visits['Age at ed visit'] <= patient_age)
    if ed_visit_match.any():
        return filtered_ed_visits.loc[ed_visit_match, 'Encounter ID'].iloc[0]

    filtered_inpatient_admissions = inpatient_admissions_df[inpatient_admissions_df['Internalpatientid'] == patient_id]
    inpatient_match = (filtered_inpatient_admissions['Admission date'] <= date_to_match) & (filtered_inpatient_admissions['Discharge date'] >= date_to_match) & (filtered_inpatient_admissions['Age at admission'] <= patient_age)
    if inpatient_match.any():
        return filtered_inpatient_admissions.loc[inpatient_match, 'Encounter ID'].iloc[0]

    filtered_outpatient_visits = outpatient_visits_df[outpatient_visits_df['Internalpatientid'] == patient_id]
    outpatient_match = (filtered_outpatient_visits['Visit start date'] <= date_to_match) & (filtered_outpatient_visits['Visit End Date'] >= date_to_match) & (filtered_outpatient_visits['Age at visit'] <= patient_age)
    if outpatient_match.any():
        return filtered_outpatient_visits.loc[outpatient_match, 'Encounter ID'].iloc[0]

    return str(uuid.uuid4())

# Mapping Immunizations DF

In [6]:
immunizations_df = dd.read_parquet('/content/drive/MyDrive/VCHAMPS - Train Cleaned/immunizations.parquet')
immunizations_df = immunizations_df.compute()
immunizations_df.columns

Index(['Internalpatientid', 'Age at immunization', 'Immunization date',
       'Immunization', 'Administered elsewhere', 'Cvx code', 'Series doses_1',
       'Series doses_2', 'Series doses_3', 'Series doses_4', 'Series doses_5',
       'Series doses_6', 'Series doses_Booster', 'Series doses_Complete',
       'Series doses_NS', 'Series doses_Partial'],
      dtype='object')

In [7]:
immunizations_df

Unnamed: 0,Internalpatientid,Age at immunization,Immunization date,Immunization,Administered elsewhere,Cvx code,Series doses_1,Series doses_2,Series doses_3,Series doses_4,Series doses_5,Series doses_6,Series doses_Booster,Series doses_Complete,Series doses_NS,Series doses_Partial
0,100000,64,2019-10-03 07:20:00,"PNEUMOCOCCAL, UNSPECIFIED FORMULATION",,109,0,0,0,0,0,0,0,0,1,0
0,100000,64,2019-10-03 07:20:00,"PNEUMOCOCCAL, UNSPECIFIED FORMULATION",,109,0,0,0,0,0,0,0,0,1,0
0,100000,64,2019-10-03 07:20:00,"PNEUMOCOCCAL, UNSPECIFIED FORMULATION",,109,0,0,0,0,0,0,0,0,1,0
0,162877,98,2013-10-17 00:23:48,"FLU,3 YRS (HISTORICAL)",,88,0,0,0,0,0,0,0,0,1,0
0,162877,98,2013-10-17 00:23:48,"FLU,3 YRS (HISTORICAL)",,88,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
616475,162868,92,2020-11-29 05:20:37,ZOSTER RECOMBINANT,,187,1,0,0,0,0,0,0,0,0,0
616476,162869,78,2007-03-16 22:39:20,ZZZINFLUENZA (HISTORICAL),,88,0,0,0,0,0,0,0,0,1,0
616477,162875,60,2010-11-27 21:25:26,TD(ADULT) UNSPECIFIED FORMULATION,,139,0,0,0,0,0,0,0,0,1,0
616478,162877,91,2007-01-24 05:40:58,TD(ADULT) UNSPECIFIED FORMULATION,,139,0,0,0,0,0,0,0,0,1,0


In [10]:
#Test on a small sample
immunizations_df_sm = immunizations_df[:100]

# Create an empty list to store the results
encounter_ids = []

# Iterate over rows and track progress using tqdm
for _, row in tqdm(immunizations_df_sm.iterrows(), total=immunizations_df_sm.shape[0], desc="Processing"):
    encounter_id = map_encounter_id_vectorized(row, 'Age at immunization', 'Immunization date')
    encounter_ids.append(encounter_id)

# Assign the encounter IDs to the dataframe
immunizations_df_sm['Encounter ID'] = encounter_ids

immunizations_df_sm

Processing: 100%|██████████| 100/100 [00:01<00:00, 50.74it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  immunizations_df_sm['Encounter ID'] = encounter_ids


Unnamed: 0,Internalpatientid,Age at immunization,Immunization date,Immunization,Administered elsewhere,Cvx code,Series doses_1,Series doses_2,Series doses_3,Series doses_4,Series doses_5,Series doses_6,Series doses_Booster,Series doses_Complete,Series doses_NS,Series doses_Partial,Encounter ID
0,100000,64,2019-10-03 07:20:00,"PNEUMOCOCCAL, UNSPECIFIED FORMULATION",,109,0,0,0,0,0,0,0,0,1,0,c85429da-0228-46ce-bbb7-97774f946107
0,100000,64,2019-10-03 07:20:00,"PNEUMOCOCCAL, UNSPECIFIED FORMULATION",,109,0,0,0,0,0,0,0,0,1,0,718fd9e4-36b1-41a6-b75c-c7b6718e96d7
0,100000,64,2019-10-03 07:20:00,"PNEUMOCOCCAL, UNSPECIFIED FORMULATION",,109,0,0,0,0,0,0,0,0,1,0,4751586b-c4f6-452e-8c96-39fba29e7bba
0,162877,98,2013-10-17 00:23:48,"FLU,3 YRS (HISTORICAL)",,88,0,0,0,0,0,0,0,0,1,0,58bf9989-ccd3-4cce-9458-d62460875cce
0,162877,98,2013-10-17 00:23:48,"FLU,3 YRS (HISTORICAL)",,88,0,0,0,0,0,0,0,0,1,0,0bd0763e-2a68-4f15-bb87-4565002643cc
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10,162895,71,2009-07-26 16:57:30,"NOVEL INFLUENZA-H1N1-09, ALL FORMULATIONS",Non-VA laboratory,128,0,0,0,0,0,0,0,0,1,0,c1dc57dc-8081-4b82-8ad5-8cb53c74715b
10,65200,79,2006-01-07 09:25:30,"PNEUMOCOCCAL, UNSPECIFIED FORMULATION",,109,0,0,0,0,0,0,0,0,1,0,f2b71435-7281-45b2-96d8-4362ca1d7d2f
10,65200,79,2006-01-07 09:25:30,"PNEUMOCOCCAL, UNSPECIFIED FORMULATION",,109,0,0,0,0,0,0,0,0,1,0,0dda591c-2f1e-4d6f-9bc5-1503de49eebd
10,65200,79,2006-01-07 09:25:30,"PNEUMOCOCCAL, UNSPECIFIED FORMULATION",,109,0,0,0,0,0,0,0,0,1,0,836cc104-9a13-4398-bd1d-eecf38099750


In [None]:
save_path = '/content/drive/MyDrive/VCHAMPS - Train Cleaned-Mapped/immunizations'
# Define the chunk size
chunk_size = 100000

# Calculate the number of chunks
num_chunks = math.ceil(len(immunizations_df) / chunk_size)

# Create an empty list to store the encounter IDs
encounter_ids = []

# Iterate over chunks
for i in range(num_chunks):
    start_idx = i * chunk_size
    end_idx = (i + 1) * chunk_size

    # Get the chunk of dataframe
    chunk_df = immunizations_df[start_idx:end_idx]

    # Process the chunk and track progress using tqdm
    for _, row in tqdm(chunk_df.iterrows(), total=chunk_df.shape[0], desc=f"Processing Chunk {i+1}/{num_chunks}"):
        encounter_id = map_encounter_id_vectorized(row, 'Age at immunization', 'Immunization date')
        encounter_ids.append(encounter_id)

    # Create a new DataFrame with the chunk results
    chunk_results_df = chunk_df.copy()
    chunk_results_df['Encounter ID'] = encounter_ids[start_idx:end_idx]

    # Save the results of the chunk to Parquet file
    chunk_results_df.to_parquet(f'{save_path}/immunizations{i+1}.parquet', index=False)

Processing Chunk 1/55: 100%|██████████| 100000/100000 [31:51<00:00, 52.31it/s]
Processing Chunk 2/55: 100%|██████████| 100000/100000 [32:23<00:00, 51.47it/s]
Processing Chunk 3/55: 100%|██████████| 100000/100000 [32:16<00:00, 51.64it/s]
Processing Chunk 4/55: 100%|██████████| 100000/100000 [32:32<00:00, 51.21it/s]
Processing Chunk 5/55: 100%|██████████| 100000/100000 [32:05<00:00, 51.94it/s]
Processing Chunk 6/55: 100%|██████████| 100000/100000 [32:07<00:00, 51.88it/s]
Processing Chunk 7/55: 100%|██████████| 100000/100000 [32:08<00:00, 51.85it/s]
Processing Chunk 8/55: 100%|██████████| 100000/100000 [31:56<00:00, 52.17it/s]
Processing Chunk 9/55: 100%|██████████| 100000/100000 [32:00<00:00, 52.06it/s]
Processing Chunk 10/55: 100%|██████████| 100000/100000 [31:49<00:00, 52.36it/s]
Processing Chunk 11/55: 100%|██████████| 100000/100000 [32:00<00:00, 52.06it/s]
Processing Chunk 12/55: 100%|██████████| 100000/100000 [31:47<00:00, 52.42it/s]
Processing Chunk 13/55: 100%|██████████| 100000/1

In [8]:
save_path = '/content/drive/MyDrive/VCHAMPS - Train Cleaned-Mapped/immunizations'
# Define the chunk size
chunk_size = 100000

# Calculate the number of chunks
num_chunks = math.ceil(len(immunizations_df) / chunk_size)

# Create an empty list to store the encounter IDs
encounter_ids = []
# Iterate over chunks starting from chunk 60
for i in range(44, num_chunks):
    start_idx = i * chunk_size
    end_idx = (i + 1) * chunk_size

    # Get the chunk of dataframe
    chunk_df = immunizations_df[start_idx:end_idx]

    # Create an empty list to store the encounter IDs for the current chunk
    chunk_encounter_ids = []

    # Process the chunk and track progress using tqdm
    for _, row in tqdm(chunk_df.iterrows(), total=chunk_df.shape[0], desc=f"Processing Chunk {i+1}/{num_chunks}"):
        encounter_id = map_encounter_id_vectorized(row, 'Age at immunization', 'Immunization date')
        chunk_encounter_ids.append(encounter_id)

    # Create a new DataFrame with the chunk results
    chunk_results_df = chunk_df.copy()
    chunk_results_df['Encounter ID'] = chunk_encounter_ids

    # Save the results of the chunk to Parquet file
    chunk_results_df.to_parquet(f'{save_path}/immunizations{i+1}.parquet', index=False)

Processing Chunk 45/55: 100%|██████████| 100000/100000 [30:47<00:00, 54.12it/s]
Processing Chunk 46/55: 100%|██████████| 100000/100000 [31:39<00:00, 52.66it/s]
Processing Chunk 47/55: 100%|██████████| 100000/100000 [31:37<00:00, 52.70it/s]
Processing Chunk 48/55: 100%|██████████| 100000/100000 [31:57<00:00, 52.15it/s]
Processing Chunk 49/55: 100%|██████████| 100000/100000 [32:13<00:00, 51.73it/s]
Processing Chunk 50/55: 100%|██████████| 100000/100000 [32:31<00:00, 51.25it/s]
Processing Chunk 51/55: 100%|██████████| 100000/100000 [32:23<00:00, 51.45it/s]
Processing Chunk 52/55: 100%|██████████| 100000/100000 [32:20<00:00, 51.53it/s]
Processing Chunk 53/55: 100%|██████████| 100000/100000 [32:26<00:00, 51.36it/s]
Processing Chunk 54/55: 100%|██████████| 100000/100000 [32:27<00:00, 51.35it/s]
Processing Chunk 55/55: 100%|██████████| 49767/49767 [16:07<00:00, 51.45it/s]
