<a href="https://colab.research.google.com/github/victormurcia/VCHAMPS/blob/main/Mapping_Encounter_IDs.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#General utilities
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm  # Import tqdm for the progress bar
import glob,shutil,os,warnings,math,time,sys,re
from typing import List
import dask.dataframe as dd
from dask.diagnostics import ProgressBar

#For converting states to their abbreviations
#!pip install us
#import us

#For performing UTC normalization on datetime columns based on the STATE column
import pytz

#For Slider viz
import ipywidgets as widgets
from IPython.display import display, clear_output,HTML

#For EDA
#!pip install dataprep
#from dataprep.eda import create_report

#Enable data to be extracted and downloaded from my Google Drive
from google.colab import drive, files
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# Specify the path to the desired directory
directory_path = r'/content/drive/MyDrive/VCHAMPS - Train Cleaned'

# Change the current working directory to the desired directory
os.chdir(directory_path)

# Verify the current working directory
cwd = os.getcwd()

print(f"Current working directory: {cwd}")

Current working directory: /content/drive/MyDrive/VCHAMPS - Train Cleaned


# Generate Encounter IDs
Do this based on the inpatient_admisssions, ed_visits,and outpatient visits files.

I'll either use a Hash or a UUID to define the Encounter ID

In [3]:
import uuid
# Define a dictionary to store generated UUIDs
uuid_dict = {}

# Define a custom function to generate UUIDs and ensure uniqueness
def generate_uuid(row, df_val):
    if df_val == 1:
      columns = ['Internalpatientid', 'Ed visit start date', 'Discharge date ed']
    elif df_val == 2:
      columns = ['Internalpatientid', 'Admission date', 'Discharge date']
    elif df_val == 3:
      columns = ['Internalpatientid', 'Visit start date']

    unique_values = tuple(row[column] for column in columns)
    unique_values += (df_val,)  # Append the additional parameter to the unique_values tuple
    key = str(unique_values)

    # Check if UUID already exists in the dictionary
    if key in uuid_dict:
        return uuid_dict[key]

    # Generate a new UUID and store it in the dictionary
    new_uuid = str(uuid.uuid5(uuid.NAMESPACE_OID, key))
    uuid_dict[key] = new_uuid
    return new_uuid

In [5]:
import hashlib

def generate_hash_id(row, df_val):
    if df_val == 1:
        columns = ['Internalpatientid', 'Ed visit start date', 'Discharge date ed']
    elif df_val == 2:
        columns = ['Internalpatientid', 'Admission date', 'Discharge date']
    elif df_val == 3:
        columns = ['Internalpatientid', 'Visit start date']

    data = ''.join(str(row[column]) for column in columns)
    data += str(df_val)

    hash_object = hashlib.sha256(data.encode())
    hash_id = hash_object.hexdigest()

    return hash_id

In [6]:
#Load the Dataframes
ed_visits_df            = dd.read_parquet('/content/drive/MyDrive/VCHAMPS - Train Cleaned/ed_visits.parquet/*.parquet')
inpatient_admissions_df = dd.read_parquet('/content/drive/MyDrive/VCHAMPS - Train Cleaned/inpatient_admissions.parquet')
outpatient_visits_df    = dd.read_parquet('/content/drive/MyDrive/VCHAMPS - Train Cleaned/outpatient_visits.parquet/*.parquet')

I decided to use UUID instead of Hash since UUIDs used far less memory

In [7]:
# Assign UUIDs to each row
ed_visits_df['Encounter ID'] = ed_visits_df.apply(generate_uuid, args=(1,), axis=1, meta=(None, 'object'))
inpatient_admissions_df['Encounter ID'] = inpatient_admissions_df.apply(generate_uuid, args=(2,), axis=1, meta=(None, 'object'))
outpatient_visits_df['Encounter ID'] = outpatient_visits_df.apply(generate_uuid, args=(3,), axis=1, meta=(None, 'object'))

In [8]:
ed_visits_df.head()

KeyboardInterrupt: ignored

In [None]:
inpatient_admissions_df.head()

In [None]:
outpatient_visits_df.head()

Now that each of the visits have a UUID associated with it, I can start mapping the other dataframes to them

In [None]:
ed_visits_df = ed_visits_df.compute()
inpatient_admissions_df = inpatient_admissions_df.compute()
outpatient_visits_df = outpatient_visits_df.compute()

In [None]:
#Save them to parquet files
# Save the Dask DataFrame as Parquet
ed_visits_df.to_parquet('/content/drive/MyDrive/VCHAMPS - Train Cleaned-Mapped/ed_visits.parquet', engine='pyarrow')
inpatient_admissions_df.to_parquet('/content/drive/MyDrive/VCHAMPS - Train Cleaned-Mapped/inpatient_admissions.parquet', engine='pyarrow')
outpatient_visits_df.to_parquet('/content/drive/MyDrive/VCHAMPS - Train Cleaned-Mapped/outpatient_visits.parquet', engine='pyarrow')

In [None]:
ed_visits_df

In [None]:
inpatient_admissions_df

In [None]:
outpatient_visits_df

In [None]:
#Make a list of all use encounter IDs. Will be used to ensure uniqueness later
encounter_ids = []
encounter_ids.extend(ed_visits_df['Encounter ID'].tolist())
encounter_ids.extend(inpatient_admissions_df['Encounter ID'].tolist())
encounter_ids.extend(outpatient_visits_df['Encounter ID'].tolist())

In [None]:
def map_encounter_id(row, age_col, date_col):
    patient_id    = row['Internalpatientid']
    patient_age   = row[age_col]
    date_to_match = row[date_col]

    #1. Check for matches in the ed_visits first
    # Filter the first dataframe for matching patient ID and age conditions
    filtered_df = ed_visits_df[(ed_visits_df['Internalpatientid'] == patient_id) & (ed_visits_df['Age at ed visit'] <= patient_age)]
    # Find the first instance where date_to_match is between the start and end dates
    filtered_df = filtered_df[(filtered_df['Ed visit start date'] <= date_to_match) & (filtered_df['Discharge date ed'] >= date_to_match)]
    if len(filtered_df) > 0:
        #print('Found match in ed_visits for ',patient_id)
        return filtered_df['Encounter ID'].iloc[0]
    else:
        #2. Check for matches in the inpatient_visits if no match is found in ed_visits
        filtered_df = inpatient_admissions_df[(inpatient_admissions_df['Internalpatientid'] == patient_id) & (inpatient_admissions_df['Age at admission'] <= patient_age)]
         # Find the first instance where date_to_match is between the start and end dates
        filtered_df = filtered_df[(filtered_df['Admission date'] <= date_to_match) & (filtered_df['Discharge date'] >= date_to_match)]
        if len(filtered_df) > 0:
          #print('Found match in inpatient_visits for ',patient_id)
          return filtered_df['Encounter ID'].iloc[0]
        else:
          #3. Check for matches in the outpatient_visits if no match is found in inpatient_visits
          filtered_df = outpatient_visits_df[(outpatient_visits_df['Internalpatientid'] == patient_id) & (outpatient_visits_df['Age at visit'] <= patient_age)]
          # Find the first instance where date_to_match is between the start and end dates
          filtered_df = filtered_df[(filtered_df['Visit start date'] <= date_to_match) & (filtered_df['Visit End Date'] >= date_to_match)]
          if len(filtered_df) > 0:
            #print('Found match in outpatient_visits for ',patient_id)
            return filtered_df['Encounter ID'].iloc[0]
          else:
            #print('No match found. Producing unique Encounter ID')
            return str(uuid.uuid4())
    return None

In [None]:
small_death_df['Encounter ID'] = small_death_df.apply(lambda row: map_encounter_id(row, 'Age at death', 'Death date'), axis=1)

In [None]:
#I tested the function in a subset just to ensure proper operation and it works well and is fairly fast.
small_death_df

# Mapping Death DF

In [None]:
death_df = dd.read_parquet('/content/drive/MyDrive/VCHAMPS - Train Cleaned/death.parquet')

In [None]:
death_df = death_df.compute()

In [None]:
death_df['Encounter ID'] = death_df.apply(lambda row: map_encounter_id(row, 'Age at death', 'Death date'), axis=1)

In [None]:
# Save the Dask DataFrame as Parquet
death_df.to_parquet('/content/drive/MyDrive/VCHAMPS - Train Cleaned-Mapped/death.parquet', engine='pyarrow')

# Mapping Inpatient Locations DF

In [None]:
inpatient_location_df = dd.read_parquet('/content/drive/MyDrive/VCHAMPS - Train Cleaned/inpatient_location.parquet')
inpatient_location_df = inpatient_location_df.compute()
inpatient_location_df.columns

In [None]:
inpatient_location_df['Encounter ID'] = inpatient_location_df.apply(lambda row: map_encounter_id(row, 'Age at location', 'Location start date'), axis=1)

In [None]:
inpatient_location_df

In [None]:
# Save the Dask DataFrame as Parquet
inpatient_location_df.to_parquet('/content/drive/MyDrive/VCHAMPS - Train Cleaned-Mapped/inpatient_location.parquet', engine='pyarrow')

# Mapping Inpatient Specialty DF

In [None]:
inpatient_specialty_df = dd.read_parquet('/content/drive/MyDrive/VCHAMPS - Train Cleaned/inpatient_specialty.parquet')
inpatient_specialty_df = inpatient_specialty_df.compute()
inpatient_specialty_df.columns

In [None]:
inpatient_specialty_df['Encounter ID'] = inpatient_specialty_df.apply(lambda row: map_encounter_id(row, 'Age at specialty', 'Specialty start date'), axis=1)

In [None]:
# Save the Dask DataFrame as Parquet
inpatient_specialty_df.to_parquet('/content/drive/MyDrive/VCHAMPS - Train Cleaned-Mapped/inpatient_specialty.parquet', engine='pyarrow')

# Session Disconnected... Will continue from here...

In [4]:
#Load the Dataframes
ed_visits_df            = dd.read_parquet('/content/drive/MyDrive/VCHAMPS - Train Cleaned-Mapped/ed_visits.parquet')
inpatient_admissions_df = dd.read_parquet('/content/drive/MyDrive/VCHAMPS - Train Cleaned-Mapped/inpatient_admissions.parquet')
outpatient_visits_df    = dd.read_parquet('/content/drive/MyDrive/VCHAMPS - Train Cleaned-Mapped/outpatient_visits.parquet')

ed_visits_df = ed_visits_df.compute()
inpatient_admissions_df = inpatient_admissions_df.compute()
outpatient_visits_df = outpatient_visits_df.compute()

# Optimizing Mapping via Vectorization

In [5]:
def map_encounter_id_vectorized(row, age_col, date_col):
    patient_id = row['Internalpatientid']
    patient_age = row[age_col]
    date_to_match = row[date_col]

    filtered_ed_visits = ed_visits_df[ed_visits_df['Internalpatientid'] == patient_id]
    ed_visit_match = (filtered_ed_visits['Ed visit start date'] <= date_to_match) & (filtered_ed_visits['Discharge date ed'] >= date_to_match) & (filtered_ed_visits['Age at ed visit'] <= patient_age)
    if ed_visit_match.any():
        return filtered_ed_visits.loc[ed_visit_match, 'Encounter ID'].iloc[0]

    filtered_inpatient_admissions = inpatient_admissions_df[inpatient_admissions_df['Internalpatientid'] == patient_id]
    inpatient_match = (filtered_inpatient_admissions['Admission date'] <= date_to_match) & (filtered_inpatient_admissions['Discharge date'] >= date_to_match) & (filtered_inpatient_admissions['Age at admission'] <= patient_age)
    if inpatient_match.any():
        return filtered_inpatient_admissions.loc[inpatient_match, 'Encounter ID'].iloc[0]

    filtered_outpatient_visits = outpatient_visits_df[outpatient_visits_df['Internalpatientid'] == patient_id]
    outpatient_match = (filtered_outpatient_visits['Visit start date'] <= date_to_match) & (filtered_outpatient_visits['Visit End Date'] >= date_to_match) & (filtered_outpatient_visits['Age at visit'] <= patient_age)
    if outpatient_match.any():
        return filtered_outpatient_visits.loc[outpatient_match, 'Encounter ID'].iloc[0]

    return str(uuid.uuid4())

# Mapping Measurements DF

In [6]:
measurements_df = dd.read_parquet('/content/drive/MyDrive/VCHAMPS - Train Cleaned/measurements.parquet/*.parquet')
measurements_df = measurements_df.compute()
measurements_df.columns

Index(['Internalpatientid', 'Age at measurement', 'Measurement date',
       'Measurement', 'Result numeric'],
      dtype='object')

In [7]:
measurements_df

Unnamed: 0,Internalpatientid,Age at measurement,Measurement date,Measurement,Result numeric
0,1,59,2003-05-21 00:27:01,Temperature,95.804066
0,9713,68,2017-08-17 18:10:50,Pulse oximetry,88.000000
0,97124,72,2019-02-09 17:56:52,Pulse,66.000000
0,107718,63,2020-12-16 02:10:45,Temperature,95.979629
0,96334,72,2016-01-09 06:23:49,Height,68.793928
...,...,...,...,...,...
783886,101271,66,2011-05-23 02:24:16,Pain,0.000000
783887,101271,66,2011-05-23 02:24:16,Weight,180.451035
783889,101271,67,2012-03-29 18:28:09,Respiratory rate,18.000000
783890,101271,67,2012-03-29 18:28:09,Pain,0.000000


In [13]:
#I'll sample 10M rows of the dataframe. This should take ~20 hours to map
sampled_measurements_df = measurements_df.sample(n=10000000, random_state=42)
sampled_measurements_df = sampled_measurements_df.reset_index(drop=True)
sampled_measurements_df

Unnamed: 0,Internalpatientid,Age at measurement,Measurement date,Measurement,Result numeric
0,127057,63,2011-10-09 22:49:08,Pulse,75.000000
1,160816,95,2005-09-22 21:00:15,Pulse,66.000000
2,120458,77,2008-08-19 11:23:04,Temperature,97.282646
3,142658,66,2006-11-26 11:12:16,Pulse oximetry,98.000000
4,34927,64,2011-11-12 00:12:58,Pulse,58.000000
...,...,...,...,...,...
9999995,78496,86,2016-05-15 11:16:54,Pulse,71.000000
9999996,122619,58,2007-12-24 18:18:57,Respiratory rate,18.000000
9999997,107225,82,2002-08-14 17:44:17,Temperature,95.925387
9999998,2727,54,2008-09-12 08:23:02,Pain,3.000000


In [None]:
save_path = '/content/drive/MyDrive/VCHAMPS - Train Cleaned-Mapped/measurements'
# Define the chunk size
chunk_size = 100000

# Calculate the number of chunks
num_chunks = math.ceil(len(measurements_df) / chunk_size)

# Create an empty list to store the encounter IDs
encounter_ids = []

# Iterate over chunks
for i in range(num_chunks):
    start_idx = i * chunk_size
    end_idx = (i + 1) * chunk_size

    # Get the chunk of dataframe
    chunk_df = measurements_df[start_idx:end_idx]

    # Process the chunk and track progress using tqdm
    for _, row in tqdm(chunk_df.iterrows(), total=chunk_df.shape[0], desc=f"Processing Chunk {i+1}/{num_chunks}"):
        encounter_id = map_encounter_id_vectorized(row, 'Age at measurement', 'Measurement date')
        encounter_ids.append(encounter_id)

    # Create a new DataFrame with the chunk results
    chunk_results_df = chunk_df.copy()
    chunk_results_df['Encounter ID'] = encounter_ids[start_idx:end_idx]

    # Save the results of the chunk to Parquet file
    chunk_results_df.to_parquet(f'{save_path}/measurements{i+1}.parquet', index=False)

Processing Chunk 1/1024: 100%|██████████| 100000/100000 [19:04<00:00, 87.34it/s]
Processing Chunk 2/1024: 100%|██████████| 100000/100000 [18:44<00:00, 88.96it/s]
Processing Chunk 3/1024: 100%|██████████| 100000/100000 [18:48<00:00, 88.60it/s]
Processing Chunk 4/1024: 100%|██████████| 100000/100000 [19:51<00:00, 83.90it/s]
Processing Chunk 5/1024: 100%|██████████| 100000/100000 [19:23<00:00, 85.94it/s]
Processing Chunk 6/1024: 100%|██████████| 100000/100000 [19:09<00:00, 86.98it/s]
Processing Chunk 7/1024: 100%|██████████| 100000/100000 [19:12<00:00, 86.79it/s]
Processing Chunk 8/1024: 100%|██████████| 100000/100000 [18:54<00:00, 88.15it/s]
Processing Chunk 9/1024: 100%|██████████| 100000/100000 [19:18<00:00, 86.31it/s]
Processing Chunk 10/1024: 100%|██████████| 100000/100000 [18:37<00:00, 89.49it/s]
Processing Chunk 11/1024: 100%|██████████| 100000/100000 [19:26<00:00, 85.70it/s]
Processing Chunk 12/1024: 100%|██████████| 100000/100000 [18:55<00:00, 88.07it/s]
Processing Chunk 13/1024:

Hmmm let me pivot the table first

In [None]:
save_path = '/content/drive/MyDrive/VCHAMPS - Train Cleaned-Mapped/measurements'
# Define the chunk size
chunk_size = 100000

# Calculate the number of chunks
num_chunks = math.ceil(len(measurements_df) / chunk_size)

# Create an empty list to store the encounter IDs
encounter_ids = []
# Iterate over chunks starting from chunk 60
for i in range(73, num_chunks):
    start_idx = i * chunk_size
    end_idx = (i + 1) * chunk_size

    # Get the chunk of dataframe
    chunk_df = measurements_df[start_idx:end_idx]

    # Create an empty list to store the encounter IDs for the current chunk
    chunk_encounter_ids = []

    # Process the chunk and track progress using tqdm
    for _, row in tqdm(chunk_df.iterrows(), total=chunk_df.shape[0], desc=f"Processing Chunk {i+1}/{num_chunks}"):
        encounter_id = map_encounter_id_vectorized(row, 'Age at measurement', 'Measurement date')
        chunk_encounter_ids.append(encounter_id)

    # Create a new DataFrame with the chunk results
    chunk_results_df = chunk_df.copy()
    chunk_results_df['Encounter ID'] = chunk_encounter_ids

    # Save the results of the chunk to Parquet file
    chunk_results_df.to_parquet(f'{save_path}/measurements{i+1}.parquet', index=False)

Processing Chunk 74/1024: 100%|██████████| 100000/100000 [21:32<00:00, 77.38it/s]
Processing Chunk 75/1024: 100%|██████████| 100000/100000 [21:46<00:00, 76.53it/s]
Processing Chunk 76/1024: 100%|██████████| 100000/100000 [21:45<00:00, 76.59it/s]
Processing Chunk 77/1024: 100%|██████████| 100000/100000 [21:19<00:00, 78.13it/s]
Processing Chunk 78/1024: 100%|██████████| 100000/100000 [21:19<00:00, 78.16it/s]
Processing Chunk 79/1024: 100%|██████████| 100000/100000 [21:20<00:00, 78.12it/s]
Processing Chunk 80/1024: 100%|██████████| 100000/100000 [21:05<00:00, 79.05it/s]
Processing Chunk 81/1024: 100%|██████████| 100000/100000 [20:52<00:00, 79.81it/s]
Processing Chunk 82/1024: 100%|██████████| 100000/100000 [21:42<00:00, 76.75it/s]
Processing Chunk 83/1024: 100%|██████████| 100000/100000 [21:11<00:00, 78.62it/s]
Processing Chunk 84/1024: 100%|██████████| 100000/100000 [20:57<00:00, 79.53it/s]
Processing Chunk 85/1024: 100%|██████████| 100000/100000 [21:28<00:00, 77.61it/s]
Processing Chunk

In [None]:
save_path = '/content/drive/MyDrive/VCHAMPS - Train Cleaned-Mapped/measurements'
# Define the chunk size
chunk_size = 100000

# Calculate the number of chunks
num_chunks = math.ceil(len(measurements_df) / chunk_size)

# Create an empty list to store the encounter IDs
encounter_ids = []
# Iterate over chunks starting from chunk 60
for i in range(140, num_chunks):
    start_idx = i * chunk_size
    end_idx = (i + 1) * chunk_size

    # Get the chunk of dataframe
    chunk_df = measurements_df[start_idx:end_idx]

    # Create an empty list to store the encounter IDs for the current chunk
    chunk_encounter_ids = []

    # Process the chunk and track progress using tqdm
    for _, row in tqdm(chunk_df.iterrows(), total=chunk_df.shape[0], desc=f"Processing Chunk {i+1}/{num_chunks}"):
        encounter_id = map_encounter_id_vectorized(row, 'Age at measurement', 'Measurement date')
        chunk_encounter_ids.append(encounter_id)

    # Create a new DataFrame with the chunk results
    chunk_results_df = chunk_df.copy()
    chunk_results_df['Encounter ID'] = chunk_encounter_ids

    # Save the results of the chunk to Parquet file
    chunk_results_df.to_parquet(f'{save_path}/measurements{i+1}.parquet', index=False)

Processing Chunk 141/1024: 100%|██████████| 100000/100000 [24:03<00:00, 69.30it/s]
Processing Chunk 142/1024: 100%|██████████| 100000/100000 [23:30<00:00, 70.90it/s]
Processing Chunk 143/1024: 100%|██████████| 100000/100000 [24:01<00:00, 69.38it/s]
Processing Chunk 144/1024: 100%|██████████| 100000/100000 [24:29<00:00, 68.06it/s]
Processing Chunk 145/1024: 100%|██████████| 100000/100000 [23:15<00:00, 71.65it/s]
Processing Chunk 146/1024: 100%|██████████| 100000/100000 [23:13<00:00, 71.79it/s]
Processing Chunk 147/1024: 100%|██████████| 100000/100000 [23:53<00:00, 69.78it/s]
Processing Chunk 148/1024: 100%|██████████| 100000/100000 [23:43<00:00, 70.23it/s]
Processing Chunk 149/1024: 100%|██████████| 100000/100000 [23:35<00:00, 70.64it/s]
Processing Chunk 150/1024: 100%|██████████| 100000/100000 [23:32<00:00, 70.82it/s]
Processing Chunk 151/1024: 100%|██████████| 100000/100000 [23:50<00:00, 69.92it/s]
Processing Chunk 152/1024: 100%|██████████| 100000/100000 [23:45<00:00, 70.13it/s]
Proc

In [None]:
save_path = '/content/drive/MyDrive/VCHAMPS - Train Cleaned-Mapped/measurements'
# Define the chunk size
chunk_size = 100000

# Calculate the number of chunks
num_chunks = math.ceil(len(measurements_df) / chunk_size)

# Create an empty list to store the encounter IDs
encounter_ids = []
# Iterate over chunks starting from chunk 60
for i in range(200, num_chunks):
    start_idx = i * chunk_size
    end_idx = (i + 1) * chunk_size

    # Get the chunk of dataframe
    chunk_df = measurements_df[start_idx:end_idx]

    # Create an empty list to store the encounter IDs for the current chunk
    chunk_encounter_ids = []

    # Process the chunk and track progress using tqdm
    for _, row in tqdm(chunk_df.iterrows(), total=chunk_df.shape[0], desc=f"Processing Chunk {i+1}/{num_chunks}"):
        encounter_id = map_encounter_id_vectorized(row, 'Age at measurement', 'Measurement date')
        chunk_encounter_ids.append(encounter_id)

    # Create a new DataFrame with the chunk results
    chunk_results_df = chunk_df.copy()
    chunk_results_df['Encounter ID'] = chunk_encounter_ids

    # Save the results of the chunk to Parquet file
    chunk_results_df.to_parquet(f'{save_path}/measurements{i+1}.parquet', index=False)

Processing Chunk 201/1024: 100%|██████████| 100000/100000 [19:57<00:00, 83.53it/s]
Processing Chunk 202/1024: 100%|██████████| 100000/100000 [19:45<00:00, 84.33it/s]
Processing Chunk 203/1024: 100%|██████████| 100000/100000 [19:32<00:00, 85.29it/s]
Processing Chunk 204/1024: 100%|██████████| 100000/100000 [19:38<00:00, 84.86it/s]
Processing Chunk 205/1024: 100%|██████████| 100000/100000 [19:16<00:00, 86.48it/s]
Processing Chunk 206/1024: 100%|██████████| 100000/100000 [19:11<00:00, 86.88it/s]
Processing Chunk 207/1024: 100%|██████████| 100000/100000 [19:24<00:00, 85.90it/s]
Processing Chunk 208/1024: 100%|██████████| 100000/100000 [19:41<00:00, 84.62it/s]
Processing Chunk 209/1024: 100%|██████████| 100000/100000 [19:28<00:00, 85.55it/s]
Processing Chunk 210/1024: 100%|██████████| 100000/100000 [19:13<00:00, 86.66it/s]
Processing Chunk 211/1024: 100%|██████████| 100000/100000 [19:54<00:00, 83.70it/s]
Processing Chunk 212/1024: 100%|██████████| 100000/100000 [20:10<00:00, 82.61it/s]
Proc

In [None]:
#measurements_df['Encounter ID'] = measurements_df.apply(lambda row: map_encounter_id(row, 'Age at measurement', 'Measurement date'), axis=1)

In [None]:
# Save the Dask DataFrame as Parquet
measurements_df.to_parquet('/content/drive/MyDrive/VCHAMPS - Train Cleaned-Mapped/measurements.parquet', engine='pyarrow')