In [1]:
#General utilities
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.patches import Ellipse
from matplotlib.lines import Line2D
import seaborn as sns
from tqdm import tqdm  # Import tqdm for the progress bar
import glob,shutil,os,warnings,math
from typing import List
import dask.dataframe as dd

#For Slider viz
import ipywidgets as widgets
from IPython.display import display, clear_output,HTML

#DataPrep for Quick EDA
from ydata_profiling import ProfileReport

#For ML model stuff
from sklearn.datasets import load_iris,load_breast_cancer
from sklearn.model_selection import learning_curve, train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_curve, roc_auc_score, confusion_matrix, log_loss
)
from sklearn.preprocessing import label_binarize
from sklearn.exceptions import ConvergenceWarning

## Loading the data
The data was provided to us in .csv files. Some of these files are rather large however (file size > 30GB). This will be an issue with pandas since we won't have enough memory to load or do any operations on this data. I could try limiting the number of rows we load initially or load the data in chunks. However, I think it'll be more advantageous to do the following:
* Convert the .csv files into .parquet files
* Use dask instead of pandas whenever possible
* Convert as many attributes that have dtype=object into something else whenever possible. For instance, the dates included  in the dataset, should be converted to datetime objects. 
* Convert as many numerical data from int64 to the lowest possible int type that does not compromise precision

These improvements should not only make much more efficient usage of memory, but it should ultimately result in an overall optimization of speed for all subsequent queries.

The cell below contains functions to create pandas dataframes from the original .csv files. I'll do that just to create comparisons between the processes. 

In [58]:
def load_csvs(path2data: str) -> List[str]:
  """
  Load and return a list of CSV file paths from the specified directory.

  Args:
      path2data (str): The directory path containing the CSV files.

  Returns:
      List[str]: A list of CSV file paths.

  """
  csv_files = glob.glob(path2data + '/*.csv')
  return csv_files

def make_df_list(csv_files: List[str]) -> List[pd.DataFrame]:
  """
  Read CSV files from the provided list of file paths and return a list of DataFrames.

  Args:
      csv_files (List[str]): A list of CSV file paths.

  Returns:
      List[pd.DataFrame]: A list of DataFrames read from the CSV files.

  """
  df_list = []
  # Read the CSV file
  for csv in csv_files:
    df = pd.read_csv(csv)
    df_list.append(df)

  return df_list

def clean_filenames(csv_files: List[str]) -> List[str]:
  """
  Clean the file names by removing directory path and the .csv extension.

  Args:
      csv_files (List[str]): A list of CSV file paths.

  Returns:
      List[str]: A list of cleaned file names without directory path and file extension.

  """
  #Get list of file names without directory junk and remove .csv extension from name
  file_names = []

  for file_path in csv_files:
      file_name = os.path.basename(file_path)  # Get the file name with extension
      file_name = os.path.splitext(file_name)[0]  # Remove the file extension
      file_names.append(file_name)
  return file_names

# Specify the path to the desired directory
directory_path = r'D:\VCHAMPS Data\Quality Check'

# Change the current working directory to the desired directory
os.chdir(directory_path)

# Verify the current working directory
current_directory = os.getcwd()
print(f"Current working directory: {current_directory}")
#Define data location
path2data = r'D:\VCHAMPS Data\Quality Check'
#Load the .csv files into memory
csv_files  = load_csvs(path2data)
print(csv_files)
#Create list of dataframes from csvs
df_list_csv    = make_df_list(csv_files)
#Clean the names of .csv files
csv_file_names = clean_filenames(csv_files)

Current working directory: D:\VCHAMPS Data\Quality Check
['D:\\VCHAMPS Data\\Quality Check\\conditions_qual.csv', 'D:\\VCHAMPS Data\\Quality Check\\demographics_event_qual.csv', 'D:\\VCHAMPS Data\\Quality Check\\demographics_static_qual.csv', 'D:\\VCHAMPS Data\\Quality Check\\ed_visits_qual.csv', 'D:\\VCHAMPS Data\\Quality Check\\immunization_qual.csv', 'D:\\VCHAMPS Data\\Quality Check\\inpatient_admissions_qual.csv', 'D:\\VCHAMPS Data\\Quality Check\\inpatient_location_qual.csv', 'D:\\VCHAMPS Data\\Quality Check\\inpatient_specialty_qual.csv', 'D:\\VCHAMPS Data\\Quality Check\\lab_results_qual.csv', 'D:\\VCHAMPS Data\\Quality Check\\measurements_blood_pressure_qual.csv', 'D:\\VCHAMPS Data\\Quality Check\\measurements_qual.csv', 'D:\\VCHAMPS Data\\Quality Check\\medications_administered_qual.csv', 'D:\\VCHAMPS Data\\Quality Check\\medications_ordered_qual.csv', 'D:\\VCHAMPS Data\\Quality Check\\outpatient_visits_qual.csv', 'D:\\VCHAMPS Data\\Quality Check\\procedures_qual.csv']


The next cell will show how to convert a .csv file into a .parquet file using dask. I also compare the file size difference between a .csv and .parquet file. The differences are striking.

In [99]:
def convert_csv_to_parquet(current_directory):
    # Create an empty DataFrame to store the file sizes
    file_sizes = pd.DataFrame(columns=['Filename', 'CSV Size', 'Parquet Size'])

    # Find all CSV files in the input directory
    csv_files = [file for file in os.listdir(current_directory) if file.endswith('.csv')]

    # Process each CSV file
    for i, csv_file in enumerate(csv_files):
        # Create the file paths
        input_file_path = os.path.join(current_directory, csv_file)
        output_file_name = f"{csv_file.replace('.csv', '')}.parquet"
        output_file_path = os.path.join(current_directory, output_file_name)

        # Get the size of the CSV file
        csv_file_size = os.path.getsize(input_file_path)

        # Append the file size to the DataFrame
        file_sizes = pd.concat([file_sizes, pd.DataFrame({
            'Filename': [csv_file.replace('.csv', '')],
            'CSV Size': [csv_file_size],
            'Parquet Size': [None]  # Initialize with None
        })], ignore_index=True)

        # Read the CSV file as a Dask DataFrame
        df = dd.read_csv(input_file_path, dtype={
            'Administered elsewhere': 'object',
            'Dose unit': 'object',
            'Result textual': 'object',
            'Administration end date': 'object',
            'Agentorangeflag': 'object',
            'Combatflag': 'object',
            'Ionizingradiationflag': 'object',
            'Swasiaconditionsflag': 'object',
            'Procedure code': 'object'
        })

        # Write the Dask DataFrame to Parquet file
        df.to_parquet(output_file_path, engine='pyarrow', write_index=False)
        
    # Find directories that end with .parquet
    parquet_directories = [directory for directory in os.listdir(current_directory) if os.path.isdir(os.path.join(current_directory, directory)) and directory.endswith('.parquet')]
    # Calculate total file sizes for each parquet directory
    parquet_file_sizes = []
    for directory in parquet_directories:
        parquet_dir_path = os.path.join(current_directory, directory)
        total_size = sum(os.path.getsize(os.path.join(parquet_dir_path, file)) for file in os.listdir(parquet_dir_path) if file.endswith('.parquet'))
        parquet_file_sizes.append(total_size)
    
    # Update the 'Parquet Size' column in file_sizes DataFrame
    file_sizes.loc[file_sizes['Filename'].isin([directory.replace('.parquet', '') for directory in parquet_directories]), 'Parquet Size'] = parquet_file_sizes
    
    print("CSV to Parquet conversion completed.")

    return file_sizes

def calculate_reduction(file_sizes):
    # Calculate % reduction
    file_sizes['%Reduction'] = ((file_sizes['CSV Size'] - file_sizes['Parquet Size']) / file_sizes['CSV Size']) * 100

    # Convert file sizes from bytes to megabytes (MB)
    file_sizes['CSV Size [MB]'] = file_sizes['CSV Size'] / (1024 * 1024)  # Convert bytes to megabytes
    file_sizes['Parquet Size [MB]'] = file_sizes['Parquet Size'] / (1024 * 1024)  # Convert bytes to megabytes

    # Rearrange the columns
    file_sizes = file_sizes[['Filename', 'CSV Size [MB]', 'Parquet Size [MB]', '%Reduction']]

    return file_sizes

# Get file sizes
file_sizes = convert_csv_to_parquet(current_directory)
file_sizes = calculate_reduction(file_sizes)
file_sizes

CSV to Parquet conversion completed.


Unnamed: 0,Filename,CSV Size [MB],Parquet Size [MB],%Reduction
0,conditions_qual,72.757038,13.019831,82.105057
1,demographics_event_qual,0.065379,0.037816,42.158851
2,demographics_static_qual,0.053,0.01626,69.320738
3,ed_visits_qual,0.40361,0.144735,64.139825
4,immunization_qual,1.707275,0.43032,74.79494
5,inpatient_admissions,1.047332,0.259045,75.266229
6,inpatient_location_qual,0.712869,0.338958,52.451582
7,inpatient_specialty_qual,1.157339,0.493114,57.392477
8,lab_results_qual,249.419017,48.029166,80.743583
9,measurements_blood_pressure_qual,14.068669,5.488777,60.985812


## Visualizing the Dataframes
I made a widget earlier to rapidly see the pandas dataframes. I've adapted so that it can be used with dask dataframes and the new file structure as well.

In [104]:
# Find directories that end with .parquet
parquet_directories = [directory for directory in os.listdir(current_directory) if os.path.isdir(os.path.join(current_directory, directory)) and directory.endswith('.parquet')]

# Create a slider widget to control the value of val
slider = widgets.IntSlider(
    min=0,
    max=len(parquet_directories) - 1,
    step=1,
    description='Select Value:',
    value=val,
    layout={'width': '400px'}
)

# Create an output widget to display the DataFrame
output = widgets.Output()

# Function to update the Dask DataFrame based on the selected value
def update_dataframe(value):
    global val
    val = value  # Update the value of val
    parquet_directory = parquet_directories[val]
    df = dd.read_parquet(f'{parquet_directory}/*.parquet')
    file_name = parquet_directory.split('/')[-1] 
    
    with output:
        clear_output(wait=True)
        print("File Name:", file_name)
        print("Number of rows:", len(df))
        print("Number of columns:", len(df.columns))
        display(df.head())

# Call the update_dataframe function to initialize the Dask DataFrame
update_dataframe(val)

# Create an observer for the slider widget
def slider_observer(change):
    update_dataframe(change.new)

# Add the observer to the slider widget
slider.observe(slider_observer, names='value')

# Display the slider widget and the output widget
display(slider)
display(output)

IntSlider(value=0, description='Select Value:', layout=Layout(width='400px'), max=14)

Output()

## Load the Training Data
Now that I have these routines in place, I'm going to attempt to load the provided training data.

In [106]:
# Specify the path to the desired directory
directory_path = r'D:\VCHAMPS Data\Train Data'

# Change the current working directory to the desired directory
os.chdir(directory_path)

# Verify the current working directory
current_directory = os.getcwd()
print(f"Current working directory: {current_directory}")

#Define data location
path2data = r'D:\VCHAMPS Data\Quality Check'
#Load the .csv files into memory
csv_files  = load_csvs(directory_path)
print(csv_files)

# Get file sizes
file_sizes = convert_csv_to_parquet(current_directory)
file_sizes = calculate_reduction(file_sizes)
file_sizes

Current working directory: D:\VCHAMPS Data\Train Data
['D:\\VCHAMPS Data\\Train Data\\conditions_train.csv', 'D:\\VCHAMPS Data\\Train Data\\death_train.csv', 'D:\\VCHAMPS Data\\Train Data\\demographics_event_train.csv', 'D:\\VCHAMPS Data\\Train Data\\demographics_static_train.csv', 'D:\\VCHAMPS Data\\Train Data\\ed_visits_train.csv', 'D:\\VCHAMPS Data\\Train Data\\immunization_train.csv', 'D:\\VCHAMPS Data\\Train Data\\inpatient_admissions_train.csv', 'D:\\VCHAMPS Data\\Train Data\\inpatient_location_train.csv', 'D:\\VCHAMPS Data\\Train Data\\inpatient_specialty_train.csv', 'D:\\VCHAMPS Data\\Train Data\\lab_results_train.csv', 'D:\\VCHAMPS Data\\Train Data\\measurements_blood_pressure_train.csv', 'D:\\VCHAMPS Data\\Train Data\\measurements_train.csv', 'D:\\VCHAMPS Data\\Train Data\\medications_administered_train.csv', 'D:\\VCHAMPS Data\\Train Data\\medications_ordered_train.csv', 'D:\\VCHAMPS Data\\Train Data\\outpatient_visits_train.csv', 'D:\\VCHAMPS Data\\Train Data\\procedures_tra

Unnamed: 0,Filename,CSV Size [MB],Parquet Size [MB],%Reduction
0,conditions_train,9830.91395,1789.354671,81.798695
1,death_train,4.944784,3.228118,34.716707
2,demographics_event_train,8.691981,4.468786,48.587254
3,demographics_static_train,7.080438,1.652361,76.663012
4,ed_visits_train,69.029263,21.160005,69.346327
5,immunization_train,232.495247,51.745118,77.74358
6,inpatient_admissions_train,124.159206,24.052683,80.627548
7,inpatient_location_train,78.85446,31.519588,60.028147
8,inpatient_specialty_train,147.459064,54.221644,63.229358
9,lab_results_train,32755.922693,6416.455462,80.411312


In [110]:
# Convert the sum of sizes from MB to GB
csv_size_sum_gb = csv_size_sum / 1024
parquet_size_sum_gb = parquet_size_sum / 1024

# Calculate the percent reduction in GB
percent_reduction_gb = ((csv_size_sum_gb - parquet_size_sum_gb) / csv_size_sum_gb) * 100

print("Sum of CSV Size [GB]:", csv_size_sum_gb)
print("Sum of Parquet Size [GB]:", parquet_size_sum_gb)
print("Percent Reduction [GB]: {:.2f}%".format(percent_reduction_gb))

Sum of CSV Size [GB]: 107.25956689100713
Sum of Parquet Size [GB]: 21.204981152899563
Percent Reduction [GB]: 80.23%


In [112]:
# Find directories that end with .parquet
parquet_directories = [directory for directory in os.listdir(current_directory) if os.path.isdir(os.path.join(current_directory, directory)) and directory.endswith('.parquet')]

# Create a slider widget to control the value of val
slider = widgets.IntSlider(
    min=0,
    max=len(parquet_directories) - 1,
    step=1,
    description='Select Value:',
    value=val,
    layout={'width': '400px'}
)

# Create an output widget to display the DataFrame
output = widgets.Output()

# Function to update the Dask DataFrame based on the selected value
def update_dataframe(value):
    global val
    val = value  # Update the value of val
    parquet_directory = parquet_directories[val]
    df = dd.read_parquet(f'{parquet_directory}/*.parquet')
    file_name = parquet_directory.split('/')[-1] 
    
    with output:
        clear_output(wait=True)
        print("File Name:", file_name)
        print("Number of rows:", len(df))
        print("Number of columns:", len(df.columns))
        display(df.head())

# Call the update_dataframe function to initialize the Dask DataFrame
update_dataframe(val)

# Create an observer for the slider widget
def slider_observer(change):
    update_dataframe(change.new)

# Add the observer to the slider widget
slider.observe(slider_observer, names='value')

# Display the slider widget and the output widget
display(slider)
display(output)

IntSlider(value=0, description='Select Value:', layout=Layout(width='400px'), max=15)

Output()