# Setup

In [13]:
# imports
from IPython.display import display, HTML
from pathlib import Path

import numpy as np
import pandas as pd
import polars as pl

import gzip
import shutil
from pathlib import Path


In [14]:
# generate synthetic data for performance testsing
!python -m synthetic_data

Test data exists in multiple formats.


In [None]:
# data paths
data_path = Path("data")

# list paths for data here for convenience
data_csv = Path("data", "python_dev_universe.csv")
data_csv_gz = Path(data_path, "python_dev_universe.csv.gz")
data_parquet = Path("data", "python_dev_universe.parquet")

# test pandas/polars read CSV and compressed CSV

In [5]:
data_csv_gz = Path("data", "python_dev_universe.csv.gz")
df_pandas = pd.read_csv(data_csv_gz, compression="gzip")

In [7]:
df_pandas.head(2)

Unnamed: 0,customer_id,age,their_lucky_number,occupation,psf_membership_status,education,date_started_python
0,A43321819,47,0,Data Engineer,Contributing,High School,1924-07-20
1,001338908,37,0,Rustacean,Managing,High School,2004-03-09


In [6]:
df_polars = pl.read_csv(source=data_csv_gz)

In [8]:
df_polars.head(2)

customer_id,age,their_lucky_number,occupation,psf_membership_status,education,date_started_python
str,i64,i64,str,str,str,str
"""A43321819""",47,0,"""Data Engineer""","""Contributing""","""High School""","""1924-07-20"""
"""001338908""",37,0,"""Rustacean""","""Managing""","""High School""","""2004-03-09"""


In [9]:
df_polars = pl.read_csv(source=data_csv_gz, n_threads=1)
df_polars.head(2)

customer_id,age,their_lucky_number,occupation,psf_membership_status,education,date_started_python
str,i64,i64,str,str,str,str
"""A43321819""",47,0,"""Data Engineer""","""Contributing""","""High School""","""1924-07-20"""
"""001338908""",37,0,"""Rustacean""","""Managing""","""High School""","""2004-03-09"""


In [None]:
# Test elapsed time for multiple configurations

In [9]:
from log_timer_results import log_to_csv


@log_to_csv(filename="timer_log_results.csv", buffer_size=10, flush_on_return=True)
def pandas_read_csv_default(datafile=data_csv):
    df = pd.read_csv(datafile)
    print(df.head(1))
    return True

@log_to_csv(filename="timer_log_results.csv", buffer_size=10, flush_on_return=True)
def pandas_read_csv_gz(datafile=data_csv, compression="gzip"):
    df = pd.read_csv(data_csv_gz) # NOTE: gzipped file here
    print(df.head(1))
    return True
    
@log_to_csv(filename="timer_log_results.csv", buffer_size=10, flush_on_return=True)
def pandas_read_csv_engine_c(datafile=data_csv):
    df = pd.read_csv(datafile, engine="c")
    print(df.head(1))
    return True

@log_to_csv(filename="timer_log_results.csv", buffer_size=10, flush_on_return=True)
def pandas_read_csv_engine_python(datafile=data_csv):
    df = pd.read_csv(datafile, engine="python")
    print(df.head(1))
    return True

@log_to_csv(filename="timer_log_results.csv", buffer_size=10, flush_on_return=True)
def pandas_read_csv_default_dtype_numpy(datafile=data_csv, dtype_backend="numpy_nullable"):
    df = pd.read_csv(datafile)
    print(df.head(1))
    return True

@log_to_csv(filename="timer_log_results.csv", buffer_size=10, flush_on_return=True)
def pandas_read_csv_engine__c_dtype_numpy(datafile=data_csv):
    df = pd.read_csv(datafile, engine="c", dtype_backend="numpy_nullable")
    print(df.head(1))
    return True

@log_to_csv(filename="timer_log_results.csv", buffer_size=10, flush_on_return=True)
def pandas_read_csv_engine_python__dtype_numpy(datafile=data_csv):
    df = pd.read_csv(datafile, engine="python", dtype_backend="numpy_nullable")
    print(df.head(1))
    return True
    
@log_to_csv(filename="timer_log_results.csv", buffer_size=10, flush_on_return=True)
def pandas_read_csv_engine_pyarrow(datafile=data_csv, engine="pyarrow"):
    df = pd.read_csv(datafile)
    print(df.head(1))
    return True

@log_to_csv(filename="timer_log_results.csv", buffer_size=10, flush_on_return=True)
def pandas_read_csv_engine_pyarrow_dtype_numpy(datafile=data_csv, engine="pyarrow", dtype_backend="numpy_nullable"):
    df = pd.read_csv(datafile)
    print(df.head(1))
    return True
    
@log_to_csv(filename="timer_log_results.csv", buffer_size=10, flush_on_return=True)
def pandas_read_csv_engine_pyarrow_dtype_pyarrow(datafile=data_csv, engine="pyarrow", dtype_backend="pyarrow"):
    df = pd.read_csv(datafile)
    print(df.head(1))
    return True
    
@log_to_csv(filename="timer_log_results.csv", buffer_size=10, flush_on_return=True)
def pandas_read_parquet(datafile=data_parquet):
    df = pd.read_parquet(datafile)
    print(df.head(1))
    return True


In [10]:
pandas_read_csv_default()
pandas_read_csv_engine_c()
pandas_read_csv_engine_python()
pandas_read_csv_default_dtype_numpy()
pandas_read_csv_engine__c_dtype_numpy()
pandas_read_csv_engine_python__dtype_numpy()
pandas_read_csv_engine_pyarrow()
pandas_read_csv_engine_pyarrow_dtype_numpy()
pandas_read_csv_engine_pyarrow_dtype_pyarrow()
pandas_read_parquet()

  customer_id  age  their_lucky_number     occupation psf_membership_status  \
0   A43321819   47                   0  Data Engineer          Contributing   

     education date_started_python  
0  High School          1924-07-20  
  customer_id  age  their_lucky_number     occupation psf_membership_status  \
0   A43321819   47                   0  Data Engineer          Contributing   

     education date_started_python  
0  High School          1924-07-20  
  customer_id  age  their_lucky_number     occupation psf_membership_status  \
0   A43321819   47                   0  Data Engineer          Contributing   

     education date_started_python  
0  High School          1924-07-20  
  customer_id  age  their_lucky_number     occupation psf_membership_status  \
0   A43321819   47                   0  Data Engineer          Contributing   

     education date_started_python  
0  High School          1924-07-20  
  customer_id  age  their_lucky_number     occupation psf_membership

True

In [11]:
@log_to_csv(filename="timer_log_results.csv", buffer_size=10, flush_on_return=True)
def polars_read_csv(datafile=data_csv):
    df = pl.read_csv(source=datafile)
    print(df.head(1))
    return True

@log_to_csv(filename="timer_log_results.csv", buffer_size=10, flush_on_return=True)
def polars_read_csv_gz(datafile=data_csv):
    df = pl.data_csv_gz) # NOTE: gzipped file here
    print(df.head(1))
    return True
    
@log_to_csv(filename="timer_log_results.csv", buffer_size=10, flush_on_return=True)
def polars_read_parquet(datafile=data_parquet):
    df = pl.read_parquet(source=datafile)
    print(df.head(1))
    return True

In [13]:
polars_read_csv()
polars_read_parquet()

shape: (1, 7)
┌─────────────┬─────┬────────────────┬───────────────┬───────────────┬─────────────┬───────────────┐
│ customer_id ┆ age ┆ their_lucky_nu ┆ occupation    ┆ psf_membershi ┆ education   ┆ date_started_ │
│ ---         ┆ --- ┆ mber           ┆ ---           ┆ p_status      ┆ ---         ┆ python        │
│ str         ┆ i64 ┆ ---            ┆ str           ┆ ---           ┆ str         ┆ ---           │
│             ┆     ┆ i64            ┆               ┆ str           ┆             ┆ str           │
╞═════════════╪═════╪════════════════╪═══════════════╪═══════════════╪═════════════╪═══════════════╡
│ A43321819   ┆ 47  ┆ 0              ┆ Data Engineer ┆ Contributing  ┆ High School ┆ 1924-07-20    │
└─────────────┴─────┴────────────────┴───────────────┴───────────────┴─────────────┴───────────────┘
shape: (1, 7)
┌─────────────┬─────┬────────────────┬───────────────┬───────────────┬─────────────┬───────────────┐
│ customer_id ┆ age ┆ their_lucky_nu ┆ occupation    ┆ psf_memb

True