# Stop persising pandas data frames to csv
In this notebook we will explore advantage and disadvanteges of persisting the pandas dataframe to CSV (`df.to_csv`) and compare with the other options which exists. We will look at:
* speed of saving
* speed of loading
* size of the persisted file
* preservation of data types

In [1]:
import pandas as pd
import numpy as np
import os
import time
from pathlib import Path
from sqlalchemy import create_engine
import plotly.express as pe

  PANDAS_TYPES = (pd.Series, pd.DataFrame, pd.Panel)


## Support functions
* `statistics` - calculates the data frame with information about 
 * data type preservation, how many % of columns remained original type after reading
 * compression/size - how big is the file in % of csv
 * write_time - how long does it take to write this format as % of csv writing time
 * read_time - how long does it take to read this format as % of csv reading time
* `repeat` - which measures average processing time, we do 7 repetitions
* `performance_test` - which inputs the defined dict of tests and runs the repetition and statistics

In [2]:
def statistics(df: pd.DataFrame, df_loaded: pd.DataFrame, new_file: str, 
               tp: str, benchmark: dict, out: dict, orig_size: int) -> pd.DataFrame:
    """Generate statistics based on performance test results"""
    
    comparison_df = pd.DataFrame({"orig": df.dtypes, "new": df_loaded.dtypes})
    comparison_df["same"] = comparison_df["orig"]==comparison_df["new"]
    #mismatched_types = comparison_df[comparison_df["same"]==False].index.to_list()
    
    new_size = Path(new_file).stat().st_size
    # we don't measure size of the SQL tables in the notebook
    if tp == "SQL":
        new_size = 0
    
    return pd.DataFrame({
    "dtype_preservation": comparison_df["same"].sum()/comparison_df["same"].count(),
    "compression": new_size/orig_size,
    "write_time": out["write"]/benchmark["write"],
    "read_time": out["read"]/benchmark["read"],
    #"mismatched_types": mismatched_types
    }, 
        index=[tp]), comparison_df

In [3]:
def repeat(f: dict, repetitions: int, df: pd.DataFrame, file: str) -> dict:
    """ Perform an operation specified by input dict number of repetition times on the data frame"""
    
    writes_elapsed = []
    reads_elapsed = []
    for r in range(repetitions):
        
        # write
        start_time = time.time()
        
        # first parameter is self (the dataframe), second the path and then **kwargs
        f["write_function"](df, file, **f["write_params"])
        write_elapsed = time.time() - start_time
        writes_elapsed.append(write_elapsed)
        
        # read
        start_time = time.time()
        df_loaded = f["read_function"](file, **f["read_params"])
        read_elapsed = time.time() - start_time
        reads_elapsed.append(read_elapsed)
    
    return {"write": sum(writes_elapsed)/len(writes_elapsed),
           "read": sum(reads_elapsed)/len(reads_elapsed),
           "df_loaded": df_loaded}

In [4]:
def performance_test(exporting_types: dict, df: pd.DataFrame, 
                     out_file: str="out", repetitions: int = 7) -> pd.DataFrame:
    
    """Run performance test for predefined dict of operations"""

    results = []
    for k,v in exporting_types.items():

        # create a file name
        new_file = out_file + v["extension"]
        
        try:
            # repeat the writing and reading several times
            out = repeat(v, repetitions, df, new_file)

            # CSV is the first one and it's set as benchmark for reading and writing times
            if v["type"] == "CSV":
                benchmark = out
                df.to_csv("benchmark.csv", index=False)
                orig_size = Path("benchmark.csv").stat().st_size

            # process the results - dtypes_preservation, compression, write and read_time
            results.append(statistics(df, out["df_loaded"], new_file, v["type"], benchmark, out, orig_size)[0])
        
        except Exception as e:
            print(f"{k} failed - {e}")

    return pd.concat(results)

# Test Data
Before we can run any test, we need some data to test on. We will create a random dataset including number of different types, random words (clusters of letters) and senetences formed based on these words and random dates.

In [5]:
import random
import string
import datetime

def get_random_string(length: int) -> str:
    """Generated random string up to the specific lenght"""
    
    letters = string.ascii_letters
    result_str = ''.join([random.choice(letters) for i in range(random.randint(3,length))])
    return result_str

def get_random_sentence(words: int=5) -> str:
    """Generate random sentence - list of random works"""
    
    words = []
    for i in range(5):
        words.append(get_random_string(10)[:random.randint(0,10)])
    return " ".join(words)

def random_date(start: str, end: str, format: str) -> str:
    """Generate random date of specified format"""
    
    earliest = datetime.datetime.strptime(start, format)
    latest  = datetime.datetime.strptime(end, format)
    delta = latest - earliest
    int_delta = (delta.days * 24 * 60 * 60) + delta.seconds    
    random_second = random.randrange(int_delta)
    return (earliest + datetime.timedelta(seconds = random_second)).strftime(format)

We use these function and random library to generate list of list containing random values

In [6]:
# https://numpy.org/doc/stable/user/basics.types.html
def generate_random_data(size: int) -> list:
    """Generates list of list containing random data of various formats"""
    data = []
    for i in range(size):
        data.append(
            [random.randint(-127,127),  # int8
             random.randint(-32768,32767),  # int16
             random.randint(-2147483648,2147483647),  # int32
             random.randint(-9223372036854775808 ,9223372036854775807),  # int64
             random.randint(0,255),  # uint8
             round(random.uniform(0,10000),2),
             round(random.uniform(0,1000000),2),
             get_random_string(10),
             get_random_sentence(5),
             random.choice([get_random_string(10) for i in range(25)]),
             random_date("1900-01-01","2020-05-01","%Y-%m-%d"),
             random_date("1900-01-01T00:00:00","2020-05-01T23:59:59","%Y-%m-%dT%H:%M:%S"),
             random.choice([True,False])])
        
    return data

These values are concatenated into the dataframe. Some persisting methods doesn't work well with some data types, mainly with `timedelta` and `datetime` with timezone so we can optionally remove them. We also measure benchmark size of the csv file created based on this data

In [7]:
def create_random_df(size: int, drop_timedelta: bool=False, drop_timezone: bool=False):
    """Generates dataframe with random values"""
    data = generate_random_data(size)
    column_names = ["Int8", "Int16", "Int32", "Int64", "UInt8", "Float32", "Float64", 
                    "String", "Sentence", "Category", "Date", "DateTime", "Bool"]
    df = pd.DataFrame(data, columns=column_names)
    df["Int8"] = df["Int8"].astype("int8")
    df["Int16"] = df["Int16"].astype("int16")
    df["Int32"] = df["Int32"].astype("int32")
    df["UInt8"] = df["UInt8"].astype("uint8")
    df["Float32"] = df["Float32"].astype("float32")
    df["Category"] = df["Category"].astype("category")
    df["Date"] = pd.to_datetime(df["Date"], format="%Y-%m-%d")
    df["DateTime"] = pd.to_datetime(df["DateTime"], format="%Y-%m-%dT%H:%M:%S")
    if not drop_timedelta:
        df["TimeDelta"] = df["DateTime"]-df["Date"]
    if not drop_timezone:
        df["DateTime+Zone"] = df["DateTime+Zone"].dt.tz_localize('Europe/Vienna')

    # store for size comparison
    df.to_csv("benchmark.csv", index=False)
    orig_size = Path("benchmark.csv").stat().st_size

    return df, orig_size

In [8]:
dataset_size = 50000
df, orig_size = create_random_df(dataset_size, True, True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 13 columns):
Int8        50000 non-null int8
Int16       50000 non-null int16
Int32       50000 non-null int32
Int64       50000 non-null int64
UInt8       50000 non-null uint8
Float32     50000 non-null float32
Float64     50000 non-null float64
String      50000 non-null object
Sentence    50000 non-null object
Category    50000 non-null category
Date        50000 non-null datetime64[ns]
DateTime    50000 non-null datetime64[ns]
Bool        50000 non-null bool
dtypes: bool(1), category(1), datetime64[ns](2), float32(1), float64(1), int16(1), int32(1), int64(1), int8(1), object(2), uint8(1)
memory usage: 4.7+ MB


# Run the peformance test
We define each test case by several parameters:
* type: type of persistance (used as descriptor of the result)
* extension: extension which is used for the file persisted on the disk
* write_function: which function is used to persist/write the dataframe
* write_params: parameters of this function, e.g. compression type or engine used
* read_function: fuction to read the persisted file
* read_parameters: parameters of the read function 

In [9]:
# define the types of transformation to use
exporting_types = {
    "csv": {
        "type": "CSV",
        "extension": ".csv",
        "write_function": pd.DataFrame.to_csv,
        "write_params": {"index": False},
        "read_function": pd.read_csv,
        "read_params": {}
    },
    "csv_zip": {
        "type": "CSV zip",
        "extension": ".zip",
        "write_function": pd.DataFrame.to_csv,
        "write_params": {"index": False, "compression": "zip"},
        "read_function": pd.read_csv,
        "read_params": {"compression": "zip"}
    },
    "picklea": {
        "type": "Pickle bz2",
        "extension": ".pkl.bz2",
        "write_function": pd.DataFrame.to_pickle,
        "write_params": {"compression": "bz2"},
        "read_function": pd.read_pickle,
        "read_params": {"compression": "bz2"}
    },
    "pickleb": {
        "type": "Pickle gzip",
        "extension": ".pkl.gzip",
        "write_function": pd.DataFrame.to_pickle,
        "write_params": {"compression": "gzip"},
        "read_function": pd.read_pickle,
        "read_params": {"compression": "gzip"}
    },
    "picklec": {
        "type": "Pickle zip",
        "extension": ".pkl.zip",
        "write_function": pd.DataFrame.to_pickle,
        "write_params": {"compression": "zip"},
        "read_function": pd.read_pickle,
        "read_params": {"compression": "zip"}
    },
    "pickled": {
        "type": "Pickle infer",
        "extension": ".pkl",
        "write_function": pd.DataFrame.to_pickle,
        "write_params": {},
        "read_function": pd.read_pickle,
        "read_params": {}
    },
    "picklee": {
        "type": "Pickle xz",
        "extension": ".pkl.xz",
        "write_function": pd.DataFrame.to_pickle,
        "write_params": {"compression": "xz"},
        "read_function": pd.read_pickle,
        "read_params": {"compression": "xz"}
    },
    "parquet_pyarrow": {
        "type": "Parquet via PyArrow",
        "extension": ".parquet.gzip",
        "write_function": pd.DataFrame.to_parquet,
        "write_params": {},
        "read_function": pd.read_parquet,
        "read_params": {}
    },
    "parquet_fastparquet": {
        "type": "Parquet via fastparquet",
        "extension": ".parquet.gzip",
        "write_function": pd.DataFrame.to_parquet,
        "write_params": {"engine":"fastparquet","compression":"GZIP"},
        "read_function": pd.read_parquet,
        "read_params": {"engine":"fastparquet"}
    },    
    "Hdf5f": {
        "type": "Hdf5 fixed",
        "extension": ".h5",
        "write_function": pd.DataFrame.to_hdf,
        "write_params": {"key":"df", "format":"fixed"},
        "read_function": pd.read_hdf,
        "read_params": {"key":"df"}
    },
    "Hdf5t": {
        "type": "Hdf5 table",
        "extension": ".h5",
        "write_function": pd.DataFrame.to_hdf,
        "write_params": {"key":"df", "format":"table"},
        "read_function": pd.read_hdf,
        "read_params": {"key":"df"}
    },
    "Hdf5c": {
        "type": "Hdf5 compressed",
        "extension": ".h5",
        "write_function": pd.DataFrame.to_hdf,
        "write_params": {"key":"df", "format":"table", "complevel": 9, "complib": "bzip2"},
        "read_function": pd.read_hdf,
        "read_params": {"key":"df"}
    },
    "Excel": {
        "type": "Excel",
        "extension": ".xlsx",
        "write_function": pd.DataFrame.to_excel,
        "write_params": {"index":False},
        "read_function": pd.read_excel,
        "read_params": {}
    },
    # to_sql and read_sql have too much different syntax
    #"SQL": {
    #    "type": "SQL",
    #    "extension": "",
    #    "write_function": pd.DataFrame.to_sql,
    #    "write_params": {"name": "pandas", "con": engine},
    #    "read_function": pd.read_sql,
    #    "read_params": {"name": "pandas", "con": engine}
    #},
    # feather file gets blocked by ArrowIO after the first run
    "Feather": {
        "type": "Feather",
        "extension": ".f",
        "write_function": pd.DataFrame.to_feather,
        "write_params": {},
        "read_function": pd.read_feather,
        "read_params": {}
    }
}

In [10]:
# performance test
performance_df = performance_test(exporting_types, df)

# results
performance_df.style.format("{:.2%}")

Hdf5f failed - Cannot store a category dtype in a HDF5 dataset that uses format="fixed". Use format="table".
Feather failed - Failed to open local file 'out.f', error: The requested operation cannot be performed on a file with a user-mapped section open.



Unnamed: 0,dtype_preservation,compression,write_time,read_time
CSV,38.46%,100.00%,100.00%,100.00%
CSV zip,38.46%,56.10%,156.66%,115.82%
Pickle bz2,100.00%,48.13%,70.25%,160.75%
Pickle gzip,100.00%,48.56%,98.65%,29.75%
Pickle zip,100.00%,48.56%,54.61%,27.60%
Pickle infer,100.00%,63.92%,5.57%,12.52%
Pickle xz,100.00%,44.20%,316.52%,191.97%
Parquet via PyArrow,100.00%,69.35%,12.54%,25.60%
Parquet via fastparquet,100.00%,48.44%,85.80%,90.82%
Hdf5 table,100.00%,3629.55%,36.17%,136.83%


## Display the results
Image is worth many words so we display the results using interactive Plotly.Express chart. Plotly.Express allow to create these chart in few lines of code. Click on the items in the legend to hide/reveal particular persistance option from the graph. Click and select an area in the chart to zoom.

In [None]:
fig = pe.bar(performance_df.T, 
             barmode='group', text="value")
fig.update_traces(texttempmlate='%{text:.2%}', textposition='auto')
fig.update_layout(title=f"Statistics for {dataset_size} records - performance test 7 repetititons", 
                  yaxis={"type": "log", "title": "value % (log scale)"})
fig.show()

Writing and reading speed and memory use in the process was previous explored by Ilia Zaitsev:  https://towardsdatascience.com/the-best-format-to-save-pandas-data-414dca023e0d

# Manual Runs
If you struggle to follow the function wrapped in the dictiory you can run each function separately in the code below. I also highlight which data types are not persisted. 
## To csv

In [12]:
out_file = "out"
repetitions = 7

In [13]:
type = "csv"
extension = ".csv"

new_file = out_file + extension
writes_elapsed, reads_elapsed = [], []
for r in range(repetitions):
    start_time = time.time()
    df.to_csv(new_file, index=False)
    write_elapsed = time.time() - start_time
    writes_elapsed.append(write_elapsed)
    
    start_time = time.time()
    df_loaded = pd.read_csv(new_file)
    #df_loaded = pd.read_csv(new_file, parse_dates=["period","accepted"], infer_datetime_format=True)
    read_elapsed = time.time() - start_time
    reads_elapsed.append(read_elapsed)
    
out = {"read": sum(reads_elapsed)/len(reads_elapsed),
      "write": sum(writes_elapsed)/len(writes_elapsed)}

In [16]:
# set up csv as your benchmark
df.to_csv("benchmark.csv", index=False)
orig_size =  Path("benchmark.csv").stat().st_size
benchmark = out

In [17]:
statistics(df, df_loaded, new_file, type, benchmark, out=out, orig_size=orig_size)[1]\
    .style.applymap(lambda x: "color: red" if isinstance(x,bool) and x==False else "black")

Unnamed: 0,orig,new,same
Int8,int8,int64,False
Int16,int16,int64,False
Int32,int32,int64,False
Int64,int64,int64,True
UInt8,uint8,int64,False
Float32,float32,float64,False
Float64,float64,float64,True
String,object,object,True
Sentence,object,object,True
Category,category,object,False


In [18]:
statistics(df, df_loaded, new_file, type, benchmark, out=out, orig_size=orig_size)[0]

Unnamed: 0,dtype_preservation,compression,write_time,read_time
csv,0.384615,1.0,1.0,1.0


In [19]:
# to keep the data types while reading csv you must specify them using `dtype` parameter
df_loaded = pd.read_csv(new_file,
                       dtype={"Int8": "int8",
                             "Int16": "int16",
                             "Uint8": "uint8",
                             "Category": "category"},
                       parse_dates=["Date","DateTime"])
df_loaded.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 13 columns):
Int8        50000 non-null int8
Int16       50000 non-null int16
Int32       50000 non-null int64
Int64       50000 non-null int64
UInt8       50000 non-null int64
Float32     50000 non-null float64
Float64     50000 non-null float64
String      50000 non-null object
Sentence    50000 non-null object
Category    50000 non-null category
Date        50000 non-null datetime64[ns]
DateTime    50000 non-null datetime64[ns]
Bool        50000 non-null bool
dtypes: bool(1), category(1), datetime64[ns](2), float64(2), int16(1), int64(3), int8(1), object(2)
memory usage: 5.4+ MB


## To Pickle

In [25]:
type = "Pickle"
extension = ".plk"
new_file = out_file + extension

# write to pickle
start_time = time.time()
df.to_pickle(new_file)
write_elapsed = time.time() - start_time

# reading the pickled file
start_time = time.time()
df_loaded = pd.read_pickle(new_file)
read_elapsed = time.time() - start_time
    
out = {"read": read_elapsed,
      "write": write_elapsed}

statistics(df, df_loaded, new_file, type, benchmark, out=out, orig_size=orig_size)[0]

Unnamed: 0,dtype_preservation,compression,write_time,read_time
Pickle,1.0,0.639237,0.047124,0.108429


## To Parquet

In [26]:
type = "Parquet"
extension = ".parquet.gzip"
new_file = out_file + extension
#df.to_parquet(new_file, engine="fastparquet", compression="gzip")
df.to_parquet(new_file)
df_loaded = pd.read_parquet(new_file)
#results.append()
statistics(df, df_loaded, new_file, type, benchmark, out={"read": 0, "write": 0},orig_size=orig_size)[1]

Unnamed: 0,orig,new,same
Int8,int8,int8,True
Int16,int16,int16,True
Int32,int32,int32,True
Int64,int64,int64,True
UInt8,uint8,uint8,True
Float32,float32,float32,True
Float64,float64,float64,True
String,object,object,True
Sentence,object,object,True
Category,category,category,True


## To Hdf5

In [27]:
type = "Hdf5"
extension = ".h5"
new_file = out_file + extension
df.to_hdf(new_file, key='df', format="table")
df_loaded = pd.read_hdf(new_file, key='df')
#results.append(statistics(df, df_loaded, new_file, type))
statistics(df, df_loaded, new_file, type, benchmark, out={"read": 0, "write": 0}, orig_size=orig_size)[1]

Unnamed: 0,orig,new,same
Int8,int8,int8,True
Int16,int16,int16,True
Int32,int32,int32,True
Int64,int64,int64,True
UInt8,uint8,uint8,True
Float32,float32,float32,True
Float64,float64,float64,True
String,object,object,True
Sentence,object,object,True
Category,category,category,True


## To Excel

In [28]:
type = "Excel"
extension = ".xlsx"
new_file = out_file + extension

writes_elapsed, reads_elapsed = [], []

for r in range(repetitions):
    start_time = time.time()
    df.to_excel(new_file, index=False)
    write_elapsed = time.time() - start_time
    writes_elapsed.append(write_elapsed)
    
    start_time = time.time()
    df_loaded = pd.read_excel(new_file)
    read_elapsed = time.time() - start_time
    reads_elapsed.append(read_elapsed)
    
out = {"read": sum(reads_elapsed)/len(reads_elapsed),
      "write": sum(writes_elapsed)/len(writes_elapsed)}

statistics(df, df_loaded, new_file, type, benchmark, out, orig_size=orig_size)[1]

Unnamed: 0,orig,new,same
Int8,int8,int64,False
Int16,int16,int64,False
Int32,int32,int64,False
Int64,int64,int64,True
UInt8,uint8,int64,False
Float32,float32,float64,False
Float64,float64,float64,True
String,object,object,True
Sentence,object,object,True
Category,category,object,False


In [29]:
statistics(df, df_loaded, new_file, type, benchmark, out, orig_size=orig_size)[0]

Unnamed: 0,dtype_preservation,compression,write_time,read_time
Excel,0.538462,0.903782,19.885719,50.167168


## To Feather

In [30]:
type = "Feather"
extension = ".f"
new_file = out_file + extension
start_time = time.time()
df.to_feather(new_file)
write_elapsed = time.time() - start_time
start_time = time.time()
df_loaded = pd.read_feather(new_file)
read_elapsed = time.time() - start_time

out = {"read": read_elapsed,
      "write": write_elapsed}

statistics(df, df_loaded, new_file, type, benchmark, out, orig_size=orig_size)[1]

Unnamed: 0,orig,new,same
Int8,int8,int8,True
Int16,int16,int16,True
Int32,int32,int32,True
Int64,int64,int64,True
UInt8,uint8,uint8,True
Float32,float32,float32,True
Float64,float64,float64,True
String,object,object,True
Sentence,object,object,True
Category,category,category,True


In [31]:
statistics(df, df_loaded, new_file, type, benchmark, out, orig_size=orig_size)[0]

Unnamed: 0,dtype_preservation,compression,write_time,read_time
Feather,1.0,0.653213,0.036684,0.180578


## To Sql
SQL was not integrated in the original test because the `.to_sql` function is so specific that it's hard to parametrize it like the others. Using `SQLAlchemy` you must 
* set up the DB engine
* establish the connection
* run the SQL command

In [32]:
engine = create_engine(
    'mssql+pyodbc://vaclav:VGF1H054ui@localhost/Vaclavs?driver=ODBC+Driver+13+for+SQL+server',
    isolation_level="REPEATABLE READ"
)

In [33]:
connection = engine.connect()

In [37]:
type = "SQL"

# drop the test table if already exists
sql = 'DROP TABLE [Vaclavs].[dbo].[test]'
connection.execute(sql)

writes_elapsed, reads_elapsed = [], []
start_time = time.time()
df.to_sql(name="test", con=connection)
write_elapsed = time.time() - start_time
    
start_time = time.time()
df_loaded = pd.read_sql_table(table_name="test", con=connection)
read_elapsed = time.time() - start_time


out = {"read": read_elapsed,
      "write": write_elapsed}
statistics(df, df_loaded, new_file, type, benchmark, out, orig_size)[1]

Unnamed: 0,orig,new,same
Bool,bool,bool,True
Category,category,object,False
Date,datetime64[ns],datetime64[ns],True
DateTime,datetime64[ns],datetime64[ns],True
Float32,float32,float64,False
Float64,float64,float64,True
Int16,int16,int64,False
Int32,int32,int64,False
Int64,int64,int64,True
Int8,int8,int64,False


In [38]:
# compression value is 
statistics(df, df_loaded, new_file, type, benchmark, out, orig_size)[0]

Unnamed: 0,dtype_preservation,compression,write_time,read_time
SQL,0.5,0.0,2.783496,3.131933


In [36]:
# Speed up SQL processing
# https://medium.com/analytics-vidhya/speed-up-bulk-inserts-to-sql-db-using-pandas-and-python-61707ae41990

from sqlalchemy import event
@event.listens_for(engine, "before_cursor_execute")
def receive_before_cursor_execute(
       conn, cursor, statement, params, context, executemany
        ):
            if executemany:
                cursor.fast_executemany = True

Rerun the SQL test after this hack to see that the `write_time` and `read_time` have improved

## Try on your own dataset
We will use the SEC quarterly data dump, the list of files, because it contains all types of columns we would like to persist:
* datetime - `accepted`, `filed`, `period`
* category - `form`
* float - `ein`

In [39]:
folder = r"..\Stocks\Data_Sec\2020Q1"
file_name = "sub.txt"
file = os.path.join(folder, file_name)
out_file = "out"
repetitions = 7  # how many times read/write to measure the speed
results = []

In [40]:
df = pd.read_csv(file, sep="\t")

In [41]:
df["accepted"] = pd.to_datetime(df["accepted"])
df["filed"] = pd.to_datetime(df["filed"], format="%Y%m%d")
df["period"] = pd.to_datetime(df["period"], format="%Y%m%d")
df["form"] = df["form"].astype('category')
df["ein"] = df["ein"].astype("float")
df = df[["adsh","cik","sic","ein","form","period","fy","accepted"]]
df.to_csv("benchmark.csv", index=False)
orig_size = Path("benchmark.csv").stat().st_size
df.dtypes

adsh                object
cik                  int64
sic                float64
ein                float64
form              category
period      datetime64[ns]
fy                   int64
accepted    datetime64[ns]
dtype: object

In [42]:
df.shape

(13560, 8)

In [43]:
# performance test
performance_df = performance_test(exporting_types, df)

# results
performance_df.style.format("{:.2%}")

Hdf5f failed - Cannot store a category dtype in a HDF5 dataset that uses format="fixed". Use format="table".
Feather failed - Failed to open local file 'out.f', error: The requested operation cannot be performed on a file with a user-mapped section open.



Unnamed: 0,dtype_preservation,compression,write_time,read_time
CSV,62.50%,100.00%,100.00%,100.00%
CSV zip,62.50%,19.43%,144.29%,124.48%
Pickle bz2,100.00%,16.53%,83.55%,149.15%
Pickle gzip,100.00%,17.81%,253.54%,35.10%
Pickle zip,100.00%,18.26%,37.57%,32.55%
Pickle infer,100.00%,74.43%,3.44%,13.59%
Pickle xz,100.00%,13.37%,453.18%,122.39%
Parquet via PyArrow,100.00%,25.81%,9.90%,25.16%
Parquet via fastparquet,100.00%,17.24%,40.12%,930.19%
Hdf5 table,100.00%,20579.48%,43.13%,164.58%


In [None]:
# display the graph with the results
fig = pe.bar(performance_df.T, barmode='group', text="value")
    
fig.update_traces(texttemplate='%{text:.2%}', textposition='auto')
fig.update_layout(title=f"Statistics for {dataset_size} records - performance test 7 repetititons", 
                  yaxis={"type": "log", "title": "value % (log scale)"})
fig.update_layout(title=f"Statistics for SEC Quarterly data dump of shape {df.shape}")
fig.show()

In [None]:
As you can see, the results on this dataset are similar to our test dataset