In [3]:
import pandas as pd
import time

In [5]:
print("Timing reads:")
print("\tPyArrow CSV read...", end="")
time_start = time.perf_counter()
df = pd.read_csv("../data/nyc-parking-violations-2020.csv", engine="pyarrow")
time_end = time.perf_counter()
print(f"{time_end - time_start}")
print("\tDefault CSV read...", end="")
time_start = time.perf_counter()
df = pd.read_csv("../data/nyc-parking-violations-2020.csv")
time_end = time.perf_counter()
print(f"\t{time_end - time_start}")

Timing reads:
	PyArrow CSV read...17.497207333974075
	Default CSV read...

  df = pd.read_csv("../data/nyc-parking-violations-2020.csv")


	34.87162225000793


In [4]:
formats = {"csv": df.to_csv, "json": df.to_json, "feather": df.to_feather}
print("Converting files")
for ext, func in formats.items():
    start_time = time.perf_counter()
    func(f"out.{ext}")
    end_time = time.perf_counter()
    print(f"\t{ext} total time: {end_time - start_time}")

Converting files
	csv total time: 72.37753045797581
	json total time: 137.52824979199795
	feather total time: 14.405495000013616


In [6]:
# memory footprint
print("PyArrow backend")
df_pa = pd.read_csv("../data/nyc-parking-violations-2020.csv", dtype_backend="pyarrow")
df_pa.memory_usage(deep=True).sum() // (1024 * 1024)


PyArrow backend


  df_pa = pd.read_csv("../data/nyc-parking-violations-2020.csv", dtype_backend="pyarrow")


4947

In [7]:
print("Numpy backend")
df_pa = pd.read_csv("../data/nyc-parking-violations-2020.csv")
df_pa.memory_usage(deep=True).sum() // (1024 * 1024)

Numpy backend


  df_pa = pd.read_csv("../data/nyc-parking-violations-2020.csv")


16010

So PyArrow is about twice as fast as the default CSV engine when reading.

And it uses about a third of the memory when using the PyArrow dtypes.