In [1]:
import pandas as pd
import numpy as np
import time
import os

# Generate a large DataFrame
df = pd.DataFrame({
    'id': np.arange(1, 1_000_001),
    'name': ['User'] * 1_000_000,
    'value': np.random.rand(1_000_000),
    'date': pd.date_range('2020-01-01', periods=1_000_000, freq='min')
})

# === Save to CSV ===
start = time.time()
df.to_csv('example.csv', index=False)
csv_time = time.time() - start
csv_size = os.path.getsize('example.csv') / (1024 * 1024)

# === Save to Parquet ===
start = time.time()
df.to_parquet('example.parquet', index=False)
parquet_time = time.time() - start
parquet_size = os.path.getsize('example.parquet') / (1024 * 1024)

print(f"CSV saved in {csv_time:.2f} seconds, size: {csv_size:.2f} MB")
print(f"Parquet saved in {parquet_time:.2f} seconds, size: {parquet_size:.2f} MB")

# === Load from Parquet ===
df_parquet = pd.read_parquet('example.parquet')
print("Loaded from Parquet:", df_parquet.shape)


CSV saved in 1.63 seconds, size: 49.74 MB
Parquet saved in 0.13 seconds, size: 18.94 MB
Loaded from Parquet: (1000000, 4)
