In [None]:
pip install fastparquet

Collecting fastparquet
  Downloading fastparquet-2024.11.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.2 kB)
Collecting cramjam>=2.3 (from fastparquet)
  Downloading cramjam-2.9.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.9 kB)
Downloading fastparquet-2024.11.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m25.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading cramjam-2.9.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.4/2.4 MB[0m [31m56.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: cramjam, fastparquet
Successfully installed cramjam-2.9.1 fastparquet-2024.11.0


In [None]:
import pandas as pd
import numpy as np

np.random.seed(42)

accounts = ['ACC123456', 'ACC234567', 'ACC345678', 'ACC456789', 'ACC567890']
merchants = ['ABC Store', 'ATM', 'Employer', 'Stock Exchange', 'XYZ Electronics', 'Online Store', 'Cafe', 'Supermarket']

n_rows = 10000

data = {
    'Transaction ID': np.arange(1000000000, 1000000000 + n_rows),
    'Account ID': np.random.choice(accounts, n_rows),
    'Transaction Amount': np.round(np.random.uniform(10.00, 2000.00, n_rows), 2),
    'Transaction Type': np.random.choice(['Purchase', 'Withdrawal', 'Deposit', 'Trade'], n_rows),
    'Merchant/Counterparty': np.random.choice(merchants, n_rows),
    'Location': np.random.choice(['New York, NY', 'Los Angeles, CA', 'Chicago, IL', 'Miami, FL', 'Online'], n_rows),
    'Date and Time': pd.date_range(start='2024-08-01', periods=n_rows, freq='H').strftime('%Y-%m-%d %H:%M').tolist()
}

data = pd.DataFrame(data)

parquet_file = "financial.parquet"
data.to_parquet(parquet_file, engine='fastparquet', index=False)
data

  'Date and Time': pd.date_range(start='2024-08-01', periods=n_rows, freq='H').strftime('%Y-%m-%d %H:%M').tolist()


Unnamed: 0,Transaction ID,Account ID,Transaction Amount,Transaction Type,Merchant/Counterparty,Location,Date and Time
0,1000000000,ACC456789,86.80,Deposit,Employer,"Los Angeles, CA",2024-08-01 00:00
1,1000000001,ACC567890,1758.97,Withdrawal,ABC Store,"Chicago, IL",2024-08-01 01:00
2,1000000002,ACC345678,636.72,Purchase,XYZ Electronics,"New York, NY",2024-08-01 02:00
3,1000000003,ACC567890,266.67,Deposit,Employer,"New York, NY",2024-08-01 03:00
4,1000000004,ACC567890,942.24,Deposit,ATM,"Chicago, IL",2024-08-01 04:00
...,...,...,...,...,...,...,...
9995,1000009995,ACC234567,1770.98,Deposit,ABC Store,"Miami, FL",2025-09-21 11:00
9996,1000009996,ACC345678,1512.92,Purchase,Supermarket,"New York, NY",2025-09-21 12:00
9997,1000009997,ACC234567,653.26,Trade,Employer,"Los Angeles, CA",2025-09-21 13:00
9998,1000009998,ACC456789,1876.98,Withdrawal,Online Store,Online,2025-09-21 14:00


In [None]:
from fastparquet import ParquetFile

pf = ParquetFile(parquet_file)

# Display schema
print("\nParquet Schema:")
print(pf.schema)


Parquet Schema:
- schema: 
| - Transaction ID: INT64, OPTIONAL
| - Account ID: BYTE_ARRAY, UTF8, OPTIONAL
| - Transaction Amount: DOUBLE, OPTIONAL
| - Transaction Type: BYTE_ARRAY, UTF8, OPTIONAL
| - Merchant/Counterparty: BYTE_ARRAY, UTF8, OPTIONAL
| - Location: BYTE_ARRAY, UTF8, OPTIONAL
  - Date and Time: BYTE_ARRAY, UTF8, OPTIONAL


In [None]:
# Display row group information
print("\nRow Groups Information:")
print(f"Number of row groups: {len(pf.row_groups)}")
for i, rg in enumerate(pf.row_groups):
    print(f"Row group {i + 1}: {rg.num_rows} rows")


Row Groups Information:
Number of row groups: 1
Row group 1: 10000 rows


In [None]:
# Display column-level statistics
print("\nColumn Statistics:")
print(f"Min={pf.statistics['min']}")
print(f"Max={pf.statistics['max']}")
print(f"Nulls={pf.statistics['null_count']}")


Column Statistics:
Min={'Transaction ID': [1000000000], 'Account ID': [None], 'Transaction Amount': [10.11], 'Transaction Type': [None], 'Merchant/Counterparty': [None], 'Location': [None], 'Date and Time': [None]}
Max={'Transaction ID': [1000009999], 'Account ID': [None], 'Transaction Amount': [1999.85], 'Transaction Type': [None], 'Merchant/Counterparty': [None], 'Location': [None], 'Date and Time': [None]}
Nulls={'Transaction ID': [0], 'Account ID': [0], 'Transaction Amount': [0], 'Transaction Type': [0], 'Merchant/Counterparty': [0], 'Location': [0], 'Date and Time': [0]}


In [None]:
# Display compression and encoding details
print("\nCompression and Encoding Details:")
for rg in pf.row_groups:
    for col_meta in rg.columns:
        column_name = col_meta.meta_data.path_in_schema[0]
        print(column_name, col_meta)


Compression and Encoding Details:
Transaction ID column_index_length: null
column_index_offset: null
crypto_metadata: null
encrypted_column_metadata: null
file_offset: 4
file_path: null
meta_data:
  bloom_filter_offset: null
  codec: 1
  data_page_offset: 4
  dictionary_page_offset: null
  encoding_stats:
  - count: 1
    encoding: 0
    page_type: 0
  encodings:
  - 0
  index_page_offset: null
  key_value_metadata: []
  num_values: 10000
  path_in_schema:
  - Transaction ID
  statistics:
    distinct_count: null
    max: b'\x0f\xf1\x9a;\x00\x00\x00\x00'
    max_value: null
    min: b'\x00\xca\x9a;\x00\x00\x00\x00'
    min_value: null
    null_count: 0
  total_compressed_size: 40069
  total_uncompressed_size: 80039
  type: 2
offset_index_length: null
offset_index_offset: null

Account ID column_index_length: null
column_index_offset: null
crypto_metadata: null
encrypted_column_metadata: null
file_offset: 40073
file_path: null
meta_data:
  bloom_filter_offset: null
  codec: 1
  data_pa

In [None]:
# Demonstrate file size advantage
import os
csv_file = "example.csv"
data.to_csv(csv_file, index=False)
csv_size = os.path.getsize(csv_file)
parquet_size = os.path.getsize(parquet_file)
print(f"\nFile Size Comparison:\nCSV Size: {csv_size / 1024:.2f} KB\nParquet Size: {parquet_size / 1024:.2f} KB")


File Size Comparison:
CSV Size: 757.73 KB
Parquet Size: 218.82 KB


In [None]:
# Display Parquet footer
print("\nParquet Footer:")
print(pf.fmd)


Parquet Footer:
column_orders: null
created_by: b'fastparquet-python version 2024.11.0 (build 0)'
encryption_algorithm: null
footer_signing_key_metadata: null
key_value_metadata:
- key: b'pandas'
  value: 'b''{"column_indexes": [{"field_name": null, "metadata": null, "name": null,
    "numpy_type": "object", "pandas_type": "mixed-integer"}], "columns": [{"field_name":
    "Transaction ID", "metadata": null, "name": "Transaction ID", "numpy_type": "int64",
    "pandas_type": "int64"}, {"field_name": "Account ID", "metadata": null, "name":
    "Account ID", "numpy_type": "object", "pandas_type": "unicode"}, {"field_name":
    "Transaction Amount", "metadata": null, "name": "Transaction Amount", "numpy_type":
    "float64", "pandas_type": "float64"}, {"field_name": "Transaction Type", "metadata":
    null, "name": "Transaction Type", "numpy_type": "object", "pandas_type": "unicode"},
    {"field_name": "Merchant/Counterparty", "metadata": null, "name": "Merchant/Counterparty",
    "numpy