In [1]:
from pathlib import Path

import glob
import hashlib
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq

In [2]:
csv_files = glob.glob('../data/yellow_tripdata_2016-*.csv')

In [3]:
for csv_file in csv_files:
    # Change the file suffix
    p = Path(csv_file)
    parquet_file = p.parent / f"{p.name[:-3]}parquet"
    str_parquet_file = p.parent / f"str_{p.name[:-3]}parquet"
    cat_parquet_file = p.parent / f"cat_{p.name[:-3]}parquet"
    
    # Read in the CSV and already convert datetime columns
    df = pd.read_csv(
        csv_file,
        parse_dates=["tpep_pickup_datetime", "tpep_dropoff_datetime"],
        index_col=False,
        infer_datetime_format=True,
    )
    
    # store_and_fwd_flag is actually boolean but read in as string.
    # Manually change it to have a more efficient storage.
    df['store_and_fwd_flag'] = df['store_and_fwd_flag'] == 'Y'
    
    # Store it with the default options:
    #  * a single RowGroup, no chunking
    #  * SNAPPY compression
    df.to_parquet(parquet_file, engine="pyarrow")
    
    df['str'] = df.apply(lambda x: hashlib.sha256(str(x).encode()).hexdigest(), axis = 1)
    df.to_parquet(str_parquet_file, engine="pyarrow")
    
    df['str'] = df['str'].apply(lambda s: f"{s[0]}-{s[1]}-{s[2]}")
    df.to_parquet(cat_parquet_file, engine="pyarrow")

In [6]:
for csv_file in csv_files:
    # Change the file suffix
    p = Path(csv_file)
    parquet_file = p.parent / f"{p.name[:-3]}parquet"
    str_parquet_file = p.parent / f"str_{p.name[:-3]}parquet"
    str_csv_file = p.parent / f"str_{p.name}"
    cat_parquet_file = p.parent / f"cat_{p.name[:-3]}parquet"
    cat_csv_files = p.parent / f"cat_{p.name}"
    
    df = pd.read_parquet(str_parquet_file)
    df.to_csv(str_csv_file, index=False)
    
    df = pd.read_parquet(cat_parquet_file)
    df.to_csv(cat_csv_files, index=False)