In [2]:
import sys, os
sys.path.append(os.path.abspath(".."))


In [3]:
# HW05 — Data Storage (CSV + Parquet)
import os
import pandas as pd
from src.config import load_env
paths = load_env()

# 1) Load example data (from project root)
df = pd.read_csv("../data/starter_data.csv", parse_dates=["date"])
print(df.shape, df.dtypes)

# 2) Save to CSV and Parquet
csv_path = os.path.join(paths.raw, "sample_storage.csv")
parq_path = os.path.join(paths.processed, "sample_storage.parquet")
df.to_csv(csv_path, index=False)
print("Saved CSV:", csv_path)

try:
    df.to_parquet(parq_path, index=False)
    print("Saved Parquet:", parq_path)
except Exception as e:
    print("Parquet save failed — install pyarrow or fastparquet. Error:", e)

# 3) Reload and validate
df_csv = pd.read_csv(csv_path, parse_dates=["date"])
print("Reload CSV:", df_csv.shape)
try:
    df_parq = pd.read_parquet(parq_path)
    print("Reload Parquet:", df_parq.shape)
    print("Dtype check (date):", df_parq["date"].dtype)
except Exception as e:
    print("Parquet read failed:", e)

# 4) Utility helpers
from pathlib import Path

def write_df(df: pd.DataFrame, path: str):
    Path(os.path.dirname(path)).mkdir(parents=True, exist_ok=True)
    if path.lower().endswith(".csv"):
        df.to_csv(path, index=False)
    elif path.lower().endswith(".parquet"):
        try:
            df.to_parquet(path, index=False)
        except Exception as e:
            raise RuntimeError("Missing Parquet engine. Try: pip install pyarrow") from e
    else:
        raise ValueError("Unknown suffix; use .csv or .parquet")

def read_df(path: str) -> pd.DataFrame:
    if path.lower().endswith(".csv"):
        return pd.read_csv(path)
    elif path.lower().endswith(".parquet"):
        return pd.read_parquet(path)
    else:
        raise ValueError("Unknown suffix; use .csv or .parquet")

# demo
write_df(df, os.path.join(paths.raw, "demo.csv"))
try:
    write_df(df, os.path.join(paths.processed, "demo.parquet"))
except Exception as e:
    print("Parquet demo skipped:", e)

print("Validation OK: shapes equal?", df.shape == read_df(csv_path).shape)


(90, 4) date        datetime64[ns]
category            object
value1             float64
value2             float64
dtype: object
Saved CSV: data/raw/sample_storage.csv
Saved Parquet: data/processed/sample_storage.parquet
Reload CSV: (90, 4)
Reload Parquet: (90, 4)
Dtype check (date): datetime64[ns]
Validation OK: shapes equal? True
