In [2]:
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Ensure output folder exists
PROC_DIR = Path("data/processed")
PROC_DIR.mkdir(parents=True, exist_ok=True)

# Data path
DATA_PATH = Path("data/starter_data.csv")

RuntimeError: CPU dispatcher tracer already initlized

In [None]:
import time

# Create an array
arr = np.arange(1, 1_000_001, dtype=np.float64)  # 1..1,000,000

# Vectorized square
t0 = time.perf_counter()
vec_sq = arr ** 2
vec_time = time.perf_counter() - t0

# Loop square (much slower)
t0 = time.perf_counter()
loop_sq = np.empty_like(arr)
for i, v in enumerate(arr):
    loop_sq[i] = v * v
loop_time = time.perf_counter() - t0

print(f"Vectorized time: {vec_time:.6f}s")
print(f"Loop time:       {loop_time:.6f}s")
print("Results equal?   ", np.allclose(vec_sq, loop_sq))


In [None]:
from pathlib import Path
import pandas as pd

DATA_PATH = Path("data/starter_data.csv")
df = pd.read_csv(DATA_PATH, parse_dates=["date"])
df["category"] = df["category"].astype("category")

# Inspect
df.info()
df.head()


In [None]:
# Numeric summary (.describe)
num_cols = df.select_dtypes(include="number").columns
summary = df[num_cols].describe().T
summary


In [None]:
# Groupby by category
gb = df.groupby("category")[list(num_cols)].agg(["mean", "sum", "count"])
gb


In [None]:
import matplotlib.pyplot as plt

PROC_DIR = Path("data/processed")
PROC_DIR.mkdir(parents=True, exist_ok=True)

# Save summary stats
summary_csv = PROC_DIR / "summary.csv"
summary_json = PROC_DIR / "summary.json"
summary.to_csv(summary_csv, index=True)
summary.to_json(summary_json, orient="records", indent=2)

# Save groupby summary too (handy)
gb.to_csv(PROC_DIR / "groupby_summary.csv", index=True)

# Bonus plot: histogram of the first numeric column
if len(num_cols):
    first_num = num_cols[0]
    plt.figure()
    df[first_num].plot(kind="hist", bins=20, title=f"Histogram of {first_num}")
    plt.xlabel(first_num)
    plt.tight_layout()
    plt.savefig(PROC_DIR / f"hist_{first_num}.png", bbox_inches="tight")
    plt.close()


In [None]:
def get_summary_stats(frame: pd.DataFrame, cat_col: str = "category"):
    """
    Return (numeric_describe, groupby_stats) for the given DataFrame.
    groupby_stats aggregates numeric columns by the given category column.
    """
    nums = frame.select_dtypes(include="number").columns
    desc = frame[nums].describe().T if len(nums) else pd.DataFrame()
    grouped = (
        frame.groupby(cat_col)[list(nums)].agg(["mean", "sum", "count"])
        if cat_col in frame.columns and len(nums) else pd.DataFrame()
    )
    return desc, grouped

desc2, grouped2 = get_summary_stats(df, "category")
desc2.head(), (grouped2.head() if not grouped2.empty else grouped2)
