In [0]:
from pathlib import Path
import pandas as pd

# Find repo root (the folder that contains README.md and RUN.md)
here = Path.cwd()
repo_root = None
for p in [here] + list(here.parents):
    if (p / "README.md").exists() and (p / "RUN.md").exists():
        repo_root = p
        break

print("cwd:", here)
print("repo_root:", repo_root)

if repo_root is None:
    raise FileNotFoundError("Could not locate repo root (README.md and RUN.md not found above cwd).")

# Paths first (so logging can reference them)
data_dir = repo_root / "data_samples"   # keep your current choice for now
menu_path = data_dir / "menu_items.csv"
orders_path = data_dir / "order_details.csv"

print("data_dir:", data_dir)
print("data_dir exists:", data_dir.exists())
if data_dir.exists():
    print("files in data_samples:", [x.name for x in data_dir.iterdir()])
print("menu_path:", menu_path, "exists:", menu_path.exists())
print("orders_path:", orders_path, "exists:", orders_path.exists())

# ---- Part B: Logging setup (after variables exist) ----
import logging, sys
from datetime import datetime
import platform, socket

logs_dir = repo_root / "logs"
logs_dir.mkdir(parents=True, exist_ok=True)

run_ts = datetime.now().strftime("%Y%m%d_%H%M")
log_path = logs_dir / f"run_{run_ts}.log"

logger = logging.getLogger("lab_2_4")
logger.setLevel(logging.INFO)
logger.propagate = False
if logger.handlers:
    logger.handlers.clear()

fmt = logging.Formatter("%(asctime)s | %(levelname)s | %(message)s")

console = logging.StreamHandler(sys.stdout)
console.setLevel(logging.INFO)
console.setFormatter(fmt)

fileh = logging.FileHandler(log_path, mode="a", encoding="utf-8")
fileh.setLevel(logging.INFO)
fileh.setFormatter(fmt)

logger.addHandler(console)
logger.addHandler(fileh)

logger.info("Run started")
logger.info(f"repo_root={repo_root}")
logger.info(f"log_path={log_path}")
logger.info(f"Python={sys.version.split()[0]}")
logger.info(f"Platform={platform.platform()}")
logger.info(f"Host={socket.gethostname()}")
logger.info(f"data_dir={data_dir}")
logger.info(f"menu_path={menu_path}")
logger.info(f"orders_path={orders_path}")
# -----------------------------------------------

# Load CSVs
menu_items = pd.read_csv(menu_path)
order_details = pd.read_csv(orders_path)

logger.info(f"Loaded menu_items: {menu_items.shape}")
logger.info(f"Loaded order_details: {order_details.shape}")

menu_items.head(), order_details.head()


In [0]:
import os, random, json, hashlib
import numpy as np
from pathlib import Path

# seeds
os.environ["PYTHONHASHSEED"] = "0"
random.seed(0)
np.random.seed(0)
logger.info("Reproducibility: seeds set (PYTHONHASHSEED=0, random=0, numpy=0)")

# sha256 for the two input CSVs
def sha256_file(path: Path) -> str:
    h = hashlib.sha256()
    with path.open("rb") as f:
        for chunk in iter(lambda: f.read(1024 * 1024), b""):
            h.update(chunk)
    return h.hexdigest()

inputs = [menu_path, orders_path]
hashes = {p.name: sha256_file(p) for p in inputs}

for name, h in hashes.items():
    logger.info(f"SHA-256 {name}: {h}")

hash_path = repo_root / "data_hashes.json"
hash_path.write_text(json.dumps(hashes, indent=2), encoding="utf-8")
logger.info(f"Wrote data hashes to {hash_path}")


In [0]:
%pip freeze > requirements.txt

In [0]:
from pathlib import Path

src = Path.cwd() / "requirements.txt"
dst = repo_root / "requirements.txt"

print("wrote here:", src, "exists:", src.exists())

if not src.exists():
    raise FileNotFoundError("requirements.txt wasn't created in the notebook CWD. If %pip ran, Databricks wrote it elsewhere.")

dst.write_text(src.read_text(encoding="utf-8"), encoding="utf-8")
logger.info(f"Wrote requirements to {dst}")
print("repo requirements:", dst, "exists:", dst.exists())


In [0]:
logger.info(f"menu_items columns: {list(menu_items.columns)}")
logger.info(f"order_details columns: {list(order_details.columns)}")

print("menu_items columns:", list(menu_items.columns))
print("order_details columns:", list(order_details.columns))


In [0]:
import pandas as pd

# clean text columns (trim)
for df in (menu_items, order_details):
    for c in df.select_dtypes(include=["object"]).columns:
        df[c] = df[c].astype(str).str.strip()

# enforce types
menu_items["menu_item_id"] = pd.to_numeric(menu_items["menu_item_id"], errors="coerce").astype("Int64")
menu_items["price"] = pd.to_numeric(menu_items["price"], errors="coerce")

order_details["item_id"] = pd.to_numeric(order_details["item_id"], errors="coerce").astype("Int64")
order_details["order_id"] = pd.to_numeric(order_details["order_id"], errors="coerce").astype("Int64")
order_details["order_details_id"] = pd.to_numeric(order_details["order_details_id"], errors="coerce").astype("Int64")

# build timestamp from date + time
order_details["order_ts"] = pd.to_datetime(
    order_details["order_date"].astype(str) + " " + order_details["order_time"].astype(str),
    errors="coerce"
)

logger.info("Cleaned types and created order_ts")

# join
joined = order_details.merge(
    menu_items,
    left_on="item_id",
    right_on="menu_item_id",
    how="left",
    validate="many_to_one"
)

logger.info(f"Joined shape: {joined.shape}")

# tidy table
tidy = joined[["order_id", "order_ts", "item_name", "category", "price"]].copy()
tidy["quantity"] = 1  # each row is one item in this dataset
tidy["line_total"] = tidy["price"] * tidy["quantity"]

logger.info(f"Tidy shape: {tidy.shape}")
tidy.head()


In [0]:
from datetime import datetime
import os

# metrics
top5_items = (
    tidy.groupby("item_name")["quantity"]
    .sum()
    .sort_values(ascending=False)
    .head(5)
    .reset_index(name="total_quantity")
)

revenue_by_category = (
    tidy.groupby("category")["line_total"]
    .sum()
    .sort_values(ascending=False)
    .reset_index(name="revenue")
)

busiest_hour = (
    tidy.dropna(subset=["order_ts"])
        .assign(hour=tidy["order_ts"].dt.hour)
        .groupby("hour")["quantity"]
        .sum()
        .sort_values(ascending=False)
        .head(1)
        .reset_index(name="items_sold")
)

logger.info("Computed metrics: top5_items, revenue_by_category, busiest_hour")

display(top5_items)
display(revenue_by_category)
display(busiest_hour)

# save combined metrics file (simple tidy format)
metrics_rows = []

for _, r in top5_items.iterrows():
    metrics_rows.append({"metric": "top_item_by_quantity", "dimension": r["item_name"], "value": int(r["total_quantity"])})

for _, r in revenue_by_category.iterrows():
    metrics_rows.append({"metric": "revenue_by_category", "dimension": r["category"], "value": float(r["revenue"])})

if len(busiest_hour) == 1:
    metrics_rows.append({"metric": "busiest_hour", "dimension": str(int(busiest_hour.loc[0, "hour"])), "value": int(busiest_hour.loc[0, "items_sold"])})

metrics_df = pd.DataFrame(metrics_rows)

out_ts = datetime.now().strftime("%Y%m%d_%H%M%S")
out_dir_dbfs = "/dbfs/FileStore/tables/etl_output"
os.makedirs(out_dir_dbfs, exist_ok=True)

metrics_out_path = f"{out_dir_dbfs}/metrics_{out_ts}.csv"
metrics_df.to_csv(metrics_out_path, index=False)

logger.info(f"Wrote metrics CSV: {metrics_out_path}")
print("Browser path:", f"/files/tables/etl_output/metrics_{out_ts}.csv")

metrics_df.head(10)


In [0]:
from datetime import datetime
import pandas as pd

# make sure the DBFS folder exists
dbutils.fs.mkdirs("dbfs:/FileStore/tables/etl_output")

# build metrics_df again (in case the previous cell stopped early)
top5_items = (
    tidy.groupby("item_name")["quantity"]
    .sum()
    .sort_values(ascending=False)
    .head(5)
    .reset_index(name="total_quantity")
)

revenue_by_category = (
    tidy.groupby("category")["line_total"]
    .sum()
    .sort_values(ascending=False)
    .reset_index(name="revenue")
)

busiest_hour = (
    tidy.dropna(subset=["order_ts"])
        .assign(hour=tidy["order_ts"].dt.hour)
        .groupby("hour")["quantity"]
        .sum()
        .sort_values(ascending=False)
        .head(1)
        .reset_index(name="items_sold")
)

metrics_rows = []

for _, r in top5_items.iterrows():
    metrics_rows.append({"metric": "top_item_by_quantity", "dimension": r["item_name"], "value": int(r["total_quantity"])})

for _, r in revenue_by_category.iterrows():
    metrics_rows.append({"metric": "revenue_by_category", "dimension": r["category"], "value": float(r["revenue"])})

if len(busiest_hour) == 1:
    metrics_rows.append({"metric": "busiest_hour", "dimension": str(int(busiest_hour.loc[0, "hour"])), "value": int(busiest_hour.loc[0, "items_sold"])})

metrics_df = pd.DataFrame(metrics_rows)

# write locally, then copy to dbfs:/FileStore/...
out_ts = datetime.now().strftime("%Y%m%d_%H%M%S")
local_tmp = f"/tmp/metrics_{out_ts}.csv"
dbfs_path = f"dbfs:/FileStore/tables/etl_output/metrics_{out_ts}.csv"

metrics_df.to_csv(local_tmp, index=False)
dbutils.fs.cp(f"file:{local_tmp}", dbfs_path)

logger.info(f"Wrote metrics CSV to {dbfs_path}")
print("Browser path:", f"/files/tables/etl_output/metrics_{out_ts}.csv")

metrics_df.head(10)


In [0]:
from datetime import datetime
from pathlib import Path

out_ts = datetime.now().strftime("%Y%m%d_%H%M%S")

out_dir = repo_root / "etl_pipeline" / "etl_output"
out_dir.mkdir(parents=True, exist_ok=True)

metrics_repo_path = out_dir / f"metrics_{out_ts}.csv"
metrics_df.to_csv(metrics_repo_path, index=False)

logger.info(f"Wrote metrics CSV to repo: {metrics_repo_path}")
print("metrics saved to:", metrics_repo_path)


In [0]:
order_details["order_ts"] = pd.to_datetime(
    order_details["order_date"].astype(str) + " " + order_details["order_time"].astype(str),
    format="%Y-%m-%d %H:%M:%S",
    errors="coerce"
)
logger.info("Rebuilt order_ts using explicit format %Y-%m-%d %H:%M:%S")


In [0]:
# Assert: joined has no missing menu info for normal rows (it should match almost all)
assert joined["item_name"].notna().any(), "Join failed: item_name is all null"

# Assert: tidy expected columns exist
expected_cols = ["order_id", "order_ts", "item_name", "category", "price", "quantity", "line_total"]
missing = [c for c in expected_cols if c not in tidy.columns]
assert not missing, f"Tidy missing columns: {missing}"

# Assert: tidy and metrics not empty
assert len(tidy) > 0, "Tidy output is empty"
assert len(metrics_df) > 0, "Metrics output is empty"

logger.info("Assert checks passed")
print("asserts passed")
