In [1]:
%load_ext autoreload
%autoreload 2

## Download playoff PBP and extract 1997 Finals (Bulls–Jazz)

In [2]:

import os
import re
from pathlib import Path
from io import BytesIO, TextIOWrapper
from urllib.request import urlopen
import tarfile
import csv
from typing import Union, Sequence, Optional, List

# fetch paths
import sys, os
sys.path.append(os.path.abspath('..'))
import config

import pandas as pd

# Import config from project root. Adjust the path below if running elsewhere.
import sys
# Assume the notebook sits in `notebooks/` and config.py is in repo root:
repo_root = Path("..").resolve()
sys.path.append(str(repo_root))
import config

# Paths from config
RAW_DIR = Path(config.DATA_RAW_DIR)
RAW_DIR.mkdir(parents=True, exist_ok=True)
PROCESSED_DIR = Path(config.DATA_PROCESSED_DIR)
PROCESSED_DIR.mkdir(parents=True, exist_ok=True)

RAW_PLAYOFFS = Path(config.DATA_RAW_PLAYOFFS)
RAW_FINALS = Path(config.DATA_RAW_FINALS)


In [3]:

def load_nba_data(
    path: Union[Path, str],
    seasons: Union[Sequence, int] = (1996,),
    data: Union[Sequence, str] = ("nbastats",),
    seasontype: str = "po",
    league: str = "nba",
    in_memory: bool = True,
    use_pandas: bool = True
) -> Optional[Union[List, pd.DataFrame]]:
    """
    Load NBA play-by-play archives from the public GitHub dataset:
    https://github.com/shufinskiy/nba_data

    Returns a concatenated pandas DataFrame if (in_memory & use_pandas) is True.
    Otherwise writes archives to `path`.
    """
    if isinstance(path, str):
        path = Path(path).expanduser()
    if isinstance(seasons, int):
        seasons = (seasons,)
    if isinstance(data, str):
        data = (data,)

    if seasontype == "rg":
        need_data = [f"{d}_{s}" for d in data for s in seasons]
    elif seasontype == "po":
        need_data = [f"{d}_{seasontype}_{s}" for d in data for s in seasons]
    else:
        need_data = [f"{d}_{s}" for d in data for s in seasons]
        need_data += [f"{d}_{seasontype}_{s}" for d in data for s in seasons]

    with urlopen("https://raw.githubusercontent.com/shufinskiy/nba_data/main/list_data.txt") as resp:
        v = resp.read().decode("utf-8")
    name_v = [line.split("=")[0] for line in v.split("\n") if "=" in line]
    element_v = [line.split("=")[1] for line in v.split("\n") if "=" in line]

    need_name = [name for name in name_v if name in need_data]
    need_element = [element for (name, element) in zip(name_v, element_v) if name in need_data]

    if in_memory and use_pandas:
        table = pd.DataFrame()
    elif in_memory:
        table = []
    else:
        table = None

    path.mkdir(parents=True, exist_ok=True)

    for name, url in zip(need_name, need_element):
        with urlopen(url) as response:
            if response.status != 200:
                raise RuntimeError(f"Failed to download: {url} (HTTP {response.status})")
            content = response.read()

        if in_memory:
            with tarfile.open(fileobj=BytesIO(content), mode="r:xz") as tar:
                csv_name = f"{name}.csv"
                member = tar.getmember(csv_name)
                f = tar.extractfile(member)
                if use_pandas:
                    df_part = pd.read_csv(f)
                    df_part["__archive_name"] = name
                    table = pd.concat([table, df_part], axis=0, ignore_index=True)
                else:
                    reader = csv.reader(TextIOWrapper(f, encoding="utf-8"))
                    for row in reader:
                        table.append(row)
        else:
            archive_path = path / f"{name}.tar.xz"
            archive_path.write_bytes(content)

    return table

# Download 1996–97 playoffs
df_playoffs = load_nba_data(
    path=RAW_DIR,
    seasons=(1996,),       # 1996 -> 1996–97 season
    data=("nbastats",),
    seasontype="po",
    league="nba",
    in_memory=True,
    use_pandas=True
)

# Basic checks and save full playoffs CSV
assert "GAME_ID" in df_playoffs.columns, "GAME_ID missing in downloaded data"
df_playoffs.sort_values(["GAME_ID","PERIOD","EVENTNUM"], inplace=True, ignore_index=True)
df_playoffs.to_csv(RAW_PLAYOFFS, index=False)
print(f"Saved playoffs CSV → {RAW_PLAYOFFS} (rows={len(df_playoffs):,})")

# Extract Bulls–Jazz games (the 1997 Finals)
def collect_team_abbrevs(df: pd.DataFrame) -> pd.Series:
    TEAM_COL_CANDIDATES = [
        "PLAYER1_TEAM_ABBREVIATION","PLAYER2_TEAM_ABBREVIATION","PLAYER3_TEAM_ABBREVIATION",
        "TEAM_ABBREVIATION","PLAYER1_TEAM_CITY","PLAYER2_TEAM_CITY","PLAYER3_TEAM_CITY",
    ]
    DESC_COLS = ["HOMEDESCRIPTION","VISITORDESCRIPTION","NEUTRALDESCRIPTION"]

    team_sets = {}
    has_cols = [c for c in TEAM_COL_CANDIDATES if c in df.columns]
    for gid, g in df.groupby("GAME_ID"):
        teams = set()
        for c in has_cols:
            vals = g[c].dropna().astype(str).str.upper().str.strip()
            teams.update([v for v in vals if re.fullmatch(r"[A-Z]{2,4}", v)])
        if not teams:
            for dcol in [c for c in DESC_COLS if c in df.columns]:
                txt = " ".join(g[dcol].dropna().astype(str).tolist()).upper()
                if " CHI " in f" {txt} " or " CHI." in txt or " CHI," in txt:
                    teams.add("CHI")
                if " UTA " in f" {txt} " or " UTAH" in txt or " UTA," in txt:
                    teams.add("UTA")
        team_sets[gid] = teams
    return pd.Series(team_sets, name="teams")

def finals_game_ids_1997(df: pd.DataFrame) -> List[str]:
    team_sets = collect_team_abbrevs(df)
    return sorted([gid for gid, teams in team_sets.items() if {"CHI","UTA"}.issubset(teams)])

df_9697 = df_playoffs[df_playoffs["__archive_name"].str.contains("_po_1996$")].copy()
finals_gids = finals_game_ids_1997(df_9697)
assert len(finals_gids) > 0, "No CHI–UTA playoff games found in 1996–97"

df_finals = df_9697[df_9697["GAME_ID"].isin(finals_gids)].copy()
df_finals.to_csv(RAW_FINALS, index=False)
print(f"Saved Finals CSV → {RAW_FINALS} (games={len(set(finals_gids))}, rows={len(df_finals):,})")
print("Detected GAME_IDs:", sorted(set(finals_gids)))


Saved playoffs CSV → /Users/charilaostsarouchas/Documents/Harris/04_Blueprints/agentic_ai/20250820_Bulls_Highlights_Retrieval/data/raw/pbp_1996_1997_playoffs.csv (rows=32,083)
Saved Finals CSV → /Users/charilaostsarouchas/Documents/Harris/04_Blueprints/agentic_ai/20250820_Bulls_Highlights_Retrieval/data/raw/pbp_1997_finals_chi_uta.csv (games=6, rows=2,608)
Detected GAME_IDs: [49600083, 49600084, 49600085, 49600086, 49600087, 49600088]
