In [5]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Prepare graph-friendly CSVs in pandas

In [7]:

import os
import re
from pathlib import Path
from io import BytesIO, TextIOWrapper
from urllib.request import urlopen
import tarfile
import csv
from typing import Union, Sequence, Optional, List

# fetch paths
import sys, os
sys.path.append(os.path.abspath('..'))
import config

import pandas as pd

# Import config from project root. Adjust the path below if running elsewhere.
import sys
# Assume the notebook sits in `notebooks/` and config.py is in repo root:
repo_root = Path("..").resolve()
sys.path.append(str(repo_root))
import config

# Paths from config
RAW_DIR = Path(config.DATA_RAW_DIR)
RAW_DIR.mkdir(parents=True, exist_ok=True)
PROCESSED_DIR = Path(config.DATA_PROCESSED_DIR)
PROCESSED_DIR.mkdir(parents=True, exist_ok=True)

RAW_PLAYOFFS = Path(config.DATA_RAW_PLAYOFFS)
RAW_FINALS = Path(config.DATA_RAW_FINALS)


In [8]:

df = pd.read_csv(RAW_FINALS)
df = df.sort_values(["GAME_ID","PERIOD","EVENTNUM"]).reset_index(drop=True)

def to_seconds_left(pctimestr: str):
    try:
        m, s = str(pctimestr).split(":")
        return int(m)*60 + int(s)
    except Exception:
        return None

df["SECONDS_LEFT_PERIOD"] = df["PCTIMESTRING"].apply(to_seconds_left)
df["SECONDS_LEFT_GAME"] = (4 - df["PERIOD"]).clip(lower=0)*720 + df["SECONDS_LEFT_PERIOD"]

df["SCOREMARGIN"] = pd.to_numeric(df["SCOREMARGIN"], errors="coerce")
df["SCOREMARGIN"].fillna(method="ffill", inplace=True)

df["EVENT_ID"] = df["GAME_ID"].astype(str) + "_" + df["EVENTNUM"].astype(str)

df["IS_CLUTCH"] = (df["PERIOD"] == 4) & (df["SECONDS_LEFT_PERIOD"] <= 30) & (df["SCOREMARGIN"].abs() <= 3)

df["NEXT_EVENT_ID"] = df.groupby("GAME_ID")["EVENT_ID"].shift(-1)


frames = []
for pid_col, name_col, team_col in [
    ("PLAYER1_ID","PLAYER1_NAME","PLAYER1_TEAM_ABBREVIATION"),
    ("PLAYER2_ID","PLAYER2_NAME","PLAYER2_TEAM_ABBREVIATION"),
    ("PLAYER3_ID","PLAYER3_NAME","PLAYER3_TEAM_ABBREVIATION"),
]:
    sub = df[[pid_col, name_col, team_col]].rename(columns={
        pid_col: "player_id", name_col: "name", team_col: "team"
    })
    frames.append(sub)

players = pd.concat(frames, ignore_index=True)
players = players.dropna(subset=["player_id"]).drop_duplicates()
players["player_id"] = players["player_id"].astype(str)




games = df[["GAME_ID"]].drop_duplicates().rename(columns={"GAME_ID":"game_id"})




events = df[[
    "EVENT_ID","GAME_ID","PERIOD","SECONDS_LEFT_PERIOD","SECONDS_LEFT_GAME",
    "EVENTMSGTYPE","EVENTMSGACTIONTYPE","SCORE","SCOREMARGIN",
    "HOMEDESCRIPTION","VISITORDESCRIPTION","IS_CLUTCH"
]].rename(columns={"GAME_ID":"game_id"})

# Relationships
rels_next = df[["EVENT_ID","NEXT_EVENT_ID"]].dropna()
rels_in_game = df[["EVENT_ID","GAME_ID"]].rename(columns={"GAME_ID":"game_id"})

rels_performed = []
for col in ["PLAYER1_ID","PLAYER2_ID","PLAYER3_ID"]:
    sub = df[["EVENT_ID", col]].dropna()
    sub.columns = ["event_id","player_id"]
    sub["role"] = col
    rels_performed.append(sub)
rels_performed = pd.concat(rels_performed)

# Save
players.to_csv(PROCESSED_DIR / "players.csv", index=False)
games.to_csv(PROCESSED_DIR / "games.csv", index=False)
events.to_csv(PROCESSED_DIR / "events.csv", index=False)
rels_next.to_csv(PROCESSED_DIR / "rels_next.csv", index=False)
rels_in_game.rename(columns={"EVENT_ID":"event_id"}, inplace=True)
rels_in_game.to_csv(PROCESSED_DIR / "rels_in_game.csv", index=False)
rels_performed.to_csv(PROCESSED_DIR / "rels_performed.csv", index=False)

print("Processed CSVs saved to:", PROCESSED_DIR)


Processed CSVs saved to: /Users/charilaostsarouchas/Documents/Harris/04_Blueprints/agentic_ai/20250820_Bulls_Highlights_Retrieval/data/processed


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["SCOREMARGIN"].fillna(method="ffill", inplace=True)
  df["SCOREMARGIN"].fillna(method="ffill", inplace=True)
