
# Bulls GraphRAG — End-to-End (Playoffs 1996–97 → Graph → Queries)

This notebook performs an **end-to-end pipeline**:

1. **Download** 1996–97 playoff play-by-play data and extract the **1997 Finals (Bulls–Jazz)**.
2. **Prepare** graph-friendly CSVs in pandas (events, players, games, and relationships).
3. **Load** the graph into **Neo4j** via the Python driver with **minimal Cypher**.
4. **Query** a few clutch moments (e.g., Steve Kerr’s dagger) for sanity checking.



In [33]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [34]:

import os
import re
from pathlib import Path
from io import BytesIO, TextIOWrapper
from urllib.request import urlopen
import tarfile
import csv
from typing import Union, Sequence, Optional, List

import pandas as pd

# Import config from project root. Adjust the path below if running elsewhere.
import sys
# Assume the notebook sits in `notebooks/` and config.py is in repo root:
repo_root = Path("..").resolve()
sys.path.append(str(repo_root))
import config

# Paths from config
RAW_DIR = Path(config.DATA_RAW_DIR)
RAW_DIR.mkdir(parents=True, exist_ok=True)
PROCESSED_DIR = Path(config.DATA_PROCESSED_DIR)
PROCESSED_DIR.mkdir(parents=True, exist_ok=True)

RAW_PLAYOFFS = Path(config.DATA_RAW_PLAYOFFS)
RAW_FINALS = Path(config.DATA_RAW_FINALS)

NEO4J_URI = config.NEO4J_URI
NEO4J_USER = config.NEO4J_USER
NEO4J_PASSWORD = config.NEO4J_PASSWORD

RAW_DIR, PROCESSED_DIR, RAW_PLAYOFFS, RAW_FINALS, NEO4J_URI


(PosixPath('/Users/charilaostsarouchas/Documents/Harris/04_Blueprints/agentic_ai/20250820_Bulls_Highlights_Retrieval/data/raw'),
 PosixPath('/Users/charilaostsarouchas/Documents/Harris/04_Blueprints/agentic_ai/20250820_Bulls_Highlights_Retrieval/data/processed'),
 PosixPath('/Users/charilaostsarouchas/Documents/Harris/04_Blueprints/agentic_ai/20250820_Bulls_Highlights_Retrieval/data/raw/pbp_1996_1997_playoffs.csv'),
 PosixPath('/Users/charilaostsarouchas/Documents/Harris/04_Blueprints/agentic_ai/20250820_Bulls_Highlights_Retrieval/data/raw/pbp_1997_finals_chi_uta.csv'),
 'bolt://localhost:7687')

## Load into Neo4j (minimal Cypher via Python driver)

In [35]:

from neo4j import GraphDatabase

driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USER, NEO4J_PASSWORD))
driver.verify_connectivity()
print("Connected to Neo4j ✅")

CONSTRAINTS = [
    "CREATE CONSTRAINT player_id IF NOT EXISTS FOR (p:Player) REQUIRE p.player_id IS UNIQUE",
    "CREATE CONSTRAINT game_id   IF NOT EXISTS FOR (g:Game)   REQUIRE g.game_id IS UNIQUE",
    "CREATE CONSTRAINT event_id  IF NOT EXISTS FOR (e:Event)  REQUIRE e.event_id IS UNIQUE",
]
with driver.session() as session:
    for c in CONSTRAINTS:
        session.run(c)
print("Constraints ensured ✅")

# Load CSVs
#players = pd.read_csv(PROCESSED_DIR / "players.csv")
players = pd.read_csv(PROCESSED_DIR / "players.csv").dropna()
games   = pd.read_csv(PROCESSED_DIR / "games.csv")
events  = pd.read_csv(PROCESSED_DIR / "events.csv")
rels_next      = pd.read_csv(PROCESSED_DIR / "rels_next.csv")
rels_in_game   = pd.read_csv(PROCESSED_DIR / "rels_in_game.csv")
rels_performed = pd.read_csv(PROCESSED_DIR / "rels_performed.csv")

events.rename(columns={"EVENT_ID":"event_id"}, inplace=True)
rels_next.rename(columns={"EVENT_ID":"event_id"}, inplace=True)

def iter_batches(df: pd.DataFrame, size: int = 10000):
    n = len(df)
    if n == 0:
        yield df
        return
    for i in range(0, n, size):
        yield df.iloc[i:i+size]

# Players
CYPHER_PLAYERS = """
UNWIND $rows AS row
MERGE (p:Player {player_id: toString(row.player_id)})
  ON CREATE SET p.name = row.name
  ON MATCH  SET p.name = coalesce(p.name, row.name)
"""
with driver.session() as session:
    for batch in iter_batches(players[["player_id","name"]]):
        session.run(CYPHER_PLAYERS, rows=batch.to_dict("records"))
print("Players ingested ✅")

# Games
CYPHER_GAMES = """
UNWIND $rows AS row
MERGE (g:Game {game_id: toString(row.game_id)})
"""
with driver.session() as session:
    for batch in iter_batches(games[["game_id"]]):
        session.run(CYPHER_GAMES, rows=batch.to_dict("records"))
print("Games ingested ✅")

# Events
event_cols = [
    "event_id","game_id","PERIOD","SECONDS_LEFT_PERIOD","SECONDS_LEFT_GAME",
    "EVENTMSGTYPE","EVENTMSGACTIONTYPE","SCORE","SCOREMARGIN",
    "HOMEDESCRIPTION","VISITORDESCRIPTION","IS_CLUTCH"
]
events_for_db = events[event_cols].copy()

CYPHER_EVENTS = """
UNWIND $rows AS row
MERGE (e:Event {event_id: row.event_id})
ON CREATE SET e.period = row.PERIOD,
              e.seconds_left_period = row.SECONDS_LEFT_PERIOD,
              e.seconds_left_game   = row.SECONDS_LEFT_GAME,
              e.event_type          = row.EVENTMSGTYPE,
              e.event_action        = row.EVENTMSGACTIONTYPE,
              e.score               = row.SCORE,
              e.score_margin        = row.SCOREMARGIN,
              e.home_desc           = row.HOMEDESCRIPTION,
              e.visit_desc          = row.VISITORDESCRIPTION,
              e.is_clutch           = row.IS_CLUTCH
ON MATCH  SET e.period = coalesce(e.period, row.PERIOD),
              e.seconds_left_period = coalesce(e.seconds_left_period, row.SECONDS_LEFT_PERIOD),
              e.seconds_left_game   = coalesce(e.seconds_left_game, row.SECONDS_LEFT_GAME),
              e.event_type          = coalesce(e.event_type, row.EVENTMSGTYPE),
              e.event_action        = coalesce(e.event_action, row.EVENTMSGACTIONTYPE),
              e.score               = coalesce(e.score, row.SCORE),
              e.score_margin        = coalesce(e.score_margin, row.SCOREMARGIN),
              e.home_desc           = coalesce(e.home_desc, row.HOMEDESCRIPTION),
              e.visit_desc          = coalesce(e.visit_desc, row.VISITORDESCRIPTION),
              e.is_clutch           = coalesce(e.is_clutch, row.IS_CLUTCH)
"""
with driver.session() as session:
    for batch in iter_batches(events_for_db, 10000):
        session.run(CYPHER_EVENTS, rows=batch.to_dict("records"))
print("Events ingested ✅")

# IN_GAME
rels_in_game = rels_in_game[["event_id","game_id"]].copy()
CYPHER_IN_GAME = """
UNWIND $rows AS row
MATCH (e:Event {event_id: row.event_id})
MATCH (g:Game  {game_id: toString(row.game_id)})
MERGE (e)-[:IN_GAME]->(g)
"""
with driver.session() as session:
    for batch in iter_batches(rels_in_game, 20000):
        session.run(CYPHER_IN_GAME, rows=batch.to_dict("records"))
print("IN_GAME relationships ingested ✅")

# NEXT
rels_next_clean = rels_next.dropna(subset=["NEXT_EVENT_ID"]).copy()
rels_next_clean["NEXT_EVENT_ID"] = rels_next_clean["NEXT_EVENT_ID"].astype(str)
CYPHER_NEXT = """
UNWIND $rows AS row
MATCH (e1:Event {event_id: row.event_id})
MATCH (e2:Event {event_id: row.NEXT_EVENT_ID})
MERGE (e1)-[:NEXT]->(e2)
"""
with driver.session() as session:
    for batch in iter_batches(rels_next_clean[["event_id","NEXT_EVENT_ID"]], 20000):
        session.run(CYPHER_NEXT, rows=batch.to_dict("records"))
print("NEXT relationships ingested ✅")

# PERFORMED
rels_performed["player_id"] = rels_performed["player_id"].astype(str)
rels_performed["event_id"]  = rels_performed["event_id"].astype(str)
rels_performed["role"] = rels_performed["role"].astype(str)
CYPHER_PERFORMED = """
UNWIND $rows AS row
MATCH (p:Player {player_id: row.player_id})
MATCH (e:Event  {event_id: row.event_id})
MERGE (p)-[:PERFORMED {role: row.role}]->(e)
"""
with driver.session() as session:
    for batch in iter_batches(rels_performed[["player_id","event_id","role"]], 20000):
        session.run(CYPHER_PERFORMED, rows=batch.to_dict("records"))
print("PERFORMED relationships ingested ✅")


Connected to Neo4j ✅
Constraints ensured ✅
Players ingested ✅
Games ingested ✅
Events ingested ✅
IN_GAME relationships ingested ✅
NEXT relationships ingested ✅
PERFORMED relationships ingested ✅


In [36]:
# Re-load to be safe
import pandas as pd, os, config
from neo4j import GraphDatabase

rels_performed = pd.read_csv(os.path.join(config.DATA_PROCESSED_DIR, "rels_performed.csv")).dropna()
rels_performed["player_id"] = rels_performed["player_id"].astype(str)
rels_performed["event_id"]  = rels_performed["event_id"].astype(str)
rels_performed["role"]      = rels_performed["role"].astype(str)

print("Will ingest PERFORMED rows:", len(rels_performed))

driver = GraphDatabase.driver(config.NEO4J_URI, auth=(config.NEO4J_USER, config.NEO4J_PASSWORD))

def iter_batches(df, size=20000):
    n = len(df)
    for i in range(0, n, size):
        yield df.iloc[i:i+size]

# IMPORTANT: Use MERGE with properties so shooter/assister become separate edges if needed
CYPHER_PERFORMED = """
UNWIND $rows AS row
MATCH (p:Player {player_id: row.player_id})
MATCH (e:Event  {event_id: row.event_id})
MERGE (p)-[r:PERFORMED {role: row.role}]->(e)
"""

with driver.session() as session:
    for batch in iter_batches(rels_performed[["player_id","event_id","role"]]):
        session.run(CYPHER_PERFORMED, rows=batch.to_dict("records"))


driver.close()

Will ingest PERFORMED rows: 7824


## Sanity checks & example queries

In [37]:

# Node counts
with driver.session() as session:
    counts = session.run("""
    MATCH (g:Game)   WITH count(g) AS g
    MATCH (p:Player) WITH g, count(p) AS p
    MATCH (e:Event)  RETURN g AS games, p AS players, count(e) AS events
    """).data()[0]
counts


  with driver.session() as session:


{'games': 6, 'players': 22, 'events': 2608}

In [38]:

# Steve Kerr clutch made shots (EVENTMSGTYPE=1)
query = """
MATCH (p:Player {name: 'Steve Kerr'})-[:PERFORMED]->(e:Event {is_clutch: true})
MATCH (e)-[:IN_GAME]->(g:Game)
WHERE e.event_type = 1
RETURN g.game_id AS game, e.period AS period, e.seconds_left_period AS sec_left,
       e.score AS score, e.score_margin AS margin, e.home_desc AS home_desc, e.visit_desc AS visit_desc
ORDER BY game, period DESC, sec_left
LIMIT 10
"""
with driver.session() as session:
    rows = session.run(query).data()
rows


  with driver.session() as session:


[{'game': '49600088',
  'period': 4,
  'sec_left': 5,
  'score': '86 - 88',
  'margin': 2.0,
  'home_desc': "Kerr 14' Jump Shot (9 PTS) (Jordan 4 AST)",
  'visit_desc': nan}]

In [39]:

# Non-Jordan clutch makes assisted by Jordan
query = """
MATCH (e:Event {is_clutch: true, event_type: 1})-[:IN_GAME]->(g:Game)
MATCH (mj:Player {name: 'Michael Jordan'})-[:PERFORMED {role:'PLAYER2_ID'}]->(e)
MATCH (scorer:Player)-[:PERFORMED {role:'PLAYER1_ID'}]->(e)
WHERE scorer.name <> 'Michael Jordan'
RETURN g.game_id AS game, scorer.name AS scorer, mj.name AS assister,
       e.period AS period, e.seconds_left_period AS sec_left,
       e.score AS score, e.score_margin AS margin, e.home_desc AS home_desc, e.visit_desc AS visit_desc
ORDER BY game, sec_left
LIMIT 10
"""
with driver.session() as session:
    rows = session.run(query).data()
rows


  with driver.session() as session:


[{'game': '49600088',
  'scorer': 'Steve Kerr',
  'assister': 'Michael Jordan',
  'period': 4,
  'sec_left': 5,
  'score': '86 - 88',
  'margin': 2.0,
  'home_desc': "Kerr 14' Jump Shot (9 PTS) (Jordan 4 AST)",
  'visit_desc': nan}]