In [12]:
import requests

In [31]:
url = "https://api.pbpstats.com/get-possessions/nba"
params = {
    "0Exactly1OnFloor": "1629027", # format index(Exactly|GreaterThan|LessThan)number(OnFloor|OffFloor|PlayedInGame|DidNotPlayInGame|Started|CameOffBench)
    "TeamId": "1610612744",
    "Season": "2024-25",
    "SeasonType": "Regular Season",
    "OffDef": "Offense", # Offense or Defense
    "StartType": "All", # see below for possible values for StartType
    "Leverage": "VeryHigh"
}
response = requests.get(url, params=params)
response_json = response.json()
possessions = response_json["possessions"]
player_stats = response_json["player_results"]
team_stats = response_json["team_results"]


In [112]:
import requests
from datetime import datetime, timedelta

def get_possessions(
    team_id:   str,
    offdef: str,
) -> List[Dict[str, Any]]:
    """
    offdef should be "Offense" or "Defense"
    """
    url = "https://api.pbpstats.com/get-possessions/nba"
    params = {
        "TeamId":     team_id,
        "Season":     "2024-25",
        "SeasonType": "Regular Season",
        "StartType":  "All",
        "Leverage":   "VeryHigh",
        "OffDef": offdef
    }
    headers = {
        "Accept":     "application/json, text/plain, */*",
        "Origin":     "https://www.pbpstats.com",
        "Referer":    "https://www.pbpstats.com/",
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)",
    }

    for attempt in range(3):
        resp = requests.get(url, params=params, headers=headers)
        status = resp.status_code

        if status == 200:
            try:
                data = resp.json()
                return data.get("possessions", [])
            except ValueError:
                # invalid or empty JSON
                return []
        elif status in (429, 503):
            # rate limited or server busy
            wait = 2 ** attempt
            time.sleep(wait)
            continue
        else:
            # unexpected error—log and skip
            return []

    # if we exhausted retries
    return []


def fetch_full_season(
    team_id:   str,
    start:     str = "2024-10-19",
    end:       str = "2025-04-16"
) -> Dict[str, List[Dict[str, Any]]]:
    """
    Fetches every single possession (Offense + Defense)
    for `team_id` over the 2024‑25 regular season.
    """
    offense_all: List[Dict[str, Any]] = []
    defense_all: List[Dict[str, Any]] = []

    dt = datetime.strptime(start, "%Y-%m-%d").date()
    end_dt = datetime.strptime(end, "%Y-%m-%d").date()
    one_day = timedelta(days=1)

    while dt <= end_dt:
        day = dt.isoformat()
        # Offense
        offs = get_possessions(team_id, day, day, "Offense")
        print(len(offs))
        offense_all.extend(offs)
        # Defense
        defs = get_possessions(team_id, day, day, "Defense")
        defense_all.extend(defs)

        dt += one_day

    return {
        "Offense": offense_all,
        "Defense": defense_all
    }

In [113]:
gsw = "1610612744"
rockets = "1610612745"

possessions = []
possessions.append(get_possessions(gsw, "Offense"))
possessions.append(get_possessions(gsw, "Defense"))
possessions.append(get_possessions(rockets, "Offense"))
possessions.append(get_possessions(rockets, "Defense"))

In [120]:
pos = []
for p in possessions:
    pos.extend(p)

In [121]:
import re
from typing import List, Dict, Any, Tuple
import pandas as pd
import numpy as np

# ——————————————————————————————————————————————————————————————
# 1) Regex‑based normalization of possession “Events” strings
# ——————————————————————————————————————————————————————————————
SHOT_RE = re.compile(
    r'^(?P<outcome>MAKE|MISS)\s+'
    r'(?P<player>[\w\.\- ]+)\s+'
    r"(?P<distance>\d+)'\s+"
    r'(?P<shot_type>.+)$'
)
REB_RE = re.compile(
    r'^(?P<player>[\w\.\- ]+)\s+REBOUND\s+'
    r'\(Off:(?P<off>\d+)\s+Def:(?P<def>\d+)\)$'
)
GEN_REB_RE = re.compile(r'^(?P<team>.+)\s+Rebound$', re.IGNORECASE)
BLOCK_RE = re.compile(
    r'^(?P<blocker>[\w\.\- ]+)\s+BLOCK\s+\((?P<count>\d+)\s+BLK\):\s*(?P<rest>.+)$'
)
STEAL_RE = re.compile(
    r'^(?P<stealer>[\w\.\- ]+)\s+STEAL\s+\((?P<count>\d+)\s+STL\):\s*(?P<rest>.+)$'
)
FT_RE = re.compile(
    r'^(?:(?P<outcome>MAKE|MISS)\s+)?'
    r'(?P<player>[\w\.\- ]+)\s+Free Throw\s+'
    r'(?P<number>\d+)\s+of\s+(?P<total>\d+)'
    r'(?:\s+\((?P<pts>\d+)\s+PTS\))?$'
)
FOUL_RE = re.compile(
    r'^(?P<player>[\w\.\- ]+)\s+'
    r'(?P<foul_type>[A-Z]?\.FOUL)'
    r'(?:\s+\((?P<refs>.+)\))?$', 
    re.IGNORECASE
)

def normalize_possessions(raw_poss: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
    def to_seconds(ts: str) -> int:
        mm, ss = map(int, ts.split(':'))
        return 60*mm + ss

    normalized = []
    for pid, poss in enumerate(raw_poss, start=1):
        rec = {k:v for k,v in poss.items() if k not in ('Events','VideoUrls')}
        rec['PossessionId'] = pid
        rec['StartTimeSec'] = to_seconds(poss['StartTime'])
        rec['EndTimeSec']   = to_seconds(poss['EndTime'])

        evs = []
        for line in poss.get('Events','').splitlines():
            line = line.strip()
            if not line:
                continue

            # BLOCK + nested shot
            m = BLOCK_RE.match(line)
            if m:
                g = m.groupdict()
                evs.append({'type':'BLOCK','player':g['blocker'],'count':int(g['count'])})
                nested = g['rest']
                m2 = SHOT_RE.match(nested)
                if m2:
                    ng = m2.groupdict()
                    evs.append({
                        'type':'SHOT','player':ng['player'],
                        'distance':int(ng['distance']),
                        'shot_type':ng['shot_type'],'outcome':ng['outcome']
                    })
                else:
                    evs.append({'type':'UNKNOWN','description':nested})
                continue

            # STEAL + nested turnover
            m = STEAL_RE.match(line)
            if m:
                g = m.groupdict()
                evs.append({'type':'STEAL','player':g['stealer'],'count':int(g['count'])})
                evs.append({'type':'TURNOVER','description':g['rest']})
                continue

            # Free Throw
            m = FT_RE.match(line)
            if m:
                g = m.groupdict()
                evs.append({
                    'type':'FREE_THROW','player':g['player'],
                    'outcome':g.get('outcome') or 'MAKE',
                    'number':int(g['number']),'total':int(g['total']),
                    'points':int(g['pts']) if g.get('pts') else None
                })
                continue

            # Shot
            m = SHOT_RE.match(line)
            if m:
                g = m.groupdict()
                evs.append({
                    'type':'SHOT','player':g['player'],
                    'distance':int(g['distance']),
                    'shot_type':g['shot_type'],
                    'outcome':g['outcome']
                })
                continue

            # Rebound
            m = REB_RE.match(line)
            if m:
                g = m.groupdict()
                evs.append({
                    'type':'REBOUND','player':g['player'],
                    'offensive':int(g['off']),'defensive':int(g['def'])
                })
                continue
            m = GEN_REB_RE.match(line)
            if m:
                evs.append({'type':'REBOUND','team':m.group('team').strip()})
                continue

            # Foul
            m = FOUL_RE.match(line)
            if m:
                g = m.groupdict()
                evs.append({
                    'type':'FOUL','player':g['player'],
                    'foul_type':g['foul_type'],'refs':g.get('refs')
                })
                continue

            # fallback
            evs.append({'type':'UNKNOWN','description':line})

        rec['Events']    = evs
        rec['VideoUrls'] = [v for v in poss.get('VideoUrls',[]) if v.get('url')]
        normalized.append(rec)
    return normalized

# ——————————————————————————————————————————————————————————————
# 2) Flatten into one‑event‑per‑row DataFrame
# ——————————————————————————————————————————————————————————————
def build_event_df(normed: List[Dict[str,Any]]) -> pd.DataFrame:
    rows = []
    for p in normed:
        for idx, ev in enumerate(p['Events'], start=1):
            rows.append({
                'GameId':        p['GameId'],
                'PossessionId':  p['PossessionId'],
                'EventIdx':      idx,
                'EventType':     ev.get('type','UNKNOWN'),
                'Player':        ev.get('player', ev.get('team','UNKNOWN')),
                'Distance':      ev.get('distance', 0),
                'ShotType':      ev.get('shot_type',''),
                'Outcome':       ev.get('outcome',''),
                'OffCount':      ev.get('offensive', ev.get('count', 0)),
                'DefCount':      ev.get('defensive', 0),
                'StartTimeSec':  p['StartTimeSec'],
                'EndTimeSec':    p['EndTimeSec'],
                'Period':        p['Period'],
                'ScoreDiff':     p['StartScoreDifferential'],
            })
    return pd.DataFrame(rows)

# ——————————————————————————————————————————————————————————————
# 3) Build vocabularies for embeddings
# ——————————————————————————————————————————————————————————————
def make_vocabs(df: pd.DataFrame) -> Tuple[pd.DataFrame, Dict[str,Any]]:
    vocabs: Dict[str,Any] = {}
    def mk(col):
        vals = df[col].astype(str).unique().tolist()
        id2v = {i+1:v for i,v in enumerate(sorted(vals))}
        v2id = {v:i    for i,v in id2v.items()}
        return id2v, v2id

    for col in ('EventType','Player','ShotType','Outcome'):
        id2v,v2id = mk(col)
        vocabs[col] = {'id2v':id2v,'v2id':v2id}

    # distance bins
    bins = list(range(0,61,10)) + [np.inf]
    df['DistanceBin'] = pd.cut(df['Distance'], bins=bins, labels=False, include_lowest=True)
    id2v = {i:str(bins[i]) for i in range(len(bins)-1)}
    v2id = {str(bins[i]):i for i in range(len(bins)-1)}
    vocabs['DistanceBin'] = {'id2v':id2v,'v2id':v2id}

    # count identity (0–10)
    for c in ('OffCount','DefCount'):
        id2v = {i:i for i in range(16)}
        v2id = id2v.copy()
        vocabs[c] = {'id2v':id2v,'v2id':v2id}

    return df, vocabs

# ——————————————————————————————————————————————————————————————
# 4) Encode to integer ID columns
# ——————————————————————————————————————————————————————————————
def encode_df(df: pd.DataFrame, vocabs: Dict[str,Any]) -> pd.DataFrame:
    df = df.copy()
    df['event_type_id']   = df['EventType'].map(  vocabs['EventType']['v2id'])
    df['player_id']       = df['Player'].map(     vocabs['Player']['v2id'])
    df['shot_type_id']    = df['ShotType'].map(   vocabs['ShotType']['v2id'])
    df['outcome_id']      = df['Outcome'].map(    vocabs['Outcome']['v2id'])
    df['distance_bin_id'] = df['DistanceBin']
    df['off_count_id']    = df['OffCount']
    df['def_count_id']    = df['DefCount']
    # temporal/context as ints
    df['period_id']       = df['Period']
    df['start_time_id']   = df['StartTimeSec']
    df['end_time_id']     = df['EndTimeSec']
    df['score_diff_id']   = df['ScoreDiff']
    return df

# ——————————————————————————————————————————————————————————————
# 5) Top‑level pipeline function
# ——————————————————————————————————————————————————————————————
def preprocess_possessions_for_transformer(
    raw_possessions: List[Dict[str,Any]]
) -> Tuple[pd.DataFrame, Dict[str,Any]]:
    normed = normalize_possessions(raw_possessions)
    df0    = build_event_df(normed)
    df1, vocabs = make_vocabs(df0)
    df_enc = encode_df(df1, vocabs)
    # df_enc now has all the *_id columns you need
    return df_enc, vocabs


In [122]:
df = preprocess_possessions_for_transformer(pos)


In [129]:
df[0]

Unnamed: 0,GameId,PossessionId,EventIdx,EventType,Player,Distance,ShotType,Outcome,OffCount,DefCount,...,player_id,shot_type_id,outcome_id,distance_bin_id,off_count_id,def_count_id,period_id,start_time_id,end_time_id,score_diff_id
0,0022401204,1,1,BLOCK,Smith Jr.,0,,,2,0,...,59,1,1,0,2,0,4,3,0,-1
1,0022401204,1,2,SHOT,Podziemski,25,3PT Jump Shot,MISS,0,0,...,50,2,3,2,0,0,4,3,0,-1
2,0022401204,1,3,REBOUND,Thompson,0,,,2,4,...,60,1,1,0,2,4,4,3,0,-1
3,0022401204,2,1,SHOT,Curry,25,3PT Step Back Jump Shot,MISS,0,0,...,17,6,3,2,0,0,4,27,5,1
4,0022401204,2,2,REBOUND,Payton II,0,,,3,4,...,49,1,1,0,3,4,4,27,5,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
242,0022400068,101,1,UNKNOWN,UNKNOWN,0,,,0,0,...,63,1,1,0,0,0,4,61,40,0
243,0022400037,102,1,SHOT,Edwards,4,Driving Floating Bank Jump Shot,MISS,0,0,...,22,9,3,0,0,0,4,12,0,0
244,0022400037,102,2,REBOUND,Sengun,0,,,3,5,...,58,1,1,0,3,5,4,12,0,0
245,0022400037,103,1,UNKNOWN,UNKNOWN,0,,,0,0,...,63,1,1,0,0,0,4,38,28,0


In [135]:
import pandas as pd

def parquet_to_jsonl_all_columns(df, jsonl_path: str):
    # Load the entire DataFrame

    # Write every column to JSONL
    df.to_json(jsonl_path, orient='records', lines=True)
    print(f"Wrote {len(df)} records with {len(df.columns)} columns to {jsonl_path}")


df_to_jsonl(df[0], "/Users/abhi/Desktop/jsonltraining.jsonl")


Wrote 247 /Users/abhi/Desktop/jsonltraining.jsonl
