In [1]:
import logging
import os
import pandas as pd

from datetime import datetime, timedelta
from nba_api.stats.static.teams import get_teams
from nba_api.stats.endpoints import playbyplayv2, leaguegamefinder
from sqlalchemy import create_engine

from nba_betting_ai.consts import proj_paths
from nba_betting_ai.data.ingest import scrape_everything


In [2]:
logging.basicConfig(level=logging.INFO)

logger = logging.getLogger()
proj_paths.logs.mkdir(exist_ok=True)
file_handler = logging.FileHandler(proj_paths.logs / 'ingest.log')
logger.addHandler(file_handler)

postgres_user = os.environ.get('POSTGRES_USER')
postgres_password = os.environ.get('POSTGRES_PASSWORD')
postgres_host = os.environ.get('POSTGRES_HOST')
postgres_port = os.environ.get('POSTGRES_PORT')
postgres_db = os.environ.get('POSTGRES_DB')

postgres_conn = f'postgresql://{postgres_user}:{postgres_password}@{postgres_host}:{postgres_port}/{postgres_db}'
engine = create_engine(postgres_conn)

In [1]:
import asyncio
import json
import time
from playwright.async_api import async_playwright

async def get_nba_headers():
    """
    Creates a browser session to obtain valid headers for NBA stats API requests.
    Returns a dictionary of headers that can be used for subsequent API calls.
    """
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=False)
        context = await browser.new_context(
            user_agent=(
                'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
                'AppleWebKit/537.36 (KHTML, like Gecko) '
                'Chrome/120.0.0.0 Safari/537.36'
            )
        )
        
        # Create new page and navigate to NBA stats
        page = await context.new_page()
        await page.goto('https://nba.com/stats')
        
        # Wait for the page to load
        await asyncio.sleep(3)
        
        # Extract the New Relic script URL
        new_relic_script_url = await page.evaluate('''() => {
            const scripts = document.getElementsByTagName('script');
            for (const script of scripts) {
                if (script.src && script.src.includes('newrelic-prod.js')) {
                    return script.src;
                }
            }
            return null;
        }''')
        
        if not new_relic_script_url:
            print("New Relic script not found!")
            await browser.close()
            return None

        # Navigate to the script URL and extract its content
        await page.goto(new_relic_script_url)
        script_content = await page.evaluate('''() => {
            return document.body.innerText;
        }''')

        # Extract the configuration object from the script content
        config_start = script_content.find('NREUM.loader_config={') + len('NREUM.loader_config=')
        config_end = script_content.find(';', config_start)
        loader_config_raw = script_content[config_start:config_end]
        
        # Parse the loader config
        loader_config_fixed = loader_config_raw.replace('{', '{"').replace(',', ',"').replace(':', '":')
        try:
            loader_config = json.loads(loader_config_fixed)
        except Exception as e:
            print(f"Failed to parse loader config: {e}")
            loader_config = {}
            
        # Build headers
        headers = {
            'Accept': '*/*',
            'Accept-Language': 'en-US,en;q=0.9',
            'Connection': 'keep-alive',
            'Origin': 'https://nba.com/stats',
            'Referer': 'https://nba.com/stats',
            'Sec-Fetch-Dest': 'empty',
            'Sec-Fetch-Mode': 'cors',
            'Sec-Fetch-Site': 'same-site',
            'User-Agent': (
                'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
                'AppleWebKit/537.36 (KHTML, like Gecko) '
                'Chrome/120.0.0.0 Safari/537.36'
            ),
        }
        
        if 'X-NewRelic-ID' in loader_config:
            headers['X-NewRelic-ID'] = loader_config['X-NewRelic-ID']
        
        await browser.close()
        return headers, loader_config

# In a Jupyter cell, just do:
headers = await get_nba_headers()
print(headers)


({'Accept': '*/*', 'Accept-Language': 'en-US,en;q=0.9', 'Connection': 'keep-alive', 'Origin': 'https://nba.com/stats', 'Referer': 'https://nba.com/stats', 'Sec-Fetch-Dest': 'empty', 'Sec-Fetch-Mode': 'cors', 'Sec-Fetch-Site': 'same-site', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'}, {'accountID': '2890198', 'trustKey': '2890199', 'agentID': '927622108', 'licenseKey': 'NRJS-93744526e47188ec9f0', 'applicationID': '927622108'})


In [4]:
%timeit

scrape_everything(engine=engine, season='2022-23', start_date=None, end_date=None, headers=None)

INFO:nba_betting_ai.data.ingest:Ingesting teams.
INFO:nba_betting_ai.data.ingest:Sleeping for 0.940699517640071 sec.
INFO:nba_betting_ai.data.ingest:Ingesting games.
INFO:nba_betting_ai.data.ingest:Ingesting new gameflows.
INFO:nba_betting_ai.data.ingest:Sleeping for 0.8044333291329769 sec.
INFO:nba_betting_ai.data.ingest:Ingesting gameflow for game 0022300542 (1/1993).
INFO:nba_betting_ai.data.ingest:Ingested gameflow for game 0022300542.
INFO:nba_betting_ai.data.ingest:Sleeping for 1.0189850327148788 sec.
INFO:nba_betting_ai.data.ingest:Ingesting gameflow for game 0022300353 (2/1993).
INFO:nba_betting_ai.data.ingest:Ingested gameflow for game 0022300353.
INFO:nba_betting_ai.data.ingest:Sleeping for 1.129465060180838 sec.
INFO:nba_betting_ai.data.ingest:Ingesting gameflow for game 0022300045 (3/1993).
INFO:nba_betting_ai.data.ingest:Ingested gameflow for game 0022300045.
INFO:nba_betting_ai.data.ingest:Sleeping for 1.1702820696083536 sec.
INFO:nba_betting_ai.data.ingest:Ingesting game

KeyboardInterrupt: 

In [10]:
def check_date_format(date: str):
    try:
        datetime.strptime(date, '%m/%d/%Y')
    except ValueError:
        raise ValueError("Incorrect date format, should be MM/DD/YYYY")

def scrape_games_between(start_date: str, end_date: str, timeout: int = 60, headers = None):
    check_date_format(start_date)
    check_date_format(end_date)
    gamefinder = leaguegamefinder.LeagueGameFinder(
        date_from_nullable=start_date,
        date_to_nullable=end_date,
        team_id_nullable=None,
        league_id_nullable='00',  # NBA games only
        timeout=timeout,
        headers=headers
    )
    games_dict = gamefinder.get_normalized_dict()
    games_df = pd.DataFrame(games_dict['LeagueGameFinderResults'])
    #raw_cols = ['SEASON_ID', 'GAME_ID', 'GAME_DATE', 'MATCHUP', 'WL']
    #games_df = games_df[raw_cols]
    return games_df


end_date = datetime.now()
start_date = end_date - timedelta(days=365)

# Convert dates to string format required by the API
start_date_str = start_date

In [14]:
headers = {
    "User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:118.0) Gecko/20100101 Firefox/118.0",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
    "Accept-Encoding": "gzip, deflate, br",
    "Accept-Language": "en-US,en;q=0.5",
    "Connection": "keep-alive",
}

In [6]:
start_date_str = datetime.now().strftime('%m/%d/%Y')
end_date_str = end_date.strftime('%m/%d/%Y')

games_df = scrape_games_between(start_date_str, end_date_str, headers=headers)
games_df

NameError: name 'datetime' is not defined

In [13]:
[
    col.lower()
    for col in games_df.columns
]

['season_id',
 'team_id',
 'team_abbreviation',
 'team_name',
 'game_id',
 'game_date',
 'matchup',
 'wl',
 'min',
 'pts',
 'fgm',
 'fga',
 'fg_pct',
 'fg3m',
 'fg3a',
 'fg3_pct',
 'ftm',
 'fta',
 'ft_pct',
 'oreb',
 'dreb',
 'reb',
 'ast',
 'stl',
 'blk',
 'tov',
 'pf',
 'plus_minus']

In [75]:
games_df.iloc[0]

SEASON_ID                            22024
TEAM_ID                         1610612760
TEAM_ABBREVIATION                      OKC
TEAM_NAME            Oklahoma City Thunder
GAME_ID                         0022400473
GAME_DATE                       2025-01-03
MATCHUP                        OKC vs. NYK
WL                                       W
MIN                                    242
PTS                                    117
FGM                                     43
FGA                                     84
FG_PCT                               0.512
FG3M                                    14
FG3A                                    27
FG3_PCT                              0.519
FTM                                     17
FTA                                     21
FT_PCT                                0.81
OREB                                     7
DREB                                    34
REB                                     41
AST                                     27
STL        

In [77]:
games_formated_df.memory_usage()

Index          132
SEASON_ID    22336
GAME_ID      22336
GAME_DATE    22336
RESULT       22336
HOME_TEAM    22336
AWAY_TEAM    22336
dtype: int64

In [76]:
def format_games_df(games_df):
    games_df['RESULT'] = games_df['WL'].map({'W': 1, 'L': 0})
    columns = ['SEASON_ID', 'GAME_ID', 'GAME_DATE', 'RESULT']
    teams = games_df['MATCHUP'].str.split(expand=True)[[0, 2]].rename(columns={0: 'HOME_TEAM', 2: 'AWAY_TEAM'})
    games_df = pd.concat([games_df[columns], teams], axis=1)
    return games_df

games_formated_df = format_games_df(games_df)


#mask_home = games_df['MATCHUP'].str.contains(' vs. ')
# games_df = games_df[mask_home]
games_formated_df

Unnamed: 0,SEASON_ID,GAME_ID,GAME_DATE,RESULT,HOME_TEAM,AWAY_TEAM
0,22024,0022400473,2025-01-03,1.0,OKC,NYK
1,22024,0022400469,2025-01-03,0.0,CHA,DET
2,22024,0022400470,2025-01-03,0.0,TOR,ORL
3,22024,0022400472,2025-01-03,0.0,WAS,NOP
4,22024,0022400475,2025-01-03,0.0,DEN,SAS
...,...,...,...,...,...,...
2787,22023,0022300491,2024-01-05,1.0,MEM,LAL
2788,22023,0022300483,2024-01-05,0.0,PHI,NYK
2789,22023,0022300479,2024-01-05,0.0,UTA,BOS
2790,22023,0022300485,2024-01-05,0.0,HOU,MIN


In [60]:
games_formated_df.sort_values(by='GAME_DATE', ascending=True)

Unnamed: 0,SEASON_ID,GAME_ID,GAME_DATE,RESULT,HOME_TEAM,AWAY_TEAM
2791,22023,0022300484,2024-01-05,1.0,CHI,CHA
2764,22023,0022300481,2024-01-05,0.0,OKC,BKN
2765,22023,0022300492,2024-01-05,1.0,SAC,TOR
2766,22023,0022300481,2024-01-05,1.0,BKN,OKC
2767,22023,0022300486,2024-01-05,1.0,LAC,NOP
...,...,...,...,...,...,...
15,22024,0022400474,2025-01-03,0.0,DAL,CLE
16,22024,0022400476,2025-01-03,,MEM,SAC
17,22024,0022400477,2025-01-03,,ATL,LAL
9,22024,0022400473,2025-01-03,0.0,NYK,OKC


In [19]:
def scrape_gameflow(game_id: str, timeout: int = 60) -> pd.DataFrame:
    play_by_play = playbyplayv2.PlayByPlayV2(game_id=game_id, timeout=timeout)
    df_plays = pd.DataFrame(play_by_play.get_normalized_dict()['PlayByPlay'])
    scored_mask = ~df_plays['SCORE'].isna()
    df_plays = df_plays[scored_mask]
    scores = df_plays['SCORE'].str.split(' - ', expand=True)
    scores.columns = ['HOME_SCORE', 'AWAY_SCORE']
    scores = scores.astype(int)
    scores.insert(0, 'GAME_ID', game_id)
    # scores['DIFF'] = scores['HOME_SCORE'] - scores['AWAY_SCORE']
    period_length = 12 * 60
    total_periods = df_plays['PERIOD'].max()
    time_remaining_period = df_plays['PCTIMESTRING'].str.split(':', expand=True).astype(int)
    time_remaining_period = time_remaining_period[0] * 60 + time_remaining_period[1]
    time_remaining = (total_periods - df_plays['PERIOD']) * period_length + time_remaining_period
    scores['TIME_REMAINING'] = time_remaining
    return scores

game_id = '0022300442'
gameflow_df = scrape_gameflow(game_id)
gameflow_df

Unnamed: 0,GAME_ID,HOME_SCORE,AWAY_SCORE,TIME_REMAINING
4,0022300442,0,2,2839
17,0022300442,0,4,2758
18,0022300442,2,4,2747
19,0022300442,2,6,2735
22,0022300442,2,9,2713
...,...,...,...,...
456,0022300442,88,104,68
458,0022300442,88,105,68
459,0022300442,90,105,53
462,0022300442,92,105,15


In [20]:
gameflow_df

Unnamed: 0,GAME_ID,HOME_SCORE,AWAY_SCORE,TIME_REMAINING
4,0022300442,0,2,2839
17,0022300442,0,4,2758
18,0022300442,2,4,2747
19,0022300442,2,6,2735
22,0022300442,2,9,2713
...,...,...,...,...
456,0022300442,88,104,68
458,0022300442,88,105,68
459,0022300442,90,105,53
462,0022300442,92,105,15


In [14]:
games_df

Unnamed: 0,SEASON_ID,TEAM_ID,TEAM_ABBREVIATION,TEAM_NAME,GAME_ID,GAME_DATE,MATCHUP,WL,MIN,PTS,...,FT_PCT,OREB,DREB,REB,AST,STL,BLK,TOV,PF,PLUS_MINUS
0,22024,1610612755,PHI,Philadelphia 76ers,0022400478,2025-01-04,PHI @ BKN,W,241,123,...,0.800,8,42,50,31,11,2,17,17,29.0
1,22024,1610612750,MIN,Minnesota Timberwolves,0022400479,2025-01-04,MIN @ DET,L,239,105,...,0.808,7,26,33,20,7,4,17,21,-14.0
2,22024,1610612756,PHX,Phoenix Suns,0022400480,2025-01-04,PHX @ IND,L,242,108,...,0.952,5,31,36,30,3,7,11,14,-18.0
3,22024,1610612743,DEN,Denver Nuggets,0022400484,2025-01-04,DEN @ SAS,W,265,122,...,0.783,14,44,58,32,11,7,7,17,0.0
4,22024,1610612765,DET,Detroit Pistons,0022400479,2025-01-04,DET vs. MIN,W,240,119,...,0.800,13,34,47,26,12,2,12,22,14.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2777,22023,1610612764,WAS,Washington Wizards,0022300494,2024-01-06,WAS vs. NYK,L,240,105,...,0.600,12,33,45,33,5,3,20,20,-16.0
2778,22023,1610612752,NYK,New York Knicks,0022300494,2024-01-06,NYK @ WAS,W,240,121,...,0.875,11,34,45,27,9,2,14,16,16.0
2779,22023,1610612755,PHI,Philadelphia 76ers,0022300495,2024-01-06,PHI vs. UTA,L,239,109,...,0.783,15,19,34,30,12,6,6,20,-11.0
2780,22023,1610612762,UTA,Utah Jazz,0022300495,2024-01-06,UTA @ PHI,W,240,120,...,0.833,9,39,48,30,4,4,23,18,11.0


In [22]:
games_df.TEAM_ABBREVIATION

2       NOP
3       OKC
4       DAL
6       TOR
9       DET
       ... 
2778    HOU
2781    NOP
2782    DAL
2786    CLE
2787    BOS
Name: TEAM_ABBREVIATION, Length: 1387, dtype: object

In [30]:
from nba_api.stats.endpoints import leaguestandings

standings = leaguestandings.LeagueStandings(
    league_id='00',  # '00' typically represents the NBA
    season='2023-24'  # Replace with your target season
)

In [31]:
standings = leaguestandings.LeagueStandings(
    league_id='00',
    season='2023-24'
)

<nba_api.stats.endpoints.leaguestandings.LeagueStandings at 0x7cfad61d14f0>

In [32]:
standings_dfs = standings.get_data_frames()

In [39]:
standings_dfs[0].columns

Index(['LeagueID', 'SeasonID', 'TeamID', 'TeamCity', 'TeamName', 'Conference',
       'ConferenceRecord', 'PlayoffRank', 'ClinchIndicator', 'Division',
       'DivisionRecord', 'DivisionRank', 'WINS', 'LOSSES', 'WinPCT',
       'LeagueRank', 'Record', 'HOME', 'ROAD', 'L10', 'Last10Home',
       'Last10Road', 'OT', 'ThreePTSOrLess', 'TenPTSOrMore', 'LongHomeStreak',
       'strLongHomeStreak', 'LongRoadStreak', 'strLongRoadStreak',
       'LongWinStreak', 'LongLossStreak', 'CurrentHomeStreak',
       'strCurrentHomeStreak', 'CurrentRoadStreak', 'strCurrentRoadStreak',
       'CurrentStreak', 'strCurrentStreak', 'ConferenceGamesBack',
       'DivisionGamesBack', 'ClinchedConferenceTitle', 'ClinchedDivisionTitle',
       'ClinchedPlayoffBirth', 'EliminatedConference', 'EliminatedDivision',
       'AheadAtHalf', 'BehindAtHalf', 'TiedAtHalf', 'AheadAtThird',
       'BehindAtThird', 'TiedAtThird', 'Score100PTS', 'OppScore100PTS',
       'OppOver500', 'LeadInFGPCT', 'LeadInReb', 'FewerTurnove

In [24]:
games_df[['TEAM_ID', 'MATCHUP']]

Unnamed: 0,TEAM_ID,MATCHUP
0,1610612747,LAL vs. CLE
1,1610612746,LAC @ SAS
2,1610612760,OKC vs. MIN
3,1610612754,IND vs. MIL
4,1610612761,TOR @ BOS
...,...,...
2781,1610612738,BOS @ OKC
2782,1610612759,SAS @ MEM
2783,1610612744,GSW vs. ORL
2784,1610612766,CHA @ SAC
