In [None]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

## PrizePicks Scraper

The code below scrapes the PrizePicks site and retrieves a list of all the Popular Picks. We are using the Popular picks as these are what the site's users are betting on the most, so we are making a reasonable assumption that these picks are the easiest to bet on (predict).

In [None]:
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import undetected_chromedriver as uc
import time
import pandas as pd

# Set up Chrome options
chrome_options = uc.ChromeOptions()
prefs = {"profile.default_content_setting_values.geolocation": 1}  # Allow location
chrome_options.add_experimental_option("prefs", prefs)
driver = uc.Chrome(options=chrome_options)

# Open the website
driver.get("https://app.prizepicks.com/")
time.sleep(5)

# Close the popup if it appears
try:
    close_button = WebDriverWait(driver, 10).until(
        EC.element_to_be_clickable((By.CLASS_NAME, "close"))
    )
    close_button.click()
    print("Popup closed successfully.")
except Exception:
    print("No popup found or already closed.")

ppPlayers = []

# Click on NBA category
try:
    nba_button = WebDriverWait(driver, 10).until(
        EC.element_to_be_clickable((By.XPATH, "//button[contains(@class, 'league-old')]//img[@alt='NBA']"))
    )
    nba_button.click()
    print("NBA button clicked successfully.")
except Exception as e:
    print("Could not find or click the NBA button:", e)

time.sleep(5)

# Click on the "Popular 🔥" category
try:
    popular_button = WebDriverWait(driver, 10).until(
        EC.element_to_be_clickable((By.XPATH, "//button[contains(@class, 'stat') and contains(text(), 'Popular')]"))
    )
    popular_button.click()
    print("Clicked 'Popular 🔥' category.")
except Exception as e:
    print("Could not find or click 'Popular 🔥' category:", e)

# Extract player projections
try:
    projectionsPP = WebDriverWait(driver, 5).until(
        EC.presence_of_all_elements_located((By.CSS_SELECTOR, "li.border-soFresh-130"))
    )
    for projections in projectionsPP:
        names = projections.find_element(By.ID, "test-player-name").text
        value = projections.find_element(By.CLASS_NAME, "heading-md").text  # This might be different
        proptype = projections.find_element(By.CLASS_NAME, "break-words").text  # Adjust as needed

        players = {
            'Name': names,
            'Value': value,
            'Prop': proptype.replace("<wbr>", "")
        }
        ppPlayers.append(players)
except Exception as e:
    print("Could not extract player projections:", e)

# Convert results to a DataFrame
dfProps = pd.DataFrame(ppPlayers)

# dfProps.to_csv('prizepicks_popular.csv')

print("These are all of the props offered in the 'Popular 🔥' section.", '\n')
dfProps


Popup closed successfully.
NBA button clicked successfully.
Could not find or click 'Popular 🔥' category: Message: 
Stacktrace:
0   undetected_chromedriver             0x00000001046a0bc8 undetected_chromedriver + 5766088
1   undetected_chromedriver             0x00000001046987ea undetected_chromedriver + 5732330
2   undetected_chromedriver             0x0000000104186680 undetected_chromedriver + 415360
3   undetected_chromedriver             0x00000001041d7d7d undetected_chromedriver + 748925
4   undetected_chromedriver             0x00000001041d7f21 undetected_chromedriver + 749345
5   undetected_chromedriver             0x0000000104228184 undetected_chromedriver + 1077636
6   undetected_chromedriver             0x00000001041fe07d undetected_chromedriver + 905341
7   undetected_chromedriver             0x00000001042253e7 undetected_chromedriver + 1065959
8   undetected_chromedriver             0x00000001041fde23 undetected_chromedriver + 904739
9   undetected_chromedriver             

Unnamed: 0,Name,Value,Prop
0,Derrick White,10.5,Points
1,Derrick White,21.5,Points
2,Jaylen Brown,17.5,Points
3,Jaylen Brown,29.5,Points
4,Jayson Tatum,19.5,Points
5,Jayson Tatum,34.5,Points
6,Jrue Holiday,5.5,Points
7,Jrue Holiday,14.5,Points
8,Kristaps Porzingis,26.5,Points
9,Kyrie Irving,19.5,Points


## Data Gathering

We are using the NBA API to gather historical data on the top 50 players in the NBA currently. We are collecting data from the current season and the two seasons prior to establish a long term trend and to ensure that our model will have enough training data.

In [None]:
import pandas as pd
from nba_api.stats.endpoints import LeagueDashPlayerStats, PlayerGameLog
import time

def get_top_players(season, num_players=50):
    """Fetches player statistics for a season and returns the top players based on points per game."""
    df = get_all_players_stats(season)
    if df.empty:
        print(f"No player data found for season {season}.")
        return pd.DataFrame()

    df_sorted = df.sort_values(by="PTS", ascending=False)  # Sort by points per game
    top_players = df_sorted.head(num_players)  # Get only top 50
    print(f"Found {len(top_players)} players for season {season}.")
    return top_players

def get_all_players_stats(season):
    """Fetches season stats for all NBA players and returns as a DataFrame."""
    try:
        data = LeagueDashPlayerStats(season=season).get_data_frames()[0]
        return data[["PLAYER_ID", "PLAYER_NAME", "PTS"]]  # Keep relevant stats
    except Exception as e:
        print(f"Error fetching player stats for season {season}: {e}")
        return pd.DataFrame()

def fetch_player_logs(player_id, player_name, seasons):
    """Fetches game logs for a given player across multiple seasons and includes player name."""
    all_games = []
    for season in seasons:
        try:
            logs = PlayerGameLog(player_id=player_id, season=season).get_data_frames()[0]
            logs["SEASON"] = season  # Add season column
            logs["PLAYER_ID"] = player_id
            logs["PLAYER_NAME"] = player_name  # Add player name
            all_games.append(logs)
            time.sleep(1)  # Prevent API rate limits
        except Exception as e:
            print(f"Error fetching logs for player {player_name} ({player_id}) in {season}: {e}")
    return pd.concat(all_games, ignore_index=True) if all_games else pd.DataFrame()

def main():
    """Main function to fetch game logs for the top 50 players over the last three seasons."""
    seasons = ["2024-25", "2023-24", "2022-23"]  # Ensure proper season format

    top_players = get_top_players(seasons[0])  # Get top 50 players for latest season
    if top_players.empty:
        print("No top players found. Exiting program.")
        return

    all_player_logs = []
    for _, row in top_players.iterrows():
        player_id = row["PLAYER_ID"]
        player_name = row["PLAYER_NAME"]
        print(f"Fetching data for {player_name} (ID: {player_id})...")
        logs = fetch_player_logs(player_id, player_name, seasons)  # Fetch logs for past 3 seasons
        if not logs.empty:
            all_player_logs.append(logs)

    if all_player_logs:
        final_df = pd.concat(all_player_logs, ignore_index=True)
        print(f"Data collection complete. Shape: {final_df.shape}")
        final_df.to_csv("nba_top50_player_logs.csv", index=False)
        print(final_df.head())  # Display first few rows to verify player names
    else:
        print("No game logs were retrieved.")

if __name__ == "__main__":
    main()


Found 50 players for season 2024-25.
Fetching data for Shai Gilgeous-Alexander (ID: 1628983)...
Fetching data for Nikola Jokić (ID: 203999)...
Fetching data for Anthony Edwards (ID: 1630162)...
Fetching data for Jayson Tatum (ID: 1628369)...
Fetching data for Jalen Brunson (ID: 1628973)...


KeyboardInterrupt: 

In [None]:
import pandas as pd
from nba_api.stats.endpoints import PlayerGameLog
import time

# Load existing dataset
file_path = "nba_top50_player_logs.csv"

try:
    existing_data = pd.read_csv(file_path)
    existing_data["GAME_DATE"] = pd.to_datetime(existing_data["GAME_DATE"])  # Convert to datetime
    print("Existing data loaded successfully.")
except FileNotFoundError:
    print("No existing file found. Exiting...")
    exit()

# Find the most recent game date in the dataset
last_game_date = existing_data["GAME_DATE"].max()
print(f"Last recorded game date: {last_game_date.date()}")

# Get unique players in dataset
players = existing_data[["PLAYER_ID", "PLAYER_NAME"]].drop_duplicates()

# Fetch only new game logs after the last recorded date
new_game_logs = []
for _, row in players.iterrows():
    player_id = row["PLAYER_ID"]
    player_name = row["PLAYER_NAME"]

    try:
        logs = PlayerGameLog(player_id=player_id, season="2024-25").get_data_frames()[0]
        logs["GAME_DATE"] = pd.to_datetime(logs["GAME_DATE"])  # Convert to datetime
        logs["PLAYER_ID"] = player_id
        logs["PLAYER_NAME"] = player_name

        # Keep only games played after the last recorded date
        new_games = logs[logs["GAME_DATE"] > last_game_date]
        if not new_games.empty:
            new_game_logs.append(new_games)
            print(f"Added {len(new_games)} new games for {player_name}")

        time.sleep(1)  # Prevent API rate limits

    except Exception as e:
        print(f"Error fetching logs for {player_name} (ID: {player_id}): {e}")

# Append new games to the existing dataset and save
if new_game_logs:
    updated_data = pd.concat([existing_data] + new_game_logs, ignore_index=True)
    updated_data.to_csv(file_path, index=False)
    print(f"Updated dataset saved. New shape: {updated_data.shape}")
else:
    print("No new games to add.")


Existing data loaded successfully.
Last recorded game date: 2025-02-08


  logs["GAME_DATE"] = pd.to_datetime(logs["GAME_DATE"])  # Convert to datetime
  logs["GAME_DATE"] = pd.to_datetime(logs["GAME_DATE"])  # Convert to datetime
  logs["GAME_DATE"] = pd.to_datetime(logs["GAME_DATE"])  # Convert to datetime
  logs["GAME_DATE"] = pd.to_datetime(logs["GAME_DATE"])  # Convert to datetime
  logs["GAME_DATE"] = pd.to_datetime(logs["GAME_DATE"])  # Convert to datetime
  logs["GAME_DATE"] = pd.to_datetime(logs["GAME_DATE"])  # Convert to datetime
  logs["GAME_DATE"] = pd.to_datetime(logs["GAME_DATE"])  # Convert to datetime
  logs["GAME_DATE"] = pd.to_datetime(logs["GAME_DATE"])  # Convert to datetime
  logs["GAME_DATE"] = pd.to_datetime(logs["GAME_DATE"])  # Convert to datetime
  logs["GAME_DATE"] = pd.to_datetime(logs["GAME_DATE"])  # Convert to datetime
  logs["GAME_DATE"] = pd.to_datetime(logs["GAME_DATE"])  # Convert to datetime
  logs["GAME_DATE"] = pd.to_datetime(logs["GAME_DATE"])  # Convert to datetime
  logs["GAME_DATE"] = pd.to_datetime(logs["GAME_DATE

  logs["GAME_DATE"] = pd.to_datetime(logs["GAME_DATE"])  # Convert to datetime
  logs["GAME_DATE"] = pd.to_datetime(logs["GAME_DATE"])  # Convert to datetime
  logs["GAME_DATE"] = pd.to_datetime(logs["GAME_DATE"])  # Convert to datetime
  logs["GAME_DATE"] = pd.to_datetime(logs["GAME_DATE"])  # Convert to datetime
  logs["GAME_DATE"] = pd.to_datetime(logs["GAME_DATE"])  # Convert to datetime
  logs["GAME_DATE"] = pd.to_datetime(logs["GAME_DATE"])  # Convert to datetime
  logs["GAME_DATE"] = pd.to_datetime(logs["GAME_DATE"])  # Convert to datetime
  logs["GAME_DATE"] = pd.to_datetime(logs["GAME_DATE"])  # Convert to datetime
  logs["GAME_DATE"] = pd.to_datetime(logs["GAME_DATE"])  # Convert to datetime
  logs["GAME_DATE"] = pd.to_datetime(logs["GAME_DATE"])  # Convert to datetime
  logs["GAME_DATE"] = pd.to_datetime(logs["GAME_DATE"])  # Convert to datetime
  logs["GAME_DATE"] = pd.to_datetime(logs["GAME_DATE"])  # Convert to datetime
  logs["GAME_DATE"] = pd.to_datetime(logs["GAME_DATE

  logs["GAME_DATE"] = pd.to_datetime(logs["GAME_DATE"])  # Convert to datetime
  logs["GAME_DATE"] = pd.to_datetime(logs["GAME_DATE"])  # Convert to datetime


No new games to add.


In [None]:
nba_data = pd.read_csv('nba_top50_player_logs.csv')
nba_data.head()

Unnamed: 0,SEASON_ID,Player_ID,Game_ID,GAME_DATE,MATCHUP,WL,MIN,FGM,FGA,FG_PCT,...,STL,BLK,TOV,PF,PTS,PLUS_MINUS,VIDEO_AVAILABLE,SEASON,PLAYER_ID,PLAYER_NAME
0,22024,1628983,22400705,2025-02-03,OKC vs. MIL,W,22,15,19,0.789,...,0,0,3,1,34,34,1,2024-25,1628983,Shai Gilgeous-Alexander
1,22024,1628983,22400691,2025-02-01,OKC vs. SAC,W,30,10,20,0.5,...,0,2,4,2,29,17,1,2024-25,1628983,Shai Gilgeous-Alexander
2,22024,1628983,22400673,2025-01-29,OKC @ GSW,L,39,16,29,0.552,...,1,0,3,4,52,1,1,2024-25,1628983,Shai Gilgeous-Alexander
3,22024,1628983,22400647,2025-01-26,OKC @ POR,W,37,12,25,0.48,...,3,0,2,1,35,-4,1,2024-25,1628983,Shai Gilgeous-Alexander
4,22024,1628983,22400625,2025-01-23,OKC vs. DAL,L,40,12,25,0.48,...,0,0,3,4,31,-7,1,2024-25,1628983,Shai Gilgeous-Alexander


In [None]:
nba_data.isna().sum()

SEASON_ID          0
Player_ID          0
Game_ID            0
GAME_DATE          0
MATCHUP            0
WL                 0
MIN                0
FGM                0
FGA                0
FG_PCT             0
FG3M               0
FG3A               0
FG3_PCT            0
FTM                0
FTA                0
FT_PCT             0
OREB               0
DREB               0
REB                0
AST                0
STL                0
BLK                0
TOV                0
PF                 0
PTS                0
PLUS_MINUS         0
VIDEO_AVAILABLE    0
SEASON             0
PLAYER_ID          0
PLAYER_NAME        0
dtype: int64

In [None]:
nba_data[nba_data.PLAYER_NAME.str.contains('lebron',case=False)]

Unnamed: 0,SEASON_ID,Player_ID,Game_ID,GAME_DATE,MATCHUP,WL,MIN,FGM,FGA,FG_PCT,...,STL,BLK,TOV,PF,PTS,PLUS_MINUS,VIDEO_AVAILABLE,SEASON,PLAYER_ID,PLAYER_NAME
2788,22024,2544,22400692,"FEB 01, 2025",LAL @ NYK,W,37,14,26,0.538,...,0,0,5,3,33,12,1,2024-25,2544,LeBron James
2789,22024,2544,22400674,"JAN 30, 2025",LAL @ WAS,W,27,9,19,0.474,...,0,0,0,1,24,29,1,2024-25,2544,LeBron James
2790,22024,2544,22400660,"JAN 28, 2025",LAL @ PHI,L,33,10,16,0.625,...,1,0,8,1,31,-11,1,2024-25,2544,LeBron James
2791,22024,2544,22400648,"JAN 27, 2025",LAL @ CHA,W,36,9,17,0.529,...,0,0,5,0,22,0,1,2024-25,2544,LeBron James
2792,22024,2544,22400644,"JAN 25, 2025",LAL @ GSW,W,35,12,25,0.480,...,1,0,3,0,25,11,1,2024-25,2544,LeBron James
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2953,22022,2544,22200076,"OCT 28, 2022",LAL @ MIN,L,37,10,24,0.417,...,4,0,3,2,28,-13,1,2022-23,2544,LeBron James
2954,22022,2544,22200064,"OCT 26, 2022",LAL @ DEN,L,35,8,21,0.381,...,1,0,8,1,19,-7,1,2022-23,2544,LeBron James
2955,22022,2544,22200037,"OCT 23, 2022",LAL vs. POR,L,38,12,22,0.545,...,2,2,4,3,31,2,1,2022-23,2544,LeBron James
2956,22022,2544,22200016,"OCT 20, 2022",LAL vs. LAC,L,37,7,17,0.412,...,1,2,2,3,20,-1,1,2022-23,2544,LeBron James


In [None]:
nba_data.columns
nba_data.columns = nba_data.columns.str.lower()

Index(['SEASON_ID', 'Player_ID', 'Game_ID', 'GAME_DATE', 'MATCHUP', 'WL',
       'MIN', 'FGM', 'FGA', 'FG_PCT', 'FG3M', 'FG3A', 'FG3_PCT', 'FTM', 'FTA',
       'FT_PCT', 'OREB', 'DREB', 'REB', 'AST', 'STL', 'BLK', 'TOV', 'PF',
       'PTS', 'PLUS_MINUS', 'VIDEO_AVAILABLE', 'SEASON', 'PLAYER_ID',
       'PLAYER_NAME'],
      dtype='object')

In [None]:
# Feature Engineering
nba_data['team'] = nba_data.matchup.str.split().str[0]
nba_data['home_away'] = nba_data['matchup'].apply(lambda x: 'away' if x.split()[1] == '@' else 'home')
nba_data['points_reb'] = nba_data['pts'] + nba_data['reb']
nba_data['points_reb_assists'] = nba_data['pts'] + nba_data['reb'] + nba_data['ast']

In [None]:
# Convert to datetime
nba_data.game_date = pd.to_datetime(nba_data.game_date)

  nba_data.game_date = pd.to_datetime(nba_data.game_date)


In [None]:
# Creating dummies for nominal categorical variables
cat_columns = ['wl','home_away']
nba_data = pd.get_dummies(nba_data, columns=cat_columns, dtype=int,drop_first=True)


In [None]:
nba_data = nba_data.rename(columns={'wl_W':'W','home_away_home':'home'})
nba_data.head()

Unnamed: 0,season_id,player_id,game_id,game_date,matchup,min,fgm,fga,fg_pct,fg3m,...,plus_minus,video_available,season,player_id.1,player_name,team,points_reb,points_reb_assists,W,home
0,22024,1628983,22400705,2025-02-03,OKC vs. MIL,22,15,19,0.789,2,...,34,1,2024-25,1628983,Shai Gilgeous-Alexander,OKC,37,43,1,1
1,22024,1628983,22400691,2025-02-01,OKC vs. SAC,30,10,20,0.5,1,...,17,1,2024-25,1628983,Shai Gilgeous-Alexander,OKC,35,44,1,1
2,22024,1628983,22400673,2025-01-29,OKC @ GSW,39,16,29,0.552,2,...,1,1,2024-25,1628983,Shai Gilgeous-Alexander,OKC,55,59,0,0
3,22024,1628983,22400647,2025-01-26,OKC @ POR,37,12,25,0.48,2,...,-4,1,2024-25,1628983,Shai Gilgeous-Alexander,OKC,40,41,1,0
4,22024,1628983,22400625,2025-01-23,OKC vs. DAL,40,12,25,0.48,1,...,-7,1,2024-25,1628983,Shai Gilgeous-Alexander,OKC,32,39,0,1


In [None]:
nba_data.columns

Index(['season_id', 'player_id', 'game_id', 'game_date', 'matchup', 'min',
       'fgm', 'fga', 'fg_pct', 'fg3m', 'fg3a', 'fg3_pct', 'ftm', 'fta',
       'ft_pct', 'oreb', 'dreb', 'reb', 'ast', 'stl', 'blk', 'tov', 'pf',
       'pts', 'plus_minus', 'video_available', 'season', 'player_id',
       'player_name', 'team', 'points_reb', 'points_reb_assists', 'W', 'home'],
      dtype='object')