In [1]:
# This notebook contains a script to train a linear regression model that estimates the number of playoff wins a team will have based on their Vegas odds of making the playoffs, making the finals, and winning the finals.

In [2]:
import pandas as pd
from bs4 import BeautifulSoup, Comment
from tqdm import tqdm

from pathlib import Path
import re
import requests
from datetime import datetime, timedelta
import io

In [3]:
FINALS_URL = "https://www.sportsoddshistory.com/nba-main/?sa=nba&a=finals&o=t"
EAST_URL = "https://www.sportsoddshistory.com/nba-main/?sa=nba&a=east&o=t"
WEST_URL = "https://www.sportsoddshistory.com/nba-main/?sa=nba&a=west&o=t"
PLAYOFFS_URL = "https://www.sportsoddshistory.com/nba-win/?sa=nba&t=post&o=s"
REGULAR_SEASON_URL = "https://www.sportsoddshistory.com/nba-win/?sa=nba&t=win&o=t"

PLAYOFF_RECORDS_PATH = "bbref_playoff_records/{}.csv"
ODDS_PATH = "sportsoddshistory"


start_year = "2015"


In [4]:
def get_start_date(url):
    soup = BeautifulSoup(requests.get(url).content)
    start_str = soup.body.find('p', text=re.compile("As of*")).text
    start_str = start_str.replace("As of", "").strip()
    return f"{start_str[0:3]} {start_str[-8:-6]}"

In [16]:
start_date = get_start_date(f"{REGULAR_SEASON_URL}&y={season}")
pd.read_html(f"{FINALS_URL}&y={season}")[0].rename(columns={start_date: "Opening Night"}).add_prefix("Title Odds as of ").rename(columns={"Title Odds as of Team": "Team"})

  start_str = soup.body.find('p', text=re.compile("As of*")).text


Unnamed: 0_level_0,Team,"Title Odds as of Preseason, as of...","Title Odds as of Preseason, as of...","Title Odds as of Preseason, as of...","Title Odds as of Preseason, as of...","Title Odds as of Preseason, as of...","Title Odds as of Regular season, as of...","Title Odds as of Regular season, as of...","Title Odds as of Regular season, as of...","Title Odds as of Regular season, as of...","Title Odds as of Regular season, as of...","Title Odds as of Regular season, as of...","Title Odds as of Regular season, as of...","Title Odds as of Playoffs, prior to...","Title Odds as of Playoffs, prior to...","Title Odds as of Playoffs, prior to...","Title Odds as of Playoffs, prior to...",Title Odds as of Result
Unnamed: 0_level_1,Team,Title Odds as of Jun 17,Title Odds as of Jun 28,Title Odds as of Jul 3,Title Odds as of Oct 1,Title Odds as of Opening Night,Title Odds as of Nov 1,Title Odds as of Dec 1,Title Odds as of Jan 1,Title Odds as of Feb 1,Title Odds as of All-Star Break,Title Odds as of Mar 1,Title Odds as of Apr 1,Title Odds as of Round 1,Title Odds as of Conf Semis,Title Odds as of Conf Finals,Title Odds as of Finals,Title Odds as of Result
0,Atlanta Hawks,10000,15000,50000,50000,50000,50000,50000,12500,25000,100000,100000,100000.0,,,,,
1,Boston Celtics,310,300,325,325,310,300,240,225,225,225,210,190.0,200.0,195.0,,,
2,Brooklyn Nets,50000,100000,100000,100000,100000,100000,100000,100000,100000,100000,100000,,,,,,
3,Charlotte Hornets,50000,100000,100000,100000,100000,100000,75000,100000,100000,100000,100000,,,,,,
4,Chicago Bulls,15000,50000,50000,100000,100000,100000,100000,100000,100000,100000,100000,100000.0,,,,,
5,Cleveland Cavaliers,5000,5000,5000,3500,3500,3000,1200,1300,800,800,600,500.0,550.0,500.0,,,
6,Dallas Mavericks,950,1000,1100,1100,1200,1100,1000,1400,2000,4000,6600,200000.0,,,,,
7,Denver Nuggets,750,750,800,1100,1100,1400,1400,1600,1300,1400,1200,1400.0,3500.0,4000.0,,,
8,Detroit Pistons,50000,100000,100000,100000,100000,100000,75000,100000,50000,50000,25000,25000.0,35000.0,,,,
9,Golden State Warriors,2500,3000,4000,4000,4000,3500,1800,2000,6600,4000,1400,1600.0,1400.0,2200.0,,,


In [5]:
def download_sportsodds_history(season):
    start_date = get_start_date(f"{REGULAR_SEASON_URL}&y={season}")
    regular_season_df = pd.read_html(f"{REGULAR_SEASON_URL}&y={season}", header=0)[0]
    playoff_odds_df = pd.read_html(f"{PLAYOFFS_URL}&y={season}")[1]
    title_odds_df = pd.read_html(f"{FINALS_URL}&y={season}")[0].rename(columns={start_date: "Opening Night"}).add_prefix("Title Odds as of ").rename(columns={"Title Odds as of Team": "Team"})
    conf_odds_df = pd.concat([pd.read_html(f"{EAST_URL}&y={season}")[0], pd.read_html(f"{WEST_URL}&y={season}")[0]]).rename(columns={start_date: "Opening Night"}).add_prefix("Conf Odds as of ").rename(columns={"Conf Odds as of Team": "Team"})

    regular_season_df.to_csv(f"sportsoddshistory/{season}/regular_season_wins.csv", index=False)
    playoff_odds_df.to_csv(f"sportsoddshistory/{season}/make_playoffs.csv", index=False)
    title_odds_df.to_csv(f"sportsoddshistory/{season}/win_championship.csv", index=False)
    conf_odds_df.to_csv(f"sportsoddshistory/{season}/win_conference.csv", index=False)

In [6]:
def download_bbref(season):
    year = season.split("-")[-1]
    response = requests.get(f"https://www.basketball-reference.com/playoffs/NBA_{year}_standings.html")
    soup = BeautifulSoup(response.text)
    # bbref hides data in a comment to avoid scraping
    table = soup.find(string=lambda text: isinstance(text, Comment) and "expanded_standings" in text)
    pd.read_html(io.StringIO(table), header=1)[0].to_csv(f"bbref_playoff_records/{season}.csv", index=False)


In [21]:
dfs = []
for d in tqdm(pd.date_range(start_year, datetime.now() + timedelta(days=365), freq="YE")):
    year = int(d.year)
    season = f"{year - 1}-{year}"
    odds_folder = Path(ODDS_PATH) / season
    if not odds_folder.exists():
        print("SportsOddsHistory data not found locally, downloading...")
        odds_folder.mkdir(exist_ok=True, parents=True)
        download_sportsodds_history(season)
    bbref_file = Path(f"bbref_playoff_records/{season}.csv")
    if not bbref_file.exists():
        print("Basketball Reference data not found locally, downloading...")
        download_bbref(season)
    
    regular_season_df = pd.read_csv(odds_folder / "regular_season_wins.csv")
    playoff_odds_df = pd.read_csv(odds_folder / "make_playoffs.csv")
    title_odds_df = pd.read_csv(odds_folder / "win_championship.csv", header=1)
    conf_odds_df = pd.read_csv(odds_folder / "win_conference.csv", header=1)

    record_df = pd.read_csv(PLAYOFF_RECORDS_PATH.format(season))[["Team", "Overall"]]

    merged_df = regular_season_df.merge(playoff_odds_df, on="Team").merge(title_odds_df, on="Team").merge(conf_odds_df, on="Team").merge(record_df, how='outer', on="Team")
    merged_df["season"] = season
    dfs.append(merged_df)
df = pd.concat(dfs)
    
        

100%|██████████| 11/11 [00:00<00:00, 142.26it/s]


In [22]:
df.head()

Unnamed: 0,Team,Win Total,Over Odds,Under Odds,Game number bet settled,Actual Wins,Result_x,Conference,Make Odds,Miss Odds,...,Title Odds as of Sep 28,Title Odds as of Oct 2,Conf Odds as of Jun 14,Conf Odds as of Jun 24,Conf Odds as of Jul 12,Conf Odds as of Sep 27,Conf Odds as of Sep 28,Conf Odds as of Oct 2,Title Odds as of Jun 28,Conf Odds as of Jun 28
0,Atlanta Hawks,42.5,-125,-105,Game 53,60,Over,Eastern,-125.0,-105.0,...,,,,,,,,,,
1,Boston Celtics,27.0,-125,-105,Game 64,40,Over,Eastern,600.0,-1000.0,...,,,,,,,,,,
2,Brooklyn Nets,41.5,-115,-115,Game 76,38,Under,Eastern,-180.0,150.0,...,,,,,,,,,,
3,Charlotte Hornets,45.0,-115,-115,Game 67,33,Under,Eastern,-155.0,125.0,...,,,,,,,,,,
4,Chicago Bulls,55.5,-115,-115,Game 67,50,Under,Eastern,-2000.0,1000.0,...,,,,,,,,,,


In [23]:
def american_to_implied(american):
    if american < 0:
        return american / (american - 100)
    else:
        return 100 / (american + 100)

In [24]:
df["make_implied"] = df["Make Odds"].apply(american_to_implied)
df["miss_implied"] = df["Miss Odds"].apply(american_to_implied)
df["title_implied"] = df["Title Odds as of Jan 1"].apply(american_to_implied)
df["conf_implied"] = df["Conf Odds as of Jan 1"].apply(american_to_implied)
df["playoff_wins"] = df['Overall'].str.split("-").str[0].astype(float).fillna(0)

  df["playoff_wins"] = df['Overall'].str.split("-").str[0].astype(float).fillna(0)


In [25]:
df["make_adj"] = df["make_implied"] / (df["make_implied"] + df["miss_implied"])
df["title_adj"] = df["title_implied"] / df.groupby('season')["title_implied"].transform('sum')
df["conf_adj"] = df["conf_implied"] / df.groupby(['season', 'Conference'])["conf_implied"].transform('sum')

  df["make_adj"] = df["make_implied"] / (df["make_implied"] + df["miss_implied"])
  df["title_adj"] = df["title_implied"] / df.groupby('season')["title_implied"].transform('sum')
  df["conf_adj"] = df["conf_implied"] / df.groupby(['season', 'Conference'])["conf_implied"].transform('sum')


In [26]:
df = df[["season", "Team", "Conference", "Win Total", "make_adj", "conf_adj", "title_adj", "playoff_wins"]]
df[df.isna().any(axis=1)]

Unnamed: 0,season,Team,Conference,Win Total,make_adj,conf_adj,title_adj,playoff_wins
11,2014-2015,Indiana Pacers,Eastern,33.5,,0.013219,0.007309,0.0
17,2014-2015,Minnesota Timberwolves,Western,29.0,0.087137,,0.001473,0.0
22,2014-2015,Philadelphia 76ers,Eastern,16.0,0.046322,,,0.0


In [27]:
df = df.dropna()

In [30]:
from sklearn.linear_model import LinearRegression

X = df[["Win Total", "make_adj", "conf_adj", "title_adj"]]
y = df["playoff_wins"]

lin_reg = LinearRegression(positive=True, fit_intercept=False).fit(X, y)

In [31]:
for col_name, coeff in zip(X.columns, lin_reg.coef_):
    print(f"{col_name:10} {coeff:.4f}")

print(f"Intercept {lin_reg.intercept_:.4f}")

Win Total  0.0000
make_adj   2.7828
conf_adj   19.7734
title_adj  0.0000
Intercept 0.0000


In [32]:
lin_reg_intercept = LinearRegression(positive=True, fit_intercept=True).fit(X, y)

In [33]:
print(f"Linear regression (no intercept): {lin_reg.score(X, y):.4f}")
print(f"Linear regression: {lin_reg_intercept.score(X, y):.4f}")

Linear regression (no intercept): 0.4516
Linear regression: 0.4519
