In [74]:
import pandas as pd
import numpy as np

In [75]:
# Load the CSV (betting spreads)
df = pd.read_csv("nba_data/nba_betting_spread.csv")
# Create filtered betting spreads dataset
df_betOnline = df[df['book_name'] == 'BetOnline']
df_betOnline.to_csv('nba_data/nba_betting_spread_betOnline.csv', index=False)

In [76]:
# Load the CSV (betting spreads)
df = pd.read_csv('nba_data/nba_betting_spread_betOnline.csv')
# Drop unnecessary columns
dropped_columns = ['book_name', 'book_id', 'price1', 'price2']
df_clean = df.drop(columns=dropped_columns)
# Create filtered betting spreads dataset
df_clean.to_csv('nba_data/nba_betting_spread_cleaned.csv', index=False)

In [77]:
# Load the CSV (all games)
df = pd.read_csv("nba_data/nba_games_all.csv")
# Create filtered NBA games dataset
df_filtered = df[df["season_year"] != 2006]
df_filtered.to_csv("nba_data/nba_games_all_filtered.csv", index=False)

In [78]:
# Load the CSV (filtered NBA games)
df = pd.read_csv("nba_data/nba_games_all_filtered.csv")
# Drop unnecessary columns
columns_to_drop = ["game_date", "matchup", "w", "l", "season_year", "min", "fgm", "fga", "fg_pct", "fg3m", "fg3a", "fg3_pct", "ftm", "fta", "ft_pct", "oreb", "dreb", "reb", "ast", "stl", "blk", "tov", "pf", "pts", "season"]
df_cleaned = df.drop(columns=columns_to_drop)
# Create filtered NBA games dataset
df_cleaned.to_csv("nba_data/nba_games_final.csv", index=False)

In [79]:
# Load the CSV (filtered NBA games)
games_df = pd.read_csv("nba_data/nba_games_final.csv")
# Load the CSV (filtered betting spreads)
spreads_df = pd.read_csv("nba_data/nba_betting_spread_cleaned.csv")
# Create a universal dataset by merging based on "game_id"
merged_df = games_df.merge(spreads_df[["game_id", "spread1", "spread2"]], on="game_id", how="left")
merged_df.to_csv("nba_data/factors_and_spreads.csv", index=False)

In [80]:
# Load the CSV (final dataset)
df = pd.read_csv("nba_data/factors_and_spreads.csv")
# Filter the dataset to include only games where spread1 > 0, underdogs
df_favorites = df[df["spread1"] > 0]
# Drop duplicates based on game_id, keeping only the first occurrence
df_favorites_unique = df_favorites.drop_duplicates(subset=["game_id"], keep="first")
# Save the final dataset
df_favorites_unique.to_csv("nba_data/factors_and_spreads.csv", index=False)

In [81]:
# Load the CSV (final dataset)
df = pd.read_csv("nba_data/factors_and_spreads.csv")
# Remove games where 'w_pct' is NaN
df_cleaned = df.dropna(subset=["w_pct"])
# Save the final dataset
df_cleaned.to_csv("nba_data/factors_and_spreads.csv", index=False)

In [82]:
# Load the CSV (final dataset)
df = pd.read_csv("nba_data/factors_and_spreads.csv")
# Create is_upset column: True if a team with spread1 > 0, wins
df["is_upset"] = ((df["spread1"] > 0) & (df["wl"] == "W"))
# Save the final dataset
df.to_csv("nba_data/factors_and_spreads.csv", index=False)

In [83]:
# Load the CSV (final dataset)
df = pd.read_csv("nba_data/factors_and_spreads.csv")
# Remove "Pre Season" games
df = df[df["season_type"] != "Pre Season"]
# Save the final dataset
df.to_csv("nba_data/factors_and_spreads.csv", index=False)

In [84]:
# Playoffs are viewed as the start of a new win percentage in this dataset, so this changes Playoff games' win percentage to their end-of-season record
# Load the CSV (final dataset)
df = pd.read_csv("nba_data/factors_and_spreads.csv")
# Calculate each team's regular-season win percentage
regular_season_wpct = (
    df[df["season_type"] == "Regular Season"]
    .groupby("team_id")["w_pct"]
    .last()
)
# Update Playoff games' w_pct to their team's regular-season w_pct
df.loc[df["season_type"] == "Playoffs", "w_pct"] = df["team_id"].map(regular_season_wpct)
# Save the final dataset
df.to_csv("nba_data/factors_and_spreads.csv", index=False)

In [85]:
# Load dataset
df = pd.read_csv("nba_data/factors_and_spreads.csv")

# Check the overall percentage of upsets
upset_rate = df["is_upset"].mean()
print(f"Overall Upset Rate: {upset_rate:.4%}")

Overall Upset Rate: 49.1002%


In [86]:
# Load the CSV (final dataset)
df = pd.read_csv("nba_data/factors_and_spreads.csv")
# Map 't' to True, 'f' to False, and handle NaN values
df['is_home'] = df['is_home'].map({'t': True, 'f': False})
df['is_home'] = df['is_home'].fillna(False)
# Group by 'is_home' and calculate upsets and games
upset_summary = df.groupby('is_home').agg(
    upsets=('is_upset', 'sum'),
    games=('is_upset', 'count')
)
# Calculate proportion of upsets
upset_summary['prop'] = upset_summary['upsets'] / upset_summary['games']
# Print the summary table
print(upset_summary)
print('An underdog team playing away won 28.20% of the time')
print('An underdog team playing at home won 70.79% of the time')

         upsets  games      prop
is_home                         
False      1325   4698  0.282035
True       3204   4526  0.707910
An underdog team playing away won 28.20% of the time
An underdog team playing at home won 70.79% of the time


In [87]:
# Load the CSV (final dataset)
df = pd.read_csv("nba_data/factors_and_spreads.csv")
# Map 't' to True, 'f' to False, and handle NaN values for is_home
df['is_home'] = df['is_home'].map({'t': True, 'f': False})
df['is_home'] = df['is_home'].fillna(False)  # If there are any NaN values, treat them as False
# Bin teams into four groups based on win percentage
df["w_pct_bin"] = pd.qcut(df["w_pct"], q=4, labels=["Low", "Mid-Low", "Mid-High", "High"])
# Create a new column that combines 'is_home' and 'w_pct_bin' for easy grouping
df['home_w_pct_bin'] = df['is_home'].map({True: 'Home', False: 'Away'}) + " " + df['w_pct_bin'].astype(str)
# Group by the new 'home_w_pct_bin' column and calculate upset statistics
upset_summary = df.groupby('home_w_pct_bin').agg(
    upsets=('is_upset', 'sum'),  # Total upsets
    games=('is_upset', 'count')  # Total games
)
# Bin teams into four groups based on win percentage for additional analysis
w_pct_upset_summary = df.groupby("w_pct_bin", observed=False).agg(
    upsets=("is_upset", "sum"),
    games=("is_upset", "count")
)
# Calculate upset proportion
w_pct_upset_summary["prop"] = w_pct_upset_summary["upsets"] / w_pct_upset_summary["games"]
# Print the results
print(w_pct_upset_summary)
print('Teams with a win percentage in the bottom 25th percentile win as an underdog 26.74% of the time.')
print('Teams with a win percentage in the 25-50th percentile win as an underdog 45.07% of the time.')
print('Teams with a win percentage in the 50-75th percentile win as an underdog 55.85% of the time.')
print('Teams with a win percentage in the top 25th percentile win as an underdog 69.12% of the time.')
# Compute upset proportion
upset_summary['prop'] = upset_summary['upsets'] / upset_summary['games']
# Reset index for better readability
upset_summary = upset_summary.reset_index()
# Print the results
print(upset_summary)
print('Teams playing Away with a win percentage in the top 25th percentile win as an underdog 42.13% of the time.')
print('Teams playing Away with a win percentage in the bottom 25th percentile win as an underdog 50.57% of the time.')
print('Teams playing Away with a win percentage in the 50-75th percentile win as an underdog 33.57% of the time.')
print('Teams playing Away with a win percentage in the 25-50th percentile win as an underdog 29.87% of the time.')
print('Teams playing at Home with a win percentage in the top 25th percentile win as an underdog 81.05% of the time.')
print('Teams playing at Home with a win percentage in the bottom 25th percentile win as an underdog 26.74% of the time.')
print('Teams playing at Home with a win percentage in the 50-75th percentile win as an underdog 72.47% of the time.')
print('Teams playing at Home with a win percentage in the 25-50th percentile win as an underdog 64.65% of the time.')

           upsets  games      prop
w_pct_bin                         
Low           627   2345  0.267377
Mid-Low      1024   2272  0.450704
Mid-High     1289   2308  0.558492
High         1589   2299  0.691170
Teams with a win percentage in the bottom 25th percentile win as an underdog 26.74% of the time.
Teams with a win percentage in the 25-50th percentile win as an underdog 45.07% of the time.
Teams with a win percentage in the 50-75th percentile win as an underdog 55.85% of the time.
Teams with a win percentage in the top 25th percentile win as an underdog 69.12% of the time.
  home_w_pct_bin  upsets  games      prop
0      Away High     297    705  0.421277
1       Away Low     315   1728  0.182292
2  Away Mid-High     331    986  0.335700
3   Away Mid-Low     382   1279  0.298671
4      Home High    1292   1594  0.810540
5       Home Low     312    617  0.505673
6  Home Mid-High     958   1322  0.724660
7   Home Mid-Low     642    993  0.646526
Teams playing Away with a win perce