In [1]:
import pandas as pd
from functools import reduce
import matplotlib.pyplot as plt

In [2]:
file_path = '../data/nba_points_2024_2025.xlsx'
df = pd.read_excel(file_path, usecols="A:T")

# Games that have been played and tracked
df = df[df['Game Count'] == 1]
# Remove whitespace from Home
df['Home'] = df['Home'].str.strip()
# Remove whitespace from Away
df['Away'] = df['Away'].str.strip()
# Feature engineering
df['Home Win'] = (df['Home Points'] > df['Away Points']).astype(int)
df['Away Win'] = (df['Away Points'] > df['Home Points']).astype(int)


In [20]:
print('Games Tracked: ', round(df['Game Count'].sum()))
print('Average Open Total: ', round(df['Open Total'].mean(), 2))
print('Average Actual Total: ', round(df['Actual Total'].mean(), 2))
print('Over Percent: ', round(df['Over'].mean() * 100, 2))
print('Under Percent: ', round(df['Under'].mean() * 100, 2))
print('First Date Tracked: ', df[df['Game Count'] == 1]['Date'].dt.strftime('%Y-%m-%d').min())
print('Last Date Tracked: ', df[df['Game Count'] == 1]['Date'].dt.strftime('%Y-%m-%d').max())

Games Tracked:  915
Average Open Total:  226.56
Average Actual Total:  227.1
Over Percent:  52.57
Under Percent:  47.32
First Date Tracked:  2024-10-22
Last Date Tracked:  2025-03-04


In [21]:
# Get all unique team names from both Home and Away columns
teams = pd.concat([df['Home'], df['Away']]).unique()

# Initialize a list to store the stats for each team
stats = []

for team in teams:
    # Overall: games where the team is either home or away
    team_games = df[(df['Home'] == team) | (df['Away'] == team)]
    overall_over_pct = team_games['Over'].mean() * 100  # Proportion of games with Over
    
    # Home games only
    home_games = df[df['Home'] == team]
    home_over_pct = home_games['Over'].mean() * 100 if not home_games.empty else None
    
    # Away games only
    away_games = df[df['Away'] == team]
    away_over_pct = away_games['Over'].mean() * 100 if not away_games.empty else None
    
    stats.append({
        'Team': team,
        'Overall Over %': overall_over_pct,
        'Home Over %': home_over_pct,
        'Away Over %': away_over_pct
    })

# Convert the list of stats into a DataFrame
stats_df = pd.DataFrame(stats)

In [32]:
# teams with highest over percentage
print('Teams with the Most Overs:')
print('-' * 50)
print(stats_df.sort_values('Overall Over %', ascending=False).head(round(30*.25))) # top 25%

Teams with the Most Overs:
--------------------------------------------------
   Team  Overall Over %  Home Over %  Away Over %
23  MEM       70.000000    64.516129    75.862069
17  CLE       61.666667    62.500000    60.714286
5   ATL       61.290323    68.965517    54.545455
14  DEN       60.655738    68.965517    53.125000
9   UTA       60.655738    56.250000    65.517241
7   NOP       58.333333    56.666667    60.000000
18  NYK       58.333333    56.250000    60.714286
3   PHI       57.377049    53.125000    62.068966


In [33]:
# teams with highest over percentage
print('Teams with the Most Unders:')
print('-' * 50)
print(stats_df.sort_values('Overall Over %', ascending=True).head(round(30*.25))) # top 25%

Teams with the Most Unders:
--------------------------------------------------
   Team  Overall Over %  Home Over %  Away Over %
20  CHA       36.666667    32.258065    41.379310
16  ORL       41.538462    33.333333    50.000000
27  BKN       43.333333    39.285714    46.875000
29  GSW       43.548387    36.666667    50.000000
0   BOS       44.262295    50.000000    38.709677
11  LAC       45.901639    44.827586    46.875000
24  SAS       48.333333    50.000000    46.666667
13  DAL       48.387097    37.500000    60.000000


In [49]:
# Ensure the 'Date' column is datetime
df['Date'] = pd.to_datetime(df['Date'])

# Get all unique team names from both Home and Away columns
teams = pd.concat([df['Home'], df['Away']]).unique()

# List to hold stats for each team
stats = []

for team in teams:
    # Overall games for the team (as home or away)
    team_games = df[(df['Home'] == team) | (df['Away'] == team)]
    team_games_last10 = team_games.sort_values('Date').tail(10)
    
    # Home games for the team
    home_games = df[df['Home'] == team]
    home_games_last10 = home_games.sort_values('Date').tail(10)
    
    # Away games for the team
    away_games = df[df['Away'] == team]
    away_games_last10 = away_games.sort_values('Date').tail(10)
    
    # Calculate Over % for each subset
    overall_over_pct = team_games_last10['Over'].mean() * 100 if not team_games_last10.empty else None
    home_over_pct = home_games_last10['Over'].mean() * 100 if not home_games_last10.empty else None
    away_over_pct = away_games_last10['Over'].mean() * 100 if not away_games_last10.empty else None

    # Calculate scoring averages
    # For overall games, pick the appropriate points column based on whether the team was home or away.
    def get_team_points(row):
        return row['Home Points'] if row['Home'] == team else row['Away Points']
    
    if not team_games_last10.empty:
        overall_team_points = team_games_last10.apply(get_team_points, axis=1)
        overall_avg_points = overall_team_points.mean()
    else:
        overall_avg_points = None

    home_avg_points = home_games_last10['Home Points'].mean() if not home_games_last10.empty else None
    away_avg_points = away_games_last10['Away Points'].mean() if not away_games_last10.empty else None
    overall_avg_total = team_games_last10['Actual Total'].mean()
    overall_avg_open_total = team_games_last10['Open Total'].mean()
    
    stats.append({
        'Team': team,
        'Overall Over % Last10': overall_over_pct,
        'Home Over % Last10': home_over_pct,
        'Away Over % Last10': away_over_pct
        #'Overall Avg Points (last10)': overall_avg_points,
        #'Home Avg Points (last10)': home_avg_points,
        #'Away Avg Points (last10)': away_avg_points,
        #'Overall Avg Total Last10': overall_avg_total,
        #'Overall Avg Open Last10': overall_avg_open_total,
    })

# Convert the list of stats into a DataFrame and display
stats10_df = pd.DataFrame(stats)

In [55]:
print('Teams with the most Overs in their last 10 games:')
print('-'*70)
print(stats10_df.sort_values(by='Overall Over % Last10',ascending=False).head(8))

Teams with the most Overs in their last 10 games:
----------------------------------------------------------------------
   Team  Overall Over % Last10  Home Over % Last10  Away Over % Last10
28  OKC                   80.0                80.0                60.0
17  CLE                   80.0                70.0                60.0
9   UTA                   80.0                80.0                60.0
11  LAC                   70.0                60.0                70.0
5   ATL                   70.0                60.0                70.0
10  POR                   60.0                50.0                50.0
22  CHI                   60.0                70.0                40.0
21  MIN                   60.0                50.0                70.0


In [60]:
# Group by 'Crew Chief' and calculate required stats
crew_stats = df.groupby('Crew Chief').agg(
    games_count=('Crew Chief', 'size'), # total number of games for each crew chief
    over_count=('Over', lambda x: x.sum()),
    under_count=('Under', lambda x: x.sum())
).reset_index()

# Calculate over percentage
crew_stats['over_percentage'] = (crew_stats['over_count'] / crew_stats['games_count']) * 100
crew_stats['under_percentage'] = (crew_stats['under_count'] / crew_stats['games_count']) * 100

In [62]:
print(crew_stats[crew_stats['games_count']>=15][['Crew Chief',
                                                 'games_count',
                                                 'over_percentage',
                                                 'under_percentage']].sort_values(by='over_percentage').head(7))

             Crew Chief  games_count  over_percentage  under_percentage
9           Jacyn Goble           15        26.666667         73.333333
7             Ed Malloy           44        38.636364         61.363636
11       James Williams           47        40.425532         59.574468
6         David Guthrie           19        42.105263         57.894737
8   Gediminas Petraitis           23        43.478261         56.521739
26        Tony Brothers           37        45.945946         54.054054
1          Bill Kennedy           26        46.153846         53.846154


In [63]:
print(crew_stats[crew_stats['games_count']>=15][['Crew Chief',
                                                 'games_count',
                                                 'over_percentage',
                                                 'under_percentage']].sort_values(by='over_percentage',
                                                                                  ascending=False).head(7))

      Crew Chief  games_count  over_percentage  under_percentage
28    Tyler Ford           50        66.000000         34.000000
24  Scott Foster           40        65.000000         35.000000
13    Josh Tiven           49        63.265306         36.734694
18    Marc Davis           45        60.000000         37.777778
19  Mark Lindsay           37        56.756757         43.243243
3    Brian Forte           30        56.666667         43.333333
25   Sean Wright           34        55.882353         44.117647


In [65]:
print(df[df['Date'] == df['Date'].max()][['Date', 'Weekday', 'Away', 'Home']])

          Date  Weekday Away Home
913 2025-03-04  Tuesday  TOR  ORL
914 2025-03-04  Tuesday  HOU  IND
915 2025-03-04  Tuesday  GSW  NYK
916 2025-03-04  Tuesday  MIL  ATL
917 2025-03-04  Tuesday  CLE  CHI
918 2025-03-04  Tuesday  PHI  MIN
919 2025-03-04  Tuesday  BKN  SAS
920 2025-03-04  Tuesday  LAC  PHX
921 2025-03-04  Tuesday  NOP  LAL
