In [140]:
import pandas as pd
import sqlite3

#using entire database
con = sqlite3.connect("data/nba.sqlite")
games = pd.read_sql_query("SELECT * FROM game", con)
active_players = pd.read_sql_query("SELECT full_name FROM player WHERE is_active=TRUE", con)


# games data excluding all-star games (2019-2023)

games_recent_5yrs = pd.read_sql_query("""
	SELECT * FROM game 
	WHERE season_type NOT IN ('All Star', 'All-Star') 
	AND game_date BETWEEN '2019-01-01 00:00:00' AND '2023-12-31 23:59:59'
    ORDER BY game_date DESC
""", con)

print(games_recent_5yrs.head(5))

  season_id team_id_home team_abbreviation_home  team_name_home     game_id  \
0     42022   1610612743                    DEN  Denver Nuggets  0042200405   
1     42022   1610612748                    MIA      Miami Heat  0042200404   
2     42022   1610612748                    MIA      Miami Heat  0042200403   
3     42022   1610612743                    DEN  Denver Nuggets  0042200402   
4     42022   1610612743                    DEN  Denver Nuggets  0042200401   

             game_date matchup_home wl_home  min  fgm_home  ...  reb_away  \
0  2023-06-12 00:00:00  DEN vs. MIA       W  240      38.0  ...      44.0   
1  2023-06-09 00:00:00  MIA vs. DEN       L  240      35.0  ...      34.0   
2  2023-06-07 00:00:00  MIA vs. DEN       L  240      34.0  ...      58.0   
3  2023-06-04 00:00:00  DEN vs. MIA       L  240      39.0  ...      31.0   
4  2023-06-01 00:00:00  DEN vs. MIA       W  240      40.0  ...      43.0   

   ast_away  stl_away  blk_away  tov_away  pf_away  pts_away  

In [141]:
# mean, standard deviation of scores for each team
score_stats_home = games_recent_5yrs.groupby(['team_id_home','team_name_home'])['pts_home'].agg(['mean', 'std', 'count'])
score_stats_away = games_recent_5yrs.groupby(['team_id_away','team_name_away'])['pts_away'].agg(['mean', 'std', 'count'])

score_stats_home.index.names = ['team_id', 'team_name']
score_stats_away.index.names = ['team_id', 'team_name']

combined_stats = pd.concat([score_stats_home, score_stats_away])
score_stats_all = combined_stats.groupby(['team_id', 'team_name']).mean()
score_stats_all['total_games'] = combined_stats.groupby(['team_id', 'team_name'])['count'].sum()
score_stats_all = score_stats_all[score_stats_all['total_games'] >= 100]


# teams with highest average
print("Teams with highest average scores:")
print(score_stats_all.sort_values(by='mean', ascending=False).head(25))

Teams with highest average scores:
                                         mean        std  count  total_games
team_id    team_name                                                        
1610612749 Milwaukee Bucks         116.435716  12.726194  217.0          434
1610612762 Utah Jazz               114.124126  11.353901  196.5          393
1610612737 Atlanta Hawks           114.082051  12.425118  195.0          390
1610612750 Minnesota Timberwolves  114.046378  11.995903  184.0          368
1610612758 Sacramento Kings        113.760321  12.404773  186.5          373
1610612744 Golden State Warriors   113.553368  12.502250  207.5          415
1610612751 Brooklyn Nets           113.466575  12.269783  195.5          391
1610612740 New Orleans Pelicans    113.429579  11.357125  184.5          369
1610612763 Memphis Grizzlies       113.219401  12.564816  196.5          393
1610612756 Phoenix Suns            113.174049  11.591735  205.5          411
1610612743 Denver Nuggets          112.98

In [142]:
# teams with consistent scoring
print("Most consistent scoring teams (lowest standard deviation):")
print(score_stats_all.sort_values(by='std').head(25))

Most consistent scoring teams (lowest standard deviation):
                                         mean        std  count  total_games
team_id    team_name                                                        
1610612762 Utah Jazz               114.124126  11.353901  196.5          393
1610612739 Cleveland Cavaliers     107.244895  11.354891  182.0          364
1610612740 New Orleans Pelicans    113.429579  11.357125  184.5          369
1610612756 Phoenix Suns            113.174049  11.591735  205.5          411
1610612748 Miami Heat              108.997146  11.596273  217.5          435
1610612761 Toronto Raptors         111.561036  11.654833  203.0          406
1610612743 Denver Nuggets          112.986301  11.774490  219.0          438
1610612753 Orlando Magic           107.049138  11.822853  189.0          378
1610612750 Minnesota Timberwolves  114.046378  11.995903  184.0          368
1610612765 Detroit Pistons         106.855801  11.997323  183.0          366
1610612747 Los An

#Intro: A "good" game isn't just about who wins. However, when it comes to which team to bet our money on, their points certainly matter. Our goal is to get an idea of which team is liekly to win the playoffs by reviewing team stats over the past 5 years. We'll begin by discussing which teams have consistently scored the most points on average in all season types.

#Rising: The data seems to point to the Milwaukee Bucks having the highest average points per game. (Visual: bar chart all teams). However, it's important to note that averages can include outliers – games with very low percentage points and very high. (Visual: distribution or maybe scatter of all games?). Furthermore, we can see the number of games varies between teams which will affect their average.

#Climax: Keeping this in mind we will continue on to look at the standard deviation (the squared variance of data from the mean) to get a better picture of what these averages are saying. (Visual: box plot). Notice how the Milwaukee Bucks and ranked signifigantly lower when taking this into consideration.

#Falling: This tells us that while a team may average a high level of points per game, it doesn't mean they can do so consistently.

#Resolution: By using both the mean and the standard deviation, we can see the Utah Jazz and Minnesota Timberwolves are most impressive with average points of 114 and stds under 12.

In [143]:
# average points per season
season_avg = pd.concat([games_recent_5yrs['pts_home'], games_recent_5yrs['pts_away']]).mean()

home_games = games_recent_5yrs[['season_id', 'team_id_home', 'team_name_home', 'pts_home']].rename(columns={
    'team_id_home': 'team_id',
    'team_name_home': 'team_name',
    'pts_home': 'points'
})
away_games = games_recent_5yrs[['season_id', 'team_id_away', 'team_name_away', 'pts_away']].rename(columns={
    'team_id_away': 'team_id',
    'team_name_away': 'team_name',
    'pts_away': 'points'
})
all_games = pd.concat([home_games, away_games])

In [144]:
# average points per team per season
team_season_avg = all_games.groupby(['season_id', 'team_id', 'team_name'])['points'].mean().reset_index()

# below average seasons
team_season_avg['league_avg'] = season_avg
team_season_avg['below_average'] = team_season_avg['points'] < team_season_avg['league_avg']
team_season_avg = team_season_avg.sort_values(['team_id', 'season_id'])
print(team_season_avg.head(10))

    season_id     team_id       team_name      points  league_avg  \
60      12022       15019  Adelaide 36ers  116.000000  111.924325   
0       12020  1610612737   Atlanta Hawks  112.750000  111.924325   
30      12021  1610612737   Atlanta Hawks  103.250000  111.924325   
61      12022  1610612737   Atlanta Hawks  112.750000  111.924325   
92      22018  1610612737   Atlanta Hawks  116.391304  111.924325   
122     22019  1610612737   Atlanta Hawks  111.761194  111.924325   
152     22020  1610612737   Atlanta Hawks  113.694444  111.924325   
182     22021  1610612737   Atlanta Hawks  113.939024  111.924325   
212     22022  1610612737   Atlanta Hawks  118.426829  111.924325   
274     42020  1610612737   Atlanta Hawks  106.277778  111.924325   

     below_average  
60           False  
0            False  
30            True  
61           False  
92           False  
122           True  
152          False  
182          False  
212          False  
274           True  


In [145]:
# consecutive streaks
def consecutive_streak(group):
   current_streak = 0
   max_streak = 0
   
   for below_avg in group:
       if below_avg:
           current_streak += 1
           max_streak = max(max_streak, current_streak)
       else:
           current_streak = 0
           
   return max_streak

team_streaks = team_season_avg.groupby('team_name')['below_average'].apply(consecutive_streak)
team_streaks = team_streaks.sort_values(ascending=False)

print("Teams with most consecutive below-average seasons:")
print(team_streaks.head(10))


team_streaks = team_streaks.sort_values(ascending=True)

print("Teams with least consecutive below-average seasons:")
print(team_streaks.head(10))

Teams with most consecutive below-average seasons:
team_name
Orlando Magic          9
Detroit Pistons        9
Miami Heat             7
Cleveland Cavaliers    7
Charlotte Hornets      5
Houston Rockets        5
San Antonio Spurs      4
Chicago Bulls          4
New York Knicks        4
Los Angeles Lakers     4
Name: below_average, dtype: int64
Teams with least consecutive below-average seasons:
team_name
Adelaide 36ers               0
Ra'anana Maccabi Ra'anana    1
Golden State Warriors        1
New Orleans Pelicans         1
Sacramento Kings             1
Utah Jazz                    2
Boston Celtics               2
Brooklyn Nets                2
Dallas Mavericks             2
Phoenix Suns                 2
Name: below_average, dtype: int64


#Intro: We know which teams are scoring the best, but which teams are scoring the worst? It's not unheard of to have a disspointing start of the playing year and then make it to the playoffs. To consider these instances, we're going to be looking at which teams have consistently underperformed. To do this we'll be calculating the average performace and seeing which teams have consecutively not met expectations.

#Rising: The magic number across all game types is 111.9243, about 112 points for the entire league. (Visual: distribution chart). From here, our goal is to see which teams have the most seasons scoring under that average, consecutively. 

#Climax: It's clear that the teams Orlando Magic and Detroit Pistons have struggled compared to their competitors, both with 9 consecutive seasons of underporformace. (Visual: bar chart all teams)

#Falling: There are many reasons a team can do poorly from poor coaching, a poor player line-up, or even less financial support compared to others. However, when we take the reverse of our data and look for which teams had the least consecutive underperforming seasons... (Visual: bar chart all teams) we can see that there are many teams that can turn it around within the 4 possible seasons types. The Adelaide 36ers has actually never underperformed the league.

#Resolution: While there may be legitimate reasons a team may not reach league standards, when considered how reliable a team will be for our money we have to consider which ones can bounce back. There are 5 teams who have only one or less "bad" seasons. History implies these teams can be trusted to shake off inevitable losses.

In [151]:
print(games['season_type'].unique())

['Regular Season' 'Playoffs' 'All-Star' 'All Star' 'Pre Season']


In [191]:
# playoff games
playoff_games = games_recent_5yrs[games_recent_5yrs['season_type']=='Playoffs']

# playoff appearances (seasons)
playoff_home_seasons = playoff_games[['team_name_home', 'season_id']].rename(
   columns={'team_name_home': 'team_name'}).drop_duplicates()
playoff_away_seasons = playoff_games[['team_name_away', 'season_id']].rename(
   columns={'team_name_away': 'team_name'}).drop_duplicates()
all_playoff_seasons = pd.concat([playoff_home_seasons, playoff_away_seasons]).drop_duplicates()
playoff_appearances = all_playoff_seasons.groupby('team_name').size()

# total playoff games (rounds)
playoff_home_games = playoff_games[['team_name_home', 'game_id']].rename(
   columns={'team_name_home': 'team_name'})
playoff_away_games = playoff_games[['team_name_away', 'game_id']].rename(
   columns={'team_name_away': 'team_name'})
all_playoff_games = pd.concat([playoff_home_games, playoff_away_games])
playoff_games_count = all_playoff_games.groupby('team_name').size()

# playoff appearances, for both seasons & games
playoff_all = pd.DataFrame({
    'team_name': playoff_appearances.index,
    'seasons': playoff_appearances.values,
    'total_games': playoff_games_count.values
})
    
# success rate

playoff_all['avg_games_played'] = (playoff_all['total_games'] / playoff_all['seasons'])

print("\nPlayoff appearances and games (2019-2023):")
print(playoff_all[['team_name', 'seasons', 'total_games', 'avg_games_played']].sort_values(by=['seasons', 'avg_games_played'], ascending=False))


Playoff appearances and games (2019-2023):
                 team_name  seasons  total_games  avg_games_played
1           Boston Celtics        5           75         15.000000
6           Denver Nuggets        5           68         13.600000
15         Milwaukee Bucks        5           65         13.000000
21      Philadelphia 76ers        5           51         10.200000
2            Brooklyn Nets        5           29          5.800000
14              Miami Heat        4           66         16.500000
11             LA Clippers        4           43         10.750000
27               Utah Jazz        4           29          7.250000
8    Golden State Warriors        3           57         19.000000
22            Phoenix Suns        3           46         15.333333
12      Los Angeles Lakers        3           43         14.333333
26         Toronto Raptors        3           41         13.666667
5         Dallas Mavericks        3           31         10.333333
0            Atlan

#Intro: The goal of all successful teams is to win the playoffs. The sad reality is, in every NBA season some never get the chance to play. We've been looking at team performace over all seasons: regular, playoffs, all-star, and pre-season. Now we're going to look at which teams have had the most appearances in just the playoffs. 

#Rising: Our data is tracking the number of seasons a team has made it to the playoffs and how many games they participated in per playoff season. This answers two important questions: Who gets the chance to win the playoffs and who has gotten the furthest.

#Climax: From our data we can see five teams are tied for the most playoff appreances. (Visual: bar chart all teams). However, when looking at the total games played by those five, the Celtics are furthest ahead with 75 total games. (Visual: bar chart, number of games).

#Falling: It's obvious that a team with only 4 or 3 playoff seasons can still make it in the upcoming year. When we consider how well players do when they make it to the finals, 5 teams stand out: Warriors, Heat, Sun, and Celtics – All averaging more than 15 games in the playoffs. (Visual: bubble chart x=total games, y=avg games playes, bubble size=number of seasons)

#Conclusion: A team who can make it to the playoffs is certainly a capable team in the league. However, it's important that a team can actually win once they're there. The Celtics are a great choice for making it to the finals, but if the Golden State Warriors also make it, history shows they may outperform their rivals.

In [192]:
#Question 4: Which teams have shown the greatest improvement over the past five years?
# extract year from game_date
games_recent_5yrs['game_year'] = pd.to_datetime(games_recent_5yrs['game_date']).dt.year

# limit to regular season
regular_games = games_recent_5yrs[games_recent_5yrs['season_type']=='Regular Season']

# home and away game data 
reg_home_games = regular_games[['team_name_home', 'game_year', 'pts_home']].rename(
    columns={'team_name_home': 'team_name', 'pts_home': 'points'})
reg_away_games = regular_games[['team_name_away', 'game_year', 'pts_away']].rename(
    columns={'team_name_away': 'team_name', 'pts_away': 'points'})
reg_all_games = pd.concat([reg_home_games, reg_away_games])

# average points for each team by year
team_year_avg = reg_all_games.groupby(['team_name', 'game_year'])['points'].mean().reset_index()

# improvement
improvement_list = []

for team in team_year_avg['team_name'].unique():
    team_data = team_year_avg[team_year_avg['team_name'] == team].sort_values('game_year')
    
    total_improvement = team_data['points'].diff().sum() 
    
    improvement_list.append({
        'team_name_home': team, 
        'total_improvement': total_improvement
    })

team_improvement = pd.DataFrame(improvement_list)
team_improvement = team_improvement.sort_values(by='total_improvement', ascending=False)

print("Teams with the greatest improvement over the past five years:")
print(team_improvement.head(20))


Teams with the greatest improvement over the past five years:
            team_name_home  total_improvement
19         New York Knicks          14.695726
25        Sacramento Kings          12.650000
14       Memphis Grizzlies          10.526330
0            Atlanta Hawks           9.028261
11          Indiana Pacers           8.326020
9    Golden State Warriors           7.655134
5      Cleveland Cavaliers           7.627350
13      Los Angeles Lakers           6.813824
20   Oklahoma City Thunder           6.298844
21           Orlando Magic           5.524613
7           Denver Nuggets           5.377463
4            Chicago Bulls           5.149422
1           Boston Celtics           4.875494
28               Utah Jazz           4.740310
22      Philadelphia 76ers           4.458629
6         Dallas Mavericks           4.343460
3        Charlotte Hornets           4.276423
17  Minnesota Timberwolves           4.053968
12             LA Clippers           2.143659
15              Mi

#Intro: Our previous data has discussed the 'best' teams historically. We also want to take a look at what teams are showing the most improvement who may surprise us in the upcoming playoffs.

#Rising: When looking at the data year after year, it is easy to see that teams we've expected to do poorly have increased their stats. (Visual: dual axis chart showing improvement vs playoff attendance?) To uncover who's done the best, we've taken their average points per year and calculated whos shown the most improvement.

#Climax: From our chart we can see the New York Knicks have increased their points at the highest rate. They're followed by The Sacramento Kings and Memphis Grizzlies. (Visual: line chart with top 5 teams?)

#Falling: What this data is aiming to show is a teams ability to strategize and improve. By only considering teams who have historically been successful, we aren't seeing a full picture of the league.

#Conclusion: When we join our stats together, we can see that our past questions revealed the Golden State Warriors do very well when in the finals and haven't had many consecutively bad seasons. This questions shows us they are also improving compared to their other high-ranking peers. However, to continue searching for a full picture, we hope to answer one final question.

In [208]:
import pandas as pd
import sqlite3 
con = sqlite3.connect("data/nba.sqlite")
games_recent_5yrs = pd.read_sql_query("""
	SELECT * FROM game 
	WHERE season_type NOT IN ('All Star', 'All-Star') 
	AND game_date BETWEEN '2019-01-01 00:00:00' AND '2023-12-31 23:59:59'
    ORDER BY game_date DESC
""", con)

regular_games = games_recent_5yrs[games_recent_5yrs['season_type']=='Regular Season']

# home games stats
home_defense = regular_games[['team_name_home', 'blk_home', 'stl_home', 'wl_home', 'oreb_home', 'fga_home', 'tov_home', 'fta_home', 'pts_home']].rename(
   columns={'team_name_home': 'team_name', 
           'blk_home': 'blocks',
           'stl_home': 'steals',
           'wl_home': 'win_loss',
           'oreb_home': 'off_rebounds',
           'fga_home': 'attempt_fg',
           'tov_home': 'turnovers',
           'fta_home': 'attempt_freethrow',
           'pts_home': 'total_points'})

# away games stats
away_defense = regular_games[['team_name_away', 'blk_away', 'stl_away', 'wl_away', 'oreb_away', 'fga_away', 'tov_away', 'fta_away', 'pts_away']].rename(
   columns={'team_name_away': 'team_name',
           'blk_away': 'blocks', 
           'stl_away': 'steals',
           'wl_away': 'win_loss',
           'oreb_away': 'off_rebounds',
           'fga_away': 'attempt_fg',
           'tov_away': 'turnovers',
           'fta_away': 'attempt_freethrow',
           'pts_away': 'total_points'})

# defense metrics
all_defense = pd.concat([home_defense, away_defense])
defense_stats = all_defense.groupby('team_name').agg(
    avg_blocks=('blocks', 'mean'),
    avg_steals=('steals', 'mean'),
    avg_rebound=('off_rebounds', 'mean'),
    avg_fieldgoal=('attempt_fg', 'mean'),
    avg_turnover=('turnovers', 'mean'),
    avg_freethrow=('attempt_freethrow', 'mean'),
    avg_pts=('total_points', 'mean'),
).reset_index()

# win rate 
wins = all_defense[all_defense['win_loss'] == 'W'].groupby('team_name').size()
total_games = all_defense.groupby('team_name').size()
win_rates = (wins / total_games).reset_index(name='win_rate')

# defense rating
defense_analysis = pd.merge(defense_stats, win_rates, on='team_name')
defense_analysis['defense_rating'] = (defense_analysis['avg_pts'] / (.96 * defense_analysis['avg_fieldgoal'] + defense_analysis['avg_turnover'] + .44 * defense_analysis['avg_freethrow'] - defense_analysis['avg_rebound'])) *100
defense_sorted = defense_analysis.sort_values(
   by=['win_rate', 'defense_rating'], 
   ascending=[False, True]
)

print("Teams with the best defensive metrics and their win consistency:")
print(defense_sorted)

Teams with the best defensive metrics and their win consistency:
                 team_name  avg_blocks  avg_steals  avg_rebound  \
16         Milwaukee Bucks    4.957865    7.359551    10.143258   
22      Philadelphia 76ers    5.336158    8.050847     9.644068   
7           Denver Nuggets    4.296919    7.627451    10.316527   
1           Boston Celtics    5.485876    7.502825    10.228814   
28               Utah Jazz    5.056657    6.603399    10.600567   
12             LA Clippers    4.564972    7.211864     9.723164   
23            Phoenix Suns    4.628895    7.824363     9.977337   
15              Miami Heat    3.823034    7.676966     9.241573   
27         Toronto Raptors    5.036932    8.914773    11.062500   
2            Brooklyn Nets    5.235795    6.821023     9.633523   
14       Memphis Grizzlies    5.619718    8.653521    11.701408   
6         Dallas Mavericks    4.182073    6.263305     9.235294   
9    Golden State Warriors    4.713043    8.043478     9.594203 

#Intro: We'd like to see which teams are showing the most technical capabilites and how it affects their win-rate. If a team can average more blocks or steals, will that lead to more wins historically?

#Rising: Using the games in the regular season, we tracked an array of metrics such as, but not limited to, blocks, steals, and attempted freethrows. We then filtered by win-rate and defensive ratings to see if there were patterns – do winning teams have stronger stats?

#Climax: The data shows a low a correlation between technical ability and win-rate. (Visual: scatter plots for main metrics, excluding defensive rating?) For example, the Pistons have the lowest win-rate in the league but actually have one of the highest attempted turnovers and second lowest defense rating. What's especially interesting the defense rating, the metric that most closely follows the win-rate. It's results actually show that a better (in this case, lower) rating is associated with teams who win less. (Visual: Scatter plot with x=defensive rating and y=win-rate)

#Rising: These results may lead to conclusion of highly defensive teams performing better. However, this singly question couldn't wish to answer which specific team is most likely to do well all-around.

#Climax: The goal is to combine the 5 queries we've shown to find which team is most probably to excell in the Playoffs. We've learned that overall stats don't necessarily lead to more winning games. It may be of some use to consider a team's defense rating in final decision-making, so long as congruently approached by other metrics.

In [198]:
print(games.columns)

Index(['season_id', 'team_id_home', 'team_abbreviation_home', 'team_name_home',
       'game_id', 'game_date', 'matchup_home', 'wl_home', 'min', 'fgm_home',
       'fga_home', 'fg_pct_home', 'fg3m_home', 'fg3a_home', 'fg3_pct_home',
       'ftm_home', 'fta_home', 'ft_pct_home', 'oreb_home', 'dreb_home',
       'reb_home', 'ast_home', 'stl_home', 'blk_home', 'tov_home', 'pf_home',
       'pts_home', 'plus_minus_home', 'video_available_home', 'team_id_away',
       'team_abbreviation_away', 'team_name_away', 'matchup_away', 'wl_away',
       'fgm_away', 'fga_away', 'fg_pct_away', 'fg3m_away', 'fg3a_away',
       'fg3_pct_away', 'ftm_away', 'fta_away', 'ft_pct_away', 'oreb_away',
       'dreb_away', 'reb_away', 'ast_away', 'stl_away', 'blk_away', 'tov_away',
       'pf_away', 'pts_away', 'plus_minus_away', 'video_available_away',
       'season_type'],
      dtype='object')
