# Issue 5 - descriptive statistics

## Question 1: Which teams have most consistently had the highest scores?

In [2]:
import pandas as pd
import sqlite3

con = sqlite3.connect("./data/nba.sqlite")
games = pd.read_sql_query("SELECT * FROM game", con)

games_recent_5yrs = pd.read_sql_query("""
    SELECT * FROM game 
    WHERE season_type NOT IN ('All Star', 'All-Star') 
    AND game_date BETWEEN '2019-01-01 00:00:00' AND '2023-12-31 23:59:59'
    ORDER BY game_date DESC
""", con)

games_recent_5yrs['game_year'] = pd.to_datetime(games_recent_5yrs['game_date']).dt.year

score_stats_home = games_recent_5yrs.groupby(['team_id_home','team_name_home', 'game_year'])['pts_home'].agg(['mean', 'std', 'count'])
score_stats_away = games_recent_5yrs.groupby(['team_id_away','team_name_away', 'game_year'])['pts_away'].agg(['mean', 'std', 'count'])

score_stats_home.index.names = ['team_id', 'team_name', 'game_year']
score_stats_away.index.names = ['team_id', 'team_name', 'game_year']

combined_stats = pd.concat([score_stats_home, score_stats_away])
score_stats_all = combined_stats.groupby(['team_id', 'team_name','game_year']).mean()
score_stats_all['total_games'] = combined_stats.groupby(['team_id', 'team_name'])['count'].sum()
score_stats_all = score_stats_all[score_stats_all['total_games'] >= 100]

# teams with highest average
print("Teams with highest average scores:")
print(score_stats_all.sort_values(by='mean', ascending=False).head(10)) 

Teams with highest average scores:
                                                   mean        std  count  \
team_id    team_name              game_year                                 
1610612758 Sacramento Kings       2023       121.351190  14.097170   27.5   
1610612737 Atlanta Hawks          2023       120.935556  10.581633   26.0   
1610612749 Milwaukee Bucks        2023       119.807692  14.019533   26.0   
1610612760 Oklahoma City Thunder  2023       119.475379  11.428609   23.0   
1610612744 Golden State Warriors  2023       119.241379  12.043391   29.0   
1610612749 Milwaukee Bucks        2019       118.377764  11.766527   48.5   
1610612750 Minnesota Timberwolves 2022       117.711762  12.590458   47.5   
1610612740 New Orleans Pelicans   2020       117.321429  11.167296   22.5   
1610612742 Dallas Mavericks       2020       117.234127  10.458553   27.5   
1610612754 Indiana Pacers         2023       117.063241  12.116996   22.5   

                                        

## Question 2 - Which team has had below average for the most consecutive seasons?


In [3]:
# Calculate the overall average points
total_avg_points = pd.read_sql_query(
    "SELECT AVG(pts_home + pts_away) AS Average_Points FROM game", con
).iloc[0, 0]  # Retrieves the calculated average

# Query teams with below-average points for consecutive seasons
query = f"""
SELECT 
    team_name_home AS Team_Name,
    team_id_home AS Team_ID,
    COUNT(season_id) AS Seasons_Below_Average
FROM (
    SELECT 
        season_id,
        team_id_home,
        team_name_home,
        AVG(pts_home + pts_away) AS Season_Avg_Points
    FROM game
    GROUP BY season_id, team_id_home
    HAVING AVG(pts_home + pts_away) < {total_avg_points}
)
GROUP BY team_name_home, team_id_home
ORDER BY Seasons_Below_Average DESC
"""

# Execute and display results
below_average = pd.read_sql_query(query, con)
print(below_average)

                        Team_Name     Team_ID  Seasons_Below_Average
0                 New York Knicks  1610612752                     74
1                  Boston Celtics  1610612738                     70
2                   Chicago Bulls  1610612741                     61
3                 Detroit Pistons  1610612765                     59
4                   Atlanta Hawks  1610612737                     58
..                            ...         ...                    ...
78        Roma Virtus Lottomatica       12307                      1
79                Toronto Huskies  1610610035                      1
80  Unicaja Malaga Unicaja Malaga       12311                      1
81                 Waterloo Hawks  1610610037                      1
82        West NBA All Stars West  1610616834                      1

[83 rows x 3 columns]


## Question 3 - Which team has consistently made it to the playoffs?


In [4]:
# Calculate the overall average points
total_avg_points = pd.read_sql_query(
    "SELECT AVG(pts_home + pts_away) AS Average_Points FROM game", con
).iloc[0, 0]  # Retrieves the calculated average

# Query teams with below-average points for consecutive seasons
query = f"""
SELECT 
    team_name_home AS Team_Name,
    team_id_home AS Team_ID,
    COUNT(season_id) AS Seasons_In_Playoff
FROM (
    SELECT 
        season_id,
        team_id_home,
        team_name_home
    FROM game WHERE season_type = 'Playoffs'
    GROUP BY season_id, team_id_home
)
GROUP BY team_name_home, team_id_home
ORDER BY Seasons_In_Playoff DESC
"""

# Execute and display results
below_average = pd.read_sql_query(query, con)
print(below_average)

                    Team_Name     Team_ID  Seasons_In_Playoff
0              Boston Celtics  1610612738                  54
1          Los Angeles Lakers  1610612747                  45
2             New York Knicks  1610612752                  38
3          Philadelphia 76ers  1610612755                  35
4           San Antonio Spurs  1610612759                  34
5               Atlanta Hawks  1610612737                  33
6               Chicago Bulls  1610612741                  33
7             Milwaukee Bucks  1610612749                  33
8      Portland Trail Blazers  1610612757                  33
9             Houston Rockets  1610612745                  31
10               Phoenix Suns  1610612756                  28
11             Denver Nuggets  1610612743                  27
12            Detroit Pistons  1610612765                  27
13                  Utah Jazz  1610612762                  27
14           Dallas Mavericks  1610612742                  22
15      

## Question 4 - Which teams have shown the greatest improvement over the past five years?


In [8]:
import pandas as pd
import sqlite3

con = sqlite3.connect("./data/nba.sqlite")
games = pd.read_sql_query("SELECT * FROM game", con)

games_recent_5yrs = pd.read_sql_query("""
    SELECT * FROM game 
    WHERE season_type NOT IN ('All Star', 'All-Star') 
    AND game_date BETWEEN '2019-01-01 00:00:00' AND '2023-12-31 23:59:59'
    ORDER BY game_date DESC
""", con)

games_recent_5yrs['game_year'] = pd.to_datetime(games_recent_5yrs['game_date']).dt.year

score_stats_home = games_recent_5yrs.groupby(['team_id_home','team_name_home', 'game_year'])['pts_home'].agg(['mean', 'std', 'count'])
score_stats_away = games_recent_5yrs.groupby(['team_id_away','team_name_away', 'game_year'])['pts_away'].agg(['mean', 'std', 'count'])

score_stats_home.index.names = ['team_id', 'team_name', 'game_year']
score_stats_away.index.names = ['team_id', 'team_name', 'game_year']


combined_stats = pd.concat([score_stats_home, score_stats_away])
score_stats_all = combined_stats.groupby(['team_id', 'team_name','game_year']).mean()

score_stats_all = score_stats_all.sort_values(by=['team_name', 'game_year'], ascending=[True, True])

score_stats_all['Difference'] = score_stats_all.groupby('team_name')['mean'].diff()


# teams with highest average
print("Teams highest year over year improvement over past 5 years: ")
print(score_stats_all.sort_values(by='Difference', ascending=False).head(10)) 

season_avg = pd.concat([games_recent_5yrs['pts_home'], games_recent_5yrs['pts_away']]).mean()


Teams highest year over year improvement over past 5 years: 
                                                   mean        std  count  \
team_id    team_name              game_year                                 
1610612760 Oklahoma City Thunder  2023       119.475379  11.428609   23.0   
                                  2022       110.795960  13.150085   44.5   
1610612763 Memphis Grizzlies      2020       114.344545  10.651583   23.5   
1610612766 Charlotte Hornets      2021       110.602733  12.377413   54.0   
1610612758 Sacramento Kings       2023       121.351190  14.097170   27.5   
1610612737 Atlanta Hawks          2023       120.935556  10.581633   26.0   
1610612750 Minnesota Timberwolves 2022       117.711762  12.590458   47.5   
1610612746 LA Clippers            2023       116.750000  15.681210   24.5   
1610612744 Golden State Warriors  2023       119.241379  12.043391   29.0   
1610612742 Dallas Mavericks       2020       117.234127  10.458553   27.5   

              

## Question 5 - Which teams have demonstrated the best defensive metrics (e.g., blocks and steals), and how has this impacted their winning consistency?


In [19]:
import pandas as pd
import sqlite3

con = sqlite3.connect("./data/nba.sqlite")
games = pd.read_sql_query("SELECT * FROM game", con)

games_recent_5yrs = pd.read_sql_query("""
    SELECT * FROM game 
    WHERE season_type NOT IN ('All Star', 'All-Star') 
    AND game_date BETWEEN '2019-01-01 00:00:00' AND '2023-12-31 23:59:59'
    ORDER BY game_date DESC
""", con)

games_recent_5yrs['game_year'] = pd.to_datetime(games_recent_5yrs['game_date']).dt.year

games_recent_5yrs['win_home'] = games_recent_5yrs['wl_home'].map({'W': 1, 'L': 0})
games_recent_5yrs['win_away'] = games_recent_5yrs['wl_away'].map({'W': 1, 'L': 0})

# Prepare home stats
home_stats = games_recent_5yrs.groupby(['team_id_home', 'team_name_home', 'game_year'])[['blk_home', 'stl_home', 'win_home']].sum().reset_index()
home_stats.columns = ['team_id', 'team_name', 'game_year', 'total_blocks', 'total_steals', 'total_wins']

# Prepare away stats
away_stats = games_recent_5yrs.groupby(['team_id_away', 'team_name_away', 'game_year'])[['blk_away', 'stl_away', 'win_away']].sum().reset_index()
away_stats.columns = ['team_id', 'team_name', 'game_year', 'total_blocks', 'total_steals', 'total_wins']

# Combine home and away stats
all_stats = pd.concat([home_stats, away_stats])

# Group by team and year to sum up home and away stats
final_stats = all_stats.groupby(['team_id', 'team_name', 'game_year']).sum().reset_index()
final_stats.set_index(['team_id', 'team_name', 'game_year'], inplace=True)

print("Teams defensive metrics: ")
print(final_stats.sort_values(by='total_wins', ascending=False).head(10)) 


Teams defensive metrics: 
                                            total_blocks  total_steals  \
team_id    team_name             game_year                               
1610612756 Phoenix Suns          2021              538.0         973.0   
1610612749 Milwaukee Bucks       2021              594.0        1041.0   
1610612762 Utah Jazz             2021              609.0         794.0   
1610612751 Brooklyn Nets         2021              611.0         787.0   
1610612738 Boston Celtics        2022              628.0         731.0   
1610612749 Milwaukee Bucks       2019              589.0         744.0   
1610612755 Philadelphia 76ers    2021              718.0        1044.0   
1610612746 LA Clippers           2021              540.0         906.0   
1610612761 Toronto Raptors       2019              518.0         853.0   
1610612744 Golden State Warriors 2021              513.0         934.0   

                                            total_wins  
team_id    team_name        

# Issue 6 - Storyboard 

## Question 1: Which teams have most consistently had the highest scores?

### Intro - For our first question, we will be trying to find which team scores the most in their games


In [25]:
import pandas as pd
import sqlite3

con = sqlite3.connect("./data/nba.sqlite")
games = pd.read_sql_query("SELECT * FROM game", con)

games_recent_5yrs = pd.read_sql_query("""
    SELECT * FROM game 
    WHERE season_type NOT IN ('All Star', 'All-Star') 
    AND game_date BETWEEN '2019-01-01 00:00:00' AND '2023-12-31 23:59:59'
    ORDER BY game_date DESC
""", con)

games_recent_5yrs['game_year'] = pd.to_datetime(games_recent_5yrs['game_date']).dt.year

score_stats_home = games_recent_5yrs.groupby(['team_id_home','team_name_home'])['pts_home'].agg(['sum', 'std', 'count'])
score_stats_away = games_recent_5yrs.groupby(['team_id_away','team_name_away'])['pts_away'].agg(['sum', 'std', 'count'])

score_stats_home.index.names = ['team_id', 'team_name']
score_stats_away.index.names = ['team_id', 'team_name']

combined_stats = pd.concat([score_stats_home, score_stats_away])

# teams with highest average
print("Teams with highest total scores:")
print(combined_stats.sort_values(by='sum', ascending=False).head(10)) 

Teams with highest total scores:
                                      sum        std  count
team_id    team_name                                       
1610612749 Milwaukee Bucks        25377.0  12.399917    216
1610612738 Boston Celtics         25220.0  13.193480    221
1610612743 Denver Nuggets         25175.0  12.521091    219
1610612749 Milwaukee Bucks        25154.0  13.052470    218
1610612738 Boston Celtics         24315.0  11.821290    218
1610612743 Denver Nuggets         24313.0  11.027889    219
1610612744 Golden State Warriors  24155.0  12.170417    210
1610612756 Phoenix Suns           23936.0  11.046690    209
1610612748 Miami Heat             23768.0  11.756334    215
1610612746 LA Clippers            23703.0  14.457101    208


### Rising action: We want to see how consistent teams are by season

In [32]:
import pandas as pd
import sqlite3

con = sqlite3.connect("./data/nba.sqlite")
games = pd.read_sql_query("SELECT * FROM game", con)

games_recent_5yrs = pd.read_sql_query("""
    SELECT * FROM game 
    WHERE season_type NOT IN ('All Star', 'All-Star') 
    AND game_date BETWEEN '2019-01-01 00:00:00' AND '2023-12-31 23:59:59'
    ORDER BY game_date DESC
""", con)

games_recent_5yrs['game_year'] = pd.to_datetime(games_recent_5yrs['game_date']).dt.year

score_stats_home = games_recent_5yrs.groupby('team_name_home')['pts_home'].mean()
score_stats_away = games_recent_5yrs.groupby('team_name_away')['pts_away'].mean()

combined_stats = pd.concat([score_stats_home, score_stats_away]).groupby(level=0).mean()

print("Teams with highest average total scores per game:")
print(combined_stats.sort_values(ascending=False).head(10))

Teams with highest average total scores per game:
Milwaukee Bucks           116.435716
Adelaide 36ers            116.000000
Utah Jazz                 114.124126
Atlanta Hawks             114.082051
Minnesota Timberwolves    114.046378
Sacramento Kings          113.760321
Golden State Warriors     113.553368
Brooklyn Nets             113.466575
New Orleans Pelicans      113.429579
Memphis Grizzlies         113.219401
dtype: float64


### Climax: we want to see how consistent they are by game

### Falling action:  we want to see the standard deviation per season to see how consistent they are. Lower standard deviation will show consistency

### Conclusion: Once we look at the falling action data, we can properly discern the standard deviation that we see fit.


## Question 2 - Which team has had below average for the most consecutive seasons?


### Intro: What is considered average? 

### Rising action: Each season has its own average

### Climax: Teams number of seasons below season average

In [33]:
import pandas as pd
import sqlite3

con = sqlite3.connect("./data/nba.sqlite")
games = pd.read_sql_query("SELECT * FROM game", con)

games_recent_5yrs = pd.read_sql_query("""
    SELECT * FROM game 
    WHERE season_type NOT IN ('All Star', 'All-Star') 
    AND game_date BETWEEN '2019-01-01 00:00:00' AND '2023-12-31 23:59:59'
    ORDER BY game_date DESC
""", con)

games_recent_5yrs['game_year'] = pd.to_datetime(games_recent_5yrs['game_date']).dt.year

score_stats_home = games_recent_5yrs.groupby('team_name_home')['pts_home'].mean()
score_stats_away = games_recent_5yrs.groupby('team_name_away')['pts_away'].mean()

combined_stats = pd.concat([score_stats_home, score_stats_away]).groupby(level=0).mean()

# Calculate league average score per game
league_avg_score = combined_stats.mean()

# Filter teams with below average scores
below_avg_teams = combined_stats[combined_stats < league_avg_score]

print("Teams with below average scores per game:")
print(below_avg_teams)


Teams with below average scores per game:
Charlotte Hornets            109.504817
Chicago Bulls                110.506483
Cleveland Cavaliers          107.244895
Detroit Pistons              106.855801
Miami Heat                   108.997146
New York Knicks              107.797772
Oklahoma City Thunder        110.057584
Orlando Magic                107.049138
Ra'anana Maccabi Ra'anana     87.666667
dtype: float64


### Falling action: the team with the longest streak of being above average

### Conclusion: The team that has the longest streak of being below average

## Question 3 - Which team has consistently made it to the playoffs?


### Introduction: Number of playoff appearances by team

### Rising action: Number of playoff appearances each season by team

### Climax: Teams with the longest streak of making playoffs

### Falling action: Teams that has the longest streak of not making it to playoffs

### Climax: The team that consistently makes it to playoffs (average of seasons played and season playoff)

## Question 4 - Which teams have shown the greatest improvement over the past five years?


### Introduction: Average win percentage by season league total (for now)

### Rising action: average win percentage of team by season 

### Climax: Greatest positive change of win percentage per season by team

### Falling action: Greatest negative change of win percentage per season by team

### Conclusion: Average win percentage change over past 5 years per team 

## Question 5 - Which teams have demonstrated the best defensive metrics (e.g., blocks and steals), and how has this impacted their winning consistency?


# Introduction:  Average blocks and steals per game by season

### rising action: average blocks and steals per game by team 

### Climax:  Teams with best blocks and steals (show win percentage)

### Falling action: Worst teams when it comes to blocks and steals (show win percentage)

### Conclusion: Teams with highest win percentage has these blocks and steals 