In [5]:
#issue 3

# Pick any NBA team that was active during the most recent season. 
# Compute their average score for both home and away games in the most 
# recent season the data includes.

import pandas as pd
import sqlite3

con = sqlite3.connect("data/nba.sqlite")
games = pd.read_sql_query("SELECT * FROM game LIMIT 100", con)
active_players = pd.read_sql_query("SELECT full_name FROM player WHERE is_active=TRUE", con)

# Pick any NBA team that was active during the most recent season. 
# Atlanta      Hawks 

# print(games)
query = """
SELECT * FROM team_history
ORDER BY year_active_till DESC
LIMIT 10;
"""
teams = pd.read_sql_query(query, con)
# print(teams)

max_season_query = """
SELECT * FROM game
ORDER BY season_id DESC
"""
max_season_id = pd.read_sql_query(max_season_query, con)
# print(max_season_id) # 42022


avg_score_query = """
WITH RecentSeason AS (
    SELECT * FROM game
    WHERE season_id = (SELECT "42022" FROM game)
    AND (team_abbreviation_home = 'ATL' OR team_name_away = 'ATL')
)
SELECT 
    AVG(pts_home) AS avg_home_score, 
    AVG(pts_away) AS avg_away_score
FROM RecentSeason;
"""

avg_score = pd.read_sql_query(avg_score_query, con)
print(avg_score)


con.close()


   avg_home_score  avg_away_score
0      123.666667      126.333333


In [26]:
#issue 5 

import pandas as pd
import sqlite3

con = sqlite3.connect("data/nba.sqlite")
games = pd.read_sql_query("SELECT * FROM game LIMIT 100", con)
active_players = pd.read_sql_query("SELECT full_name FROM player WHERE is_active=TRUE", con)

query = """
SELECT COUNT(DISTINCT season_id) AS total_seasons FROM game;
"""
session_count = pd.read_sql_query(query, con)
print(session_count)


# all_season_ids_query = """
# SELECT season_id FROM game;
# """
# all_season_ids = pd.read_sql_query(all_season_ids_query, con)
# print("All season IDs:")
# print(all_season_ids)

# distinct_season_ids_query = """
# SELECT DISTINCT season_id FROM game;
# """
# distinct_season_ids = pd.read_sql_query(distinct_season_ids_query, con)
# print("Distinct season IDs:")
# print(distinct_season_ids)

# con.close()

   total_seasons
0            225


In [28]:
# issue 5 -1
# Which teams have consistently had the highest scores the most? 

query = """
SELECT team, AVG(score) AS avg_score
FROM (
    SELECT team_abbreviation_home AS team, pts_home AS score FROM game
    UNION ALL
    SELECT team_abbreviation_away AS team, pts_away AS score FROM game
) AS TeamScores
GROUP BY team
ORDER BY avg_score DESC
LIMIT 10;
"""

high_score_teams = pd.read_sql_query(query, con)
print("Which teams have consistently had the highest scores the most? :")
print(high_score_teams)


Which teams have consistently had the highest scores the most? :
  team   avg_score
0  GNS  170.200000
1  LBN  166.200000
2  DRT  153.333333
3  STP  145.000000
4  WST  127.101695
5  EST  127.059322
6  ADL  116.000000
7  SDR  115.459350
8  BLT  113.124819
9  CIN  112.749446


In [16]:
#issue 5 -2
# Which team has had below average for the most consecutive seasons? 

query = """
WITH SeasonAverages AS (
    SELECT season_id, AVG(pts_home) AS avg_home_score, AVG(pts_away) AS avg_away_score
    FROM game
    GROUP BY season_id
),
TeamScores AS (
    SELECT 
        g.season_id,
        g.team_abbreviation_home AS team,
        AVG(g.pts_home) AS team_avg_score
    FROM game g
    GROUP BY g.season_id, g.team_abbreviation_home
    UNION ALL
    SELECT 
        g.season_id,
        g.team_abbreviation_away AS team,
        AVG(g.pts_away) AS team_avg_score
    FROM game g
    GROUP BY g.season_id, g.team_abbreviation_away
),
BelowAverageSeasons AS (
    SELECT 
        t.season_id,
        t.team,
        t.team_avg_score,
        sa.avg_home_score,
        sa.avg_away_score,
        CASE 
            WHEN t.team_avg_score < ((sa.avg_home_score + sa.avg_away_score) / 2) THEN 1 
            ELSE 0 
        END AS below_average
    FROM TeamScores t
    JOIN SeasonAverages sa ON t.season_id = sa.season_id
),
ConsecutiveBelowAverage AS (
    SELECT 
        team,
        season_id,
        below_average,
        ROW_NUMBER() OVER (PARTITION BY team ORDER BY season_id) - 
        ROW_NUMBER() OVER (PARTITION BY team, below_average ORDER BY season_id) AS streak_group
    FROM BelowAverageSeasons
    WHERE below_average = 1
),
StreakLengths AS (
    SELECT 
        team, 
        COUNT(season_id) AS streak_length
    FROM ConsecutiveBelowAverage
    GROUP BY team, streak_group
)
SELECT team, MAX(streak_length) AS longest_below_avg_streak
FROM StreakLengths
GROUP BY team
ORDER BY longest_below_avg_streak DESC
LIMIT 1;
"""

longest = pd.read_sql_query(query, con)
print("Which team has had below average for the most consecutive seasons? :")
print(longest)


Team with the longest below-average scoring streak:
  team  longest_below_avg_streak
0  NYK                       155


In [23]:
#issue 5 -3
# Which team has consistently made it to the playoffs? 

import pandas as pd
import sqlite3

con = sqlite3.connect("data/nba.sqlite")
games = pd.read_sql_query("SELECT * FROM game LIMIT 100", con)
# active_players = pd.read_sql_query("SELECT full_name FROM player WHERE is_active=TRUE", con)

three_point_win_loss_query = """
WITH TeamThreePointStats AS (
    SELECT 
        game_id,
        team_abbreviation_home AS team,
        (CAST(fg3m_home AS FLOAT) / NULLIF(fgm_home, 0)) AS three_point_pct,
        CASE WHEN pts_home > pts_away THEN 1 ELSE 0 END AS win
    FROM game
    UNION ALL
    SELECT 
        game_id,
        team_abbreviation_away AS team,
        (CAST(fg3m_away AS FLOAT) / NULLIF(fgm_away, 0)) AS three_point_pct,
        CASE WHEN pts_away > pts_home THEN 1 ELSE 0 END AS win
    FROM game
)
SELECT 
    AVG(CASE WHEN win = 1 THEN three_point_pct END) AS avg_three_point_pct_win,
    AVG(CASE WHEN win = 0 THEN three_point_pct END) AS avg_three_point_pct_lose
FROM TeamThreePointStats;
"""

three_point_win_loss_data = pd.read_sql_query(three_point_win_loss_query, con)
print("Average three-point percentage for wins and losses:")
print(three_point_win_loss_data)



Average three-point percentage for wins and losses:
   avg_three_point_pct_win  avg_three_point_pct_lose
0                 0.153541                  0.144109


In [31]:
#issue 5 -4
# Does a higher proportion of three-point field goals increase the likelihood of winning?

import pandas as pd
import sqlite3

# Connect to the SQLite database
con = sqlite3.connect("data/nba.sqlite")

query = """
WITH TeamThreePointStats AS (
    SELECT 
        game_id,
        team_abbreviation_home AS team,
        (CAST(fg3m_home AS FLOAT) / NULLIF(fgm_home, 0)) AS three_point_pct,
        CASE WHEN pts_home > pts_away THEN 1 ELSE 0 END AS win
    FROM game
    UNION ALL
    SELECT 
        game_id,
        team_abbreviation_away AS team,
        (CAST(fg3m_away AS FLOAT) / NULLIF(fgm_away, 0)) AS three_point_pct,
        CASE WHEN pts_away > pts_home THEN 1 ELSE 0 END AS win
    FROM game
)
SELECT 
    AVG(CASE WHEN win = 1 THEN three_point_pct END) AS avg_three_point_pct_win,
    AVG(CASE WHEN win = 0 THEN three_point_pct END) AS avg_three_point_pct_lose
FROM TeamThreePointStats;
"""

win_loss_result = pd.read_sql_query(query, con)
con.close()

print("Average three-point percentage for wins and losses:")
print(win_loss_result)

avg_win = win_loss_result['avg_three_point_pct_win'][0]
avg_lose = win_loss_result['avg_three_point_pct_lose'][0]

print("\n")
if avg_win > avg_lose:
    print(f"Higher three-point percentage correlates with winning")
    print(f"Average three-point percentage for wins: {avg_win:.2%}")
    print(f"Average three-point percentage for losses: {avg_lose:.2%}")
else:
    print(f"Higher three-point percentage does not correlate with winning")
    print(f"Average three-point percentage for wins: {avg_win:.2%}")
    print(f"Average three-point percentage for losses: {avg_lose:.2%}")


Average three-point percentage for wins and losses:
   avg_three_point_pct_win  avg_three_point_pct_lose
0                 0.153541                  0.144109


Higher three-point percentage correlates with winning
Average three-point percentage for wins: 15.35%
Average three-point percentage for losses: 14.41%


In [33]:
# issue 5 -5
# How does the frequency of turnovers in a game affect the likelihood of winning? Simply put, do more turnovers help or hinder?

import pandas as pd
import sqlite3

con = sqlite3.connect("data/nba.sqlite")

query = """
WITH TeamTurnovers AS (
    SELECT 
        game_id,
        team_abbreviation_home AS team,
        tov_home AS turnovers,
        CASE WHEN pts_home > pts_away THEN 1 ELSE 0 END AS win
    FROM game
    UNION ALL
    SELECT 
        game_id,
        team_abbreviation_away AS team,
        tov_away AS turnovers,
        CASE WHEN pts_away > pts_home THEN 1 ELSE 0 END AS win
    FROM game
)
SELECT 
    AVG(CASE WHEN win = 1 THEN turnovers END) AS avg_turnovers_win,
    AVG(CASE WHEN win = 0 THEN turnovers END) AS avg_turnovers_lose
FROM TeamTurnovers;
"""

win_loss_result = pd.read_sql_query(query, con)
con.close()

print("Average turnovers for wins and losses:")
print(win_loss_result)

avg_win = win_loss_result['avg_turnovers_win'][0]
avg_loss = win_loss_result['avg_turnovers_lose'][0]

print("\n")


if avg_win < avg_loss:
    print(f"More turnovers hinder winning")
    print(f"Average turnovers for wins: {avg_win:.2f}")
    print(f"Average turnovers for losses: {avg_loss:.2f}")
else:
    print(f"Turnovers may not hinder winning")
    print(f"Average turnovers for wins: {avg_win:.2f}")
    print(f"Average turnovers for losses: {avg_loss:.2f}")


Average turnovers for wins and losses:
   avg_turnovers_win  avg_turnovers_lose
0          14.519133           15.463436


More turnovers hinder winning
Average turnovers for wins: 14.52
Average turnovers for losses: 15.46
