In [2]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re

In [3]:
allplayer_url = (f'https://www.basketball-reference.com/leagues/NBA_2024_per_game.html')
allplayer_pergame_res = requests.get(allplayer_url)

In [5]:
allplayer_soup = BeautifulSoup(allplayer_pergame_res.content, 'lxml')
allplayer_pergame = allplayer_soup.find(name = 'table', attrs = {'id': 'per_game_stats'})

In [6]:
allplayer_stats = []
for row in allplayer_pergame.find_all('tr')[1:]:
    player = {}
    player['Name'] = row.find('td', {'data-stat': 'name_display'}).text
    player['Age'] = row.find('td', {'data-stat': 'age'}).text
    player['Team'] = row.find('td', {'data-stat': 'team_name_abbr'}).text
    player['Position'] = row.find('td', {'data-stat': 'pos'}).text
    player['Games'] = row.find('td', {'data-stat': 'games'}).text
    player['MPG'] = row.find('td', {'data-stat': 'mp_per_g'}).text
    player['FG'] = row.find('td', {'data-stat': 'fg_per_g'}).text
    player['FGA'] = row.find('td', {'data-stat': 'fga_per_g'}).text
    player['FG%'] = row.find('td', {'data-stat': 'fg_pct'}).text
    player['3P'] = row.find('td', {'data-stat': 'fg3_per_g'}).text
    player['3P%'] = row.find('td', {'data-stat': 'fg3_pct'}).text
    player['2P'] = row.find('td', {'data-stat': 'fg2_per_g'}).text
    player['2PA'] = row.find('td', {'data-stat': 'fg2a_per_g'}).text
    player['2P%'] = row.find('td', {'data-stat': 'fg2_pct'}).text
    player['eFG%'] = row.find('td', {'data-stat': 'efg_pct'}).text
    player['FT'] = row.find('td', {'data-stat': 'ft_per_g'}).text
    player['FTA'] = row.find('td', {'data-stat': 'fta_per_g'}).text
    player['FT%'] = row.find('td', {'data-stat': 'ft_pct'}).text
    player['ORB'] = row.find('td', {'data-stat': 'orb_per_g'}).text
    player['DRB'] = row.find('td', {'data-stat': 'drb_per_g'}).text
    player['TRB'] = row.find('td', {'data-stat': 'trb_per_g'}).text
    player['AST'] = row.find('td', {'data-stat': 'ast_per_g'}).text
    player['STL'] = row.find('td', {'data-stat': 'stl_per_g'}).text
    player['BLK'] = row.find('td', {'data-stat': 'blk_per_g'}).text
    player['TOV'] = row.find('td', {'data-stat': 'tov_per_g'}).text
    player['PF'] = row.find('td', {'data-stat': 'pf_per_g'}).text
    player['PTS'] = row.find('td', {'data-stat': 'pts_per_g'}).text
    allplayer_stats.append(player)
    
allplayer_stats = pd.DataFrame(allplayer_stats)
allplayer_stats.to_csv('allplayer_stats.csv')

In [3]:
# team abbreviations
teams_dict = {'hawks' : 'ATL',
              'celtics' : 'BOS',
              'nets' : 'BRK',
              'hornets' : 'CHO',
              'bulls' : 'CHI',
              'cavaliers' : 'CLE',
              'mavericks' : 'DAL',
              'nuggets' : 'DEN',
              'pistons' : 'DET',
              'warriors' : 'GSW',
              'rockets' : 'HOU',
              'pacers' : 'IND',
              'clippers' : 'LAC',
              'lakers' : 'LAL',
              'grizzlies' : 'MEM',
              'heat' : 'MIA',
              'bucks' : 'MIL',
              'timberwolves' : 'MIN',
              'pelicans' : 'NOP',
              'knicks' : 'NYK',
              'thunder' : 'OKC',
              'magic' : 'ORL',
              '76ers' : 'PHI',
              'suns' : 'PHO',
              'trailblazers' : 'POR',
              'kings' : 'SAC',
              'spurs' : 'SAS',
              'raptors' : 'TOR',
              'jazz' : 'UTA',
              'wizards' : 'WAS'}

In [8]:
# 2 man lineup stats
team_urls = {
    abbr: f'https://www.basketball-reference.com/teams/{abbr}/2024/lineups' for abbr in teams_dict.values()
}

teams_abbr = list(teams_dict.values())
iteration = True
index = 0

for team_url in team_urls.values():
    lineups_res = requests.get(team_url)
    lineups_soup = BeautifulSoup(lineups_res.content, 'lxml')
    lineups_str = str(lineups_soup.find('div', attrs = {'id': "all_lineups_2-man_"})).replace("<!--", "").replace("-->", "")
    lineups_soup = BeautifulSoup(lineups_str, 'lxml')
    lineups = lineups_soup.find(name = 'table', attrs = {'id': 'lineups_2-man_'})
  
    lineups_stats = []
    for row in lineups.find_all('tr')[2:]:
        duo = {}
        duo['Lineup'] = row.find('td').text
        duo['Team'] = teams_abbr[index]
        duo['MP'] = row.find('td', {'data-stat': 'mp'}).text
        duo['PTS'] = row.find('td', {'data-stat': 'diff_pts'}).text
        duo['FG'] = row.find('td', {'data-stat': 'diff_fg'}).text
        duo['FGA'] = row.find('td', {'data-stat': 'diff_fga'}).text
        duo['FG%'] = row.find('td', {'data-stat': 'diff_fg_pct'}).text
        duo['3P'] = row.find('td', {'data-stat': 'diff_fg3'}).text
        duo['3PA'] = row.find('td', {'data-stat': 'diff_fg3_pct'}).text
        duo['eFG%'] = row.find('td', {'data-stat': 'diff_efg_pct'}).text
        duo['FT'] = row.find('td', {'data-stat': 'diff_ft'}).text
        duo['FTA'] = row.find('td', {'data-stat': 'diff_fta'}).text
        duo['FT%'] = row.find('td', {'data-stat': 'diff_ft_pct'}).text
        duo['DRB'] = row.find('td', {'data-stat': 'diff_drb'}).text
        duo['DRB%'] = row.find('td', {'data-stat': 'diff_drb_pct'}).text
        duo['ORB'] = row.find('td', {'data-stat': 'diff_orb'}).text
        duo['ORB%'] = row.find('td', {'data-stat': 'diff_orb_pct'}).text
        duo['TRB'] = row.find('td', {'data-stat': 'diff_trb'}).text
        duo['TRB%'] = row.find('td', {'data-stat': 'diff_trb_pct'}).text
        duo['AST'] = row.find('td', {'data-stat': 'diff_ast'}).text
        duo['STL'] = row.find('td', {'data-stat': 'diff_stl'}).text
        duo['BLK'] = row.find('td', {'data-stat': 'diff_blk'}).text
        duo['TOV'] = row.find('td', {'data-stat': 'diff_tov'}).text
        duo['PF'] = row.find('td', {'data-stat': 'diff_pf'}).text
        lineups_stats.append(duo)
    lineups_stats = pd.DataFrame(lineups_stats)
    
    if iteration:
        lineups_stats.to_csv('alllineup_stats.csv', mode='w', header=True, index=False)  
        iteration = False
    else:
        lineups_stats.to_csv('alllineup_stats.csv', mode='a', header=False, index=False)
        
    index+=1

In [4]:
# all player stats per 100 possessions
team_urls = {
    abbr: f'https://www.basketball-reference.com/teams/{abbr}/2024.html' for abbr in teams_dict.values()
}

teams_abbr = list(teams_dict.values())
iteration = True
index = 0

for team_url in team_urls.values():
    players_res = requests.get(team_url)
    print(players_res)
    players_soup = BeautifulSoup(players_res.content, 'lxml')
    players_str = str(players_soup.find('div', attrs = {'id': "all_per_poss-playoffs_per_poss"})).replace("<!--", "").replace("-->", "")
    players_soup = BeautifulSoup(players_str, 'lxml')
    players = players_soup.find(name = 'table', attrs = {'id': 'per_poss'})

    players_stats = []
    for row in players.find_all('tr')[1:]:
        player = {}
        player['Name'] = row.find('td', {'data-stat': 'player'}).text
        player['Team'] = teams_abbr[index]
        player['Age'] = row.find('td', {'data-stat': 'age'}).text
        player['G'] = row.find('td', {'data-stat': 'g'}).text
        player['GS'] = row.find('td', {'data-stat': 'gs'}).text
        player['MP'] = row.find('td', {'data-stat': 'mp'}).text
        player['FG'] = row.find('td', {'data-stat': 'fg_per_poss'}).text
        player['FGA'] = row.find('td', {'data-stat': 'fga_per_poss'}).text
        player['FG%'] = row.find('td', {'data-stat': 'fg_pct'}).text
        player['3P'] = row.find('td', {'data-stat': 'fg3_per_poss'}).text
        player['3PA'] = row.find('td', {'data-stat': 'fg3a_per_poss'}).text
        player['3P%'] = row.find('td', {'data-stat': 'fg3_pct'}).text
        player['2P'] = row.find('td', {'data-stat': 'fg2_per_poss'}).text
        player['2PA'] = row.find('td', {'data-stat': 'fg2a_per_poss'}).text
        player['2P%'] = row.find('td', {'data-stat': 'fg2_pct'}).text
        player['FT'] = row.find('td', {'data-stat': 'ft_per_poss'}).text
        player['FTA'] = row.find('td', {'data-stat': 'fta_per_poss'}).text
        player['FT%'] = row.find('td', {'data-stat': 'ft_pct'}).text
        player['ORB'] = row.find('td', {'data-stat': 'orb_per_poss'}).text
        player['DRB'] = row.find('td', {'data-stat': 'drb_per_poss'}).text
        player['TRB'] = row.find('td', {'data-stat': 'trb_per_poss'}).text
        player['AST'] = row.find('td', {'data-stat': 'ast_per_poss'}).text
        player['STL'] = row.find('td', {'data-stat': 'stl_per_poss'}).text
        player['BLK'] = row.find('td', {'data-stat': 'blk_per_poss'}).text
        player['TOV'] = row.find('td', {'data-stat': 'tov_per_poss'}).text
        player['PF'] = row.find('td', {'data-stat': 'pf_per_poss'}).text
        player['PTS'] = row.find('td', {'data-stat': 'pts_per_poss'}).text
        player['ORTG'] = row.find('td', {'data-stat': 'off_rtg'}).text
        player['DRTG'] = row.find('td', {'data-stat': 'def_rtg'}).text
        players_stats.append(player)
    players_stats = pd.DataFrame(players_stats)
    
    if iteration:
        players_stats.to_csv('player_100stats.csv', mode='w', header=True, index=False)  
        iteration = False
    else:
        players_stats.to_csv('player_100stats.csv', mode='a', header=False, index=False)
        
    index+=1

<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>


In [11]:
# team and opponent stats
team_urls = {
    abbr: f'https://www.basketball-reference.com/teams/{abbr}/2024.html' for abbr in teams_dict.values()
}

teams_data = {}
index = 0
teams_abbr = list(teams_dict.values())

for team_url in team_urls.values():
    team_res = requests.get(team_url)
    team_soup = BeautifulSoup(team_res.content, 'lxml')
    team_str = str(team_soup.find('div', attrs = {'id': "all_team_and_opponent"})).replace("<!--", "").replace("-->", "")
    team_soup = BeautifulSoup(team_str, 'lxml')
    team = team_soup.find(name = 'table', attrs = {'id': 'team_and_opponent'})
    
    teams_data[teams_abbr[index]] = team
    
    index+=1 

In [None]:
# team and opponent stats
iteration = True

for team in teams_data.values():
    
    team_stats = []
    for tr in team.find_all('tr'):
        cells = tr.find_all(['th','td'])
        row = [cell.text.strip() for cell in cells]
        team_stats.append(row)
    team_stats = pd.DataFrame(team_stats)
    
    if iteration:
        team_stats.to_csv('team_and_opponent.csv', mode='w', header=True, index=False)  
        iteration = False
    else:
        team_stats.to_csv('team_and_opponent.csv', mode='a', header=False, index=False)

In [15]:
# team and opponent stats
team_urls = {
    abbr: f'https://www.basketball-reference.com/teams/{abbr}/2024.html' for abbr in teams_dict.values()
}

teams_data = {}
index = 0
teams_abbr = list(teams_dict.values())

for team_url in team_urls.values():
    team_res = requests.get(team_url)
    print(team_res)
    team_soup = BeautifulSoup(team_res.content, 'lxml')
    team_str = str(team_soup.find('div', attrs = {'id': "all_team_misc"})).replace("<!--", "").replace("-->", "")
    team_soup = BeautifulSoup(team_str, 'lxml')
    team = team_soup.find(name = 'table', attrs = {'id': 'team_misc'})
    
    teams_data[teams_abbr[index]] = team
    
    index+=1 

<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>


In [16]:
# team and opponent stats
iteration = True

for team in teams_data.values():
    
    team_stats = []
    for tr in team.find_all('tr'):
        cells = tr.find_all(['th','td'])
        row = [cell.text.strip() for cell in cells]
        team_stats.append(row)
    team_stats = pd.DataFrame(team_stats)
    
    if iteration:
        team_stats.to_csv('team_misc.csv', mode='w', header=True, index=False)  
        iteration = False
    else:
        team_stats.to_csv('team_misc.csv', mode='a', header=False, index=False)

In [18]:
# shooting stats
team_urls = {
    abbr: f'https://www.basketball-reference.com/teams/{abbr}/2024.html' for abbr in teams_dict.values()
}

teams_data = {}
index = 0
teams_abbr = list(teams_dict.values())

for team_url in team_urls.values():
    team_res = requests.get(team_url)
    print(team_res)
    team_soup = BeautifulSoup(team_res.content, 'lxml')
    team_str = str(team_soup.find('div', attrs = {'id': "all_shooting-playoffs_shooting"})).replace("<!--", "").replace("-->", "")
    team_soup = BeautifulSoup(team_str, 'lxml')
    team = team_soup.find(name = 'table', attrs = {'id': 'shooting'})
    
    teams_data[teams_abbr[index]] = team
    
    index+=1 

<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>


In [21]:
# shooting stats
iteration = True

for team in teams_data.values():
    
    shooting_stats = []
    for row in team.find_all('tr')[1:]:
        cols = [col.text for col in row.find_all('td')]
        shooting_stats.append(cols)
    shooting_stats = pd.DataFrame(shooting_stats)
    
    if iteration:
        shooting_stats.to_csv('shooting_stats.csv', mode='w', header=True, index=False)  
        iteration = False
    else:
        shooting_stats.to_csv('shooting_stats.csv', mode='a', header=False, index=False)