In [91]:
year = 2019
base_url = "https://www.basketball-reference.com"
url = f"https://www.basketball-reference.com/leagues/NBA_{year}.html"

# 1. Obtaining data for 'Teams' 

## - create directory to store downloaded files, obtain data for each time, and save to a csv file

In [92]:
import requests
import os

data = requests.get(url)

# create directory
dir_name = 'data'
try:
    os.makedirs(dir_name)
except OSError as error:
    print(f'Directory "{dir_name}" cannot be created')

# download data and save into html file for data parsing, instead of downloading the entire page each time
with open(f"data/{year}.html", "w+", encoding="cp437", errors='ignore') as f:
    f.write(data.text)

In [93]:
from bs4 import BeautifulSoup

In [94]:
with open(f"data/{year}.html") as f:
    # read the file and store the data as a string obj
    page = f.read()

In [95]:
# initialize soup object to parse html data
soup = BeautifulSoup(page, "html.parser")

In [96]:
# get the abbreviated team names and store into a list (some abbreviated names are different on this site)
nba_team_pages = []

# find the appropriate tables with the team stats for every team in the NBA
eastern_conf_table = soup.find(id="confs_standings_E")
western_conf_table = soup.find(id="confs_standings_W")

# eastern conference teams
for a in eastern_conf_table.find_all('a'):
    nba_team_pages.append(a['href'])
    
# western conference teams
for a in western_conf_table.find_all('a'):
    nba_team_pages.append(a['href'])
    
# team abbrs
team_abbrs = []

for abbr in nba_team_pages:
    team_abbrs.append(abbr.split('/')[2])
    
# create directory
parent_dir = dir_name
dir_name = os.path.join(parent_dir, 'teams')

try:
    os.makedirs(dir_name)
except OSError as error:
    print(f'Directory "{dir_name}" cannot be created')

# go through each page and save them to their corresponding html files
for team_page, abbr_name in zip(nba_team_pages, team_abbrs):
    data = requests.get(base_url + team_page)
    
    # download data and save into html file for data parsing, instead of downloading the entire page each time
    with open(f"{dir_name}/{abbr_name}.html", "w+", encoding="cp437", errors='ignore') as f:
        f.write(data.text)

# at this step, you will have the html files of all the individual teams

In [222]:
team_abbrs.sort()

In [225]:
import pandas as pd
import re

teams_df = pd.DataFrame(columns=['Name', 'Abbreviated Name', 'Arena', 'Wins', 'Losses', 'W-L'])

for idx, team in enumerate(team_abbrs, start=1):
    # each team should have a name, abbr name, wins, losses, location
    with open(f'data/teams/{team}.html', encoding="cp437", errors='ignore') as f:
        page = f.read()
        
        # initialize soup object to parse html data
        soup = BeautifulSoup(page, "html.parser")
        
        # team name
        team_name = soup.find(id="info").find_all('span')[1].extract().text
        
        # wins/losses
        wins_and_losses = soup.find(id="info").select_one('div[data-template="Partials/Teams/Summary"]').find('p').extract().text.replace(" ", "").replace("\n", "").split(",")[0].split(":")[1]
        wins = wins_and_losses.split("-")[0]
        losses = wins_and_losses.split("-")[1]
        
        # location i.e. arena
        # filter string to find the arena name
        locations = soup.find(id="info").select_one('div[data-template="Partials/Teams/Summary"]').find_all('p')
        pattern = 'Attendance'
        location = None
        match = None
        
        # find the tag with the arena
        for loc in locations:
            match = (re.search(pattern, loc.text))
            if match:
                location = loc.text
                break
        
        # # obtain only the arena name, and filter further
        # location = location[:match.start()].replace("\n", "").replace(" ", "").split(":")[-1]
        location = location[:match.start()].replace("\n", "").split(":")[-1].strip()
        
        teams_df.loc[idx] = [team_name, team, location, wins, losses, wins_and_losses]
    

In [229]:
# create directory
dir_name = 'csv_files'

try:
    os.makedirs(dir_name)
except OSError as error:
    print(f'Directory "{dir_name}" cannot be created')
    
teams_df.to_csv('csv_files/teams.csv')

Directory "csv_files" cannot be created


## 2. Obtain team stats data for each team, filter data, convert to dataframe, and save to csv file

In [262]:
teams_stats_df = pd.DataFrame(columns=['Name', 'Field Goal %', 'Three-Point Field Goal %', 'Free Throw %', 'Turnovers Per Game', 'Offensive Rebounds Per Game', 'Defensive Rebounds Per Game', 'Assists Per Game', 'Steals Per Game', 'Blocks Per Game', 'Points Per Game'])

for idx, team in enumerate(team_abbrs, start=1):
    # each team should have a name, abbr name, wins, losses, location
    with open(f'data/teams/{team}.html', encoding="cp437", errors='ignore') as f:
        page = f.read()
        
        # initialize soup object to parse html data
        soup = BeautifulSoup(page, "html.parser")
        
        # need to use Selenium to scrape dynamic content
        
        break

4
