In [1]:
# Import libaries
import pandas as pd
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)
import requests
from bs4 import BeautifulSoup
import time

In [23]:
def overall_season(year):
     # Set up our url to webscrape from using the year
    overall_season_url = "https://www.basketball-reference.com/leagues/NBA_" + str(year) + ".html"
    # Utilize requests and BeautifulSoup libraries
    res = requests.get(overall_season_url)
    soup = BeautifulSoup(res.content, "lxml")
    # set up a blank dataframe
    teams = pd.DataFrame()
    # Set up a dictionary for our webscrape
    conferences = {
        "Eastern" : "E",
        "Western" : "W"
    }
    # iterate through the dictionary
    for conference, symbol in conferences.items():
        # determine which table we want on the webpage
        table_title = "divs_standings_" + symbol
        # use BeautifulSoup to pull the table
        table = soup.find("table", {"id" : table_title})
        # start with an empty list
        full_conference = []
        #
        team_basics = ["Year","Conference", "Team Name", "Team Abbreviation"]
        for header in table.find_all("tbody"):
            for row in header.find_all("th", {"class" : "left"}):
                team_name = row.find('a').text
                team_url = row.find('a').attrs["href"].split("/")
                team_abbreviation = team_url[2]
                conference_basics = dict(zip(team_basics,[year, conference, team_name, team_abbreviation]))
                full_conference.append(conference_basics)
        
        full_conference_df = pd.DataFrame(full_conference)
        
        team_cols = []
        for header in table.find_all("thead"):
            for row in header.find_all("th", {"class" : "poptip right"}):
                team_cols.append(row.text)
            team_cols.append("GB")
            order = [0, 1, 2, 6, 3, 4, 5]
            team_cols = [team_cols[i] for i in order]
        
        
        team_standings_stats = []
        for body in table.find_all("tbody"):
            for row in body.find_all("tr", {"class" : "full_table"}):
                team_standings = dict(zip(team_cols,[stat.text for stat in row.find_all("td")]))
                team_standings_stats.append(team_standings)
        
        team_standings_df = pd.DataFrame(team_standings_stats)
        
        team_standings_df = team_standings_df.loc[:, ['W', 'L', 'W/L%', 'GB', 'PS/G', 'PA/G', 'SRS']]
        
        conference_standings = pd.concat([full_conference_df, team_standings_df], axis = 1)
        
        teams = teams.append(conference_standings)
        
    teams.reset_index(drop = True, inplace = True)
    
    teams.rename(columns = {
        "W"    : "Wins",
        "L"    : "Losses",
        "W/L%" : "Win/Loss Percentage",
        "GB"   : "Games Back",
        "PS/G" : "Points Per Game",
        "PA/G" : "Opponent Points Per Game",
        "SRS"  : "Simple Rating System"
    }, inplace = True)
    
    return teams

### Season Schedule Dataframe
This function webscrapes from basketball-reference.com to get the full schedule of any team from any year. It collects relevant information that can then be converted into game data from another function. The input parameters are the team's abbreviation, specificying if the user wants data from the regular season or playoffs, and utilization from the previous dataframe.

In [29]:
def season_schedule(team_abbreviation, point_in_season, dataframe):
    
    for i in range(0, len(dataframe["Team Abbreviation"])):
        if team_abbreviation == dataframe["Team Abbreviation"][i]:
            team = dataframe["Team Name"][i]
    
    # Set up our url to webscrape from using the team's abbreviation and year
    season_url = "https://www.basketball-reference.com/teams/" + team_abbreviation + "/" + str(dataframe["Year"][0]) + "_games.html"
    # Utilize requests and BeautifulSoup libraries
    res = requests.get(season_url)
    soup = BeautifulSoup(res.content, "lxml")
    
    # Specify the tables to pull from - either regular season or playoffs
    if point_in_season.lower() == "regular season":
        game_type = "games"
    elif point_in_season.lower() == "playoffs":
        game_type = "games_playoffs"
    
    # Set up the table
    table = soup.find("table", {"id" : game_type})
    
    # create a list of column headers
    games_cols = []
    for header in table.find_all("thead"):
        for row in header.find_all("th", {"class" : "poptip sort_default_asc left"}):
            games_cols.append(row.text)
    games_cols.append("Game")
    games_cols.append("Location")
    games_cols.append("Result")
    games_cols.append("Overtime")
    games_cols.append("Wins")
    games_cols.append("Losses")
    games_cols.append("Time Period")
    games_cols.append("Opponent Abbreviation")
    
    # webscrape for content from the table
    # create an empty list to append all rows of data as dictionaries to be converted into a dataframe
    game_review_list = []
    # looking within the body of the table,
    for body in table.find_all("tbody"):
        # iterate through each row - excluding rows with classes, which have no relevant data
        for row in body.find_all("tr", {"class" : None}):
            game = row.find("th", {"data-stat" : "g"}).text # game number in the season
            date = (row.find("td", {"data-stat" : "date_game"}).attrs["csk"]) # date of the game
            opponent = row.find("td", {"data-stat" : "opp_name"}).text # opponent to the team
            location = row.find("td", {"data-stat" : "game_location"}).text # Home or Away
            result = row.find("td", {"data-stat" : "game_result"}).text # Win or loss
            overtime = row.find("td", {"data-stat" : "overtimes"}).text # whether overtime was required
            wins = row.find("td", {"data-stat" : "wins"}).text # season wins record
            losses = row.find("td", {"data-stat" : "losses"}).text # season losses record
            stage = point_in_season.title() # regular season or playoff
            # zip the columns with the data into a dictionary
            
            for i in range(0, len(dataframe["Team Name"])):
                if opponent == dataframe["Team Name"][i]:
                    abbr = dataframe["Team Abbreviation"][i]
            
            game_review = dict(zip(games_cols,[date, opponent, game, location, result, 
                                               overtime, wins, losses, stage, abbr]))
            # add each dictionary to the original list
            game_review_list.append(game_review)
    # create a dataframe from our final list        
    game_review_df = pd.DataFrame(game_review_list)
    # add a few columns to our list
    
    game_review_df["Team"] = team
    game_review_df["Team Abbreviation"] = team_abbreviation
    game_review_df["Season"] = dataframe["Year"][0] # what year this season was
    # data from game location needs to be converted to human readable "Home" and "Away"
    game_review_df["Location"] = game_review_df["Location"].map({"" : "Home", "@" : "Away"})
    # reorder the dataframe
    game_review_df = game_review_df.loc[:, ["Team", "Team Abbreviation", "Season", "Time Period", "Game", "Date",
                                            "Opponent", "Opponent Abbreviation", 
                                            "Location", "Result", "Overtime", "Wins", "Losses"]]
    # return season dataframe
    return game_review_df

### Game Data Function
This function is designed to webscrape individual game data for one team from any game in any season. It collects data from basic game stats and advanced stats and collects them all together, with the overall team data as well. It then creates a dataframe from those tables. It requires the url from the game from basketball-reference.com, the team's abbreviation, the date of the game, and utilization of the previous dataframe.

In [73]:
def get_game_stats(url, team_abbreviation, date, dataframe):
    # Utilize requests and BeautifulSoup libraries
    res = requests.get(url)
    soup = BeautifulSoup(res.content, "lxml")

    # Identify our two tables, basic and advanced data
    table = soup.find("table", {"id" : "box-" + team_abbreviation + "-game-basic"})
    table2 = soup.find("table", {"id" : "box-" + team_abbreviation + "-game-advanced"})

    # create a dataframe of our players
    # start with an empty list
    players_list = []
    # iterate through the basic stats table
#     for body in table.find_all("tbody"):
    for row in table.find_all("th", {"class" : "left"}):
        players_list.append(row.text) # add each player's name
    # create a dataframe with one column that is just all the players    
    players_df = pd.DataFrame(players_list, columns = ["Player"])
    # add some rows to this dataframe
    players_df["Team Abbreviation"] = team_abbreviation # add team abbreviation

    
    players_df["Season"] = dataframe["Season"][0] # Add the season
    players_df["Date"] = date # add the date of the game
    
    for i in range(0, dataframe.shape[0]):
        if date == dataframe["Date"][i]:
            players_df["Overtime"] = dataframe["Overtime"][i]
            
            if team_abbreviation == dataframe["Opponent Abbreviation"][i]:
                
                players_df["Team"] = dataframe["Opponent"][i]
                
                if dataframe["Location"][i] == "Home":
                    players_df["Location"] = "Away"
                else:
                    players_df["Location"] = "Home"
                    
                if dataframe["Result"][i] == "W":
                    players_df["Result"] = "L"
                else:
                    players_df["Result"] = "W"

            else:
                players_df["Team"] = dataframe["Team"][0]
                players_df["Location"] = dataframe["Location"][i]
                players_df["Result"] = dataframe["Result"][i]
        
    players_df["Time Period"] = dataframe["Time Period"][0] # add column for regular season or playoffs
 

    # reorder the dataframe to a more human readable version
    players_df = players_df.loc[: , ["Season", "Time Period", "Date", "Team", "Team Abbreviation", 
                                     "Location", "Overtime", "Result", "Player"]]
    
    # create another dataframe of the game stats to be combined with our other dataframe
    # Identify our columns from the basic table
    # create an empty list
    stats_cols = []
    # iterate through all our rows that are column headers
    for row in table.find_all("th", {"class" : "poptip center"}):
        stats_cols.append(row.text) # add column headers to the list

    # combine our table body into a list of zipped dictionaries
    # add empty list
    stats_box_list = []
    # iterate through the body of the table
    for body in table.find_all("tbody"):
        
        for row in body.find_all("tr"):
            stats = dict(zip(stats_cols,[stat.text for stat in row.find_all("td")]))
            stats_box_list.append(stats)
            

    # Create a dataframe of the first table
    stats_df = pd.DataFrame(stats_box_list).dropna().reset_index(drop = True)
    
    players_df_basic_stats = pd.concat([players_df, stats_df], axis = 1)
    players_df_basic_stats.drop(index = (players_df_basic_stats.loc[players_df_basic_stats["MP"].isna(),:].index),
                                inplace = True)
    
    
    team_df = pd.DataFrame(["Team Totals"], columns = ["Player"])
        # add some rows to this dataframe

    team_df["Team Abbreviation"] = team_abbreviation # add team abbreviation
    team_df["Season"] = players_df["Season"][0]
    team_df["Time Period"] = players_df["Time Period"][0]
    team_df["Date"] = players_df["Date"][0]
    team_df["Team"] = players_df["Team"][0]
    team_df["Location"] = players_df["Location"][0]
    team_df["Overtime"] = players_df["Overtime"][0]
    team_df["Result"] = players_df["Result"][0]
    
    # reorder the dataframe to a more human readable version
    team_df = team_df.loc[:, ["Season","Time Period", "Date", "Team", "Team Abbreviation", 
                              "Location", "Overtime", "Result", "Player"]]
    
    team_stats_box_list = []
    # add in the bottom row of team overall data
    for foot in table.find_all("tfoot"):
        for row in foot.find_all("tr"):
            team_stats = dict(zip(stats_cols,[stat.text for stat in row.find_all("td")]))
            team_stats_box_list.append(team_stats)
            
    team_stats_df = pd.DataFrame(team_stats_box_list)
    
    team_df_basic_stats = pd.concat([team_df, team_stats_df], axis = 1)
    
    basic_stats_table = pd.concat([players_df_basic_stats, team_df_basic_stats], axis = 0)
    basic_stats_table.reset_index(inplace = True, drop = True)
    
    
    # Collect our column names
    adv_stats_cols = []
    for row in table2.find_all("th", {"class" : "poptip center"}):
        adv_stats_cols.append(row.text)

    # collect our advanced stats from 

    for row in table2.find_all("th", {"class" : "poptip sort_default_asc center"}):
        adv_stats_cols.append(row.text)

    for row in table2.find_all("th", {"class" : "poptip right"}):
        adv_stats_cols.append(row.text)

    order = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 14, 11, 12, 15, 16, 13]
    adv_stats_cols = [adv_stats_cols[i] for i in order]
    adv_stats_cols = adv_stats_cols[:-1]

    # Collect our stats from the advaced table and create a zipped dictionary
    stats_box_list = []
    for body in table2.find_all("tbody"):
        for row in body.find_all("tr"):
            stats = dict(zip(adv_stats_cols,[stat.text for stat in row.find_all("td")]))
            stats_box_list.append(stats)

    for foot in table2.find_all("tfoot"):
        for row in foot.find_all("tr"):
            stats = dict(zip(adv_stats_cols,[stat.text for stat in row.find_all("td")]))
            stats_box_list.append(stats)

    # create a dataframe of the advanced stats that drops the repeated minutes played row
    advanced_stats_df = pd.DataFrame(stats_box_list).dropna().reset_index(drop = True).drop(columns = "MP")

    # Concatenate all 3 data sets into one
    box_stats_df = pd.concat([basic_stats_table, advanced_stats_df], axis = 1)
    

        # If you want to add in BPM values for the team - uncomment the lines below
#     box_stats_df["BPM2"] = box_stats_df["BPM"]
    
#     BPM_values = list(box_stats_df.loc[box_stats_df["Player"] != "Team Totals", :]["BPM"])
    
#     BPM_floats = []
#     for i in BPM_values:
#          BPM_floats.append(float(i))
#     team_avg_bpm = round(sum(BPM_floats)/len(BPM_floats), 2)
    
#     box_stats_df.loc[box_stats_df["Player"] == "Team Totals", "BPM"] = team_avg_bpm
    
#     box_stats_df.loc[box_stats_df["Player"] == "Team Totals", "BPM2"] = sum(BPM_floats)    
    
    return box_stats_df

### All Seasons' Stats Function

In [187]:
def season_stats(list_of_years, stages):
    base_url = "https://www.basketball-reference.com/boxscores/"
    
    final_df = pd.DataFrame()
    for year in list_of_years:
        season_df = overall_season(year)
        for stage in stages:
            for team_abbr in season_df["Team Abbreviation"]:
                schedule_df = season_schedule(team_abbr, stage, season_df)

                for i in range(len(schedule_df.index)):
                    date = schedule_df["Date"][i].replace("-", "")
                    if schedule_df["Location"][i] == "Home":
                        city = schedule_df["Team Abbreviation"][i]
                    else:
                        city = schedule_df["Opponent Abbreviation"][i]

                    game_url = base_url + date + "0" + city + ".html"
                    # def get_game_stats(url, team_abbreviation, date, dataframe):
                    game_df = pd.concat([get_game_stats(game_url,
                                                        schedule_df["Team Abbreviation"][i],
                                                        schedule_df["Date"][i],
                                                        schedule_df),
                                         get_game_stats(game_url, 
                                                        schedule_df["Opponent Abbreviation"][i], 
                                                        schedule_df["Date"][i], 
                                                        schedule_df)])

                    final_df = final_df.append(game_df)
                    if i % 20 == 0 and i !=0:
                        message = (
                            f'Collecting statistics from the {i +1}st game of the {year} season for '
                            f'the {schedule_df["Team"][i]} when they played the {schedule_df["Opponent"][i]} on '
                            f'{schedule_df["Date"][i]} where they got the {schedule_df["Result"][i]}.'
                        )
                        print (message)
                    if i == 81:
                        final_df.to_csv("../data/all_teams_csv", index = False)
                    time.sleep(2)
    final_df['Date'] = pd.to_datetime(final_df['Date'])
    final_df.reset_index(drop = True, inplace = True)
    
    return final_df

In [188]:
nba_2018_2019_seasons = season_stats([2018, 2019], ["Regular Season"])
nba_2018_2019_seasons.head(100)

Collecting statistics from the 21st game of the 2018 season for the Toronto Raptors when they played the Indiana Pacers on 2017-12-01 where they got the W.
Collecting statistics from the 41st game of the 2018 season for the Toronto Raptors when they played the Golden State Warriors on 2018-01-13 where they got the L.
Collecting statistics from the 61st game of the 2018 season for the Toronto Raptors when they played the Washington Wizards on 2018-03-02 where they got the W.
Collecting statistics from the 81st game of the 2018 season for the Toronto Raptors when they played the Detroit Pistons on 2018-04-09 where they got the W.
Collecting statistics from the 21st game of the 2018 season for the Boston Celtics when they played the Indiana Pacers on 2017-11-25 where they got the W.
Collecting statistics from the 41st game of the 2018 season for the Boston Celtics when they played the Cleveland Cavaliers on 2018-01-03 where they got the W.
Collecting statistics from the 61st game of the 2

Collecting statistics from the 41st game of the 2018 season for the Orlando Magic when they played the Dallas Mavericks on 2018-01-09 where they got the L.
Collecting statistics from the 61st game of the 2018 season for the Orlando Magic when they played the Toronto Raptors on 2018-02-28 where they got the L.
Collecting statistics from the 81st game of the 2018 season for the Orlando Magic when they played the Milwaukee Bucks on 2018-04-09 where they got the L.
Collecting statistics from the 21st game of the 2018 season for the Atlanta Hawks when they played the Cleveland Cavaliers on 2017-11-30 where they got the L.
Collecting statistics from the 41st game of the 2018 season for the Atlanta Hawks when they played the Denver Nuggets on 2018-01-10 where they got the W.
Collecting statistics from the 61st game of the 2018 season for the Atlanta Hawks when they played the Los Angeles Lakers on 2018-02-26 where they got the L.
Collecting statistics from the 81st game of the 2018 season for

Collecting statistics from the 21st game of the 2018 season for the Sacramento Kings when they played the Milwaukee Bucks on 2017-11-28 where they got the L.
Collecting statistics from the 41st game of the 2018 season for the Sacramento Kings when they played the Los Angeles Clippers on 2018-01-11 where they got the L.
Collecting statistics from the 61st game of the 2018 season for the Sacramento Kings when they played the Portland Trail Blazers on 2018-02-27 where they got the L.
Collecting statistics from the 81st game of the 2018 season for the Sacramento Kings when they played the San Antonio Spurs on 2018-04-09 where they got the L.
Collecting statistics from the 21st game of the 2018 season for the Dallas Mavericks when they played the San Antonio Spurs on 2017-11-27 where they got the L.
Collecting statistics from the 41st game of the 2018 season for the Dallas Mavericks when they played the New York Knicks on 2018-01-07 where they got the L.
Collecting statistics from the 61st 

Collecting statistics from the 21st game of the 2019 season for the Miami Heat when they played the New Orleans Pelicans on 2018-11-30 where they got the W.
Collecting statistics from the 41st game of the 2019 season for the Miami Heat when they played the Memphis Grizzlies on 2019-01-12 where they got the W.
Collecting statistics from the 61st game of the 2019 season for the Miami Heat when they played the Houston Rockets on 2019-02-28 where they got the L.
Collecting statistics from the 81st game of the 2019 season for the Miami Heat when they played the Philadelphia 76ers on 2019-04-09 where they got the W.
Collecting statistics from the 21st game of the 2019 season for the Washington Wizards when they played the New Orleans Pelicans on 2018-11-28 where they got the L.
Collecting statistics from the 41st game of the 2019 season for the Washington Wizards when they played the Philadelphia 76ers on 2019-01-08 where they got the L.
Collecting statistics from the 61st game of the 2019 s

Collecting statistics from the 21st game of the 2019 season for the Los Angeles Clippers when they played the Sacramento Kings on 2018-11-29 where they got the W.
Collecting statistics from the 41st game of the 2019 season for the Los Angeles Clippers when they played the Denver Nuggets on 2019-01-10 where they got the L.
Collecting statistics from the 61st game of the 2019 season for the Los Angeles Clippers when they played the Denver Nuggets on 2019-02-24 where they got the L.
Collecting statistics from the 81st game of the 2019 season for the Los Angeles Clippers when they played the Golden State Warriors on 2019-04-07 where they got the L.
Collecting statistics from the 21st game of the 2019 season for the Sacramento Kings when they played the Los Angeles Clippers on 2018-11-29 where they got the L.
Collecting statistics from the 41st game of the 2019 season for the Sacramento Kings when they played the Phoenix Suns on 2019-01-08 where they got the L.
Collecting statistics from th

Unnamed: 0,Season,Time Period,Date,Team,Team Abbreviation,Location,Overtime,Result,Player,MP,FG,FGA,FG%,3P,3PA,3P%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,TS%,eFG%,3PAr,FTr,ORB%,DRB%,TRB%,AST%,STL%,BLK%,TOV%,USG%,ORtg,DRtg,BPM
0,2018,Regular Season,2017-10-19,Toronto Raptors,TOR,Home,,W,DeMar DeRozan,32:03,2,9,0.222,0,0,,7,8,0.875,1,5,6,5,2,1,5,1,11,0.439,0.222,0.0,0.889,3.6,16.3,10.2,20.8,3.0,2.6,28.5,23.4,86.0,97.0,-5.3
1,2018,Regular Season,2017-10-19,Toronto Raptors,TOR,Home,,W,Kyle Lowry,31:20,4,7,0.571,1,3,0.333,3,3,1.0,0,4,4,9,1,0,4,1,12,0.721,0.643,0.429,0.429,0.0,13.3,7.0,41.9,1.5,0.0,32.5,16.8,121.0,103.0,2.3
2,2018,Regular Season,2017-10-19,Toronto Raptors,TOR,Home,,W,Jonas Valančiūnas,30:30,9,17,0.529,0,1,0.0,5,6,0.833,8,7,15,2,2,0,1,4,23,0.586,0.529,0.059,0.353,30.0,23.9,26.8,12.7,3.2,0.0,4.8,28.9,136.0,96.0,4.3
3,2018,Regular Season,2017-10-19,Toronto Raptors,TOR,Home,,W,Norman Powell,24:35,5,11,0.455,3,6,0.5,2,2,1.0,0,4,4,1,1,0,2,2,15,0.631,0.591,0.545,0.182,0.0,17.0,8.9,6.7,2.0,0.0,14.4,24.1,111.0,101.0,0.0
4,2018,Regular Season,2017-10-19,Toronto Raptors,TOR,Home,,W,Serge Ibaka,24:14,3,8,0.375,2,4,0.5,0,0,,1,3,4,0,0,2,0,2,8,0.5,0.5,0.5,0.0,4.7,12.9,9.0,0.0,0.0,7.0,0.0,14.1,113.0,101.0,-2.0
5,2018,Regular Season,2017-10-19,Toronto Raptors,TOR,Home,,W,Delon Wright,23:22,4,6,0.667,0,1,0.0,5,5,1.0,1,1,2,5,1,0,0,0,13,0.793,0.667,0.167,0.833,4.9,4.5,4.7,33.4,2.1,0.0,0.0,15.0,185.0,104.0,11.9
6,2018,Regular Season,2017-10-19,Toronto Raptors,TOR,Home,,W,C.J. Miles,20:01,7,12,0.583,6,9,0.667,2,2,1.0,0,5,5,0,0,0,0,4,22,0.854,0.833,0.75,0.167,0.0,26.1,13.6,0.0,0.0,0.0,0.0,27.5,165.0,102.0,15.1
7,2018,Regular Season,2017-10-19,Toronto Raptors,TOR,Home,,W,Jakob Poeltl,17:30,1,2,0.5,0,0,,0,0,,0,5,5,0,1,0,3,3,2,0.5,0.5,0.0,0.0,0.0,29.8,15.6,0.0,2.7,0.0,60.0,12.2,34.0,95.0,-12.0
8,2018,Regular Season,2017-10-19,Toronto Raptors,TOR,Home,,W,OG Anunoby,17:14,3,6,0.5,1,3,0.333,2,2,1.0,2,1,3,2,0,0,0,1,9,0.654,0.583,0.5,0.333,13.3,6.1,9.5,18.2,0.0,0.0,0.0,17.1,158.0,108.0,5.5
9,2018,Regular Season,2017-10-19,Toronto Raptors,TOR,Home,,W,Fred VanVleet,12:59,1,5,0.2,0,2,0.0,0,0,,0,1,1,2,1,1,2,1,2,0.2,0.2,0.4,0.0,0.0,8.0,4.2,20.9,3.7,6.5,28.6,23.0,49.0,95.0,-11.2
