In [60]:
# Import libaries
import pandas as pd
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)
import requests
from bs4 import BeautifulSoup
import time

In [73]:
team_names = {
    "Philadelphia 76ers"    : "PHI",
    "Boston Celtics"        : "BOS",
    "New York Knicks"       : "NYK",
    "Washington Bullets"    : "WSB",
    "Miami Heat"            : "MIA",
    "New Jersey Nets"       : "NJN",
    "Detroit Pistons"       : "DET",
    "Chicago Bulls"         : "CHI",
    "Milwaukee Bucks"       : "MIL",
    "Cleveland Cavaliers"   : "CLE",
    "Indiana Pacers"        : "IND",
    "Atlanta Hawks"         : "ATL",
    "Orlando Magic"         : "ORL",
    "San Antonio Spurs"     : "SAS",
    "Utah Jazz"             : "UTA",
    "Dallas Mavericks"      : "DAL",
    "Denver Nuggets"        : "DEN",
    "Houston Rockets"       : "HOU",
    "Minnesota Timberwolves": "MIN",
    "Charlotte Hornets"     : "CHH",
    "Los Angeles Lakers"    : "LAL",
    "Portland Trail Blazers": "POR",
    "Phoenix Suns"          : "PHO",
    "Seattle SuperSonics"   : "SEA",
    "Golden State Warriors" : "GSW",
    "Los Angeles Clippers"  : "LAC",
    "Sacramento Kings"      : "SAC",
    "Toronto Raptors"       : "TOR",
    "Vancouver Grizzlies"   : "VAN",
    "Kansas City Kings"     : "KCK",
    "Washington Wizards"    : "WAS"
}

### Season Schedule Dataframe

In [62]:
def season_schedule(team_acronym, year, point_in_season):
    season_url = "https://www.basketball-reference.com/teams/" + team_acronym + "/" + str(year) + "_games.html"
    res = requests.get(season_url)
    soup = BeautifulSoup(res.content, "lxml")
    
    if point_in_season.lower() == "regular season":
        game_type = "games"
    elif point_in_season.lower() == "playoffs":
        game_type = "games_playoffs"
    
    table = soup.find("table", {"id" : game_type})

    games_cols = []
    for header in table.find_all("thead"):
        for row in header.find_all("th", {"class" : "poptip sort_default_asc left"}):
            games_cols.append(row.text)
    games_cols.append("Game")
    games_cols.append("Location")
    games_cols.append("Result")
    games_cols.append("Wins")
    games_cols.append("Losses")
    games_cols.append("Time Period")
    
    games_cols

    game_review_list = []
    for body in table.find_all("tbody"):
        for row in body.find_all("tr", {"class" : None}):
            game = row.find("th", {"data-stat" : "g"}).text
            date = (row.find("td", {"data-stat" : "date_game"}).attrs["csk"])
            opponent = row.find("td", {"data-stat" : "opp_name"}).text
            location = row.find("td", {"data-stat" : "game_location"}).text
            result = row.find("td", {"data-stat" : "game_result"}).text
            wins = row.find("td", {"data-stat" : "wins"}).text
            losses = row.find("td", {"data-stat" : "losses"}).text
            stage = point_in_season.title()
            game_review = dict(zip(games_cols,[date, opponent, game, location, result, wins, losses, stage]))
            game_review_list.append(game_review)
    game_review_df = pd.DataFrame(game_review_list)
    game_review_df["Team Acronym"] = [team_names[i] for i in game_review_df["Opponent"]]
    game_review_df["Season"] = year
    game_review_df["Location"] = game_review_df["Location"].map({"" : "Home", "@" : "Away"})
    game_review_df = game_review_df.loc[:, ["Season", "Time Period", "Game", "Date", "Team Acronym", "Opponent", "Location", "Result", "Wins", "Losses"]]
    return game_review_df

### Game Data Function

In [81]:
def get_game_stats(url, team_acronym, season, date, game, location, point_in_season):
    res = requests.get(url)
    soup = BeautifulSoup(res.content, "lxml")

    # Identify our two tables
    table = soup.find("table", {"id" : "box-" + team_acronym + "-game-basic"})
    table2 = soup.find("table", {"id" : "box-" + team_acronym + "-game-advanced"})

    # create a dataframe of our players
    players_list = []
    for row in table.find_all("th", {"class" : "left"}):
        players_list.append(row.text)
    players_df = pd.DataFrame(players_list, columns = ["Player"])
    players_df["Team_Abbr"] = team_acronym
    for full_name, abbreviation in team_names.items():
        if abbreviation == team_acronym:
            players_df["Team"] = full_name
    players_df["Season"] = season
    players_df["Date"] = date
    players_df["Game"] = game
    if team_acronym == "CHI":
        players_df["Location"] = location
    else:
        if location == "Home":
            players_df["Location"] = "Away"
        else:
            players_df["Location"] = "Home"
    players_df["Time Period"] = point_in_season.title()
    players_df = players_df.loc[:, ["Season","Time Period", "Game", "Date", "Team_Abbr", "Team", "Location", "Player"]]
    
    
    # Identify our columns from table 1
    stats_cols = []
    for row in table.find_all("th", {"class" : "poptip center"}):
        stats_cols.append(row.text)

    # combine our table body into a list of zipped dictionaries
    stats_box_list = []
    for body in table.find_all("tbody"):
        for row in body.find_all("tr"):
            stats = dict(zip(stats_cols,[stat.text for stat in row.find_all("td")]))
            stats_box_list.append(stats)

    # add in the bottom row of team overall data
    for foot in table.find_all("tfoot"):
        for row in foot.find_all("tr"):
            stats = dict(zip(stats_cols,[stat.text for stat in row.find_all("td")]))
            stats_box_list.append(stats)

    # Create a dataframe of the first table
    stats_df = pd.DataFrame(stats_box_list).dropna().reset_index(drop = True)

    # Collect our column names
    adv_stats_cols = []
    for row in table2.find_all("th", {"class" : "poptip center"}):
        adv_stats_cols.append(row.text)

    # collect our advanced stats from 

    for row in table2.find_all("th", {"class" : "poptip sort_default_asc center"}):
        adv_stats_cols.append(row.text)

    for row in table2.find_all("th", {"class" : "poptip right"}):
        adv_stats_cols.append(row.text)

    order = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 14, 11, 12, 15, 16, 13]
    adv_stats_cols = [adv_stats_cols[i] for i in order]
    adv_stats_cols = adv_stats_cols[:-1]

    # Collect our stats from the advaced table and create a zipped dictionary
    stats_box_list = []
    for body in table2.find_all("tbody"):
        for row in body.find_all("tr"):
            stats = dict(zip(adv_stats_cols,[stat.text for stat in row.find_all("td")]))
            stats_box_list.append(stats)

    for foot in table2.find_all("tfoot"):
        for row in foot.find_all("tr"):
            stats = dict(zip(adv_stats_cols,[stat.text for stat in row.find_all("td")]))
            stats_box_list.append(stats)

    # create a dataframe of the advanced stats that drops the repeated minutes played row
    advanced_stats_df = pd.DataFrame(stats_box_list).dropna().reset_index(drop = True).drop(columns = "MP")

    # Concatenate all 3 data sets into one
    box_stats_df = pd.concat([players_df, advanced_stats_df, stats_df], axis = 1)
    
    return box_stats_df

### All Seasons' Stats Function

In [82]:
def season_stats(team_acronym, list_of_years):
    base_url = "https://www.basketball-reference.com/boxscores/"
    
    stages = ["regular season", "playoffs"]
    
    final_df = pd.DataFrame()
    for year in list_of_years:
        for stage in stages:
            schedule_df = season_schedule(team_acronym, year, stage)

            city = ""
            for i in range(len(schedule_df.index)):
                date = schedule_df["Date"][i].replace("-", "")
                if schedule_df["Location"][i] == "Home":
                    city = "CHI"
                else:
                    city = schedule_df["Team Acronym"][i]

                game_url = base_url + date + "0" + city + ".html"

                game_df = pd.concat([get_game_stats(game_url,
                                                    "CHI", 
                                                    schedule_df["Season"][i], 
                                                    schedule_df["Date"][i], 
                                                    schedule_df["Game"][i], 
                                                    schedule_df["Location"][i],
                                                    schedule_df["Time Period"][i]),
                                     get_game_stats(game_url, 
                                                    schedule_df["Team Acronym"][i], 
                                                    schedule_df["Season"][i], 
                                                    schedule_df["Date"][i], 
                                                    schedule_df["Game"][i], 
                                                    schedule_df["Location"][i],
                                                    schedule_df["Time Period"][i])])

                final_df = final_df.append(game_df)
                if i % 10 == 0:
                    print (f"Collecting statistics from game {schedule_df['Game'][i]} of the {year} {stage.title()} against the {schedule_df['Opponent'][i]}.")
                time.sleep(5)
    final_df['Date'] = pd.to_datetime(final_df['Date'])
    final_df.reset_index(drop = True, inplace = True)
    
    return final_df

In [89]:
mj_seasons = season_stats(team_names["Chicago Bulls"], [(i + 1985) for i in range(14)])
mj_seasons.shape

Collecting statistics from game 1 of the 1985 Regular Season against the Washington Bullets.
Collecting statistics from game 11 of the 1985 Regular Season against the Philadelphia 76ers.
Collecting statistics from game 21 of the 1985 Regular Season against the New York Knicks.
Collecting statistics from game 31 of the 1985 Regular Season against the Atlanta Hawks.
Collecting statistics from game 41 of the 1985 Regular Season against the Indiana Pacers.
Collecting statistics from game 51 of the 1985 Regular Season against the Indiana Pacers.
Collecting statistics from game 61 of the 1985 Regular Season against the Boston Celtics.
Collecting statistics from game 71 of the 1985 Regular Season against the Dallas Mavericks.
Collecting statistics from game 81 of the 1985 Regular Season against the Atlanta Hawks.
Collecting statistics from game 1 of the 1985 Playoffs against the Milwaukee Bucks.
Collecting statistics from game 1 of the 1986 Regular Season against the Cleveland Cavaliers.
Coll

Collecting statistics from game 61 of the 1993 Regular Season against the Charlotte Hornets.
Collecting statistics from game 71 of the 1993 Regular Season against the Boston Celtics.
Collecting statistics from game 81 of the 1993 Regular Season against the Charlotte Hornets.
Collecting statistics from game 1 of the 1993 Playoffs against the Atlanta Hawks.
Collecting statistics from game 11 of the 1993 Playoffs against the New York Knicks.
Collecting statistics from game 1 of the 1994 Regular Season against the Charlotte Hornets.
Collecting statistics from game 11 of the 1994 Regular Season against the San Antonio Spurs.
Collecting statistics from game 21 of the 1994 Regular Season against the San Antonio Spurs.
Collecting statistics from game 31 of the 1994 Regular Season against the Dallas Mavericks.
Collecting statistics from game 41 of the 1994 Regular Season against the Milwaukee Bucks.
Collecting statistics from game 51 of the 1994 Regular Season against the Charlotte Hornets.
Col

(30694, 42)

In [90]:
mj_seasons.tail(50)

Unnamed: 0,Season,Time Period,Game,Date,Team_Abbr,Team,Location,Player,TS%,eFG%,3PAr,FTr,ORB%,DRB%,TRB%,AST%,STL%,BLK%,TOV%,USG%,ORtg,DRtg,BPM,MP,FG,FGA,FG%,3P,3PA,3P%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
30644,1998,Playoffs,20,1998-06-12,CHI,Chicago Bulls,Home,Toni Kukoč,0.933,1.0,0.462,0.538,2.9,18.0,9.6,0.0,1.4,1.8,11.1,20.2,153.0,106.0,18.2,42:54,11.0,13.0,0.846,4.0,6.0,0.667,4.0,7.0,0.571,1.0,5.0,6.0,0.0,1.0,1.0,2.0,2.0,30.0
30645,1998,Playoffs,20,1998-06-12,CHI,Chicago Bulls,Home,Ron Harper,0.25,0.25,0.167,0.0,0.0,10.0,4.4,5.6,1.9,2.5,14.3,10.9,57.0,106.0,-4.9,30:50,1.0,6.0,0.167,1.0,1.0,1.0,0.0,0.0,,0.0,2.0,2.0,1.0,1.0,1.0,1.0,2.0,3.0
30646,1998,Playoffs,20,1998-06-12,CHI,Chicago Bulls,Home,Luc Longley,0.375,0.375,0.0,0.0,26.8,13.5,20.9,9.2,0.0,6.7,27.3,23.0,76.0,106.0,-6.0,23:32,3.0,8.0,0.375,0.0,0.0,,0.0,0.0,,5.0,2.0,7.0,1.0,0.0,2.0,3.0,4.0,6.0
30647,1998,Playoffs,20,1998-06-12,CHI,Chicago Bulls,Home,Dennis Rodman,1.0,1.0,0.0,0.0,0.0,19.4,8.6,0.0,2.5,0.0,0.0,2.0,200.0,104.0,-2.5,24:03,1.0,1.0,1.0,0.0,0.0,,0.0,0.0,,0.0,3.0,3.0,0.0,1.0,0.0,0.0,5.0,2.0
30648,1998,Playoffs,20,1998-06-12,CHI,Chicago Bulls,Home,Steve Kerr,0.75,0.75,0.75,0.0,0.0,0.0,0.0,22.5,0.0,0.0,0.0,10.7,166.0,115.0,7.5,18:19,2.0,4.0,0.5,2.0,3.0,0.667,0.0,0.0,,0.0,0.0,0.0,2.0,0.0,0.0,0.0,1.0,6.0
30649,1998,Playoffs,20,1998-06-12,CHI,Chicago Bulls,Home,Scott Burrell,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,12.0,0.0,66.7,14.4,0.0,90.0,-9.4,10:09,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,2.0,0.0,2.0,1.0,0.0
30650,1998,Playoffs,20,1998-06-12,CHI,Chicago Bulls,Home,Jud Buechler,,,,,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,115.0,-6.0,0:24,0.0,0.0,,0.0,0.0,,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
30651,1998,Playoffs,20,1998-06-12,CHI,Chicago Bulls,Home,Bill Wennington,0.483,0.433,0.267,0.267,33.3,64.5,47.1,65.5,13.7,8.1,16.0,100.0,101.1,103.6,,240,29.0,75.0,0.387,7.0,20.0,0.35,16.0,20.0,0.8,13.0,20.0,33.0,19.0,11.0,5.0,16.0,25.0,81.0
30652,1998,Playoffs,20,1998-06-12,CHI,Chicago Bulls,Home,Dickey Simpkins,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
30653,1998,Playoffs,20,1998-06-12,CHI,Chicago Bulls,Home,Randy Brown,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [322]:
test_df = pd.DataFrame()
test_df = test_df.append(pd.concat([get_game_stats(test_url, 
                                         "CHI", 
                                         bulls_1990_schedule["Season"][0], 
                                         bulls_1990_schedule["Date"][0], 
                                         bulls_1990_schedule["Game"][0], 
                                         bulls_1990_schedule["Location"][0]),
                         get_game_stats(test_url, 
                                        bulls_1990_schedule["Team Acronym"][0], 
                                        bulls_1990_schedule["Season"][0], 
                                        bulls_1990_schedule["Date"][0],
                                        bulls_1990_schedule["Game"][0], 
                                        bulls_1990_schedule["Location"][0])]))
test_df

Unnamed: 0,Season,Game,Date,Team_Abbr,Team,Location,Players,TS%,eFG%,3PAr,FTr,ORB%,DRB%,TRB%,AST%,STL%,BLK%,TOV%,USG%,ORtg,DRtg,BPM,MP,FG,FGA,FG%,3P,3PA,3P%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
0,1990,1,1989-11-03,CHI,Chicago Bulls,Home,Scottie Pippen,0.369,0.354,0.167,0.167,2.2,4.8,3.5,20.4,1.9,1.3,16.3,25.8,78.0,111.0,-8.0,51:00,8,24,0.333,1,4,0.25,2,4,0.5,1,2,3,7,2,1,5,4,19
1,1990,1,1989-11-03,CHI,Chicago Bulls,Home,Michael Jordan,0.702,0.629,0.065,0.548,7.2,28.8,17.5,30.0,3.1,1.4,0.0,35.0,154.0,103.0,22.5,47:00,19,31,0.613,1,2,0.5,15,17,0.882,3,11,14,6,3,1,0,3,54
2,1990,1,1989-11-03,CHI,Chicago Bulls,Home,Horace Grant,0.799,0.75,0.0,0.75,5.0,13.7,9.2,9.6,1.1,3.0,22.0,13.0,130.0,109.0,1.4,45:00,6,8,0.75,0,0,,5,6,0.833,2,5,7,3,1,2,3,5,17
3,1990,1,1989-11-03,CHI,Chicago Bulls,Home,Bill Cartwright,0.485,0.429,0.0,0.571,16.1,26.4,21.0,6.9,1.2,1.6,5.4,18.9,116.0,107.0,-1.9,42:00,6,14,0.429,0,0,,5,8,0.625,6,9,15,2,1,1,1,5,17
4,1990,1,1989-11-03,CHI,Chicago Bulls,Home,John Paxson,0.25,0.25,0.0,0.0,3.3,0.0,1.7,22.0,1.4,0.0,0.0,5.0,115.0,114.0,-4.1,34:00,1,4,0.25,0,0,,0,0,,1,0,1,6,1,0,0,2,2
5,1990,1,1989-11-03,CHI,Chicago Bulls,Home,Stacey King,0.5,0.5,0.0,0.0,0.0,0.0,0.0,8.7,0.0,0.0,33.3,8.6,76.0,116.0,-11.8,15:00,0,2,0.0,0,0,,5,8,0.625,1,2,3,0,1,0,2,3,5
6,1990,1,1989-11-03,CHI,Chicago Bulls,Home,B.J. Armstrong,0.453,0.0,0.0,4.0,7.5,16.4,11.8,0.0,3.2,0.0,26.6,21.4,79.0,105.0,-12.2,15:00,1,2,0.5,0,0,,0,0,,0,0,0,1,0,0,1,2,2
7,1990,1,1989-11-03,CHI,Chicago Bulls,Home,Craig Hodges,0.571,0.571,0.714,0.0,0.0,10.3,4.9,0.0,4.0,0.0,0.0,24.9,116.0,106.0,4.8,12:00,3,7,0.429,2,5,0.4,0,0,,0,1,1,0,1,0,0,1,8
8,1990,1,1989-11-03,CHI,Chicago Bulls,Home,Will Perdue,0.0,0.0,0.0,0.0,0.0,0.0,0.0,60.2,0.0,0.0,0.0,10.7,84.0,117.0,7.8,4:00,0,1,0.0,0,0,,0,0,,0,0,0,2,0,0,0,0,0
9,1990,1,1989-11-03,CHI,Chicago Bulls,Home,Team Totals,0.554,0.495,0.118,0.462,29.8,69.8,48.9,61.4,9.2,6.3,9.7,100.0,113.6,109.1,,265,44,93,0.473,4,11,0.364,32,43,0.744,14,30,44,27,10,5,12,25,124


In [88]:
for i in range(14):
    print (i + 1985)

1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
