In [2]:
from bs4 import BeautifulSoup
from datetime import datetime, timedelta
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split

import requests
import pandas as pd
import re

In [3]:
dates = pd.date_range(start = "2023-10-24", end = datetime.today() - timedelta(days=1))

## Using ESPN

In [None]:
base_link = "https://www.espn.com"

In [None]:
base_link = "https://www.espn.com"
headers = requests.utils.default_headers()
headers.update({
    'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0',
})

box_scores = pd.DataFrame(columns = ["MIN", "FG", "3PT", "FT", "OREB", "DREB", "REB", "AST", "STL", "BLK", "TO", "PF", "PLUS_MINUS", "PTS", "PLAYER_NAME"])

for date in dates:
	date = date.strftime("%Y%m%d")
	link = base_link + "/nba/scoreboard/_/date/" + date

	page = requests.get(link, headers=headers)
	soup = BeautifulSoup(page.content, "html.parser")
	games = soup.find_all("section", class_ = "Scoreboard")
	for game in games:
		box_score_link = game.find_all("a", class_ = "Button--anchorLink", href = True)[1]['href']
		req = requests.get(base_link + box_score_link, headers=headers)
		soup = BeautifulSoup(req.content, "html.parser")	
		
		all_players = []
		all_statlines = []
		teams = []

		team_names = soup.find_all("div", class_ = "BoxscoreItem__TeamName")
		for team in team_names:
			teams.append(team.text.strip())
			

		# Get all the players
		players = soup.find("div", class_ = "Boxscore__ResponsiveWrapper").find_all("div", class_ = "ResponsiveTable")
		for i, row in enumerate(players):
			columns = row.find("table", class_ = "Table").tbody.find_all("td")
			for player in columns[:-2]:
				player = player.text.strip()
				if player != "starters" and player != "bench":
					all_players.append(player)

		# Get all the statlines
		statlines = soup.find("div", class_ = "Boxscore__ResponsiveWrapper").find_all("div", class_ = "Table__Scroller")
		for i, row in enumerate(statlines):
			rows = row.find("table", class_ = "Table").tbody.find_all("tr")
			for i, row in enumerate(rows):
				if i != 0 and i != 6:
					td = row.find_all('td')
					data = [tr.text.strip() for tr in td if tr.text.strip()]
					data.append(date)
					if '-' not in data[0] and '%' not in data[0]:
						all_statlines.append(data)
					if "DNP-" in data[0]:
						all_statlines.append([0] * 15)
		
		for i, player in enumerate(all_players):
			all_statlines[i].append(player)
		
		df = pd.DataFrame(all_statlines, columns = ["MIN", "FG", "3PT", "FT", "OREB", "DREB", "REB", "AST", "STL", "BLK", "TO", "PF", "PLUS_MINUS", "PTS", "DATE", "PLAYER_NAME"])
		box_scores = pd.concat([box_scores, df])

## Using Basketball Reference

In [4]:
base_link = "https://www.basketball-reference.com"

In [31]:
columns = ["PLAYER", "MP", "FG", "FGA", "FG%", "3P", "3PA", "3P%", "FT", "FTA", "FT%", "ORB", "DRB", "TRB", "AST", "STL", "BLK", "TOV", "PF", "PTS", "+/-",
"TS%", "eFG%", "3PAr", "FTr", "ORB%", "DRB%", "TRB%", "AST%", "STL%", "BLK%", "TOV%", "USG%", "ORtg", "DRtg", "BPM"
]

box_scores = pd.DataFrame(columns = columns)

for date in dates:
	link = base_link + "/boxscores/?month={0}&day={1}&year={2}".format(date.month, date.day, date.year)
	
	page = requests.get(link)
	soup = BeautifulSoup(page.content, "html.parser")

	# Get all games for the day
	games = soup.find("div", class_ = "game_summaries").find_all("table", class_= "teams")
	# Loop through all games to get boxscore link
	for game in games:
		# Dict to store player: stats for appending advanced stats to basic stats
		player_stats 	= {}
		all_players 	= []
		all_statlines 	= []
		teams 			= []

		boxscore_link = game.find("td", class_ = "gamelink").find("a", href = True)['href']

		# Access boxscore
		page = requests.get(base_link + boxscore_link)
		soup = BeautifulSoup(page.content, "html.parser")

		# Get only the tables corresponding to the "Basic Game" tab
		basic_tables = soup.find_all("table", class_ = "sortable", id = re.compile('game-basic'))
		advanced_tables = soup.find_all("table", class_ = "sortable", id = re.compile('game-advanced'))

		print(len(basic_tables))
		print(len(advanced_tables))

		for table in basic_tables:
			rows = table.tbody.find_all("tr")
			for i, row in enumerate(rows):
				if row != []:
					# Skip Reserves and Team Totals rows
					if 'class' in row.attrs:
							continue
					else:
						# Collect row data
						player = row.find("th").text.strip()
						row_data = row.find_all("td")
						cell_values = [player, *[cell.text.strip() for cell in row_data]]
						# Do not want to collect data for players who did not play
						if cell_values[1] == 'Did Not Play':
							continue
						player_stats[player] = cell_values

		for table in advanced_tables:
			rows = table.tbody.find_all("tr")
			for i, row in enumerate(rows):
				if row != []:
					# Skip Reserves and Team Totals rows
					if 'class' in row.attrs:
							continue
					else:
						# Collect row data
						player = row.find("th").text.strip()
						row_data = row.find_all("td")
						cell_values = [cell.text.strip() for cell in row_data]
						# Do not want to collect data for players who did not play
						if cell_values[0] == 'Did Not Play' or not player_stats[player]:
							continue
						
						# We extend the stats to the back of the existing player on that page
						player_stats[player].extend(cell_values[1:])
		
		for value in player_stats.values():
			all_statlines.append(value)

		df = pd.DataFrame(all_statlines, columns = columns)
		box_scores = pd.concat([box_scores, df])
		break
	break
		
		

2
2
adding player D'Angelo Russell
adding player Anthony Davis
adding player Austin Reaves
adding player Taurean Prince
adding player LeBron James
adding player Gabe Vincent
adding player Cam Reddish
adding player Christian Wood
adding player Rui Hachimura
adding player Jaxson Hayes
adding player Max Christie
adding player Maxwell Lewis
adding player Nikola Jokić
adding player Kentavious Caldwell-Pope
adding player Aaron Gordon
adding player Jamal Murray
adding player Michael Porter Jr.
adding player Reggie Jackson
adding player Christian Braun
adding player Zeke Nnaji
adding player Peyton Watson
adding player Jalen Pickett
adding player Collin Gillespie
adding player Braxton Key
Extending Player D'Angelo Russell
Extending Player Anthony Davis
Extending Player Austin Reaves
Extending Player Taurean Prince
Extending Player LeBron James
Extending Player Gabe Vincent
Extending Player Cam Reddish
Extending Player Christian Wood
Extending Player Rui Hachimura
Extending Player Jaxson Hayes
E

In [28]:
all_statlines

[["D'Angelo Russell",
  '36:11',
  '4',
  '12',
  '.333',
  '2',
  '5',
  '.400',
  '1',
  '2',
  '.500',
  '0',
  '4',
  '4',
  '7',
  '1',
  '0',
  '3',
  '3',
  '11',
  '+1',
  '.427',
  '.417',
  '.417',
  '.167',
  '0.0',
  '13.3',
  '6.2',
  '26.0',
  '1.4',
  '0.0',
  '18.9',
  '19.2',
  '91',
  '125',
  '-7.5'],
 ['Anthony Davis',
  '34:09',
  '6',
  '17',
  '.353',
  '1',
  '2',
  '.500',
  '4',
  '4',
  '1.000',
  '1',
  '7',
  '8',
  '4',
  '0',
  '2',
  '2',
  '3',
  '17',
  '-17',
  '.453',
  '.382',
  '.118',
  '.235',
  '3.1',
  '24.6',
  '13.1',
  '17.3',
  '0.0',
  '4.9',
  '9.6',
  '26.6',
  '100',
  '122',
  '-3.0'],
 ['Austin Reaves',
  '31:20',
  '4',
  '11',
  '.364',
  '1',
  '2',
  '.500',
  '5',
  '7',
  '.714',
  '4',
  '4',
  '8',
  '4',
  '2',
  '0',
  '2',
  '2',
  '14',
  '-14',
  '.497',
  '.409',
  '.182',
  '.636',
  '13.3',
  '15.3',
  '14.3',
  '17.6',
  '3.2',
  '0.0',
  '12.4',
  '22.4',
  '113',
  '121',
  '0.8'],
 ['Taurean Prince',
  '29:53',
  '

## Merge

In [6]:
box_scores

Unnamed: 0,PLAYER,MP,FG,FGA,FG%,3P,3PA,3P%,FT,FTA,...,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,+/-
0,D'Angelo Russell,36:11,4,12,0.333,2,5,0.4,1,2,...,0,4,4,7,1,0,3,3,11,1
1,Anthony Davis,34:09,6,17,0.353,1,2,0.5,4,4,...,1,7,8,4,0,2,2,3,17,-17
2,Austin Reaves,31:20,4,11,0.364,1,2,0.5,5,7,...,4,4,8,4,2,0,2,2,14,-14
3,Taurean Prince,29:53,6,8,0.75,4,6,0.667,2,2,...,1,2,3,1,0,1,1,0,18,-14
4,LeBron James,29:00,10,16,0.625,1,4,0.25,0,1,...,1,7,8,5,1,0,0,1,21,7
5,Gabe Vincent,22:18,3,8,0.375,0,4,0.0,0,0,...,1,0,1,2,1,0,2,3,6,-17
6,Cam Reddish,17:38,2,4,0.5,1,2,0.5,2,2,...,2,2,4,0,0,1,0,2,7,7
7,Christian Wood,15:28,3,4,0.75,0,1,0.0,1,2,...,1,3,4,0,0,0,1,1,7,2
8,Rui Hachimura,14:39,3,10,0.3,0,3,0.0,0,0,...,2,1,3,0,0,0,0,2,6,-8
9,Jaxson Hayes,6:54,0,0,,0,0,,0,0,...,0,1,1,0,0,0,0,1,0,-7


In [None]:
X = box_scores[["MIN"]]
y = box_scores["PTS"]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.2, random_state=8)

In [None]:
params = {"n_estimators": 500,
			"max_depth": 4,
			"min_samples_split": 5,
			"learning_rate": 0.01,
			"loss": "squared_error"}

In [None]:
reg = GradientBoostingRegressor(**params)
reg.fit(X_train, y_train)

In [None]:
y_pred = reg.predict(X_test)

In [None]:
compare = pd.DataFrame(list(y_test), columns=["Actual Points"])
compare['Predicted Points'] = y_pred.round()
compare['Minutes'] = list(X_test['MIN'])

In [None]:
compare