In [21]:
from bs4 import BeautifulSoup
from datetime import datetime, timedelta
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split

import requests
import pandas as pd

In [4]:
dates = pd.date_range(start = "2023-10-24", end = datetime.today() - timedelta(days=1))

In [5]:
base_link = "https://www.espn.com"

In [6]:
headers = requests.utils.default_headers()
headers.update({
    'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0',
})

In [7]:
box_scores = pd.DataFrame(columns = ["MIN", "FG", "3PT", "FT", "OREB", "DREB", "REB", "AST", "STL", "BLK", "TO", "PF", "PLUS_MINUS", "PTS", "PLAYER_NAME"])

In [8]:
for date in dates:
	date = date.strftime("%Y%m%d")
	link = base_link + "/nba/scoreboard/_/date/" + date

	page = requests.get(link, headers=headers)
	soup = BeautifulSoup(page.content, "html.parser")
	games = soup.find_all("section", class_ = "Scoreboard")
	for game in games:
		box_score_link = game.find_all("a", class_ = "Button--anchorLink", href = True)[1]['href']
		req = requests.get(base_link + box_score_link, headers=headers)
		soup = BeautifulSoup(req.content, "html.parser")	
		
		all_players = []
		all_statlines = []
		teams = []

		team_names = soup.find_all("div", class_ = "BoxscoreItem__TeamName")
		for team in team_names:
			teams.append(team.text.strip())
			

		# Get all the players
		players = soup.find("div", class_ = "Boxscore__ResponsiveWrapper").find_all("div", class_ = "ResponsiveTable")
		for i, row in enumerate(players):
			columns = row.find("table", class_ = "Table").tbody.find_all("td")
			for player in columns[:-2]:
				player = player.text.strip()
				if player != "starters" and player != "bench":
					all_players.append(player)

		# Get all the statlines
		statlines = soup.find("div", class_ = "Boxscore__ResponsiveWrapper").find_all("div", class_ = "Table__Scroller")
		for i, row in enumerate(statlines):
			rows = row.find("table", class_ = "Table").tbody.find_all("tr")
			for i, row in enumerate(rows):
				if i != 0 and i != 6:
					td = row.find_all('td')
					data = [tr.text.strip() for tr in td if tr.text.strip()]
					data.append(date)
					if '-' not in data[0] and '%' not in data[0]:
						all_statlines.append(data)
					if "DNP-" in data[0]:
						all_statlines.append([0] * 15)
		
		for i, player in enumerate(all_players):
			all_statlines[i].append(player)
		
		df = pd.DataFrame(all_statlines, columns = ["MIN", "FG", "3PT", "FT", "OREB", "DREB", "REB", "AST", "STL", "BLK", "TO", "PF", "PLUS_MINUS", "PTS", "DATE", "PLAYER_NAME"])
		box_scores = pd.concat([box_scores, df])

In [9]:
box_scores

Unnamed: 0,MIN,FG,3PT,FT,OREB,DREB,REB,AST,STL,BLK,TO,PF,PLUS_MINUS,PTS,PLAYER_NAME,DATE
0,34,6-17,1-2,4-4,1,7,8,4,0,2,2,3,-17,17,A. Davis PF,20231024
1,30,6-8,4-6,2-2,1,2,3,1,0,1,1,0,-14,18,T. Prince PF,20231024
2,29,10-16,1-4,0-1,1,7,8,5,1,0,0,1,+7,21,L. James SF,20231024
3,36,4-12,2-5,1-2,0,4,4,7,1,0,3,3,+1,11,D. Russell PG,20231024
4,31,4-11,1-2,5-7,4,4,8,4,2,0,2,2,-14,14,A. Reaves SG,20231024
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22,14,2-4,0-2,0-0,0,2,2,0,1,0,0,3,+1,4,C. Duarte SG,20231029
23,0,0,0,0,0,0,0,0,0,0,0,0,0,0,J. Slawson F,0
24,0,0,0,0,0,0,0,0,0,0,0,0,0,0,A. Len C,0
25,0,0,0,0,0,0,0,0,0,0,0,0,0,0,C. Jones G,0


In [18]:
X = box_scores[["MIN"]]
y = box_scores["PTS"]

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.2, random_state=8)

In [23]:
params = {"n_estimators": 500,
			"max_depth": 4,
			"min_samples_split": 5,
			"learning_rate": 0.01,
			"loss": "squared_error"}

In [24]:
reg = GradientBoostingRegressor(**params)
reg.fit(X_train, y_train)

In [25]:
y_pred = reg.predict(X_test)

In [53]:
compare = pd.DataFrame(list(y_test), columns=["Actual Points"])
compare['Predicted Points'] = y_pred.round()
compare['Minutes'] = list(X_test['MIN'])

In [54]:
compare

Unnamed: 0,Actual Points,Predicted Points,Minutes
0,12,6.0,17
1,6,23.0,37
2,0,0.0,0
3,12,6.0,21
4,0,1.0,6
...,...,...,...
222,20,11.0,26
223,36,10.0,25
224,0,4.0,14
225,14,15.0,31
