In [1]:
# https://medium.com/@HeeebsInc/using-machine-learning-to-predict-daily-fantasy-basketball-scores-part-i-811de3c54a98

In [1]:
# needed libraries
from urllib.request import urlopen
from bs4 import BeautifulSoup
import pandas as pd

# URL to scrape
url = "https://www.basketball-reference.com/playoffs/"

# collect HTML data
html = urlopen(url)
        
# create beautiful soup object from HTML
soup = BeautifulSoup(html, features="lxml")

# use getText()to extract the headers into a list
headers = [th.getText() for th in soup.findAll('tr', limit=2)[1].findAll('th')]

# get rows from table
rows = soup.findAll('tr')[2:]
rows_data = [[td.getText() for td in rows[i].findAll('td')]
                    for i in range(len(rows))]

# if you print row_data here you'll see an empty row
# so, remove the empty row
rows_data.pop(20)
# for simplicity subset the data for only 39 seasons
rows_data = rows_data[0:38]

# we're missing a column for years
# add the years into rows_data
last_year = 2020
for i in range(0, len(rows_data)):
    rows_data[i].insert(0, last_year)
    last_year -=1
    
# create the dataframe
nba_finals = pd.DataFrame(rows_data, columns = headers)
# export dataframe to a CSV 
nba_finals.to_csv("nba_finals_history.csv", index=False)

# Web Scraper

In [None]:
# https://github.com/klaudius1D/ShootersShoot/tree/main

In [1]:
import pandas as pd
import requests
import os
from bs4 import BeautifulSoup

In [3]:
# Set Playoff Years from 2020 to 2023
years = list(range(2020, 2024))

# {} used to vary dates in loop
url_start = "https://www.basketball-reference.com/playoffs/NBA_{}_per_game.html"

# Replace brackets with specific years based on list
for year in years:
    url = url_start.format(year)
    data = requests.get(url)

    # Create HTML files for each year
    with open("topPicksHTML/Playoffs-{}_topPicks.html".format(year), "wb") as f:
        f.write(data.content)

In [5]:
# using a loop, parse each html for each year in years list
dfs = []
for year in years:
    url = url_start.format(year)
    data = requests.get(url)
    soup = BeautifulSoup(data.content, "html.parser")
    topPicks_table = soup.find("table", {"id": "per_game_stats"})
    topPicks_df = pd.read_html(str(topPicks_table))[0]
    topPicks_df = topPicks_df[topPicks_df.Rk != "Rk"]
    topPicks_df["Year"] = year

    dfs.append(topPicks_df)

    # Save all contents into one csv files into new folder called topPicksCSV
    topPicks_df.to_csv("topPicksCSV/Playoffs-{}_topPicks.csv".format(year), index=False)

In [6]:
# combine all dataframes into one
topPicks_df = pd.concat(dfs)
topPicks_df.to_csv("topPicksCSV/Playoffs-Master_topPicks.csv", index=False)

In [7]:
# check contents of dataframe, ensure all years are included
topPicks_df.head()
topPicks_df.tail()

Unnamed: 0,Rk,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,...,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,Year
222,213,Ziaire Williams,SF,21,MEM,4,0,3.0,0.5,1.8,...,0.0,0.5,0.5,0.5,0.0,0.0,0.5,0.3,1.3,2023
223,214,Trae Young,PG,24,ATL,6,6,38.3,10.0,24.8,...,0.8,2.8,3.7,10.2,1.7,0.7,4.0,1.8,29.2,2023
224,215,Omer Yurtseven,C,24,MIA,8,0,2.0,0.3,0.9,...,0.4,0.3,0.6,0.1,0.0,0.1,0.1,0.3,0.5,2023
225,216,Cody Zeller,C,30,MIA,21,0,8.3,1.0,1.7,...,0.5,1.8,2.3,0.3,0.1,0.2,0.6,1.3,2.2,2023
226,217,Ivica Zubac,C,25,LAC,5,5,26.0,3.4,6.0,...,3.2,6.4,9.6,0.6,0.6,0.2,2.2,1.6,9.2,2023


# Data Cleaning

In [1]:
import pandas as pd
import numpy as np

# create variables for scoring settings
global pointsScored, totalRebounds, assists, steals, blockedShots, turnovers
pointsScored = 1.0
totalRebounds = 1.2
assists = 1.5
steals = 3.0
blockedShots = 3.0
turnovers = -1.0

In [2]:
# load Master_topPicks into dataframe
df = pd.read_csv("topPicksCSV/Playoffs-Master_topPicks.csv")

# keep only the columns we need
stats_to_keep = [
    "Player",
    "Pos",
    "Tm",
    "G",
    "MP",
    "TRB",
    "AST",
    "STL",
    "BLK",
    "TOV",
    "PTS",
    "Year",
]
df = df[stats_to_keep]

# capture each player occurences
# df = df.groupby(["Player"]).sum().reset_index()

# if a player has played for multiple years, average their stats across those years
numeric_cols = df.select_dtypes(include=["float64", "int64"]).columns
df = df.groupby(["Player"])[numeric_cols].mean().reset_index()

# drop year column
df = df.drop("Year", axis=1)

In [3]:
# create calculated column for fantasy points
df["FantasyPoints"] = (
    df["PTS"] * pointsScored
    + df["TRB"] * totalRebounds
    + df["AST"] * assists
    + df["STL"] * steals
    + df["BLK"] * blockedShots
    + df["TOV"] * turnovers)

#round to the nearest decimal
df = df.round(1)

In [4]:
# write to csv
df.to_csv("topPicksCSV/Playoffs-Master-Combined_topPicks.csv", index=False)

df

Unnamed: 0,Player,G,MP,TRB,AST,STL,BLK,TOV,PTS,FantasyPoints
0,Aaron Gordon,11.7,32.5,6.2,2.4,0.5,0.7,1.3,12.7,26.1
1,Aaron Holiday,3.7,8.4,0.6,1.7,0.5,0.1,0.7,3.8,8.0
2,Aaron Nesmith,10.0,9.2,1.8,0.2,0.2,0.2,0.2,2.0,5.4
3,Abdel Nader,4.0,7.0,1.0,0.0,0.2,0.4,0.0,1.0,3.8
4,Al Horford,15.7,32.9,7.9,2.9,0.7,1.4,0.9,8.6,28.0
...,...,...,...,...,...,...,...,...,...,...
455,Yogi Ferrell,9.0,1.7,0.3,0.3,0.0,0.0,0.0,0.4,1.2
456,Yuta Watanabe,2.5,3.8,0.5,0.0,0.0,0.0,0.6,2.0,2.0
457,Zach LaVine,4.0,38.3,5.3,6.0,0.8,0.3,3.0,19.3,35.0
458,Zeke Nnaji,4.0,3.5,0.3,0.1,0.1,0.0,0.1,1.6,2.5
