# Extra Feature Scraping

The purpose of this notebook is to scrape more features, including:
- Team Salaries
- Player Salaries
- Playoff Histories

In [1]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import matplotlib.ticker as mtick
import sqlite3
import seaborn as sns
from matplotlib.offsetbox import OffsetImage, AnnotationBbox
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import requests   
import shutil      
import datetime
from scipy.stats import norm
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
import os
import winsound


In [2]:
os.chdir('C:\\Users\\Travis\\OneDrive\\Data Science\\Personal_Projects\\Sports\\NBA_Prediction_V3_1')

In [3]:
options = np.arange(2000,2022,1)

## Team Salaries

In [4]:
urlz = []
for i in options:
    url = 'https://hoopshype.com/salaries/' + str(i) + '-' + str(i+1) + '/'
    urlz.append(url)
urlz

['https://hoopshype.com/salaries/2000-2001/',
 'https://hoopshype.com/salaries/2001-2002/',
 'https://hoopshype.com/salaries/2002-2003/',
 'https://hoopshype.com/salaries/2003-2004/',
 'https://hoopshype.com/salaries/2004-2005/',
 'https://hoopshype.com/salaries/2005-2006/',
 'https://hoopshype.com/salaries/2006-2007/',
 'https://hoopshype.com/salaries/2007-2008/',
 'https://hoopshype.com/salaries/2008-2009/',
 'https://hoopshype.com/salaries/2009-2010/',
 'https://hoopshype.com/salaries/2010-2011/',
 'https://hoopshype.com/salaries/2011-2012/',
 'https://hoopshype.com/salaries/2012-2013/',
 'https://hoopshype.com/salaries/2013-2014/',
 'https://hoopshype.com/salaries/2014-2015/',
 'https://hoopshype.com/salaries/2015-2016/',
 'https://hoopshype.com/salaries/2016-2017/',
 'https://hoopshype.com/salaries/2017-2018/',
 'https://hoopshype.com/salaries/2018-2019/',
 'https://hoopshype.com/salaries/2019-2020/',
 'https://hoopshype.com/salaries/2020-2021/',
 'https://hoopshype.com/salaries/2

In [22]:
# if team_contracts not a folder, make it
if not os.path.exists('data/team/team_contracts'):
    os.makedirs('data/team/team_contracts')

In [23]:
for u in urlz:
    df = pd.read_html(u)[0]
    df.to_csv('data/team/team_contracts/' + str(u[-10:-1]) + '.csv')

## Player Salaries

In [27]:
# if player_contracts not a folder, make it
if not os.path.exists('data/player/contracts'):
    os.makedirs('data/player/contracts')

In [28]:
player_urls = []
for i in options:
    url = 'https://hoopshype.com/salaries/players/' + str(i) + '-' + str(i+1) + '/'
    player_urls.append(url)

In [29]:
for u in player_urls:
    df = pd.read_html(u)[0]
    df.to_csv('data/player/contracts/' + str(u[-10:-1]) + '.csv')

## Playoff History

In [6]:
playoff_history = 'https://www.basketball-reference.com/playoffs/series.html'
df = pd.read_html(playoff_history)[0]
df.to_csv('data/team/playoff_history.excel')
df

Unnamed: 0_level_0,Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Winner,Winner,Unnamed: 7_level_0,Loser,Loser,Unnamed: 10_level_0,Unnamed: 11_level_0,Unnamed: 12_level_0
Unnamed: 0_level_1,Yr,Lg,Series,Unnamed: 3_level_1,Unnamed: 4_level_1,Team,W,Unnamed: 7_level_1,Team,W,Unnamed: 10_level_1,Favorite,Underdog
0,2022,NBA,Eastern Conf First Round,"Apr 17 - Apr 27, 2022",,Milwaukee Bucks (3),4,,Chicago Bulls (6),1,,MIL (-1100),CHI (+700)
1,2022,NBA,Western Conf First Round,"Apr 16 - Apr 29, 2022",,Memphis Grizzlies (2),4,,Minnesota Timberwolves (7),2,,MEM (-320),MIN (+260)
2,2022,NBA,Eastern Conf First Round,"Apr 17 - Apr 25, 2022",,Boston Celtics (2),4,,Brooklyn Nets (7),0,,BOS (-130),BRK (+110)
3,2022,NBA,Western Conf First Round,"Apr 16 - Apr 27, 2022",,Golden State Warriors (3),4,,Denver Nuggets (6),1,,GSW (-275),DEN (+225)
4,2022,NBA,Western Conf First Round,"Apr 16 - Apr 28, 2022",,Dallas Mavericks (4),4,,Utah Jazz (5),2,,UTA (-300),DAL (+245)
...,...,...,...,...,...,...,...,...,...,...,...,...,...
998,1947,BAA,Quarterfinals,"Apr 2 - Apr 9, 1947",,New York Knicks (3),2,,Cleveland Rebels (3),1,,,
999,1947,BAA,Quarterfinals,"Apr 2 - Apr 6, 1947",,Philadelphia Warriors (2),2,,St. Louis Bombers (2),1,,,
1000,1947,BAA,Semifinals,"Apr 2 - Apr 13, 1947",,Chicago Stags (1),4,,Washington Capitols (1),2,,,
1001,1947,BAA,Semifinals,"Apr 12 - Apr 14, 1947",,Philadelphia Warriors (2),2,,New York Knicks (3),0,,,


In [7]:
winners = df[('Winner', 'Team')]
yr = df[('Unnamed: 0_level_0', 'Yr')]
round = df[('Unnamed: 2_level_0', 'Series')]
losers = df[('Loser', 'Team')]
favorite = df[('Unnamed: 11_level_0', 'Favorite')]
underdog = df[('Unnamed: 12_level_0', 'Underdog')]

new_df = pd.concat([yr, round, winners, losers, favorite, underdog], axis=1)
new_df

Unnamed: 0_level_0,Unnamed: 0_level_0,Unnamed: 2_level_0,Winner,Loser,Unnamed: 11_level_0,Unnamed: 12_level_0
Unnamed: 0_level_1,Yr,Series,Team,Team,Favorite,Underdog
0,2022,Eastern Conf First Round,Milwaukee Bucks (3),Chicago Bulls (6),MIL (-1100),CHI (+700)
1,2022,Western Conf First Round,Memphis Grizzlies (2),Minnesota Timberwolves (7),MEM (-320),MIN (+260)
2,2022,Eastern Conf First Round,Boston Celtics (2),Brooklyn Nets (7),BOS (-130),BRK (+110)
3,2022,Western Conf First Round,Golden State Warriors (3),Denver Nuggets (6),GSW (-275),DEN (+225)
4,2022,Western Conf First Round,Dallas Mavericks (4),Utah Jazz (5),UTA (-300),DAL (+245)
...,...,...,...,...,...,...
998,1947,Quarterfinals,New York Knicks (3),Cleveland Rebels (3),,
999,1947,Quarterfinals,Philadelphia Warriors (2),St. Louis Bombers (2),,
1000,1947,Semifinals,Chicago Stags (1),Washington Capitols (1),,
1001,1947,Semifinals,Philadelphia Warriors (2),New York Knicks (3),,


In [8]:
new_df = new_df.dropna()
new_df

Unnamed: 0_level_0,Unnamed: 0_level_0,Unnamed: 2_level_0,Winner,Loser,Unnamed: 11_level_0,Unnamed: 12_level_0
Unnamed: 0_level_1,Yr,Series,Team,Team,Favorite,Underdog
0,2022,Eastern Conf First Round,Milwaukee Bucks (3),Chicago Bulls (6),MIL (-1100),CHI (+700)
1,2022,Western Conf First Round,Memphis Grizzlies (2),Minnesota Timberwolves (7),MEM (-320),MIN (+260)
2,2022,Eastern Conf First Round,Boston Celtics (2),Brooklyn Nets (7),BOS (-130),BRK (+110)
3,2022,Western Conf First Round,Golden State Warriors (3),Denver Nuggets (6),GSW (-275),DEN (+225)
4,2022,Western Conf First Round,Dallas Mavericks (4),Utah Jazz (5),UTA (-300),DAL (+245)
...,...,...,...,...,...,...
893,Yr,Series,Team,Team,Favorite,Underdog
911,Yr,Series,Team,Team,Favorite,Underdog
933,Yr,Series,Team,Team,Favorite,Underdog
951,Yr,Series,Team,Team,Favorite,Underdog


In [9]:
new_df.to_csv('data/team/playoffs_history_cleaned.csv')

In [10]:
finals = new_df[new_df[('Unnamed: 2_level_0', 'Series')] == 'Finals']
finals

Unnamed: 0_level_0,Unnamed: 0_level_0,Unnamed: 2_level_0,Winner,Loser,Unnamed: 11_level_0,Unnamed: 12_level_0
Unnamed: 0_level_1,Yr,Series,Team,Team,Favorite,Underdog
30,2021,Finals,Milwaukee Bucks (3),Phoenix Suns (2),PHO (-180),MIL (+160)
46,2020,Finals,Los Angeles Lakers (1),Miami Heat (5),LAL (-350),MIA (+290)
62,2019,Finals,Toronto Raptors (2),Golden State Warriors (1),GSW (-270),TOR (+230)
78,2018,Finals,Golden State Warriors (2),Cleveland Cavaliers (4),GSW (-1075),CLE (+688)
94,2017,Finals,Golden State Warriors (1),Cleveland Cavaliers (2),GSW (-300),CLE (+250)
110,2016,Finals,Cleveland Cavaliers (1),Golden State Warriors (1),GSW (-220),CLE (+180)
126,2015,Finals,Golden State Warriors (1),Cleveland Cavaliers (2),GSW (-220),CLE (+190)
142,2014,Finals,San Antonio Spurs (1),Miami Heat (2),SAS (-155),MIA (+135)
158,2013,Finals,Miami Heat (1),San Antonio Spurs (2),MIA (-220),SAS (+180)
174,2012,Finals,Miami Heat (2),Oklahoma City Thunder (2),OKC (-175),MIA (+155)


In [11]:
finals.to_csv('data/team/playoffs_history_finals.csv')

In [12]:
fin_df = pd.read_csv('data/team/playoffs_history_finals.csv', skiprows = [0])

In [13]:
fin_df = fin_df.rename(columns={'Team': 'Winning_team', 'Team.1' : 'Losing_team'})
fin_df

Unnamed: 0.1,Unnamed: 0,Yr,Series,Winning_team,Losing_team,Favorite,Underdog
0,30,2021,Finals,Milwaukee Bucks (3),Phoenix Suns (2),PHO (-180),MIL (+160)
1,46,2020,Finals,Los Angeles Lakers (1),Miami Heat (5),LAL (-350),MIA (+290)
2,62,2019,Finals,Toronto Raptors (2),Golden State Warriors (1),GSW (-270),TOR (+230)
3,78,2018,Finals,Golden State Warriors (2),Cleveland Cavaliers (4),GSW (-1075),CLE (+688)
4,94,2017,Finals,Golden State Warriors (1),Cleveland Cavaliers (2),GSW (-300),CLE (+250)
5,110,2016,Finals,Cleveland Cavaliers (1),Golden State Warriors (1),GSW (-220),CLE (+180)
6,126,2015,Finals,Golden State Warriors (1),Cleveland Cavaliers (2),GSW (-220),CLE (+190)
7,142,2014,Finals,San Antonio Spurs (1),Miami Heat (2),SAS (-155),MIA (+135)
8,158,2013,Finals,Miami Heat (1),San Antonio Spurs (2),MIA (-220),SAS (+180)
9,174,2012,Finals,Miami Heat (2),Oklahoma City Thunder (2),OKC (-175),MIA (+155)


In [14]:
fin_df = fin_df.drop(columns = ['Unnamed: 0'])

In [15]:
# split Favorite column by ' '
fin_df['Favorite'] = fin_df['Favorite'].str.split(' ').str[0]
fin_df['Underdog'] = fin_df['Underdog'].str.split(' ').str[0] 

In [16]:
fin_df['Winning_team'] = fin_df['Winning_team'].str.split('(').str[0]
fin_df['Winning_team'] = fin_df['Winning_team'].str[:-1]
fin_df['Losing_team'] = fin_df['Losing_team'].str.split('(').str[0]
fin_df['Losing_team'] = fin_df['Losing_team'].str[:-1]
fin_df

Unnamed: 0,Yr,Series,Winning_team,Losing_team,Favorite,Underdog
0,2021,Finals,Milwaukee Bucks,Phoenix Suns,PHO,MIL
1,2020,Finals,Los Angeles Lakers,Miami Heat,LAL,MIA
2,2019,Finals,Toronto Raptors,Golden State Warriors,GSW,TOR
3,2018,Finals,Golden State Warriors,Cleveland Cavaliers,GSW,CLE
4,2017,Finals,Golden State Warriors,Cleveland Cavaliers,GSW,CLE
5,2016,Finals,Cleveland Cavaliers,Golden State Warriors,GSW,CLE
6,2015,Finals,Golden State Warriors,Cleveland Cavaliers,GSW,CLE
7,2014,Finals,San Antonio Spurs,Miami Heat,SAS,MIA
8,2013,Finals,Miami Heat,San Antonio Spurs,MIA,SAS
9,2012,Finals,Miami Heat,Oklahoma City Thunder,OKC,MIA


In [17]:
def teamname_to_abb(teamname):
    if teamname == 'New York Knicks':
        return 'NYK'
    elif teamname == 'Los Angeles Lakers':
        return 'LAL'
    elif teamname == 'Boston Celtics':
        return 'BOS'
    elif teamname == 'Philadelphia 76ers':
        return 'PHI'
    elif teamname == 'Toronto Raptors':
        return 'TOR'
    elif teamname == 'Chicago Bulls':
        return 'CHI'
    elif teamname == 'Oklahoma City Thunder':
        return 'OKC'
    elif teamname == 'Charlotte Hornets':
        return 'CHO'
    elif teamname == 'Detroit Pistons':
        return 'DET'
    elif teamname == 'Indiana Pacers':
        return 'IND'
    elif teamname == 'Milwaukee Bucks':
        return 'MIL'
    elif teamname == 'Washington Wizards':
        return 'WAS'
    elif teamname == 'Orlando Magic':
        return 'ORL'
    elif teamname == 'Miami Heat':
        return 'MIA'
    elif teamname == 'New Orleans Pelicans':
        return 'NOP'
    elif teamname == 'Atlanta Hawks':
        return 'ATL'
    elif teamname == 'Charlotte Bobcats':
        return 'CHA'
    elif teamname == 'Cleveland Cavaliers':
        return 'CLE'
    elif teamname == 'Dallas Mavericks':
        return 'DAL'
    elif teamname == 'Denver Nuggets':
        return 'DEN'
    elif teamname == 'Golden State Warriors':
        return 'GSW'
    elif teamname == 'Houston Rockets':
        return 'HOU'
    elif teamname == 'Memphis Grizzlies':
        return 'MEM'
    elif teamname == 'Minnesota Timberwolves':
        return 'MIN'
    elif teamname == 'New Jersey Nets':
        return 'NJN'
    elif teamname == 'Oklahoma City Thunder':
        return 'OKC'
    elif teamname == 'Portland Trail Blazers':
        return 'POR'

In [18]:
fin_df['Winner_abv'] = fin_df.apply(lambda row: teamname_to_abb(row['Winning_team']), axis=1)
fin_df['Loser_abv'] = fin_df.apply(lambda row: teamname_to_abb(row['Losing_team']), axis=1)
fin_df

Unnamed: 0,Yr,Series,Winning_team,Losing_team,Favorite,Underdog,Winner_abv,Loser_abv
0,2021,Finals,Milwaukee Bucks,Phoenix Suns,PHO,MIL,MIL,
1,2020,Finals,Los Angeles Lakers,Miami Heat,LAL,MIA,LAL,MIA
2,2019,Finals,Toronto Raptors,Golden State Warriors,GSW,TOR,TOR,GSW
3,2018,Finals,Golden State Warriors,Cleveland Cavaliers,GSW,CLE,GSW,CLE
4,2017,Finals,Golden State Warriors,Cleveland Cavaliers,GSW,CLE,GSW,CLE
5,2016,Finals,Cleveland Cavaliers,Golden State Warriors,GSW,CLE,CLE,GSW
6,2015,Finals,Golden State Warriors,Cleveland Cavaliers,GSW,CLE,GSW,CLE
7,2014,Finals,San Antonio Spurs,Miami Heat,SAS,MIA,,MIA
8,2013,Finals,Miami Heat,San Antonio Spurs,MIA,SAS,MIA,
9,2012,Finals,Miami Heat,Oklahoma City Thunder,OKC,MIA,MIA,OKC


In [19]:
fin_df.to_csv('data/team/cleaned_finals_results.csv')

In [20]:
fin_df['predicted?'] = np.where(fin_df['Winner_abv'] == fin_df['Favorite'], 1, 0)


In [21]:
fin_df

Unnamed: 0,Yr,Series,Winning_team,Losing_team,Favorite,Underdog,Winner_abv,Loser_abv,predicted?
0,2021,Finals,Milwaukee Bucks,Phoenix Suns,PHO,MIL,MIL,,0
1,2020,Finals,Los Angeles Lakers,Miami Heat,LAL,MIA,LAL,MIA,1
2,2019,Finals,Toronto Raptors,Golden State Warriors,GSW,TOR,TOR,GSW,0
3,2018,Finals,Golden State Warriors,Cleveland Cavaliers,GSW,CLE,GSW,CLE,1
4,2017,Finals,Golden State Warriors,Cleveland Cavaliers,GSW,CLE,GSW,CLE,1
5,2016,Finals,Cleveland Cavaliers,Golden State Warriors,GSW,CLE,CLE,GSW,0
6,2015,Finals,Golden State Warriors,Cleveland Cavaliers,GSW,CLE,GSW,CLE,1
7,2014,Finals,San Antonio Spurs,Miami Heat,SAS,MIA,,MIA,0
8,2013,Finals,Miami Heat,San Antonio Spurs,MIA,SAS,MIA,,1
9,2012,Finals,Miami Heat,Oklahoma City Thunder,OKC,MIA,MIA,OKC,0


#### Note: Since 2003, the favorite (of the two finalists) has only won the finals 7/19 times. Interesting.