# NBA Stats Dataset

### Web Scraping from Basketball Reference

In [145]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager

driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
driver.get("https://www.basketball-reference.com/leagues/NBA_2020_per_game.html")


html = driver.page_source

tables = pd.read_html(html)
df = tables[18]

driver.close()

### Cleaning and Optimizing the data

In [140]:
#Delete the repeated headers
#Only keep the total season stat line if the player was traded
df=df.drop_duplicates()
df=df.drop_duplicates(subset=['Player'], keep='first')
df=df.loc[~df['GS'].str.match("GS")]

#Drop unnecessary colums and reset the index
df=df[['Player','FG%','3P%','FT%','PTS','AST','TRB','BLK','STL']]

#Replace NaN Values with Zero
df=df.fillna(0)

#Change columns data type
df = df.astype({"Player": str,"FG%": float,"3P%": float,"FT%": float,"TRB": float,"AST": float,
               "STL": float,"BLK": float,"PTS": float})
#Changing fields decimal to percentage
df['FG%'] = df['FG%']*100
df['3P%'] = df['3P%']*100
df['FT%'] = df['FT%']*100


df.to_csv(r'nba_2020_stats.csv',index=False, encoding='utf-8', header='true')
df_stats=df
df_stats

Unnamed: 0,Player,FG%,3P%,FT%,PTS,AST,TRB,BLK,STL
0,Steven Adams,59.2,33.3,58.2,10.9,2.3,9.3,1.1,0.8
1,Bam Adebayo,55.7,14.3,69.1,15.9,5.1,10.2,1.3,1.1
2,LaMarcus Aldridge,49.3,38.9,82.7,18.9,2.4,7.4,1.6,0.7
3,Kyle Alexander,50.0,0.0,0.0,1.0,0.0,1.5,0.0,0.0
4,Nickeil Alexander-Walker,36.8,34.6,67.6,5.7,1.9,1.8,0.2,0.4
...,...,...,...,...,...,...,...,...,...
672,Trae Young,43.7,36.1,86.0,29.6,9.3,4.3,0.1,1.1
673,Cody Zeller,52.4,24.0,68.2,11.1,1.5,7.1,0.4,0.7
674,Tyler Zeller,25.0,0.0,0.0,1.0,0.0,2.0,0.0,0.0
675,Ante Žižić,56.9,0.0,73.7,4.4,0.3,3.0,0.2,0.3


# NBA All-Star List Dataset

### Web Scraping from Basketball Reference

In [146]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager

driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
driver.get("https://www.basketball-reference.com/allstar/NBA_2020.html")


html = driver.page_source

tables1 = pd.read_html(html)

driver.close()

### Cleaning and Optimizing the data

In [148]:
#Put east and west player roster together
result = pd.concat([tables1[19], tables1[20]], axis=1, join="inner")
result = result.droplevel(0,axis=1)

#Keep players column
result = result[['Starters']]

#Remove unecessary rows
result = result[~result['Starters'].isin(["Reserves", "Team Totals"])]
#Drop null rows
result = result.dropna()

#Remove duplicate headers
result.columns = result.iloc[0]
result = result[1:]

#Add headers
result = result.T.reset_index().T.reset_index(drop=True)
result.columns = ["West", "East"]

result.to_csv(r'nba_2020_all_stars.csv',index=False, encoding='utf-8', header='true')

#Create nba 2020 all-star list
east = result['East'].values.tolist()
west = result['West'].values.tolist()
players=east+west



players

['Giannis Antetokounmpo',
 'Kemba Walker',
 'Joel Embiid',
 'Pascal Siakam',
 'Trae Young',
 'Kyle Lowry',
 'Khris Middleton',
 'Rudy Gobert',
 'Donovan Mitchell',
 'Jimmy Butler',
 'Bam Adebayo',
 'Brandon Ingram',
 'Kawhi Leonard',
 'Anthony Davis',
 'LeBron James',
 'James Harden',
 'Luka Dončić',
 'Ben Simmons',
 'Russell Westbrook',
 'Chris Paul',
 'Devin Booker',
 'Domantas Sabonis',
 'Jayson Tatum',
 'Nikola Jokić']

In [141]:
pattern = '|'.join([f'(?i){p}' for p in players]) 

df['All-Star']=df.Player.str.contains(pattern)
df['All-Star'] = df['All-Star'].astype(int)
df

Unnamed: 0,Player,FG%,3P%,FT%,PTS,AST,TRB,BLK,STL,All-Star
0,Steven Adams,59.2,33.3,58.2,10.9,2.3,9.3,1.1,0.8,0
1,Bam Adebayo,55.7,14.3,69.1,15.9,5.1,10.2,1.3,1.1,1
2,LaMarcus Aldridge,49.3,38.9,82.7,18.9,2.4,7.4,1.6,0.7,0
3,Kyle Alexander,50.0,0.0,0.0,1.0,0.0,1.5,0.0,0.0,0
4,Nickeil Alexander-Walker,36.8,34.6,67.6,5.7,1.9,1.8,0.2,0.4,0
...,...,...,...,...,...,...,...,...,...,...
672,Trae Young,43.7,36.1,86.0,29.6,9.3,4.3,0.1,1.1,1
673,Cody Zeller,52.4,24.0,68.2,11.1,1.5,7.1,0.4,0.7,0
674,Tyler Zeller,25.0,0.0,0.0,1.0,0.0,2.0,0.0,0.0,0
675,Ante Žižić,56.9,0.0,73.7,4.4,0.3,3.0,0.2,0.3,0


# NBA Salaries dataset

In [142]:
#import salaries data
df_salaries = pd.read_csv('nba-salaries.csv')

df_salaries = df_salaries.loc[df_salaries['season'] == 2020]



#Drop unnecessary colums and reset the index
df_salaries=df_salaries.drop(['rank','position','team','season'], axis=1)

#Change columns data type
df_salaries = df_salaries.astype({"name": str,"salary": int})
df_salaries.to_csv(r'nba_2020_salaries.csv',index=False, encoding='utf-8', header='true')

#Rename unnamed columns 
df_salaries.rename(columns={'name': 'Player', 'salary': 'Salary'}, inplace=True)

df_salaries

Unnamed: 0,Player,Salary
8928,Stephen Curry,40231758
8929,Chris Paul,38506482
8930,Russell Westbrook,38506482
8931,John Wall,38199000
8932,Kevin Durant,38199000
...,...,...
9451,Jimmer Fredette,208509
9452,Jontay Porter,197933
9453,Anthony Tolliver,183115
9454,Tyler Johnson,183115


### Merge the datasets

In [143]:
df= pd.merge(df_stats, df_salaries, how='inner').sort_values(by='Player')


df


Unnamed: 0,Player,FG%,3P%,FT%,PTS,AST,TRB,BLK,STL,All-Star,Salary
141,Aaron Gordon,43.7,30.8,67.4,14.4,3.7,7.7,0.6,0.8,0,19863636
174,Aaron Holiday,41.4,39.4,85.1,9.5,3.4,2.4,0.2,0.8,0,2329200
283,Abdel Nader,46.8,37.5,77.3,6.3,0.7,1.8,0.4,0.4,0,1618520
347,Admiral Schofield,38.0,31.1,66.7,3.0,0.5,1.4,0.1,0.2,0,1000000
180,Al Horford,45.0,35.0,76.3,11.9,4.0,6.8,0.9,0.8,0,28000000
...,...,...,...,...,...,...,...,...,...,...,...
122,Yogi Ferrell,42.0,30.4,85.7,4.4,1.4,1.0,0.1,0.4,0,3150000
84,Zach Collins,47.1,36.8,75.0,7.0,1.5,6.3,0.5,0.5,0,4240200
228,Zach LaVine,45.0,38.0,80.2,25.5,4.2,4.8,0.5,1.5,0,19500000
360,Zhaire Smith,27.3,0.0,50.0,1.1,0.3,0.3,0.0,0.4,0,3058800


In [144]:
#exporting to CSV file
df.to_csv(r'nba_stats&salaries&all_star_status.csv',index=False, encoding='utf-8', header='true')