In [1]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import pandas as pd
import re
import requests

In [2]:
start = 2020
stop = 2021

# Let's use a for loop to secure the data from 2004 to today

In [3]:
for year in range(start,stop,1):
    site = 'https://www.basketball-reference.com/leagues/NBA_{}.html'
    tableID = 'team-stats-per_game'
    
    comm = re.compile("<!--|-->")
    url= site.format(year)
    file = "{}.csv"
    
    html = requests.get(url).text
    cleaned_soup = BeautifulSoup(re.sub("<!--|-->","", str(html)),'lxml')
    
    tableStats = cleaned_soup.find('table', {'id':tableID})
    
    headers = [th.getText() for th in tableStats.findAll('tr')[0].findAll('th')]
    headers = headers[1:]
    rows = tableStats.findAll('tr')[1:]
    stats = [[td.getText() for td in rows[i].findAll('td')]
                for i in range(len(rows))]
        
    stats = pd.DataFrame(stats, columns = headers)
    
    for col in stats:
        if col != 'Team':
            stats[col] = stats[col].rank()
            
    stats["Playoffs"] = ["1" if "*"  in ele else "0" for ele in stats["Team"]]
    
    if year == start:
        overallstats = stats
    if year != start:
        overallstats = overallstats.append(stats, ignore_index=True)

# So, now let's look at what stats we scraped:

In [4]:
overallstats

Unnamed: 0,Team,G,MP,FG,FGA,FG%,3P,3PA,3P%,2P,...,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,Playoffs
0,Dallas Mavericks,31.0,22.0,22.0,22.0,15.5,29.0,30.0,22.0,4.0,...,12.5,28.0,28.0,18.0,4.0,15.5,3.5,7.0,29.0,0
1,Milwaukee Bucks*,26.5,3.5,31.0,29.0,29.0,26.0,28.0,13.0,24.5,...,26.5,31.0,31.0,25.0,10.5,29.0,22.0,5.5,31.0,1
2,Houston Rockets*,17.5,13.0,20.0,25.0,9.0,30.0,31.0,9.0,2.0,...,5.0,14.5,16.0,2.0,28.5,20.0,12.5,24.5,30.0,1
3,Portland Trail Blazers,30.0,8.0,25.0,26.0,19.0,21.0,16.0,29.0,20.0,...,3.5,20.5,22.0,1.0,3.0,30.0,5.0,23.0,25.0,0
4,Los Angeles Clippers*,17.5,8.0,21.0,18.5,20.0,18.0,15.0,24.0,18.5,...,15.5,29.0,29.0,10.0,8.0,15.5,17.0,27.0,28.0,1
5,New Orleans Pelicans,17.5,23.0,28.0,31.0,18.0,27.5,26.0,25.5,15.0,...,19.0,24.0,27.0,29.0,15.5,18.0,30.0,18.0,27.0,0
6,Washington Wizards,17.5,8.0,24.0,27.5,15.5,16.5,14.0,25.5,23.0,...,1.5,2.0,1.5,20.5,22.0,7.0,9.0,30.0,26.0,0
7,Phoenix Suns,26.5,8.0,17.5,7.5,21.0,10.0,10.0,15.0,22.0,...,29.0,8.5,9.0,31.0,19.5,2.0,20.0,29.0,20.0,0
8,Memphis Grizzlies,26.5,3.5,30.0,27.5,23.0,6.0,7.0,7.5,30.0,...,6.5,27.0,25.0,30.0,21.0,26.0,24.0,19.0,18.0,0
9,Miami Heat*,26.5,31.0,5.0,1.0,25.0,25.0,22.0,31.0,3.0,...,22.0,25.0,13.0,26.5,12.5,9.0,18.5,13.0,17.0,1


# Now let's clean up the data for just what stats matter

In [5]:
overallstats = overallstats[~overallstats['Team'].str.contains("League Average")]
overallstats = overallstats[['3P%','2P%','DRB','AST','STL','BLK','TOV','PF','Playoffs']]

In [6]:
overallstats

Unnamed: 0,3P%,2P%,DRB,AST,STL,BLK,TOV,PF,Playoffs
0,22.0,28.0,28.0,18.0,4.0,15.5,3.5,7.0,0
1,13.0,31.0,31.0,25.0,10.5,29.0,22.0,5.5,1
2,9.0,29.0,14.5,2.0,28.5,20.0,12.5,24.5,1
3,29.0,7.0,20.5,1.0,3.0,30.0,5.0,23.0,0
4,24.0,13.0,29.0,10.0,8.0,15.5,17.0,27.0,1
5,25.5,17.0,24.0,29.0,15.5,18.0,30.0,18.0,0
6,25.5,5.5,2.0,20.5,22.0,7.0,9.0,30.0,0
7,15.0,21.0,8.5,31.0,19.5,2.0,20.0,29.0,0
8,7.5,24.5,27.0,30.0,21.0,26.0,24.0,19.0,0
9,31.0,24.5,25.0,26.5,12.5,9.0,18.5,13.0,1


In [7]:
overallstats.to_csv(f"statsfor{start}to{stop}.csv",index=False)