In [19]:
# From Ben Kite - https://github.com/BenKite/football_data/blob/master/profootballReferenceScrape.py

import pandas
import requests, bs4
import re

## Provides a list of the html tables that can be found at the url
## provided.  The order in the list returned should reflect the order
## that the tables appear.  On pro-football-reference.com, these names
## usually indicate what information they contain.
def findTables(url):
    res = requests.get(url)
    comm = re.compile("<!--|-->")
    soup = bs4.BeautifulSoup(comm.sub("", res.text), 'html.parser')
    divs = soup.findAll('div', id = "content")
    divs = divs[0].findAll("div", id=re.compile("^all"))
    ids = []
    for div in divs:
        searchme = str(div.findAll("table"))
        x = searchme[searchme.find("id=") + 3: searchme.find(">")]
        x = x.replace("\"", "")
        if len(x) > 0:
            ids.append(x)
    return(ids)
## For example:
## findTables("http://www.pro-football-reference.com/boxscores/201702050atl.htm")


## Pulls a table (indicated by tableID, which can be identified with
## "findTables") from the specified url. The header option determines
## if the function should try to determine the column names and put
## them in the returned data frame. The default for header is True.
## If you get an index error for data_header, try specifying header =
## False. I will include a generated error message for that soon.
def pullTable(url, tableID, header = True):
    res = requests.get(url)
    ## Work around comments
    comm = re.compile("<!--|-->")
    soup = bs4.BeautifulSoup(comm.sub("", res.text), 'html.parser')
    tables = soup.findAll('table', id = tableID)
    data_rows = tables[0].findAll('tr')
    game_data = [[td.getText() for td in data_rows[i].findAll(['th','td'])]
        for i in range(len(data_rows))
        ]
    data = pandas.DataFrame(game_data)
    if header == True:
        data_header = tables[0].findAll('thead')
        data_header = data_header[0].findAll("tr")
        data_header = data_header[0].findAll("th")
        header = []
        for i in range(len(data.columns)):
            header.append(data_header[i].getText())
        data.columns = header
        data = data.loc[data[header[0]] != header[0]]
    data = data.reset_index(drop = True)
    return(data)


In [20]:
findTables("https://www.pro-football-reference.com/years/2002/")

['AFC',
 'NFC',
 'playoff_results',
 'team_stats',
 'passing',
 'rushing',
 'returns',
 'kicking',
 'team_scoring',
 'team_conversions',
 'drives']

In [23]:
conf_table_names = ['AFC', 'NFC', 'playoff_results']
YEAR_START = 2002
YEAR_END = 2021
for year in range(YEAR_START, YEAR_END):
    url = "https://www.pro-football-reference.com/years/" + str(year)
    for conf_entr in conf_table_names:
        if (year == 2020 and conf_entr == 'playoff_results'):
            break
        dest = str(year) + "/" + conf_entr
        data = pullTable(url, conf_entr)
        data.to_csv(dest + ".csv")
  

In [26]:
print(findTables("https://www.pro-football-reference.com/years/2002/passing.htm"))
print(findTables("https://www.pro-football-reference.com/years/2002/rushing.htm"))
print(findTables("https://www.pro-football-reference.com/years/2002/receiving.htm"))
print(findTables("https://www.pro-football-reference.com/years/2002/defense.htm"))
print(findTables("https://www.pro-football-reference.com/years/2002/kicking.htm"))
print(findTables("https://www.pro-football-reference.com/years/2002/returns.htm"))
print(findTables("https://www.pro-football-reference.com/years/2002/games.htm"))
print(findTables("https://www.pro-football-reference.com/years/2002/coaches.htm"))

['passing']
['rushing']
['receiving']
['defense']
['kicking']
['returns']
['games']
['coaches']


In [30]:
table_names = ['passing', 'rushing', 'receiving', 'defense', 'kicking', 'returns', 'games','coaches']
for year in range(YEAR_START, YEAR_END):
    url = "https://www.pro-football-reference.com/years/" + str(year) + "/"
    for table_entr in table_names:
        print(url + table_entr + ".htm")
        dest = str(year) + "/" + table_entr
        try:
            data = pullTable(url + table_entr + ".htm", table_entr)
        except:
            data = pullTable(url + table_entr + ".htm", table_entr, header = False)
        data.to_csv(dest + ".csv")

https://www.pro-football-reference.com/years/2002/passing.htm
https://www.pro-football-reference.com/years/2002/rushing.htm
https://www.pro-football-reference.com/years/2002/receiving.htm
https://www.pro-football-reference.com/years/2002/defense.htm
https://www.pro-football-reference.com/years/2002/kicking.htm
https://www.pro-football-reference.com/years/2002/returns.htm
https://www.pro-football-reference.com/years/2002/games.htm
https://www.pro-football-reference.com/years/2002/coaches.htm
https://www.pro-football-reference.com/years/2003/passing.htm
https://www.pro-football-reference.com/years/2003/rushing.htm
https://www.pro-football-reference.com/years/2003/receiving.htm
https://www.pro-football-reference.com/years/2003/defense.htm
https://www.pro-football-reference.com/years/2003/kicking.htm
https://www.pro-football-reference.com/years/2003/returns.htm
https://www.pro-football-reference.com/years/2003/games.htm
https://www.pro-football-reference.com/years/2003/coaches.htm
https://

https://www.pro-football-reference.com/years/2018/returns.htm
https://www.pro-football-reference.com/years/2018/games.htm
https://www.pro-football-reference.com/years/2018/coaches.htm
https://www.pro-football-reference.com/years/2019/passing.htm
https://www.pro-football-reference.com/years/2019/rushing.htm
https://www.pro-football-reference.com/years/2019/receiving.htm
https://www.pro-football-reference.com/years/2019/defense.htm
https://www.pro-football-reference.com/years/2019/kicking.htm
https://www.pro-football-reference.com/years/2019/returns.htm
https://www.pro-football-reference.com/years/2019/games.htm
https://www.pro-football-reference.com/years/2019/coaches.htm
https://www.pro-football-reference.com/years/2020/passing.htm
https://www.pro-football-reference.com/years/2020/rushing.htm
https://www.pro-football-reference.com/years/2020/receiving.htm
https://www.pro-football-reference.com/years/2020/defense.htm
https://www.pro-football-reference.com/years/2020/kicking.htm
https://

In [31]:
findTables("https://www.pro-football-reference.com/stadiums")

['stadiums']

In [32]:
pullTable("https://www.pro-football-reference.com/stadiums", "stadiums").to_csv("stadiums.csv")

In [33]:
findTables("https://www.pro-football-reference.com/awards/ap-nfl-mvp-award.htm")

['awards']

In [36]:
findTables("https://www.pro-football-reference.com/awards/ap-offensive-player-of-the-year.htm")

['awards']

In [38]:
findTables("https://www.pro-football-reference.com/awards/ap-defensive-player-of-the-year.htm")

['awards']

In [39]:
pullTable("https://www.pro-football-reference.com/awards/ap-offensive-player-of-the-year.htm", "awards").to_csv("OPOY.csv")
pullTable("https://www.pro-football-reference.com/awards/ap-defensive-player-of-the-year.htm", "awards").to_csv("DPOY.csv")
pullTable("https://www.pro-football-reference.com/awards/ap-nfl-mvp-award.htm", "awards").to_csv("MVP.csv")

In [40]:
findTables("https://www.pro-football-reference.com/years/2020/draft.htm")

['drafts']

In [43]:
for year in range(YEAR_START, YEAR_END):
    url = "https://www.pro-football-reference.com/years/" + str(year) + "/draft.htm"
    dest = str(year) + "/" + "draft"
    try:
        data = pullTable(url, "drafts")
    except:
        data = pullTable(url, "drafts", header = False)
    data.to_csv(dest + ".csv")