In [1]:
import pandas as pd
import numpy as np
import requests
import time

from urllib.request import Request, urlopen
from bs4 import BeautifulSoup
from sklearn.dummy import DummyRegressor
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split, cross_validate, cross_val_score, GridSearchCV,\
RandomizedSearchCV
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from operator import itemgetter
from math import sqrt

import seaborn as sns
from matplotlib import pyplot as plt

# Part 1 - Webscraping

1. Here we use Request to get access to the desired website
    - Side note, this took a little bit of time because I wasn't aware of headers and kept getting a 403 FORBIDDEN error. This video https://www.youtube.com/watch?v=6RfyXcf_vQo , helped me get through this
2. Then we use urlopen to get the raw html code
3. Finally we use BeautifulSoup to parse the html code and have it in a decipherable format
4. Inspect the variable soup to make sure it's working!


In [None]:
url = 'https://www.footballdb.com/teams/index.html'
req = Request(url, headers={'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36'}) # Make a get request to retrieve the page
webpage = urlopen(req).read()
soup = BeautifulSoup(webpage, 'html.parser') # Pass the page contents to beautiful soup for parsing
soup

# Part 1.1 - Team url's

We're going to use the soup from the previous part and extract each NFL team's url

1. The urls are located in a table, so we have to find the table body 'tbody' and then find all the table rows 'tr' where they're located
2. Unfortunately in all the '< a >' tags the href was only the latter part of the url
    - ie. '/teams/nfl/buffalo-bills/stats'
3. Once we have the latter part of the url, we add it to the base part of the url that every team website needs in order for us to later scrape information on each team.
    - ie. 'https://www.footballdb.com' + '/teams/nfl/buffalo-bills/stats'
4. Then we append it to a list that contains each teams url

In [None]:
"""
Getting all the teams stats pages, and appending them to a list for the next step.

Next step will involve going to each team's respective stats page and scraping each player ulr in passing, rushing, 
and receiving categories.

"""

teams = soup2.find('tbody').find_all('tr')

url_part_one = 'https://www.footballdb.com'

team_urls = []
for team in teams:
    if len(team.find_all('a')) > 1:
        a_tags = team.find_all('a')[2]['href']
        team_urls.append(url_part_one+a_tags)
        
team_urls

# Part 1.2 - Player url's

We're using the list of team urls to scrape all of their players that have played offense url's

1. In order to retrive each players url we have to do what we did in part 1 to pull and parse each team's html
2. After we have the team's parsed html, we're going to find all of their tables that contain player stats
    - This happens to be the first three tables! Passing, Rushing, Receiving.
3. For each table we have to find all the 'span' tags that contained the 'hidden-xs' class which had each players url
4. Within the variable sub_table we have all the 'hidden-xs', so for example in the Passing table for each 'hidden-xs' class we're going to do the same as part 1.1 and scrape the 'href' of each one
5. We have the same problem as part 1.1 where the href is not complete, so before appending it to the list we have to add the base url and a specific ending so that we can just have their stats for the most recent year
    - 'https://www.footballdb.com' + '/players/josh-allen-allenjo06' + '/gamelogs/2022'
6. Then right before the function ends we have it pause for five seconds, so that the website doens't think anything fishy is going on and our IP does not get banned

In [None]:
complete_players_url_list = []

In [None]:
def team_urls_to_player_list(list_of_team_urls):
    url_part_one = 'https://www.footballdb.com'
    url_part_two = '/gamelogs/2022'
    
    for team in list_of_team_urls:
        
        url = team
        req = Request(url, headers={'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36'}) # Make a get request to retrieve the page
        webpage = urlopen(req).read()
        soup = BeautifulSoup(webpage, 'html.parser') # Pass the page contents to beautiful soup for parsing
        
        tables = soup.find_all('table')
        
        for i in range(3):
            sub_table = tables[i].find_all('span', class_='hidden-xs')
            for i in range(len(sub_table)):
                player_url = sub_table[i].find('a')['href']
                player_url = url_part_one+player_url+url_part_two
                if player_url not in complete_players_url_list:
                    complete_players_url_list.append(player_url)
        print('Team retrieved')
        complete_players_url_list
        time.sleep(5)
    print('All players urls retrieved!')

In [None]:
team_urls_to_player_list(team_urls)

## Checking that all the players url's are there!

In [None]:
complete_players_url_list

### Below is the process of iterating to see how to retrieve certain parts of the website

In [None]:
url = 'https://www.footballdb.com/teams/nfl/buffalo-bills/stats'
req = Request(url, headers={'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36'}) # Make a get request to retrieve the page
webpage = urlopen(req).read()
soup = BeautifulSoup(webpage, 'html.parser') # Pass the page contents to beautiful soup for parsing
tables = soup.find_all('table')

In [None]:
tables[0].find('span',{'class':'hidden-xs'}).find('a')['href']

In [None]:
tables[0].find_all('span')

In [None]:
players_list = []
url_part_one = 'https://www.footballdb.com'
url_part_two = '/gamelogs/2022'

for i in range(3):
    sub_table = tables[i].find_all('span', class_='hidden-xs')
    for i in range(len(sub_table)):
        player_url = sub_table[i].find('a')['href']
        player_url = url_part_one+player_url+url_part_two
        if player_url not in players_list:
            players_list.append(player_url)
players_list

### Iterating on player's url to see how to extract certain information such as position
- Not every player website had the same format, so I had to figure out a method that would work no matter the formatting
- The solution was to only extract the text if the 'b' tag had the exact word 'Position'

In [None]:
url1 = 'https://www.footballdb.com/players/laquon-treadwell-treadla01/gamelogs/2022'
req1 = Request(url1, headers={'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36'}) # Make a get request to retrieve the page
webpage1 = urlopen(req1).read()
soup1 = BeautifulSoup(webpage1, 'html.parser') # Pass the page contents to beautiful soup for parsing
soup1

In [None]:
test = soup1.find('div', id='playerbanner').find_all('b')

[test[i].text for i in range(len(test))]

In [None]:
for i in range(len(test)):
    if test[i].text == 'Position:':
        a = test[i].next.next.strip()
a

# Part 1.3 - Fetching row headers

- Here we went into Josh Allen's page and retrieved the relevant row headers that will be used later on in a dataframe
- Josh Allen had the relevant headers in his first three tables, so the code below won't work for all players url
- The headers were in the 'header right' class, so in each 'header right' we find all the table headers 'th' and then we append each one as a str to the row_headers list

In [None]:
url1 = 'https://www.footballdb.com/players/josh-allen-allenjo06/gamelogs/2022'
req1 = Request(url1, headers={'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36'}) # Make a get request to retrieve the page
webpage1 = urlopen(req1).read()
soup1 = BeautifulSoup(webpage1, 'html.parser') # Pass the page contents to beautiful soup for parsing

jo_allen = soup1.find_all('table')

In [None]:
row_headers= []
for i in range(3):
    for x in jo_allen[i].find_all('tr', class_='header right'):
        for y in x.find_all('th'):
            row_headers.append(y.text)
row_headers

In [None]:
# We're inserting 'Name' and 'Pos' to the row_headers list because we know we're going to be scraping that for each
# player as well.
row_headers.insert(0,'Pos')
row_headers.insert(0,'Name')

In [None]:
# Finding out what index each ele is
test_e = enumerate(row_headers, start=0)
print(list(test_e))

# Part 1.4 - Scraping stats for each player

1. Here we created a function that takes a players soup and scrapes their name and position as well as all of their stats for Passing, Rushing, and Receiving
2. We create four blank lists,
    - player_stats to contain all of their game stats, ie. [[game1], [game2] ...]
    - pass_stats for their passing stats for each game
    - rush_stats for their rushing stats for each game
    - rec_stats for their receiving stats for each game
    
3. First we scrape the name and position of the player and store those as separate variables
4. Find  all the tables for the player
5. In each table we find all the table headers, and make a list of them
6. If the table header is Passing, Rushing, or Receiving then we go through each table row and find all the table data
    - ie. each table row is one game and for each game we want all the table data which is the game's statistics
7. If the player does not have one of the three aforementioned stats, then we populate that stat as 0's so that every game scraped will have the same total length
    - This is very important for when we turn this list of lists into a dataframe
8. Then at the end we combine each respective game's Passing, Rushing, and Receiving stat to the player_stats variable
    - ie. [pass gm 1] + [rush gm 1] + [rec gm 1] -> append to player_stats, and do this for all their games

In [None]:
def players_table_to_stat_list(player_soup):
    player_stats = []
    pass_stats = [] #20 cols for players that have this table
    rush_stats = [] #12 cols for players that have this table
    rec_stats = [] #14 cols for players that have this table
    
    # Original code above, but have to change the way each week is inserted because not all players have the same
    # table structure. ie. some have pass/rush/rec and some only have rush/rec, pass/rush, or etc.
    
    player_name = player_soup.find('div', class_='teamlabel').text
    player_position = player_soup.find('div', id='playerbanner').find_all('b')
    
    for i in range(len(player_position)):
        if player_position[i].text == 'Position:':
            pos = player_position[i].next.next.strip()
    player_position = pos        
    
    player_table = player_soup.find_all('table')
    
    for i in range(len(player_table)):
        
        player_th = player_table[i].find_all('th')
        player_th = [x.text for x in player_th]
            
        # Passing Stats
        if 'Passing' in player_th:
            for x in player_table[i].find_all('tbody'):
                for y in x.find_all('tr'):
                    td_tags = y.find_all('td')
                    if td_tags == []:
                        continue
                    else:
                        pass
                    td_val = [z.text for z in td_tags]
                    pass_stats.append(td_val)

        # Rushing Stats
        elif 'Rushing' in player_th:    
            for x in player_table[i].find_all('tbody'):
                for y in x.find_all('tr'):
                    td_tags = y.find_all('td')
                    if td_tags == []:
                        continue
                    else:
                        pass
                    td_val = [z.text for z in td_tags]
                    rush_stats.append(td_val)
        
        #Receiving Stats
        elif 'Receiving' in player_th:    
            for x in player_table[i].find_all('tbody'):
                for y in x.find_all('tr'):
                    td_tags = y.find_all('td')
                    if td_tags == []:
                        continue
                    else:
                        pass
                    td_val = [z.text for z in td_tags]
                    rec_stats.append(td_val)

    #If player does not have specific table stats          
    if pass_stats == []:
        for x in player_table[i].find_all('tbody'):
            for y in x.find_all('tr'):
                pass_stats.append([0]*20)
    
    if rush_stats == []:
        for x in player_table[i].find_all('tbody'):
                        for y in x.find_all('tr'):
                            rush_stats.append([0]*12)
    if rec_stats == []:
        for x in player_table[i].find_all('tbody'):
                            for y in x.find_all('tr'):
                                rec_stats.append([0]*14)
   
    
    for throw in range(len(pass_stats)):
        for rush in range(len(rush_stats)):
            for rec in range(len(rec_stats)):
                if throw == rush == rec:
                    rush_stats[rush].extend(rec_stats[rec])
                    pass_stats[throw].extend(rush_stats[rush])
                    pass_stats[throw].insert(0,player_position)
                    pass_stats[throw].insert(0,player_name)
                    
    player_stats.extend(pass_stats)
    return player_stats

In [None]:
## Testing it out and making sure the function works
url2 = 'https://www.footballdb.com/players/laquon-treadwell-treadla01/gamelogs/2022'
req2 = Request(url2, headers={'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36'}) # Make a get request to retrieve the page
webpage2 = urlopen(req2).read()
soup2 = BeautifulSoup(webpage2, 'html.parser')

In [None]:
# Testing it out and making sure the function works
x = players_table_to_stat_list(soup2)
x

# Part 1.5 - Compiling every players stats

Every player in complete_players_url_list will have their stats scraped and stored in the complete_players_stats variable

1. We have a loop that goes through each player's url and parses there html, same steps as in Part 1.1
2. Then the player's soup is fed into the function players_table_to_stat_list, and all of their games are stored into a variable called stats
3. These stats are then extended into complete_players_stats, so that each list represents one game and not a set of games per player (as would be the case if we were to append)
4. Then we have the function print out the url of the player that was just added, the amount of players scraped, and a little message letting us know it was succesful.
5. Before moving to the next player we have the function pause for five seconds so that our IP does not get banned. This will take around an hour or two to fully run
6. Once it's all done the function, let's us know via a print!

In [None]:
complete_players_stats = []

In [None]:
players_url_retrieved = []

In [None]:
def team_players_url_to_stats_table(players_url_list):
    
    
    for player in players_url_list:
        url = player
        req = Request(url, headers={'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36'}) # Make a get request to retrieve the page
        webpage = urlopen(req).read()
        soup = BeautifulSoup(webpage, 'html.parser')
        
    
        stats = players_table_to_stat_list(soup)
        
        complete_players_stats.extend(stats)
        
        print(url)
        players_url_retrieved.append(url)
        print(len(players_url_retrieved))
        print('Player Stats added!')
        print()
        
        time.sleep(5)
    
        
    print('All Players added!')

In [None]:
team_players_url_to_stats_table(complete_players_url_list)

### Sanity checks, making sure that the amount of players scraped matches

- Also checking the amount of games in the complete_players_stats

In [None]:
len(complete_players_stats)

In [None]:
len(players_url_retrieved)

In [None]:
len(complete_players_url_list)

In [None]:
# if the length of the list is less than 48, then that means the player either did not play or was an irrelevant
# summary statistic such as their totals for the season.
# Testing random elements in the list
print(len(complete_players_stats[22]))
print(len(complete_players_stats[10578]))

In [None]:
"""
Cleaning up the list to not include any list that does not have 48 elements/features.
"""
complete_players_stats_cleaned = [x for x in complete_players_stats if len(x) == 48]

In [None]:
# Confirming that the clean list has reduced entries, and only games where the player played.
print(len(complete_players_stats_cleaned))
for i in complete_players_stats_cleaned:
    if len(i) != 48:
        print('error')

### Checking the row headers again
- Making sure that certain headers do not repeat and can be differenciated based on what stat it's a part of

In [None]:
test_e = enumerate(row_headers, start=0)
print(list(test_e))

In [None]:
row_headers[4] = 'Opp1'
row_headers[24] = 'Opp2'
row_headers[36] = 'Opp3'
row_headers[9] = 'Pass_Yds'
row_headers[11] = 'Pass_TD'
row_headers[27] = 'Rush_Yds'
row_headers[30] = 'Rush_TD'
row_headers[39] = 'Rec_Yds'
row_headers[42] = 'Rec_TD'
row_headers

# Part 1.6 - Making a dataframe

In [None]:
offense_nfl_df = pd.DataFrame(complete_players_stats_cleaned, columns=row_headers)

In [None]:
offense_nfl_df.to_csv('offense_nfl_2022.csv')