Data scraped from https://www.espn.com/ and https://www.teamrankings.com/

In [None]:
from bs4 import BeautifulSoup
import pandas as pd
import requests
import numpy as np
import plotly.express as px
import matplotlib.pyplot as plt
from sklearn.metrics import r2_score
import datapane as dp

## Scraping teams and scoring margins (through 8 weeks) from 2003-2020

In [None]:
year = 2003

teams, point_diff = [], []

while year < 2021:

    url = "https://www.teamrankings.com/nfl/stat/average-scoring-margin?date=" + str(year) + "-11-04"

    r = requests.get(url,timeout=2.5)
    r_html = r.text

    soup = BeautifulSoup(r_html, 'html.parser')
    
    classes = ["text-left nowrap", "text-right"]
    
    NFL_data = soup.find_all("td", attrs={"class": classes})

    NFL_data = [str(l) for l in NFL_data]
    NFL_stats = []

    for l in NFL_data:
        if '">' and '</td>' in l:
            NFL_stats.append(l[l.index('">')+len('">'):l.index('</td>')])
        if 'sort="' and '"><a' in l:
            NFL_stats.append(l[l.index('sort="')+len('sort="'):l.index('"><a')])
        else:
            continue
            
    ct_team, ct_pd = 1, 2

    while ct_pd <= len(NFL_stats) - 6:
        teams.append(NFL_stats[ct_team])
        point_diff.append(NFL_stats[ct_pd])

        ct_team += 8
        ct_pd += 8
        
    year += 1
    
point_diff = [float(l) for l in point_diff]

## Changing team names to match what we get from ESPN for consistency

In [None]:
def change_names(list):
    
    for indx, item in enumerate(list):
        if item == 'Kansas City':
            list[indx] = "Kansas City Chiefs"
        if item == "Tampa Bay":
            list[indx] = "Tampa Bay Buccaneers"
        if item == "Baltimore":
            list[indx] = "Baltimore Ravens"
        if item == "Pittsburgh":
            list[indx] = "Pittsburgh Steelers"
        if item == "Indianapolis":
            list[indx] = "Indianapolis Colts"
        if item == "Miami":
            list[indx] = "Miami Dolphins"
        if item == "Arizona":
            list[indx] = "Arizona Cardinals"
        if item == "Seattle":
            list[indx] = "Seattle Seahawks"
        if item == "LA Rams":
            list[indx] = "Los Angeles Rams"
        if item == "Green Bay":
            list[indx] = "Green Bay Packers"
        if item == "San Francisco":
            list[indx] = "San Francisco 49ers"
        if item == "Tennessee":
            list[indx] = "Tennessee Titans"
        if item == "New Orleans":
            list[indx] = "New Orleans Saints"
        if item == "Buffalo":
            list[indx] = "Buffalo Bills"
        if item == "Chicago":
            list[indx] = "Chicago Bears"
        if item == "LA Chargers":
            list[indx] = "Los Angeles Chargers"
        if item == "Carolina":
            list[indx] = "Carolina Panthers"
        if item == "Atlanta":
            list[indx] = "Atlanta Falcons"
        if item == "Las Vegas":
            list[indx] = "Las Vegas Raiders"
        if item == "Philadelphia":
            list[indx] = "Philadelphia Eagles"
        if item == "Cincinnati":
            list[indx] = "Cincinnati Bengals"
        if item == "Cleveland":
            list[indx] = "Cleveland Browns"
        if item == "Detroit":
            list[indx] = "Detroit Lions"
        if item == "New England":
            list[indx] = "New England Patriots"
        if item == "Minnesota":
            list[indx] = "Minnesota Vikings"
        if item == "Denver":
            list[indx] = "Denver Broncos"
        if item == "NY Giants":
            list[indx] = "New York Giants"
        if item == "Houston":
            list[indx] = "Houston Texans"
        if item == "Jacksonville":
            list[indx] = "Jacksonville Jaguars"
        if item == "Dallas":
            list[indx] = "Dallas Cowboys"
        if item == "NY Jets":
            list[indx] = "New York Jets"
    
    return list

In [None]:
NFL_teams = change_names(teams)

## Scraping final league standings from 2003-2020

In [None]:
year = 2003

teams_standings = []
wins = []

while year < 2021:

    url = "https://www.espn.com/nfl/standings/_/season/" + str(year) + "/group/league"

    r = requests.get(url,timeout=2.5)
    r_html = r.text

    soup = BeautifulSoup(r_html, 'html.parser')
    
    data = soup.find_all("td")
    data = [str(l) for l in data]
    
    teams = data[0:32]

    teams = [i[-135:] for i in teams]
    
    left, right = 'tabindex="0">', '</a>'
    for l in teams:
        if left and right in l:
            teams_standings.append(l[l.index(left)+len(left):l.index(right)])
        elif 'none" title="' and '">WSH' in l:
            teams_standings.append(l[l.index('none" title="')+len('none" title="'):l.index('">WSH')])
        elif 'none" title="' and '">OAK' in l:
            teams_standings.append(l[l.index('none" title="')+len('none" title="'):l.index('">OAK')])
        elif 'none" title="' and '">SD' in l:
            teams_standings.append(l[l.index('none" title="')+len('none" title="'):l.index('">SD')]) 
        elif 'none" title="' and '">STL' in l:
            teams_standings.append(l[l.index('none" title="')+len('none" title="'):l.index('">STL')]) 
    
    stats = data[32:]
    
    left, right = 'cell">', '</sp'
    stats = [l[l.index(left)+len(left):l.index(right)] for l in stats if left in l and right in l]
    
    ct = 0
    while ct < len(stats):
        wins.append(stats[ct])
        ct += 11
    
    year += 1


## Teams names changed since 2003, so updating

In [None]:
for indx, item in enumerate(teams_standings):
    if item == "St. Louis Rams":
        teams_standings[indx] = "Los Angeles Rams"
    if item == "Washington Redskins":
        teams_standings[indx] = "Washington"
    if item == "Oakland Raiders":
        teams_standings[indx] = "Las Vegas Raiders"
    if item == "San Diego Chargers":
        teams_standings[indx] = "Los Angeles Chargers"

## Re-organizing lists so that standings and wins match up with the team names and scoring margins from the first scrape

In [None]:
NFL_standings, NFL_wins = [], []

for i in NFL_teams:
    if i in teams_standings:
        item_index = teams_standings.index(i)
        NFL_standings.append(i)
        teams_standings[teams_standings.index(i)] = ''
        NFL_wins.append(wins[item_index])

NFL_wins = [float(l) for l in NFL_wins]

## New list called seasons. Each season is appended 32 times (one for each team)

In [None]:
seasons = []

ct = 0
year = 2003

while ct <= len(NFL_wins) - 1:
    seasons.append(year)
    ct += 1
    if ct % 32 == 0:
        year += 1

In [None]:
df = pd.DataFrame(data={'Team': NFL_standings, 'Season':seasons, 'Scoring Margin': point_diff, 'Actual Wins': NFL_wins})

df

## Running DataFrame through the model to get predicted wins for each team since 2003

In [None]:
from sklearn import linear_model

X = df[['Scoring Margin']]
y = df['Actual Wins']

scorig_margin = df['Scoring Margin'].tolist()
actual_wins = df['Actual Wins'].tolist()

predicted_wins = []

for i in scorig_margin:
    
    regr = linear_model.LinearRegression()
    regr.fit(X, y)
    predicted_wins.append(round(float(regr.predict([[i]])), 2))


df['Predicted Wins'] = predicted_wins
diff_list = []

for i, k in zip(actual_wins, predicted_wins):
    diff = str(round(i-k, 2))
    if float(diff) > 0:
        diff = '+' + str(diff)

    diff_list.append(diff)

diff_list = [float(l) for l in diff_list]

df['Difference'] = diff_list
df

In [None]:
df_copy = df.copy()
df_copy['Difference'] = df_copy['Difference'].astype(float)

overperforming = df_copy.nlargest(10, 'Difference')

## Finding average difference between projected and observed win totals

In [None]:
diff_list_float = [float(l) for l in diff_list]

abs_vals = [abs(x) for x in diff_list_float]

avg_difference = sum(abs_vals) / len(diff_list_float)

avg_difference

## Plotly object

In [None]:
fig = px.scatter(df, x = df['Scoring Margin'], y = df['Actual Wins'], color = df['Scoring Margin'],
          size = df['Actual Wins'], size_max=17, trendline='ols', template='gridon',
          title='NFL Win Totals and Week 8 Scoring Margins | 2003-2020',
          hover_data = [df['Team'], df['Season'], df['Predicted Wins']])

## Creating linear regression model

In [None]:
model = np.polyfit(df['Scoring Margin'], df['Actual Wins'], 1)

In [None]:
predict = np.poly1d(model)

r2 = r2_score(NFL_wins, predict(point_diff))
r2 #R2 value

In [None]:
r = np.sqrt(r2) #Coefficient correlation

## Scraping for current season scoring margins to run through the model

In [None]:
year = 2021

current_teams, current_point_diff = [], []

url = "https://www.teamrankings.com/nfl/stat/average-scoring-margin?date=" + str(year) + "-11-04"

r = requests.get(url,timeout=2.5)
r_html = r.text

soup = BeautifulSoup(r_html, 'html.parser')

classes = ["text-left nowrap", "text-right"]

NFL_data = soup.find_all("td", attrs={"class": classes})

NFL_data = [str(l) for l in NFL_data]
NFL_stats = []

for l in NFL_data:
    if '">' and '</td>' in l:
        NFL_stats.append(l[l.index('">')+len('">'):l.index('</td>')])
    if 'sort="' and '"><a' in l:
        NFL_stats.append(l[l.index('sort="')+len('sort="'):l.index('"><a')])
    else:
        continue

ct_team, ct_pd = 1, 2

while ct_pd <= len(NFL_stats) - 6:
    current_teams.append(NFL_stats[ct_team])
    current_point_diff.append(NFL_stats[ct_pd])

    ct_team += 8
    ct_pd += 8

year += 1

current_point_diff = [float(l) for l in current_point_diff]

In [None]:
current_teams = change_names(current_teams)

## Putting current data through model

In [None]:
projected_wins = []

for i in current_point_diff:
    
    X = df[['Scoring Margin']]
    y = df['Actual Wins']

    regr = linear_model.LinearRegression()
    regr.fit(X, y)
    
    prediction = regr.predict([[i]])
    
    projected_wins.append(round(float(prediction), 2))

In [None]:
new_df = pd.DataFrame(data={"Team": current_teams, "Week-8 Scoring Margin": current_point_diff, "Projected 2021 Wins": projected_wins})

new_df

## DataPane objects

In [None]:
table = dp.DataTable(new_df)
dp.Report(table).upload(name='Projected Win Totals for the 2021 NFL Season')

In [None]:
table = dp.DataTable(df)
dp.Report(table).upload(name='Week 8 Scoring Margin compared to End-of-Season Wins')

In [None]:
table = dp.DataTable(overperforming)
dp.Report(table).upload(name='10 Most Overperforming Teams by Observed Win Total')

In [None]:
underperforming = df_copy.nsmallest(10, 'Difference')

table = dp.DataTable(underperforming)
dp.Report(table).upload(name='10 Most Underperforming Teams by Observed Win Total')

In [None]:
graph = dp.Plot(fig)
dp.Report(graph).upload(name='NFL Win Totals and Scoring Margins | 2003-2020')