In [5]:
import pandas as pd
from bs4 import BeautifulSoup
import requests
from datetime import date
from dateutil.relativedelta import *
import json
import os

import numpy as np

import re

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By

from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from selenium.webdriver.support.ui import WebDriverWait

from selenium.common.exceptions import TimeoutException

In [6]:
# Get MyFantasyLeague players' name, team name, position, and ages
urlString = "https://api.myfantasyleague.com/2022/export?TYPE=players"
response = requests.get(urlString)
soup = BeautifulSoup(response.content,'xml')
data = []
players = soup.find_all('player')
for i in range(len(players)):
    rows = [players[i].get("id"), players[i].get("name"), players[i].get("position"), players[i].get("team")]
    data.append(rows)
player_df = pd.DataFrame(data)
player_df.columns=['PlayerID','Name', 'Position', 'Team']

# Get ages
to_query_age = player_df
player_dobs  = pd.DataFrame()
if len(to_query_age)>0:
    # Break player list into chunks small enough for the API server
    n = 50  #chunk row size
    list_df = [to_query_age.PlayerID[i:i+n] for i in range(0,to_query_age.PlayerID.shape[0],n)]

    for i in range(len(list_df)):
        idList = ",".join(list_df[i])

        # Get playerProfiles
        urlString = f"https://api.myfantasyleague.com/2022/export?TYPE=playerProfile&P={idList}"
        response = requests.get(urlString)
        soup = BeautifulSoup(response.content,'xml')
        data = []
        profiles = soup.find_all('playerProfile')
        players = soup.find_all('player')
        for i in range(len(profiles)):
            rows = [profiles[i].get("id"), players[i].get("dob")]
            data.append(rows)
        data_df = pd.DataFrame(data)
        age = pd.DataFrame(columns=['PlayerID', 'DOB'])
        age['PlayerID'] = data_df[0]
        age['DOB'] = data_df[1]
        player_dobs = player_dobs.append(age)

# Convert string to datetime
player_dobs['DOB'] = pd.to_datetime(player_dobs['DOB'])
# Convert DOB to Age
today = date.today()
def age(born):
    return today.year - born.year - ((today.month, today.day) < (born.month, born.day))
player_dobs['Age'] = player_dobs['DOB'].apply(age)

# Merge all dfs
player_df = player_df.merge(player_dobs, on='PlayerID', how='left')
player_df = player_df.drop(columns='DOB')

In [7]:
# Clean MFL data
mfl_df = player_df.copy()
# Select only relevant positions
mfl_df = mfl_df.loc[mfl_df['Position'].isin(['QB', 'WR', 'RB', 'TE', 'PK', 'Def'])]
mfl_df = mfl_df.reset_index(drop=True)

# Clean Name column
to_join = mfl_df['Name'].str.split(", ", n=1, expand=True)
to_join.columns = ['lname', 'fname']
to_join['Name'] = to_join['fname'] + " " + to_join['lname']
mfl_df['Name'] = to_join['Name']
# Change to Title Case
mfl_df['Name'] = mfl_df['Name'].str.upper()
# Drop punctuation
mfl_df['Name'] = mfl_df['Name'].str.replace(".", "")
mfl_df['Name'] = mfl_df['Name'].str.replace(",", "")
mfl_df['Name'] = mfl_df['Name'].str.replace("'", "")

# Clean position column
mfl_df['Position'] = mfl_df['Position'].replace('Def', 'DF')

# Clean Team column
mfl_df['Team'] = mfl_df['Team'].replace('FA*', 'FA')

# Change column names
mfl_df.columns = ['id_mfl', 'player', 'pos_mfl', 'team', 'age']
mfl_df


  from ipykernel import kernelapp as app


Unnamed: 0,id_mfl,player,pos_mfl,team,age
0,0501,BUFFALO BILLS,DF,BUF,
1,0502,INDIANAPOLIS COLTS,DF,IND,
2,0503,MIAMI DOLPHINS,DF,MIA,
3,0504,NEW ENGLAND PATRIOTS,DF,NEP,
4,0505,NEW YORK JETS,DF,NYJ,
...,...,...,...,...,...
1093,15996,JALEN VIRGIL,WR,DEN,24.0
1094,15997,DANNY DAVIS,WR,GBP,24.0
1095,15999,RAMIZ AHMED,PK,GBP,27.0
1096,16000,PEYTON HENDERSHOT,TE,DAL,23.0


In [8]:
# Get OurLads data
# Set Selenium settings
capa = DesiredCapabilities.CHROME
capa["pageLoadStrategy"] = "none"
# Scrape web for stats
url = f"https://www.ourlads.com/nfldepthcharts/depthcharts.aspx"

PATH = "/Applications/chromedriver"
driver = webdriver.Chrome(service=Service(PATH), desired_capabilities=capa)
wait = WebDriverWait(driver, 20)
driver.get(url)

wait.until(EC.presence_of_element_located((By.XPATH, "//table[@id='ctl00_phContent_gvChart']")))
driver.execute_script("window.stop();")

ourlads_scrape = pd.read_html(driver.find_element(By.XPATH, value="//table[@id='ctl00_phContent_gvChart']").get_attribute("outerHTML"))
ourlads_scrape = ourlads_scrape[0]

In [40]:
# Clean ourlads_df data
df = ourlads_scrape.copy()
df = df[['Team', 'Pos', 'Player 1', 'Player 2','Player 3', 'Player 4', 'Player 5']]

# Transform columns into rows
df1 = df[['Team', 'Pos', 'Player 1']]
df1 = df1.rename(columns={'Player 1':'Player'})
df1['posRank'] = "1"

df2 = df[['Team', 'Pos', 'Player 2']]
df2 = df2.rename(columns={'Player 2':'Player'})
df2['posRank'] = "2"

df3 = df[['Team', 'Pos', 'Player 3']]
df3 = df3.rename(columns={'Player 3':'Player'})
df3['posRank'] = "3"

df4 = df[['Team', 'Pos', 'Player 4']]
df4 = df4.rename(columns={'Player 4':'Player'})
df4['posRank'] = "4"

df5 = df[['Team', 'Pos', 'Player 5']]
df5 = df5.rename(columns={'Player 5':'Player'})
df5['posRank'] = "5"

complete = pd.concat([df1, df2, df3, df4, df5], axis=0, ignore_index=True)

# Clean Position column
# Select only relevant positions
posList = ['LWR', 'RWR', 'SWR', 'TE', 'QB', 'RB', 'PK', 'PR', 'KR', 'RES']
ourlads_df = complete.loc[complete['Pos'].isin(posList)]
# Convert WR roles to "WR"
ourlads_df['Pos'].replace(["LWR", "RWR", "SWR"], "WR", inplace=True)
ourlads_df['posRank'] = ourlads_df['Pos'] + ourlads_df['posRank']
ourlads_df = ourlads_df.reset_index(drop=True)
ourlads_df.dropna(inplace=True)
ourlads_df.drop_duplicates(subset=['Player', 'Team', 'Pos'], inplace=True)

# Create columns for KRs and PRs
krs = ourlads_df.loc[ourlads_df.Pos=='KR']
krs = krs.drop(columns=['Pos'])
krs.columns = ['Team', 'Player', 'KR']
prs = ourlads_df.loc[ourlads_df.Pos=='PR']
prs = prs.drop(columns=['Pos'])
prs.columns = ['Team', 'Player', 'PR']
# Join pr and pk dfs back onto main ourlads df
ourlads_df = ourlads_df.merge(krs, how='left', on=['Player', 'Team']).merge(prs, how='left', on=['Player', 'Team'])
ourlads_df['KR'].fillna("NO", inplace=True)
ourlads_df['PR'].fillna("NO", inplace=True)

# Clean name column
names = ourlads_df['Player'].str.split(" ", n=2, expand=True)
names.columns = ['a', 'b', 'c']
names['a'] = names['a'].str.replace(",", "")
ourlads_df['Player'] = names['b'] + " " + names['a']
# Change to Upper Case
ourlads_df['Player'] = ourlads_df['Player'].str.upper()
# Drop punctuation
ourlads_df['Player'] = ourlads_df['Player'].str.replace(".", "")
ourlads_df['Player'] = ourlads_df['Player'].str.replace(",", "")
ourlads_df['Player'] = ourlads_df['Player'].str.replace("'", "")

# Change column names and order
ourlads_df = ourlads_df[['Player', 'Pos', 'Team', 'posRank', 'KR', 'PR']]
ourlads_df.columns = ['player', 'pos_ol', 'team', 'posRank', 'KR', 'PR']

# Remove separate rows for PRs and KRs
ourlads_df = ourlads_df.loc[(ourlads_df.pos_ol!="KR")]
ourlads_df = ourlads_df.loc[(ourlads_df.pos_ol!="PR")]

# Drop position column
ourlads_df.drop(columns=['pos_ol'], inplace=True)
ourlads_df

# Rename team abbreviations
teamDict = {
    'ARZ':'ARI', 'ATL':'ATL', 'BAL':'BAL', 'BUF':'BUF', 'CAR':'CAR', 'CHI':'CHI', 'CIN':'CIN', 'CLE':'CLE', 
    'DAL':'DAL', 'DEN':'DEN', 'DET':'DET', 'GB':'GBP', 'HOU':'HOU', 'IND':'IND', 'JAX':'JAC', 'KC':'KCC', 
    'LAC':'LAC', 'LAR':'LAR', 'LV':'LVR', 'MIA':'MIA', 'MIN':'MIN', 'NE':'NEP', 'NO':'NOS', 'NYG':'NYG', 
    'NYJ':'NYJ', 'PHI':'PHI', 'PIT':'PIT', 'SEA':'SEA', 'SF':'SFO', 'TB':'TBB', 'TEN':'TEN', 'WAS':'WAS'
    }
ourlads_df['team'] = ourlads_df['team'].map(teamDict)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return self._update_inplace(result)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [41]:
print(sorted(list(mfl_df.team.unique())))
print(sorted(list(ourlads_df.team.unique())))





['ARI', 'ATL', 'BAL', 'BUF', 'CAR', 'CHI', 'CIN', 'CLE', 'DAL', 'DEN', 'DET', 'FA', 'GBP', 'HOU', 'IND', 'JAC', 'KCC', 'LAC', 'LAR', 'LVR', 'MIA', 'MIN', 'NEP', 'NOS', 'NYG', 'NYJ', 'PHI', 'PIT', 'SEA', 'SFO', 'TBB', 'TEN', 'WAS']
['ARI', 'ATL', 'BAL', 'BUF', 'CAR', 'CHI', 'CIN', 'CLE', 'DAL', 'DEN', 'DET', 'GBP', 'HOU', 'IND', 'JAC', 'KCC', 'LAC', 'LAR', 'LVR', 'MIA', 'MIN', 'NEP', 'NOS', 'NYG', 'NYJ', 'PHI', 'PIT', 'SEA', 'SFO', 'TBB', 'TEN', 'WAS']


In [42]:
# Merge MyFantasyLeague and OurLads data
mfl_ol_df = mfl_df.merge(ourlads_df, how='left', on=['player', 'team'])
# Clean merged df
mfl_ol_df.loc[mfl_ol_df['pos_mfl']=='DF', 'posRank'] = "DF1"
mfl_ol_df['KR'].fillna("NO", inplace=True)
mfl_ol_df['PR'].fillna("NO", inplace=True)

# Clean posRanks
mfl_ol_df['posRank'] = mfl_ol_df['posRank'].map({
    'RES1':'RES',
    'RES2':'RES',
    'RES3':'RES',
    'RES4':'RES',
    'RES5':'RES',
    'QB1':'QB1', 
    'QB2':'QB2', 
    'QB3':'QB3', 
    'QB4':'QB3',
    'QB5':'QB3', 
    'RB1':'RB1', 
    'RB2':'RB2', 
    'RB3':'RB3', 
    'RB4':'RB3', 
    'RB5':'RB3',
    'WR1': 'WR1', 
    'WR2': 'WR2', 
    'WR3': 'WR3', 
    'WR4': 'WR3', 
    'WR5': 'WR3', 
    'TE1':'TE1', 
    'TE2':'TE2', 
    'TE3':'TE3', 
    'TE4':'TE3', 
    'TE5':'TE3', 
    'PK1':'PK1', 
    'PK2':'PK2', 
    'PK3':'PK3',
    'DF1':'DF1'
    })

mfl_ol_df['RES'] = "NO"
mfl_ol_df.loc[mfl_ol_df['posRank']=="RES", 'RES'] = "RES"
mfl_ol_df.loc[mfl_ol_df.posRank.isna(), 'posRank'] = mfl_ol_df.loc[mfl_ol_df.posRank.isna(), 'pos_mfl'] + "3"
mfl_ol_df.loc[mfl_ol_df.posRank=="RES", 'posRank'] = mfl_ol_df.loc[mfl_ol_df.posRank=="RES", 'pos_mfl'] + "3"
# Specify current season
mfl_ol_df['season'] = 2022
mfl_ol_df

Unnamed: 0,id_mfl,player,pos_mfl,team,age,posRank,KR,PR,RES,season
0,0501,BUFFALO BILLS,DF,BUF,,DF1,NO,NO,NO,2022
1,0502,INDIANAPOLIS COLTS,DF,IND,,DF1,NO,NO,NO,2022
2,0503,MIAMI DOLPHINS,DF,MIA,,DF1,NO,NO,NO,2022
3,0504,NEW ENGLAND PATRIOTS,DF,NEP,,DF1,NO,NO,NO,2022
4,0505,NEW YORK JETS,DF,NYJ,,DF1,NO,NO,NO,2022
...,...,...,...,...,...,...,...,...,...,...
1093,15996,JALEN VIRGIL,WR,DEN,24.0,WR3,NO,NO,NO,2022
1094,15997,DANNY DAVIS,WR,GBP,24.0,WR3,NO,NO,NO,2022
1095,15999,RAMIZ AHMED,PK,GBP,27.0,PK2,NO,NO,NO,2022
1096,16000,PEYTON HENDERSHOT,TE,DAL,23.0,TE3,NO,NO,NO,2022


In [43]:
# Get historical data
histdf = pd.read_csv('data_cleaned/modelSource.csv', index_col=0)

# Select only previous two years
histdf = histdf.loc[histdf['season']>=2020]

# Clean defense names in historical data
histdf['player'] = histdf['player'].str.upper()
# Drop punctuation
histdf['player'] = histdf['player'].str.replace(".", "")
histdf['player'] = histdf['player'].str.replace(",", "")
histdf['player'] = histdf['player'].str.replace("'", "")

histdf


  # Remove the CWD from sys.path while we load stuff.


Unnamed: 0,season,week,team,player,age,pos,posRank,opponent,passA,passC,...,defSack_prior1_opp,defI_prior1_opp,defSaf_prior1_opp,defFum_prior1_opp,defBlk_prior1_opp,defT_prior1_opp,defPtsAgainst_prior1_opp,defPassYAgainst_prior1_opp,defRushYAgainst_prior1_opp,defYdsAgainst_prior1_opp
4749,2020.0,1.0,GBP,AARON RODGERS,37.0,QB,QB1,MIN,44.0,32.0,...,3.0000,1.0625,0.0625,0.8750,0.0625,0.1250,18.9375,233.5625,108.0000,341.5625
4750,2020.0,1.0,SEA,RUSSELL WILSON,32.0,QB,QB1,ATL,35.0,31.0,...,1.7500,0.7500,0.0000,0.5000,0.0625,0.1875,24.9375,244.8750,110.9375,355.8125
4751,2020.0,1.0,BUF,JOSH ALLEN,24.0,QB,QB1,NYJ,46.0,33.0,...,2.1875,0.7500,0.1250,0.5625,0.0625,0.3750,22.4375,236.1875,86.9375,323.1250
4752,2020.0,1.0,BAL,LAMAR JACKSON,23.0,QB,QB1,CLE,25.0,20.0,...,2.3750,0.8750,0.0000,0.3750,0.1250,0.0625,24.5625,216.8750,144.6875,361.5625
4753,2020.0,1.0,CHI,MITCHELL TRUBISKY,26.0,QB,QB1,DET,36.0,20.0,...,1.7500,0.4375,0.0000,0.6875,0.0000,0.1875,26.4375,284.4375,115.9375,400.3750
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
57478,2021.0,17.0,MIA,MIAMI DOLPHINS,0.0,DF,DF1,TEN,0.0,0.0,...,1.1875,0.9375,0.0000,0.5000,0.0625,0.1250,27.4375,277.4375,120.8125,398.2500
57479,2021.0,17.0,DET,DETROIT LIONS,0.0,DF,DF1,SEA,0.0,0.0,...,2.8750,0.8750,0.1875,0.5000,0.1250,0.0000,23.1875,285.0000,95.5625,380.5625
57480,2021.0,17.0,DEN,DENVER BRONCOS,0.0,DF,DF1,LAC,0.0,0.0,...,1.6875,0.7500,0.0625,0.4375,0.1250,0.1250,26.6250,223.6250,119.8125,343.4375
57481,2021.0,17.0,MIN,MINNESOTA VIKINGS,0.0,DF,DF1,GBP,0.0,0.0,...,2.5625,0.6875,0.0625,0.4375,0.0000,0.1250,23.0625,221.1875,112.8125,334.0000


In [44]:
# merge all columns needed for model onto mfl_ol_df
df = mfl_ol_df.copy()
# new = pd.DataFrame(columns=histdf.columns[7:])
# df = pd.concat([df, new], axis=1)
# df.loc[:, len(mfl_ol_df.columns):] = 0
df = df.rename(columns={'pos_mfl':'pos'})
df = df.drop(columns='id_mfl')

# Handle duplicates
# Historical duplicates from "allData.csv"
dupList_hist = ['Alex Smith', 'Mike Williams', 'Steve Smith', 'Ryan Griffin', 'Zach Miller', 'David Johnson']
dupList_hist = [x.upper() for x in dupList_hist]
# Handle new year duplicates
dupList = ['JOSH JOHNSON']
dupList.extend(dupList_hist)
# rename duplicate players by appending their position to their name
for p in dupList:
    df.loc[df.player==p, 'player'] = df.loc[df.player==p, 'player'] + df.loc[df.player==p, 'pos']

# Clean age column
df.loc[df.pos=='DF', 'age'] = 0
# assign arbitrary ages to unknown players
df.loc[df.age.isna(), 'age'] = 25
df

Unnamed: 0,player,pos,team,age,posRank,KR,PR,RES,season
0,BUFFALO BILLS,DF,BUF,0.0,DF1,NO,NO,NO,2022
1,INDIANAPOLIS COLTS,DF,IND,0.0,DF1,NO,NO,NO,2022
2,MIAMI DOLPHINS,DF,MIA,0.0,DF1,NO,NO,NO,2022
3,NEW ENGLAND PATRIOTS,DF,NEP,0.0,DF1,NO,NO,NO,2022
4,NEW YORK JETS,DF,NYJ,0.0,DF1,NO,NO,NO,2022
...,...,...,...,...,...,...,...,...,...
1093,JALEN VIRGIL,WR,DEN,24.0,WR3,NO,NO,NO,2022
1094,DANNY DAVIS,WR,GBP,24.0,WR3,NO,NO,NO,2022
1095,RAMIZ AHMED,PK,GBP,27.0,PK2,NO,NO,NO,2022
1096,PEYTON HENDERSHOT,TE,DAL,23.0,TE3,NO,NO,NO,2022


In [45]:
modelCols = [
    'season',
    'week',
    'team',
    'player',
    'age',
    'pos',
    'posRank',
    'opponent',
    'passA',
    'passC',
    'passY',
    'passT',
    'passI',
    'pass2',
    'rushA',
    'rushY',
    'rushT',
    'rush2',
    'recC',
    'recY',
    'recT',
    'rec2',
    'fum',
    'XPA',
    'XPM',
    'FGA',
    'FGM',
    'FG50',
    'defSack',
    'defI',
    'defSaf',
    'defFum',
    'defBlk',
    'defT',
    'defPtsAgainst',
    'defPassYAgainst',
    'defRushYAgainst',
    'defYdsAgainst',
    'passA_curr',
    'passC_curr',
    'passY_curr',
    'passT_curr',
    'passI_curr',
    'pass2_curr',
    'rushA_curr',
    'rushY_curr',
    'rushT_curr',
    'rush2_curr',
    'recC_curr',
    'recY_curr',
    'recT_curr',
    'rec2_curr',
    'fum_curr',
    'XPA_curr',
    'XPM_curr',
    'FGA_curr',
    'FGM_curr',
    'FG50_curr',
    'defSack_curr',
    'defI_curr',
    'defSaf_curr',
    'defFum_curr',
    'defBlk_curr',
    'defT_curr',
    'defPtsAgainst_curr',
    'defPassYAgainst_curr',
    'defRushYAgainst_curr',
    'defYdsAgainst_curr',
    'gamesPlayed_curr',
    'gamesPlayed_prior1',
    'passA_prior1',
    'passC_prior1',
    'passY_prior1',
    'passT_prior1',
    'passI_prior1',
    'pass2_prior1',
    'rushA_prior1',
    'rushY_prior1',
    'rushT_prior1',
    'rush2_prior1',
    'recC_prior1',
    'recY_prior1',
    'recT_prior1',
    'rec2_prior1',
    'fum_prior1',
    'XPA_prior1',
    'XPM_prior1',
    'FGA_prior1',
    'FGM_prior1',
    'FG50_prior1',
    'defSack_prior1',
    'defI_prior1',
    'defSaf_prior1',
    'defFum_prior1',
    'defBlk_prior1',
    'defT_prior1',
    'defPtsAgainst_prior1',
    'defPassYAgainst_prior1',
    'defRushYAgainst_prior1',
    'defYdsAgainst_prior1',
    'gamesPlayed_prior2',
    'passA_prior2',
    'passC_prior2',
    'passY_prior2',
    'passT_prior2',
    'passI_prior2',
    'pass2_prior2',
    'rushA_prior2',
    'rushY_prior2',
    'rushT_prior2',
    'rush2_prior2',
    'recC_prior2',
    'recY_prior2',
    'recT_prior2',
    'rec2_prior2',
    'fum_prior2',
    'XPA_prior2',
    'XPM_prior2',
    'FGA_prior2',
    'FGM_prior2',
    'FG50_prior2',
    'defSack_prior2',
    'defI_prior2',
    'defSaf_prior2',
    'defFum_prior2',
    'defBlk_prior2',
    'defT_prior2',
    'defPtsAgainst_prior2',
    'defPassYAgainst_prior2',
    'defRushYAgainst_prior2',
    'defYdsAgainst_prior2',
    'defSack_curr_opp',
    'defI_curr_opp',
    'defSaf_curr_opp',
    'defFum_curr_opp',
    'defBlk_curr_opp',
    'defT_curr_opp',
    'defPtsAgainst_curr_opp',
    'defPassYAgainst_curr_opp',
    'defRushYAgainst_curr_opp',
    'defYdsAgainst_curr_opp',
    'defSack_prior1_opp',
    'defI_prior1_opp',
    'defSaf_prior1_opp',
    'defFum_prior1_opp',
    'defBlk_prior1_opp',
    'defT_prior1_opp',
    'defPtsAgainst_prior1_opp',
    'defPassYAgainst_prior1_opp',
    'defRushYAgainst_prior1_opp',
    'defYdsAgainst_prior1_opp']
len(modelCols)

151

In [46]:
#Create summary dfs for each season
seasonsPrior1_df = histdf.copy()
seasonsPrior1_df = seasonsPrior1_df[[
    'season',
    'week',
    'team',
    'player',
    'pos',
    'posRank',
    'opponent',
    'passA',
    'passC',
    'passY',
    'passT',
    'passI',
    'pass2',
    'rushA',
    'rushY',
    'rushT',
    'rush2',
    'recC',
    'recY',
    'recT',
    'rec2',
    'fum',
    'XPA',
    'XPM',
    'FGA',
    'FGM',
    'FG50',
    'defSack',
    'defI',
    'defSaf',
    'defFum',
    'defBlk',
    'defT',
    'defPtsAgainst',
    'defPassYAgainst',
    'defRushYAgainst',
    'defYdsAgainst']]
seasonsPrior1_df = seasonsPrior1_df.loc[seasonsPrior1_df.season==2021]
seasonsPrior1_df['season'] = 2022
seasonsPrior1 = seasonsPrior1_df.groupby(['season', 'player']).mean()
seasonsPrior1['gamesPlayed'] = seasonsPrior1_df.groupby(['season', 'player'])['week'].count()
seasonsPrior1.drop(columns='week', inplace=True)
gamesPlayedCol = seasonsPrior1.pop('gamesPlayed')
seasonsPrior1.insert(0, 'gamesPlayed', gamesPlayedCol)
# Rename all columns in seasonsPrior1
colList = [(x + "_prior1") for x in list(seasonsPrior1.columns)]
seasonsPrior1.columns = colList

# Create summary dfs for combined two seasons before
seasonsPrior2_df = histdf.copy()
seasonsPrior2_df = seasonsPrior2_df[[
    'season',
    'week',
    'team',
    'player',
    'pos',
    'posRank',
    'opponent',
    'passA',
    'passC',
    'passY',
    'passT',
    'passI',
    'pass2',
    'rushA',
    'rushY',
    'rushT',
    'rush2',
    'recC',
    'recY',
    'recT',
    'rec2',
    'fum',
    'XPA',
    'XPM',
    'FGA',
    'FGM',
    'FG50',
    'defSack',
    'defI',
    'defSaf',
    'defFum',
    'defBlk',
    'defT',
    'defPtsAgainst',
    'defPassYAgainst',
    'defRushYAgainst',
    'defYdsAgainst']]
seasonsPrior2_df['season'] = 2022
seasonsPrior2 = seasonsPrior2_df.groupby(['season', 'player']).mean()
seasonsPrior2['gamesPlayed'] = seasonsPrior2_df.groupby(['season', 'player'])['week'].count()
seasonsPrior2.drop(columns='week', inplace=True)
gamesPlayedCol = seasonsPrior2.pop('gamesPlayed')
seasonsPrior2.insert(0, 'gamesPlayed', gamesPlayedCol)
# Rename all columns in seasonsPrior2
colList = [(x + "_prior2") for x in list(seasonsPrior2.columns)]
seasonsPrior2.columns = colList
seasonsPrior2

Unnamed: 0_level_0,Unnamed: 1_level_0,gamesPlayed_prior2,passA_prior2,passC_prior2,passY_prior2,passT_prior2,passI_prior2,pass2_prior2,rushA_prior2,rushY_prior2,rushT_prior2,...,defSack_prior2,defI_prior2,defSaf_prior2,defFum_prior2,defBlk_prior2,defT_prior2,defPtsAgainst_prior2,defPassYAgainst_prior2,defRushYAgainst_prior2,defYdsAgainst_prior2
season,player,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
2022,AARON JONES,29,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,12.827586,65.620690,0.448276,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2022,AARON RODGERS,31,33.516129,23.354839,266.967742,2.677419,0.290323,0.000000,2.258065,7.838710,0.193548,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2022,ADAM HUMPHRIES,22,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2022,ADAM PRENTICE,3,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.666667,1.333333,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2022,ADAM SHAHEEN,15,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2022,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022,ZACH PASCAL,30,0.033333,0.000000,0.000000,0.000000,0.000000,0.000000,0.066667,0.700000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2022,ZACH WILSON,12,30.250000,17.166667,187.250000,0.666667,0.916667,0.083333,2.250000,13.416667,0.333333,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2022,ZACK MOSS,25,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,8.120000,32.720000,0.320000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2022,ZANE GONZALEZ,24,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [47]:
# Merge df, currentSeason, seasonsPrior1, and seasonsPrior1
final = df.copy()

# Merge df, currentSeason, seasonsPrior1, and seasonsPrior1
final = final.merge(
    seasonsPrior1, how='left', on=['season', 'player']
    ).merge(
        seasonsPrior2, how='left', on=['season', 'player']
        )
# fill NA values for players who do not have two years prior data
final.fillna(0, inplace=True)
final

Unnamed: 0,player,pos,team,age,posRank,KR,PR,RES,season,gamesPlayed_prior1,...,defSack_prior2,defI_prior2,defSaf_prior2,defFum_prior2,defBlk_prior2,defT_prior2,defPtsAgainst_prior2,defPassYAgainst_prior2,defRushYAgainst_prior2,defYdsAgainst_prior2
0,BUFFALO BILLS,DF,BUF,0.0,DF1,NO,NO,NO,2022,16.0,...,2.21875,1.06250,0.00000,0.68750,0.03125,0.15625,20.43750,202.87500,116.62500,319.50000
1,INDIANAPOLIS COLTS,DF,IND,0.0,DF1,NO,NO,NO,2022,16.0,...,2.25000,1.06250,0.12500,0.75000,0.09375,0.28125,21.90625,238.40625,100.03125,338.43750
2,MIAMI DOLPHINS,DF,MIA,0.0,DF1,NO,NO,NO,2022,16.0,...,2.71875,0.96875,0.00000,0.65625,0.06250,0.21875,21.46875,239.06250,112.34375,351.40625
3,NEW ENGLAND PATRIOTS,DF,NEP,0.0,DF1,NO,NO,NO,2022,16.0,...,1.84375,1.28125,0.00000,0.34375,0.06250,0.21875,19.46875,207.34375,125.34375,332.68750
4,NEW YORK JETS,DF,NYJ,0.0,DF1,NO,NO,NO,2022,16.0,...,2.00000,0.53125,0.03125,0.50000,0.06250,0.09375,29.18750,267.62500,124.15625,391.78125
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1093,JALEN VIRGIL,WR,DEN,24.0,WR3,NO,NO,NO,2022,0.0,...,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000
1094,DANNY DAVIS,WR,GBP,24.0,WR3,NO,NO,NO,2022,0.0,...,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000
1095,RAMIZ AHMED,PK,GBP,27.0,PK2,NO,NO,NO,2022,0.0,...,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000
1096,PEYTON HENDERSHOT,TE,DAL,23.0,TE3,NO,NO,NO,2022,0.0,...,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000


In [48]:
final.loc[final.player=='SAN FRANCISCO 49ERS']

Unnamed: 0,player,pos,team,age,posRank,KR,PR,RES,season,gamesPlayed_prior1,...,defSack_prior2,defI_prior2,defSaf_prior2,defFum_prior2,defBlk_prior2,defT_prior2,defPtsAgainst_prior2,defPassYAgainst_prior2,defRushYAgainst_prior2,defYdsAgainst_prior2
29,SAN FRANCISCO 49ERS,DF,SFO,0.0,DF1,NO,NO,NO,2022,16.0,...,2.28125,0.59375,0.0,0.59375,0.0625,0.09375,22.84375,207.375,106.21875,313.59375


In [49]:
schedule = pd.read_csv('data_raw/schedule.csv')
schedule['team'] = schedule['team'].str.upper()
schedule['opponent'] = schedule['opponent'].str.upper()

initialList = ['BUF', 'IND', 'MIA', 'NEP', 'NYJ', 'CIN', 'CLE', 'TEN', 'JAC',
       'PIT', 'DEN', 'KCC', 'LVR', 'LAC', 'SEA', 'DAL', 'NYG', 'PHI',
       'ARI', 'WAS', 'CHI', 'DET', 'GBP', 'MIN', 'TBB', 'ATL', 'CAR',
       'LAR', 'NOS', 'SFO', 'BAL', 'HOU']
teamList = ['BUFFALO BILLS', 'INDIANAPOLIS COLTS', 'MIAMI DOLPHINS',
       'NEW ENGLAND PATRIOTS', 'NEW YORK JETS', 'CINCINNATI BENGALS',
       'CLEVELAND BROWNS', 'TENNESSEE TITANS', 'JACKSONVILLE JAGUARS',
       'PITTSBURGH STEELERS', 'DENVER BRONCOS', 'KANSAS CITY CHIEFS',
       'LAS VEGAS RAIDERS', 'LOS ANGELES CHARGERS', 'SEATTLE SEAHAWKS',
       'DALLAS COWBOYS', 'NEW YORK GIANTS', 'PHILADELPHIA EAGLES',
       'ARIZONA CARDINALS', 'WASHINGTON COMMANDERS', 'CHICAGO BEARS',
       'DETROIT LIONS', 'GREEN BAY PACKERS', 'MINNESOTA VIKINGS',
       'TAMPA BAY BUCCANEERS', 'ATLANTA FALCONS', 'CAROLINA PANTHERS',
       'LOS ANGELES RAMS', 'NEW ORLEANS SAINTS', 'SAN FRANCISCO 49ERS',
       'BALTIMORE RAVENS', 'HOUSTON TEXANS']
initialsZipped = dict(zip(teamList, initialList))

schedule = schedule.replace({'team':initialsZipped})
schedule = schedule.replace({'opponent':initialsZipped})
schedule


Unnamed: 0,week,team,opponent
0,1,BUF,LAR
1,1,NOS,ATL
2,1,CLE,CAR
3,1,SFO,CHI
4,1,PIT,CIN
...,...,...,...
539,18,PIT,CLE
540,18,LVR,KCC
541,18,SEA,LAR
542,18,SFO,ARI


In [50]:
# merge weekly schedule into final_df
df2 = final.copy()
df2 = df2.merge(schedule, how='left', on='team')

In [51]:
# Get opponent historical data
# select only defenses
allDef = final.loc[final['pos']=='DF']

# Rename all columns in allDef
colList = [(x + "_opp") for x in list(allDef.columns)]
allDef.columns = colList
allDef = allDef.rename(columns={
       'season_opp':'season',
       'week_opp':'week',
       'team_opp':'team',
       'player_opp':'player'})

allDef = allDef[['season', 'team', # 'week',
       # 'defSack_curr_opp', 'defI_curr_opp',
       # 'defSaf_curr_opp', 'defFum_curr_opp', 'defBlk_curr_opp',
       # 'defT_curr_opp', 'defPtsAgainst_curr_opp', 'defPassYAgainst_curr_opp',
       # 'defRushYAgainst_curr_opp', 'defYdsAgainst_curr_opp',
       'defSack_prior1_opp', 'defI_prior1_opp', 'defSaf_prior1_opp',
       'defFum_prior1_opp', 'defBlk_prior1_opp', 'defT_prior1_opp',
       'defPtsAgainst_prior1_opp', 'defPassYAgainst_prior1_opp',
       'defRushYAgainst_prior1_opp', 'defYdsAgainst_prior1_opp']]
# currDef = pd.DataFrame(columns=[
#        'defSack_curr_opp', 'defI_curr_opp',
#        'defSaf_curr_opp', 'defFum_curr_opp', 'defBlk_curr_opp',
#        'defT_curr_opp', 'defPtsAgainst_curr_opp', 'defPassYAgainst_curr_opp',
#        'defRushYAgainst_curr_opp', 'defYdsAgainst_curr_opp'
#        ])
currDef = allDef.copy()
currDef.columns = ['season', 'team',
       'defSack_curr_opp', 'defI_curr_opp',
       'defSaf_curr_opp', 'defFum_curr_opp', 'defBlk_curr_opp',
       'defT_curr_opp', 'defPtsAgainst_curr_opp', 'defPassYAgainst_curr_opp',
       'defRushYAgainst_curr_opp', 'defYdsAgainst_curr_opp'
       ]
allDef = allDef.merge(currDef, how='left', on=['season', 'team'])
#allDef = pd.concat([allDef, currDef], axis=1)
#allDef.loc[:, -10:] = 0

# Connect opponents to defenses
df2 = df2.merge(allDef, how='left', left_on=['season', 'opponent'], right_on=['season', 'team'],)
df2.drop(columns=['team_y'], inplace=True)
df2.rename(columns={'team_x':'team'}, inplace=True)
df2['defPassYAgainst_prior1_opp'].sum()

3229220.8125

In [52]:
# merge in current season stats (before season all stats are zero)
newCols = pd.DataFrame(columns=[
    'passA_curr',
    'passC_curr',
    'passY_curr',
    'passT_curr',
    'passI_curr',
    'pass2_curr',
    'rushA_curr',
    'rushY_curr',
    'rushT_curr',
    'rush2_curr',
    'recC_curr',
    'recY_curr',
    'recT_curr',
    'rec2_curr',
    'fum_curr',
    'XPA_curr',
    'XPM_curr',
    'FGA_curr',
    'FGM_curr',
    'FG50_curr',
    'defSack_curr',
    'defI_curr',
    'defSaf_curr',
    'defFum_curr',
    'defBlk_curr',
    'defT_curr',
    'defPtsAgainst_curr',
    'defPassYAgainst_curr',
    'defRushYAgainst_curr',
    'defYdsAgainst_curr',
    'gamesPlayed_curr'
    ])
df2 = pd.concat([df2, newCols], axis=1)
df2.loc[:, (len(newCols.columns) * -1):] = 0
df2

  indexer = self._get_setitem_indexer(key)


Unnamed: 0,player,pos,team,age,posRank,KR,PR,RES,season,gamesPlayed_prior1,...,defI_curr,defSaf_curr,defFum_curr,defBlk_curr,defT_curr,defPtsAgainst_curr,defPassYAgainst_curr,defRushYAgainst_curr,defYdsAgainst_curr,gamesPlayed_curr
0,BUFFALO BILLS,DF,BUF,0.0,DF1,NO,NO,NO,2022,16.0,...,0,0,0,0,0,0,0,0,0,0
1,BUFFALO BILLS,DF,BUF,0.0,DF1,NO,NO,NO,2022,16.0,...,0,0,0,0,0,0,0,0,0,0
2,BUFFALO BILLS,DF,BUF,0.0,DF1,NO,NO,NO,2022,16.0,...,0,0,0,0,0,0,0,0,0,0
3,BUFFALO BILLS,DF,BUF,0.0,DF1,NO,NO,NO,2022,16.0,...,0,0,0,0,0,0,0,0,0,0
4,BUFFALO BILLS,DF,BUF,0.0,DF1,NO,NO,NO,2022,16.0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14837,JARED BERNHARDT,WR,ATL,24.0,WR3,NO,NO,NO,2022,0.0,...,0,0,0,0,0,0,0,0,0,0
14838,JARED BERNHARDT,WR,ATL,24.0,WR3,NO,NO,NO,2022,0.0,...,0,0,0,0,0,0,0,0,0,0
14839,JARED BERNHARDT,WR,ATL,24.0,WR3,NO,NO,NO,2022,0.0,...,0,0,0,0,0,0,0,0,0,0
14840,JARED BERNHARDT,WR,ATL,24.0,WR3,NO,NO,NO,2022,0.0,...,0,0,0,0,0,0,0,0,0,0


In [53]:

df2 = df2[[
    'season',
    'week',
    'team',
    'player',
    'age',
    'KR',
    'PR',
    'RES',
    'pos',
    'posRank',
    'opponent',
    'passA_curr',
    'passC_curr',
    'passY_curr',
    'passT_curr',
    'passI_curr',
    'pass2_curr',
    'rushA_curr',
    'rushY_curr',
    'rushT_curr',
    'rush2_curr',
    'recC_curr',
    'recY_curr',
    'recT_curr',
    'rec2_curr',
    'fum_curr',
    'XPA_curr',
    'XPM_curr',
    'FGA_curr',
    'FGM_curr',
    'FG50_curr',
    'defSack_curr',
    'defI_curr',
    'defSaf_curr',
    'defFum_curr',
    'defBlk_curr',
    'defT_curr',
    'defPtsAgainst_curr',
    'defPassYAgainst_curr',
    'defRushYAgainst_curr',
    'defYdsAgainst_curr',
    'gamesPlayed_curr',
    'gamesPlayed_prior1',
    'passA_prior1',
    'passC_prior1',
    'passY_prior1',
    'passT_prior1',
    'passI_prior1',
    'pass2_prior1',
    'rushA_prior1',
    'rushY_prior1',
    'rushT_prior1',
    'rush2_prior1',
    'recC_prior1',
    'recY_prior1',
    'recT_prior1',
    'rec2_prior1',
    'fum_prior1',
    'XPA_prior1',
    'XPM_prior1',
    'FGA_prior1',
    'FGM_prior1',
    'FG50_prior1',
    'defSack_prior1',
    'defI_prior1',
    'defSaf_prior1',
    'defFum_prior1',
    'defBlk_prior1',
    'defT_prior1',
    'defPtsAgainst_prior1',
    'defPassYAgainst_prior1',
    'defRushYAgainst_prior1',
    'defYdsAgainst_prior1',
    'gamesPlayed_prior2',
    'passA_prior2',
    'passC_prior2',
    'passY_prior2',
    'passT_prior2',
    'passI_prior2',
    'pass2_prior2',
    'rushA_prior2',
    'rushY_prior2',
    'rushT_prior2',
    'rush2_prior2',
    'recC_prior2',
    'recY_prior2',
    'recT_prior2',
    'rec2_prior2',
    'fum_prior2',
    'XPA_prior2',
    'XPM_prior2',
    'FGA_prior2',
    'FGM_prior2',
    'FG50_prior2',
    'defSack_prior2',
    'defI_prior2',
    'defSaf_prior2',
    'defFum_prior2',
    'defBlk_prior2',
    'defT_prior2',
    'defPtsAgainst_prior2',
    'defPassYAgainst_prior2',
    'defRushYAgainst_prior2',
    'defYdsAgainst_prior2',
    'defSack_curr_opp',
    'defI_curr_opp',
    'defSaf_curr_opp',
    'defFum_curr_opp',
    'defBlk_curr_opp',
    'defT_curr_opp',
    'defPtsAgainst_curr_opp',
    'defPassYAgainst_curr_opp',
    'defRushYAgainst_curr_opp',
    'defYdsAgainst_curr_opp',
    'defSack_prior1_opp',
    'defI_prior1_opp',
    'defSaf_prior1_opp',
    'defFum_prior1_opp',
    'defBlk_prior1_opp',
    'defT_prior1_opp',
    'defPtsAgainst_prior1_opp',
    'defPassYAgainst_prior1_opp',
    'defRushYAgainst_prior1_opp',
    'defYdsAgainst_prior1_opp']]
df2

Unnamed: 0,season,week,team,player,age,KR,PR,RES,pos,posRank,...,defSack_prior1_opp,defI_prior1_opp,defSaf_prior1_opp,defFum_prior1_opp,defBlk_prior1_opp,defT_prior1_opp,defPtsAgainst_prior1_opp,defPassYAgainst_prior1_opp,defRushYAgainst_prior1_opp,defYdsAgainst_prior1_opp
0,2022,1.0,BUF,BUFFALO BILLS,0.0,NO,NO,NO,DF,DF1,...,2.9375,1.0625,0.0000,0.3750,0.0000,0.0625,21.5625,237.1875,101.1875,338.3750
1,2022,3.0,BUF,BUFFALO BILLS,0.0,NO,NO,NO,DF,DF1,...,2.8750,0.8125,0.0000,0.6250,0.0625,0.2500,21.8125,226.6250,108.3125,334.9375
2,2022,4.0,BUF,BUFFALO BILLS,0.0,NO,NO,NO,DF,DF1,...,2.0625,0.5000,0.0000,0.3750,0.0625,0.0625,23.5000,281.6875,84.8125,366.5000
3,2022,6.0,BUF,BUFFALO BILLS,0.0,NO,NO,NO,DF,DF1,...,1.8750,0.9375,0.0000,0.8125,0.0625,0.1875,21.2500,256.2500,113.0000,369.2500
4,2022,9.0,BUF,BUFFALO BILLS,0.0,NO,NO,NO,DF,DF1,...,2.0625,0.4375,0.0000,0.4375,0.0000,0.1250,29.8125,259.6875,136.3125,396.0000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14837,2022,9.0,ATL,JARED BERNHARDT,24.0,NO,NO,NO,WR,WR3,...,2.0000,0.6875,0.0000,0.6250,0.0000,0.1250,26.5000,224.3125,136.6875,361.0000
14838,2022,11.0,ATL,JARED BERNHARDT,24.0,NO,NO,NO,WR,WR3,...,2.8750,0.5000,0.0625,0.5000,0.0000,0.1875,23.5000,189.3750,126.4375,315.8125
14839,2022,13.0,ATL,JARED BERNHARDT,24.0,NO,NO,NO,WR,WR3,...,3.2500,0.6875,0.0000,0.5000,0.0625,0.0625,24.0625,220.2500,139.6250,359.8750
14840,2022,17.0,ATL,JARED BERNHARDT,24.0,NO,NO,NO,WR,WR3,...,2.5000,0.7500,0.0000,0.8125,0.0000,0.0625,20.5000,213.5000,109.3750,322.8750


In [54]:
df2.to_csv('data_cleaned/toPredict.csv')