In [54]:
#import required modules
import numpy as py
import pandas as pd
import re

In [55]:
#import 2020 rosters
with open("2020rosters.html") as file:
        output = file.read()

In [56]:
#regex to find players - eventually going to condense into positions
pattern = """
(?P<PlayerNames>[A-Z]+[A-Z]*\s[A-Z]+[a-z]*)
"""
playerList = []
for player in re.finditer(pattern, output, flags = re.S | re.X):
        playerList.append(player.group("PlayerNames"))

In [57]:
positionList = [x.split(" ")[0] for x in playerList]
#Some teams have free safties, others have strong safeties, and some have both 
#We'll just replace the position with a singlular "safety"
i = 0
while i < len(positionList):
        if positionList[i] == "FS" or positionList[i] == "SS":
                positionList[i] = "S"
        i += 1

In [58]:
#regex to find player ratings
pattern = "[0-9][0-9]\.[0-9]"
ratingList = []
for rating in re.finditer(pattern, output, flags = re.S | re.X):
        ratingList.append(rating.group())

In [59]:
#hand-inputed team list to line up with the rankings in the article
#note the Commanders have gone through a series of name changes over these past few years, 
#so I'm just giving them their current name to simplify things
teamList = ["Ravens", "Saints", "49ers", "Chiefs", 
            "Buccaneers", "Cowboys", "Bills", "Titans", 
            "Eagles", "Steelers", "Browns", "Packers", "Colts",
            "Broncos", "Seahawks", "Chargers", "Vikings", "Patriots",
            "Lions", "Falcons", "Bears", "Rams", "Texans", "Cardinals",
            "Raiders", "Bengals", "Giants", "Jets", "Panthers", "Dolphins",
            "Commanders", "Jaguars"]

#create a dictionary of teams with respective position rankings, sort from highest to lowest, dropping duplicates
index = 0
teamDict2020 = {}
for team in teamList:
        teamDict2020[team] = pd.Series(ratingList[index:index + 24], index = positionList[index:index + 24]).sort_values(ascending = False)
        teamDict2020[team] = teamDict2020[team][~teamDict2020[team].index.duplicated(keep = 'first')]
        index += 24
        
#For some reason Kyle Jusczyk was the only Fullback included in these roster rankings
#So we'll just drop him to keep the variables constant across all teams
teamDict2020["49ers"] = teamDict2020["49ers"].drop("FB")

#Demonstrating position rankings for one team
teamDict2020["Ravens"]

DT      90.3
QB      90.1
LT      89.4
TE      88.5
CB      83.9
RB      83.8
LB      82.8
S       76.9
RT      73.6
WR      72.8
EDGE    69.2
C       68.7
LG      63.8
RG      63.6
dtype: object

In [60]:
#import 2021 rosters
with open("2021rosters.html") as file:
        output = file.read()

In [61]:
#regex to find players - eventually going to condense into positions
pattern = """
(?P<PlayerNames>[A-Z]+[A-Z]*\s[A-Z]+[a-z]*)
"""

playerList = []
for player in re.finditer(pattern, output, flags = re.S | re.X):
        playerList.append(player.group("PlayerNames"))

In [62]:
#Some entries weirdly have no break spaces while others have regular spaces
#Let's just remove them both
positionList = [x.split("\xa0")[0] for x in playerList]
positionList = [x.split(" ")[0] for x in positionList]

#No issues with safeties on this data set, but DTs are oddly labled as "DI"s, so we'll switch them back
i = 0
while i < len(positionList):
        if positionList[i] == "DI":
                positionList[i] = "DT"
        i += 1

In [63]:
#regex to find player ratings
pattern = "[0-9][0-9]\.[0-9]"
ratingList = []
for rating in re.finditer(pattern, output, flags = re.S | re.X):
        ratingList.append(rating.group())

In [64]:
#hand-inputed team list to line up with the rankings in the article
teamList = ["Buccaneers", "Chiefs", "Browns", "Bills", "Ravens",
            "Packers", "Rams", "Cowboys", "Vikings", "Broncos", "49ers",
            "Commanders", "Titans", "Colts", "Seahawks", "Steelers", "Saints",
            "Patriots", "Giants", "Chargers", "Cardinals", "Dolphins", "Bears",
            "Bengals", "Falcons", "Raiders", "Panthers", "Jaguars", "Eagles",
            "Jets", "Lions", "Texans"]

#create a dictionary of teams with respective position rankings, sort from highest to lowest, dropping duplicates
index = 0
teamDict2021 = {}
for team in teamList:
        teamDict2021[team] = pd.Series(ratingList[index:index + 24], index = positionList[index:index + 24]).sort_values(ascending = False)
        teamDict2021[team] = teamDict2021[team][~teamDict2021[team].index.duplicated(keep = 'first')]
        teamDict2021[team] = pd.to_numeric(teamDict2021[team])
        index += 24

#Same issue with FB as before, also for some reason EDGE Andrew Van Ginkel was mislabled as an "ED"
teamDict2021["Vikings"] = teamDict2021["Vikings"].drop("FB")
teamDict2021["49ers"] = teamDict2021["49ers"].drop("FB")
teamDict2021["Dolphins"] = teamDict2021["Dolphins"].drop("ED")

#Demonstrating position rankings for one team
teamDict2021["Buccaneers"]

QB      93.3
DT      89.9
RT      84.1
WR      82.2
LG      80.7
LB      78.8
CB      76.4
EDGE    74.7
LT      72.8
TE      70.9
S       70.0
RG      67.7
RB      65.5
C       63.6
dtype: float64

In [65]:
#import 2022 rosters
with open("2022rosters.html") as file:
        output = file.read()

In [66]:
#regex to find players - eventually going to condense into positions
#this time we have to use a slightly different regex becasue the abbreviations have become incosistent
#i.e. initial names are weird - like how D.J. Moore is "DJ Moore", while A.J. Brown is "A.J. Brown"
pattern = """
(?P<PlayerNames>[A-Z]+[A-Z]*\s[A-Z]+)
"""

#for some reason EDGEs are labled as "Edge", so we must fix this to maintain consistency
output = re.sub("Edge", "EDGE", output)
playerList = []
for player in re.finditer(pattern, output, flags = re.S | re.X):
        playerList.append(player.group("PlayerNames"))

#no issues with break spaces in this data set
positionList = [x.split(" ")[0] for x in playerList]
#the issue with "DI"s is back though
i = 0
while i < len(positionList):
        if positionList[i] == "DI":
                positionList[i] = "DT"
        i += 1

In [67]:
#regex to find player ratings
pattern = "[0-9][0-9]\.[0-9]"
ratingList = []
for rating in re.finditer(pattern, output, flags = re.S | re.X):
        ratingList.append(rating.group())
        
#for some inexplicable reason, Derek Carr's (77.6) rating was left out
ratingList.insert(480, "77.6")

In [68]:
#hand-inputed team list to line up with the rankings in the article
teamList = ["Bills", "Buccaneers", "Chargers", "Rams", "Packers", "Browns",
            "Eagles", "Bengals", "Chiefs", "Dolphins", "Saints", "Ravens", "49ers",
            "Broncos", "Colts", "Cowboys", "Commanders", "Patriots", "Vikings",
            "Titans", "Raiders", "Steelers", "Cardinals", "Jets", "Lions", "Giants",
            "Panthers", "Jaguars", "Seahawks", "Bears", "Falcons", "Texans"]

#create a dictionary of teams with respective position rankings, sort from highest to lowest, dropping duplicates
index = 0
teamDict2022 = {}
for team in teamList:
        teamDict2022[team] = pd.Series(ratingList[index:index + 24], index = positionList[index:index + 24]).sort_values(ascending = False)
        teamDict2022[team] = teamDict2022[team][~teamDict2022[team].index.duplicated(keep = 'first')]
        teamDict2022[team] = pd.to_numeric(teamDict2022[team])
        index += 24
        
#once again, the issue with FBs
teamDict2022["Dolphins"] = teamDict2022["Dolphins"].drop("FB")
teamDict2022["Ravens"] = teamDict2022["Ravens"].drop("FB")
teamDict2022["49ers"] = teamDict2022["49ers"].drop("FB")
teamDict2022["Vikings"] = teamDict2022["Vikings"].drop("FB")

#Demonstrating position rankings for one team
teamDict2022["Bills"]

EDGE    91.2
QB      90.9
RB      85.0
S       83.8
WR      81.5
LT      79.4
DT      71.8
CB      70.3
LG      68.1
RG      66.3
TE      63.6
C       63.4
RT      59.3
LB      50.4
dtype: float64

In [69]:
#manually removing rookie college rankings because they are very inaccurate predictors
teamDict2020["Ravens"]["RB"] = 78.7
teamDict2020["Ravens"]["LB"] = 74.4
teamDict2020["Saints"]["RG"] = 58.6
teamDict2020["49ers"]["WR"] = 77.5
teamDict2020["49ers"]["DT"] = 89.6
teamDict2020["Chiefs"]["RB"] = 65.7
teamDict2020["Chiefs"]["LB"] = 50.4
teamDict2020["Buccaneers"]["WR"] = 90.7
teamDict2020["Buccaneers"]["RB"] = 59.9
teamDict2020["Buccaneers"]["RT"] = 81.8
teamDict2020["Buccaneers"]["S"] = 67.1
teamDict2020["Cowboys"]["WR"] = 84.1
teamDict2020["Cowboys"]["C"] = 53.5
teamDict2020["Cowboys"]["CB"] = 72.2
teamDict2020["Bills"]["RB"] = 69.3
teamDict2020["Titans"]["CB"] = 79.9
teamDict2020["Eagles"]["WR"] = 74.8
teamDict2020["Eagles"]["LB"] = 62.2
teamDict2020["Browns"]["LT"] = 51.5
teamDict2020["Browns"]["LB"] = 58.4
teamDict2020["Packers"]["RB"] = 84.8
teamDict2020["Packers"]["TE"] = 68.0
teamDict2020["Packers"]["DT"] = 79.5
teamDict2020["Colts"]["RB"] = 83.9
teamDict2020["Colts"]["WR"] = 75.1
teamDict2020["Broncos"]["WR"] = 83.1
teamDict2020["Broncos"]["C"] = 40.5
teamDict2020["Broncos"]["CB"] = 81.3
teamDict2020["Seahawks"]["RG"] = 70.2
teamDict2020["Seahawks"]["LB"] = 76.4
teamDict2020["Chargers"]["LB"] = 66.6
teamDict2020["Vikings"]["WR"] = 90.4
teamDict2020["Vikings"]["LT"] = 66.2
teamDict2020["Vikings"]["RG"] = 33.1
teamDict2020["Vikings"]["CB"] = 70.1
teamDict2020["Patriots"]["QB"] = 42.6
teamDict2020["Patriots"]["TE"] = 51.6
teamDict2020["Rams"]["RB"] = 66.7
teamDict2020["Rams"]["RG"] = 57.0
teamDict2020["Rams"]["CB"] = 70.3
teamDict2020["Falcons"]["RG"] = 47.0
teamDict2020["Falcons"]["CB"] = 56.8
teamDict2020["Bears"]["CB"] = 62.5
teamDict2020["Rams"]["RB"] = 68.4
teamDict2020["Rams"]["LB"] = 60.7
teamDict2020["Texans"]["DT"] = 59.8
teamDict2020["Texans"]["CB"] = 64.5
teamDict2020["Cardinals"]["DT"] = 65.1
teamDict2020["Cardinals"]["LB"] = 61.8
teamDict2020["Raiders"]["S"] = 69.3
teamDict2020["Bengals"]["QB"] = 75.1
teamDict2020["Bengals"]["WR"] = 85.2
teamDict2020["Bengals"]["LT"] = 70.1
teamDict2020["Bengals"]["LB"] = 51.2
teamDict2020["Giants"]["S"] = 66.6
teamDict2020["Giants"]["LT"] = 62.4
teamDict2020["Jets"]["WR"] = 73.0
teamDict2020["Jets"]["LT"] = 74.4
teamDict2020["Panthers"]["DT"] = 83.7
teamDict2020["Panthers"]["EDGE"] = 54.7
teamDict2020["Panthers"]["S"] = 76.4
teamDict2020["Dolphins"]["LT"] = 52.3
teamDict2020["Dolphins"]["RG"] = 65.8
teamDict2020["Dolphins"]["DT"] = 64.4
teamDict2020["Commanders"]["EDGE"] = 60.2
teamDict2020["Jaguars"]["DT"] = 75.6
teamDict2020["Jaguars"]["EDGE"] = 68.4
teamDict2021["Chiefs"]["C"] = 91.4
teamDict2021["Browns"]["LB"] = 45.5
teamDict2021["Browns"]["CB"] = 70.5
teamDict2021["Ravens"]["RG"] = 55.8
teamDict2021["Packers"]["C"] = 58.3
teamDict2021["Rams"]["LB"] = 40.0
teamDict2021["Rams"]["S"] = 65.6
teamDict2021["Cowboys"]["LB"] = 89.8
teamDict2021["Cowboys"]["CB"] = 62.7
teamDict2021["Vikings"]["LT"] = 71.9
teamDict2021["Vikings"]["RG"] = 54.4
teamDict2021["Broncos"]["RB"] = 72.5
teamDict2021["49ers"]["RG"] = 56.9
teamDict2021["Commanders"]["WR"] = 78.5
teamDict2021["Commanders"]["RT"] = 74.9
teamDict2021["Commanders"]["LB"] = 64.8
teamDict2021["Titans"]["RT"] = 49.7
teamDict2021["Titans"]["CB"] = 66.7
teamDict2021["Colts"]["EDGE"] = 62.9
teamDict2021["Seahawks"]["WR"] = 82.5
teamDict2021["Steelers"]["RB"] = 70.7
teamDict2021["Steelers"]["TE"] = 57.5
teamDict2021["Steelers"]["C"] = 52.4
teamDict2021["Saints"]["LB"] = 77.8
teamDict2021["Saints"]["CB"] = 60.3
teamDict2021["Giants"]["EDGE"] = 67.7
teamDict2021["Chargers"]["LT"] = 83.6
teamDict2021["Chargers"]["CB"] = 63.1
teamDict2021["Cardinals"]["LB"] = 59.9
teamDict2021["Dolphins"]["RT"] = 50.8
teamDict2021["Dolphins"]["EDGE"] = 66.4
teamDict2021["Dolphins"]["S"] = 84.7
teamDict2021["Bears"]["LT"] = 47.5
teamDict2021["Bengals"]["WR"] = 83.1
teamDict2021["Bengals"]["RG"] = 56.3
teamDict2021["Falcons"]["TE"] = 80.3
teamDict2021["Falcons"]["LG"] = 49.2
teamDict2021["Falcons"]["S"] = 65.3
teamDict2021["Raiders"]["RT"] = 45.0
teamDict2021["Raiders"]["S"] = 72.5
teamDict2021["Panthers"]["TE"] = 61.7
teamDict2021["Panthers"]["LT"] = 61.6
teamDict2021["Panthers"]["CB"] = 70.4
teamDict2021["Jaguars"]["QB"] = 59.6
teamDict2021["Jaguars"]["RB"] = 72.9
teamDict2021["Jaguars"]["DT"] = 71.7
teamDict2021["Jaguars"]["CB"] = 64.6
teamDict2021["Eagles"]["WR"] = 71.2
teamDict2021["Eagles"]["CB"] = 62.9
teamDict2021["Jets"]["QB"] = 59.3
teamDict2021["Jets"]["WR"] = 85.3
teamDict2021["Jets"]["LG"] = 66.8
teamDict2021["Lions"]["WR"] = 79.9
teamDict2021["Lions"]["RT"] = 77.0
teamDict2021["Lions"]["DT"] = 63.9
teamDict2022["Bills"]["RB"] = 69.0
teamDict2022["Buccaneers"]["RB"] = 75.1
teamDict2022["Buccaneers"]["LG"] = 51.6
teamDict2022["Chargers"]["RG"] = 65.9
teamDict2022["Rams"]["RG"] = 48.2
teamDict2022["Packers"]["WR"] = 70.1
teamDict2022["Packers"]["DT"] = 75.4
teamDict2022["Browns"]["WR"] = 73.0
teamDict2022["Browns"]["DT"] = 41.6
teamDict2022["Eagles"]["DT"] = 68.7
teamDict2022["Eagles"]["LB"] = 66.5
teamDict2022["Bengals"]["S"] = 72.1
teamDict2022["Chiefs"]["WR"] = 70.0
teamDict2022["Chiefs"]["RT"] = 67.2
teamDict2022["Chiefs"]["EDGE"] = 54.9
teamDict2022["Chiefs"]["CB"] = 79.1
teamDict2022["Saints"]["WR"] = 79.6
teamDict2022["Ravens"]["C"] = 75.4
teamDict2022["49ers"]["LG"] = 55.5
teamDict2022["Broncos"]["DT"] = 73.7
teamDict2022["Colts"]["S"] = 64.8
teamDict2022["Cowboys"]["LG"] = 72.3
teamDict2022["Commanders"]["WR"] = 78.3
teamDict2022["Commanders"]["S"] = 70.9
teamDict2022["Patriots"]["LG"] = 66.3
teamDict2022["Vikings"]["CB"] = 63.0
teamDict2022["Vikings"]["S"] = 77.9
teamDict2022["Titans"]["WR"] = 75.7
teamDict2022["Titans"]["CB"] = 64.1
teamDict2022["Raiders"]["RG"] = 50.6
teamDict2022["Jets"]["WR"] = 71.2
teamDict2022["Jets"]["EDGE"] = 80.3
teamDict2022["Jets"]["CB"] = 78.6
teamDict2022["Lions"]["WR"] = 79.9
teamDict2022["Lions"]["EDGE"] = 68.8
teamDict2022["Giants"]["WR"] = 72.8
teamDict2022["Giants"]["RT"] = 65.8
teamDict2022["Giants"]["EDGE"] = 58.3
teamDict2022["Panthers"]["LT"] = 71.6
teamDict2022["Panthers"]["CB"] = 61.3
teamDict2022["Jaguars"]["RB"] = 67.4
teamDict2022["Jaguars"]["LB"] = 47.0
teamDict2022["Seahawks"]["RB"] = 80.5
teamDict2022["Seahawks"]["LT"] = 66.6
teamDict2022["Seahawks"]["RT"] = 54.0
teamDict2022["Bears"]["WR"] = 74.7
teamDict2022["Bears"]["RT"] = 61.4
teamDict2022["Bears"]["CB"] = 64.2
teamDict2022["Bears"]["S"] = 59.0
teamDict2022["Falcons"]["WR"] = 62.9
teamDict2022["Falcons"]["EDGE"] = 65.3
teamDict2022["Texans"]["RB"] = 65.4
teamDict2022["Texans"]["WR"] = 77.4
teamDict2022["Texans"]["LG"] = 59.6
teamDict2022["Texans"]["S"] = 83.8

In [70]:
#use pickle to export this data so it can be used in data cleaning
import pickle

pickle.dump(teamDict2020, open("2020Dict.p", "wb"))
pickle.dump(teamDict2021, open("2021Dict.p", "wb"))
pickle.dump(teamDict2022, open("2022Dict.p", "wb"))
