In [34]:
#import required modules
import numpy as py
import pandas as pd
import re

In [35]:
#import 2020 rosters
with open("2020rosters.html") as file:
        output = file.read()

In [36]:
#regex to find players - eventually going to condense into positions
pattern = """
(?P<PlayerNames>[A-Z]+[A-Z]*\s[A-Z]+[a-z]*)
"""
playerList = []
for player in re.finditer(pattern, output, flags = re.S | re.X):
        playerList.append(player.group("PlayerNames"))

In [37]:
positionList = [x.split(" ")[0] for x in playerList]
#Some teams have free safties, others have strong safeties, and some have both 
#We'll just replace the position with a singlular "safety"
i = 0
while i < len(positionList):
        if positionList[i] == "FS" or positionList[i] == "SS":
                positionList[i] = "S"
        i += 1

In [38]:
#regex to find player ratings
pattern = "[0-9][0-9]\.[0-9]"
ratingList = []
for rating in re.finditer(pattern, output, flags = re.S | re.X):
        ratingList.append(rating.group())

In [39]:
#hand-inputed team list to line up with the rankings in the article
#note the Commanders have gone through a series of name changes over these past few years, 
#so I'm just giving them their current name to simplify things
teamList = ["Ravens", "Saints", "49ers", "Chiefs", 
            "Buccaneers", "Cowboys", "Bills", "Titans", 
            "Eagles", "Steelers", "Browns", "Packers", "Colts",
            "Broncos", "Seahawks", "Chargers", "Vikings", "Patriots",
            "Lions", "Falcons", "Bears", "Rams", "Texans", "Cardinals",
            "Raiders", "Bengals", "Giants", "Jets", "Panthers", "Dolphins",
            "Commanders", "Jaguars"]

#create a dictionary of teams with respective position rankings, sort from highest to lowest, dropping duplicates
index = 0
teamDict2020 = {}
for team in teamList:
        teamDict2020[team] = pd.Series(ratingList[index:index + 24], index = positionList[index:index + 24]).sort_values(ascending = False)
        teamDict2020[team] = teamDict2020[team][~teamDict2020[team].index.duplicated(keep = 'first')]
        teamDict2020[team] = pd.to_numeric(teamDict2020[team])
        index += 24
        
#For some reason Kyle Jusczyk was the only Fullback included in these roster rankings
#So we'll just drop him to keep the variables constant across all teams
teamDict2020["49ers"] = teamDict2020["49ers"].drop("FB")

#Demonstrating position rankings for one team
teamDict2020["Ravens"]

DT      90.3
QB      90.1
LT      89.4
TE      88.5
CB      83.9
RB      83.8
LB      82.8
S       76.9
RT      73.6
WR      72.8
EDGE    69.2
C       68.7
LG      63.8
RG      63.6
dtype: float64

In [40]:
#import 2021 rosters
with open("2021rosters.html") as file:
        output = file.read()

In [41]:
#regex to find players - eventually going to condense into positions
pattern = """
(?P<PlayerNames>[A-Z]+[A-Z]*\s[A-Z]+[a-z]*)
"""

playerList = []
for player in re.finditer(pattern, output, flags = re.S | re.X):
        playerList.append(player.group("PlayerNames"))

In [42]:
#Some entries weirdly have no break spaces while others have regular spaces
#Let's just remove them both
positionList = [x.split("\xa0")[0] for x in playerList]
positionList = [x.split(" ")[0] for x in positionList]

#No issues with safeties on this data set, but DTs are oddly labled as "DI"s, so we'll switch them back
i = 0
while i < len(positionList):
        if positionList[i] == "DI":
                positionList[i] = "DT"
        i += 1

In [43]:
#regex to find player ratings
pattern = "[0-9][0-9]\.[0-9]"
ratingList = []
for rating in re.finditer(pattern, output, flags = re.S | re.X):
        ratingList.append(rating.group())

In [44]:
#hand-inputed team list to line up with the rankings in the article
teamList = ["Buccaneers", "Chiefs", "Browns", "Bills", "Ravens",
            "Packers", "Rams", "Cowboys", "Vikings", "Broncos", "49ers",
            "Commanders", "Titans", "Colts", "Seahawks", "Steelers", "Saints",
            "Patriots", "Giants", "Chargers", "Cardinals", "Dolphins", "Bears",
            "Bengals", "Falcons", "Raiders", "Panthers", "Jaguars", "Eagles",
            "Jets", "Lions", "Texans"]

#create a dictionary of teams with respective position rankings, sort from highest to lowest, dropping duplicates
index = 0
teamDict2021 = {}
for team in teamList:
        teamDict2021[team] = pd.Series(ratingList[index:index + 24], index = positionList[index:index + 24]).sort_values(ascending = False)
        teamDict2021[team] = teamDict2021[team][~teamDict2021[team].index.duplicated(keep = 'first')]
        teamDict2021[team] = pd.to_numeric(teamDict2021[team])
        index += 24

#Same issue with FB as before, also for some reason EDGE Andrew Van Ginkel was mislabled as an "ED"
teamDict2021["Vikings"] = teamDict2021["Vikings"].drop("FB")
teamDict2021["49ers"] = teamDict2021["49ers"].drop("FB")
teamDict2021["Dolphins"] = teamDict2021["Dolphins"].drop("ED")

#Demonstrating position rankings for one team
teamDict2021["Buccaneers"]

QB      93.3
DT      89.9
RT      84.1
WR      82.2
LG      80.7
LB      78.8
CB      76.4
EDGE    74.7
LT      72.8
TE      70.9
S       70.0
RG      67.7
RB      65.5
C       63.6
dtype: float64

In [45]:
#import 2022 rosters
with open("2022rosters.html") as file:
        output = file.read()

In [46]:
#regex to find players - eventually going to condense into positions
#this time we have to use a slightly different regex becasue the abbreviations have become incosistent
#i.e. initial names are weird - like how D.J. Moore is "DJ Moore", while A.J. Brown is "A.J. Brown"
pattern = """
(?P<PlayerNames>[A-Z]+[A-Z]*\s[A-Z]+)
"""

#for some reason EDGEs are labled as "Edge", so we must fix this to maintain consistency
output = re.sub("Edge", "EDGE", output)
playerList = []
for player in re.finditer(pattern, output, flags = re.S | re.X):
        playerList.append(player.group("PlayerNames"))

#no issues with break spaces in this data set
positionList = [x.split(" ")[0] for x in playerList]
#the issue with "DI"s is back though
i = 0
while i < len(positionList):
        if positionList[i] == "DI":
                positionList[i] = "DT"
        i += 1

In [47]:
#regex to find player ratings
pattern = "[0-9][0-9]\.[0-9]"
ratingList = []
for rating in re.finditer(pattern, output, flags = re.S | re.X):
        ratingList.append(rating.group())
        
#for some inexplicable reason, Derek Carr's (77.6) rating was left out
ratingList.insert(480, "77.6")

In [48]:
#hand-inputed team list to line up with the rankings in the article
teamList = ["Bills", "Buccaneers", "Chargers", "Rams", "Packers", "Browns",
            "Eagles", "Bengals", "Chiefs", "Dolphins", "Saints", "Ravens", "49ers",
            "Broncos", "Colts", "Cowboys", "Commanders", "Patriots", "Vikings",
            "Titans", "Raiders", "Steelers", "Cardinals", "Jets", "Lions", "Giants",
            "Panthers", "Jaguars", "Seahawks", "Bears", "Falcons", "Texans"]

#create a dictionary of teams with respective position rankings, sort from highest to lowest, dropping duplicates
index = 0
teamDict2022 = {}
for team in teamList:
        teamDict2022[team] = pd.Series(ratingList[index:index + 24], index = positionList[index:index + 24]).sort_values(ascending = False)
        teamDict2022[team] = teamDict2022[team][~teamDict2022[team].index.duplicated(keep = 'first')]
        teamDict2022[team] = pd.to_numeric(teamDict2022[team])
        index += 24
        
#once again, the issue with FBs
teamDict2022["Dolphins"] = teamDict2022["Dolphins"].drop("FB")
teamDict2022["Ravens"] = teamDict2022["Ravens"].drop("FB")
teamDict2022["49ers"] = teamDict2022["49ers"].drop("FB")
teamDict2022["Vikings"] = teamDict2022["Vikings"].drop("FB")

#Demonstrating position rankings for one team
teamDict2022["Bills"]

EDGE    91.2
QB      90.9
RB      85.0
S       83.8
WR      81.5
LT      79.4
DT      71.8
CB      70.3
LG      68.1
RG      66.3
TE      63.6
C       63.4
RT      59.3
LB      50.4
dtype: float64

In [49]:
#use pickle to export this data so it can be used in data cleaning
import pickle

pickle.dump(teamDict2020, open("2020Dict.p", "wb"))
pickle.dump(teamDict2021, open("2021Dict.p", "wb"))
pickle.dump(teamDict2022, open("2022Dict.p", "wb"))
