In [5]:
#imports
import pandas as pd
import numpy as np
import datetime as dt
import requests
import bs4

In [6]:
#looks for the top QBs on PFR based on passing yards
qbs = pd.read_html('https://www.pro-football-reference.com/leaders/pass_yds_career.htm')
qbs = qbs[0]

#Removes the '-' from year and splits them into first/last year columns
qbs['First Year']=qbs['Years'].apply(lambda x:int(x.split('-')[0]))
qbs['Last Year']= qbs['Years'].apply(lambda x:int(x.split('-')[1]))

#Finds QBs in the timeframe wanted for this study
qbs = qbs[(qbs['First Year']>=2002) | (qbs['Last Year']>=2002)]
qbs.drop(['Rank', 'Yds', 'Years', 'Tm', 'First Year', 'Last Year'], axis=1,  inplace=True)

#Cleans qb names so they can be searched later using PFRs URL
qbs['Player']=qbs['Player'].apply(lambda x:x.replace('+', ''))
qbs['Player']=qbs['Player'].apply(lambda x:x.replace("'", ''))
qb_names = qbs['Player'].values.tolist()



In [7]:
def filter_stats(df):
    
    #goes to the proper level
    df.columns = df.columns.droplevel()

    #renames columns without a name
    df.rename(columns={'Unnamed: 7_level_1':'H/A', 'Rk': 'Week'}, inplace=True)

    #creates a more readable home/away column
    df['H/A'] = df['H/A'].fillna(value='H')


    #replaces confusing symbols with more readable ones
    df = df.replace(['@','*'],['A','Y'])

    #drops games the QB did not start and Super Bowls which arent really relevant for the study
    df = df[(df['GS']=='Y')&(df['H/A']!='N')]

    #Allows the filtering of multiple YD and TD categories (Passing, rushing)
    cols=[]
    count_yds = 1
    count_td = 1
    for column in df.columns:
        if column == 'Yds':
            cols.append(f'Yds{count_yds}')
            count_yds +=1
            continue
        elif column == 'TD':
            cols.append(f'TD{count_td}')
            count_td +=1
            continue
        cols.append(column)
    df.columns = cols          

    
    if not df.empty:
        df3 = df[['Date', 'H/A', 'Opp', 'Result', 'Cmp%', 'Yds1', 'TD1', 'Int', 'Rate', 'Tm']]

        df3=df3.reset_index().drop('index', axis=1)

        #creates a new column that shows the location of the game

        result=[]
        for i in range(len(df3)):
            if df3['H/A'].iloc[i]=='A':
                result.append(df3['Opp'].iloc[i])
            else:
                result.append(df3['Tm'].iloc[i])



        df3['GameLoc']= result

        #rearanges the columns for easier readability
        place_column = df3.pop('GameLoc')
        df3.insert(3,'GameLoc', place_column) 



        df3['Date']=pd.to_datetime(df3['Date']).dt.date
        return df3


def season_type(qb_test):
    #imports csv data from a GitHub repository
    #can be found here    https://github.com/ThompsonJamesBliss/WeatherData
    games = pd.read_csv('https://raw.githubusercontent.com/ThompsonJamesBliss/WeatherData/master/data/games.csv')
    games_weather = pd.read_csv('https://raw.githubusercontent.com/ThompsonJamesBliss/WeatherData/master/data/games_weather.csv')

    #merges the two datasets to match weather with games played and stadiums
    df1 = pd.merge(games_weather,games, on='game_id')


    #Converts columns into datetime
    df1[['TimeMeasure', 'TimeEndGame', 'TimeStartGame']]=df1[[
        'TimeMeasure', 'TimeEndGame', 'TimeStartGame']].apply(pd.to_datetime)


    #filters out weather measurements that arent necessary
    df_weather_time = df1[(df1['TimeStartGame']<=df1['TimeMeasure'])&
                                     (df1['TimeEndGame']>=df1['TimeMeasure'])]

    #creates a new avg temp column based on the avg of the time period of each game
    df_avg_temp = df_weather_time.groupby('game_id').mean()

    #drops all the duplicate gameids
    df_drop_dupes = df_weather_time.drop_duplicates(subset=['game_id'])


    #drops the temperature column since the data will be using the average temp
    df_drop_dupes = df_drop_dupes.drop('Temperature', axis=1)

    #merges the two DataFrames into one only taking the avg_temp column from one
    df_final = pd.merge(df_drop_dupes, df_avg_temp['Temperature'], on= 'game_id')

    #Drops unnecessary columns
    df_final.drop(['game_id', 'Source', 
                   'DistanceToStation', 'TimeStartGame', 'TimeEndGame', 
                   'WindDirection', 'TZOffset', 'Season'], axis=1, inplace=True)

    #Creates a new temp column thats the rounded temp (average)
    #Drops the temperature coulmn since the data will be using the RT column for temp readings
    df_final['RT'] = df_final['Temperature'].round().astype(int)  
    df_final.drop('Temperature', axis=1, inplace=True)

    #imports stadium csv
    df_stadium = pd.read_csv('https://raw.githubusercontent.com/ThompsonJamesBliss/WeatherData/master/data/stadium_coordinates.csv')

    #merges weather data/stadiums to get the team that plays there and type of roof data
    df_wea_sta = pd.merge(df_final, df_stadium, on='StadiumName')

    #drops unnecessary columns
    df_wea_sta.drop(['StadiumAzimuthAngle'], axis=1, inplace=True)

    #changes the values to simple 0's and 1's based on outdoor vs indoor
    #for simplicity all retractable stadiums are treated as indoor or 1 since
    #the stadium would close the roof if the weather was extreme
    mapping = {'Indoor':1, 'Outdoor':0, 'Retractable':1}
    df_wea_sta.replace({'RoofType':mapping}, inplace=True)
    
    
    #creates a list of NFL teams to compare with the other dataset to allow them to merge
    nfl_teams = list(df_wea_sta['HomeTeam'].unique())
    nfl_teams.sort()
    nfl_teams=list(filter(lambda x:len(x)<=3, nfl_teams))

    #creates a list of the team abv. values to compare with nfl_teams variable
    real_abv=['IND', 'MIA', 'SDG', 'DEN', 'ATL', 'BUF', 'STL', 'NOR', 'NYJ',
       'CLE', 'CAR', 'OAK', 'PIT', 'KAN', 'GNB', 'CHI', 'MIN', 'DET',
       'TEN', 'PHI', 'WAS', 'NYG', 'DAL', 'HOU', 'JAX', 'ARI', 'SEA',
       'BAL', 'CIN', 'SFO', 'TAM', 'LAR', 'LAC', 'LVR', 'NWE']
    real_abv.sort()

    #easier to remove NE and NO after sorting since they change order to NOR NWE in real_abv
    def remove_alpha(x):
        for elem in list(x):
            if elem=='NE' or elem=='NO':
                nfl_teams.remove(elem)
            elif elem =='NWE' or elem=='NOR':
                real_abv.remove(elem)
    remove_alpha(nfl_teams)
    remove_alpha(real_abv)


    #replaces the team values with the correct ones based on qb_test dataframe
    df_wea_sta['HomeTeam'] = df_wea_sta['HomeTeam'].replace(nfl_teams,real_abv)
    df_wea_sta['HomeTeam'] = df_wea_sta['HomeTeam'].replace(['NE','NO'],['NWE', 'NOR'])

    #creates the final dataframe to be merged with qb_test
    df_wea_sta.sort_values(by='TimeMeasure', inplace=True)
    df_wea_sta.reset_index(drop=True, inplace=True)

    #converts 'TimeMeasure' column to date to pair with qb_test and renames it
    df_wea_sta['TimeMeasure']=pd.to_datetime(df_wea_sta['TimeMeasure']).dt.date
    df_wea_sta.rename(columns={'TimeMeasure':'Date', 'HomeTeam':'GameLoc'}, inplace=True)


    #merges the two dataframes and drops NaN values
    final=pd.merge(qb_test,df_wea_sta, how='left')
    final.dropna(subset=['StadiumName'], inplace=True)

    #makes sure numerical columns are numeric and not str
    #final[['Cmp%', 'Yds1', 'TD1', 'Int', 'Rate', 'DewPoint', 'Humidity', 'RT']]=final[['Cmp%', 'Yds1', 'TD1', 'Int', 'Rate', 'DewPoint', 'Humidity', 'RT']].apply(pd.to_numeric)
    
    
    #Lets add this extra info for fun
    #final.drop(['DewPoint', 'Humidity', 'Pressure', 'Precipitation', 'EstimatedCondition'], axis=1, inplace=True)
    
    games = pd.read_csv('https://raw.githubusercontent.com/ThompsonJamesBliss/WeatherData/master/data/games.csv')
    games_weather = pd.read_csv('https://raw.githubusercontent.com/ThompsonJamesBliss/WeatherData/master/data/games_weather.csv')
    
    
    return final

In [8]:
#Checks to see if the QB played in playoffs. The extra DF for playoffs needs to be added and level dropped
def play_in_playoffs(df):
    try:
        df.columns.droplevel()
        return True
    except:
        return False

In [9]:
#Checks to see the position of the player 
def qb_check(link):
    if 'QB' in link:
        return True
    else:
        return False
    
test = qb_names[0:30]
remove_list = ['Vinny Testaverde', 'Drew Bledsoe', 
               'Mark Brunell', 'Steve McNair', 'Kerry Collins']


for i in remove_list:
    try:
        test.remove(i)
    except ValueError:
        pass
new_qbs = ['Patrick Mahomes', 'Josh Allen', 'Andrew Luck', 'Dak Prescott', 'Jared Goff']
test.extend(new_qbs)

qb_data_store = {}


#iterates through qb names and stores the final df in a dictoinary
for names in test:
    name_int = 0
    q_b_name = names.split()
    link1=q_b_name[1][0].capitalize()
        
    #Creates a while loop to check if the link is a QB, if not it adds a number and trys again
    #Since there can be multiple QBs ex Josh Allen (DE)/Josh Allen (QB)
    qbStatsInTable = False
    while qbStatsInTable == False:
        link2=q_b_name[1][0:4].capitalize()+q_b_name[0][0:2].capitalize()+f'0{name_int}'
        url = f'https://www.pro-football-reference.com/players/{link1}/{link2}/gamelog/'
    ####################################################  
    
        result = requests.get(url)
        soup = bs4.BeautifulSoup(result.text, 'lxml')
        find_example = soup.find('div', {'id': 'meta'}).getText()
        qbStatsInTable = qb_check(find_example)
            
       
    #####################################################
        #if it s a QB then it sets the dataframe to it
        if qbStatsInTable == True:
            #print(f'{names} - Found QB Stats in {link1}/{link2}/gamelog/')
            q_b = pd.read_html(f'https://www.pro-football-reference.com/players/{link1}/{link2}/gamelog/')
        #Adds one to the link number (multiple players with the same name)
        else:
            name_int += 1
        
        #Not able to find a gamelog link, breaks out of the loop
        if name_int == 10:
            print(f'Did not find a link for {names}')
            qbStatsInTable = False
            
    #Chceks to see if the QB played in the playoffs
    if len(q_b)>1:
        q_b_reg = q_b[0]
        q_b_play = q_b[1]
        
        #If they started in a playoff game the df will have multi index
        #uses play_in_playoffs functoin to check
        if play_in_playoffs(q_b_play) == True:
            playoffs = True
        else:
            playoffs = False
    else:
        q_b_reg = q_b[0]
        playoffs = False

    if playoffs==True:
        qb_reg_season = filter_stats(q_b_reg)
        qb_play_season = filter_stats(q_b_play)
        
        #Combines playoff games and regular season games into one df
        total_games = pd.concat([qb_play_season, qb_reg_season], ignore_index=True)
        #Reorganizes the games to make them more readable and resets the index
        total_games.sort_values(by=['Date'], inplace=True)
        total_games.reset_index(drop=True, inplace=True)
    else:
        #if the QB had no playoff games, just makes a df out of their regular season games
        qb_reg_season = filter_stats(q_b_reg)
        total_games = qb_reg_season
    
    #creates a df that will be stored in the dictionary for later use
    all_games = season_type(total_games)
    all_games = all_games.astype({'Rate':float})

    #Stores the df in a dictionary with the key as the QB name.
    qb_data_store[names]=all_games
        
    

In [10]:
qb_data_store['Drew Brees']

Unnamed: 0,Date,H/A,Opp,GameLoc,Result,Cmp%,Yds1,TD1,Int,Rate,...,Humidity,Precipitation,WindSpeed,Pressure,EstimatedCondition,StadiumName,RT,RoofType,Longitude,Latitude
0,2002-09-08,A,CIN,CIN,W 34-6,78.95,160,2,0,136.8,...,40.0,0.0,0.00,30.1944,Clear,Paul Brown Stadium,94.0,0.0,-84.516000,39.095000
1,2002-09-15,H,HOU,SDG,W 24-3,53.57,163,1,1,68.0,...,73.0,0.0,10.31,29.8696,Clear,Qualcomm Stadium,72.0,0.0,-117.119444,32.783056
2,2002-09-22,A,ARI,ARI,W 23-15,54.84,181,0,1,58.7,...,8.0,0.0,3.36,29.7898,Clear,Sun Devil Stadium,104.0,0.0,-111.932500,33.426389
3,2002-09-29,H,NWE,SDG,W 21-14,55.56,104,1,0,91.0,...,68.0,0.0,10.31,29.9700,Clear,Qualcomm Stadium,70.0,0.0,-117.119444,32.783056
4,2002-10-06,A,DEN,DEN,L 9-26,61.90,235,1,2,65.1,...,37.0,0.0,12.00,24.6700,Clear,Empower Field at Mile High,62.0,0.0,-105.020000,39.743889
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,2020-12-20,H,KAN,NOR,L 29-32,44.12,234,3,1,84.7,...,69.0,,11.18,30.0497,,Mercedes-Benz Superdome,58.0,1.0,-90.811111,29.950833
299,2020-12-25,H,MIN,NOR,W 52-33,73.08,311,0,2,80.8,...,29.0,,9.32,30.2387,,Mercedes-Benz Superdome,48.0,1.0,-90.811111,29.950833
300,2021-01-03,A,CAR,CAR,W 33-7,68.75,201,3,0,116.8,...,77.0,0.0,5.59,29.9227,Clear,Bank of America Stadium,53.0,0.0,-80.852778,35.225833
301,2021-01-10,H,CHI,NOR,W 21-9,71.79,265,2,0,107.3,...,75.0,0.0,12.43,30.1855,Clear,Mercedes-Benz Superdome,44.0,1.0,-90.811111,29.950833


In [11]:
#Converts dictionary to DF
total= pd.concat(qb_data_store)
total

Unnamed: 0,Unnamed: 1,Date,H/A,Opp,GameLoc,Result,Cmp%,Yds1,TD1,Int,Rate,...,Humidity,Precipitation,WindSpeed,Pressure,EstimatedCondition,StadiumName,RT,RoofType,Longitude,Latitude
Tom Brady,0,2001-09-30,H,IND,NWE,W 44-13,56.52,168,0,0,79.6,...,70.0,0.000,24.17,30.2003,Clear,Foxboro Stadium,56.0,0.0,-71.267442,42.092700
Tom Brady,1,2001-10-07,A,MIA,MIA,L 10-30,50.00,86,0,0,58.7,...,63.0,0.000,5.84,30.0497,Clear,Hard Rock Stadium,87.0,0.0,-80.238889,25.958056
Tom Brady,2,2001-10-14,H,SDG,NWE,W 29-26,61.11,364,2,0,93.4,...,100.0,0.012,8.08,30.1265,Light Rain,Foxboro Stadium,56.0,0.0,-71.267442,42.092700
Tom Brady,3,2001-10-21,A,IND,IND,W 38-17,80.00,202,3,0,148.3,...,68.0,0.000,11.62,30.0792,Clear,RCA Dome,71.0,1.0,-86.163333,39.763611
Tom Brady,4,2001-10-28,A,DEN,DEN,L 20-31,65.79,203,2,4,57.1,...,20.0,0.000,10.31,29.9493,Clear,Empower Field at Mile High,73.0,0.0,-105.020000,39.743889
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Jared Goff,68,2020-12-06,A,ARI,ARI,W 38-28,78.72,351,1,0,104.9,...,12.0,0.000,0.00,30.1147,Clear,State Farm Stadium,69.0,1.0,-112.263000,33.528000
Jared Goff,69,2020-12-10,H,NWE,LAR,W 24-3,64.00,137,1,1,74.9,...,72.0,0.000,0.00,30.0527,Clear,SoFi Stadium,57.0,0.0,-118.339200,33.953450
Jared Goff,70,2020-12-20,H,NYJ,LAR,L 20-23,64.71,209,2,1,89.0,...,16.0,0.000,6.84,30.0615,Clear,SoFi Stadium,73.0,0.0,-118.339200,33.953450
Jared Goff,71,2020-12-27,A,SEA,SEA,L 9-20,55.81,234,0,1,61.6,...,83.0,0.000,3.11,30.0408,Clear,CenturyLink Field,46.0,0.0,-122.331600,47.595200


In [12]:
#saves CSV file
total.to_csv('qb_list.csv')

In [13]:
#Imports the CSV and converts it to a DF with QB name now a column
top_30 = pd.read_csv('qb_list.csv')
top_30.rename(columns={'Unnamed: 0':'QB Name'}, inplace=True)
top_30.drop("Unnamed: 1", axis=1, inplace=True)
top_30.to_csv('qbs_for_viz.csv', index=False)
test = pd.read_csv('qbs_for_viz.csv')

In [14]:
test

Unnamed: 0,QB Name,Date,H/A,Opp,GameLoc,Result,Cmp%,Yds1,TD1,Int,...,Humidity,Precipitation,WindSpeed,Pressure,EstimatedCondition,StadiumName,RT,RoofType,Longitude,Latitude
0,Tom Brady,2001-09-30,H,IND,NWE,W 44-13,56.52,168,0.0,0.0,...,70.0,0.000,24.17,30.2003,Clear,Foxboro Stadium,56.0,0.0,-71.267442,42.092700
1,Tom Brady,2001-10-07,A,MIA,MIA,L 10-30,50.00,86,0.0,0.0,...,63.0,0.000,5.84,30.0497,Clear,Hard Rock Stadium,87.0,0.0,-80.238889,25.958056
2,Tom Brady,2001-10-14,H,SDG,NWE,W 29-26,61.11,364,2.0,0.0,...,100.0,0.012,8.08,30.1265,Light Rain,Foxboro Stadium,56.0,0.0,-71.267442,42.092700
3,Tom Brady,2001-10-21,A,IND,IND,W 38-17,80.00,202,3.0,0.0,...,68.0,0.000,11.62,30.0792,Clear,RCA Dome,71.0,1.0,-86.163333,39.763611
4,Tom Brady,2001-10-28,A,DEN,DEN,L 20-31,65.79,203,2.0,4.0,...,20.0,0.000,10.31,29.9493,Clear,Empower Field at Mile High,73.0,0.0,-105.020000,39.743889
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4903,Jared Goff,2020-12-06,A,ARI,ARI,W 38-28,78.72,351,1.0,0.0,...,12.0,0.000,0.00,30.1147,Clear,State Farm Stadium,69.0,1.0,-112.263000,33.528000
4904,Jared Goff,2020-12-10,H,NWE,LAR,W 24-3,64.00,137,1.0,1.0,...,72.0,0.000,0.00,30.0527,Clear,SoFi Stadium,57.0,0.0,-118.339200,33.953450
4905,Jared Goff,2020-12-20,H,NYJ,LAR,L 20-23,64.71,209,2.0,1.0,...,16.0,0.000,6.84,30.0615,Clear,SoFi Stadium,73.0,0.0,-118.339200,33.953450
4906,Jared Goff,2020-12-27,A,SEA,SEA,L 9-20,55.81,234,0.0,1.0,...,83.0,0.000,3.11,30.0408,Clear,CenturyLink Field,46.0,0.0,-122.331600,47.595200
