In [1]:
import pandas as pd
from bs4 import BeautifulSoup as bs
import requests
import time
import numpy as np
import random
from matplotlib import pyplot as plt
import seaborn as sns
import regex as re

In [5]:
def scrape_fantasy_pros(start_year = 2018, end_year = 2021, position = None):
    
    years = [i for i in range(start_year, end_year+1)]
    positions = ['qb', 'rb', 'wr', 'te']
    
    weekly_df = pd.DataFrame()
    for yr in years:
        print(yr)
        for wk in range(1, 18):
            print(wk, end = '\r')
            for pos in positions:

                time.sleep(0.5)

                BASE_URL = 'https://www.fantasypros.com/nfl/stats/{position}.php?year={year}&week={week}&range=week&scoring=HALF'.format(position=pos,year=yr,week=wk)
                page = requests.get(BASE_URL)

                col_dict = {'rb': ['rank','player', 'rush_att', 'rush_yards', 'rush_y/a', 'rush_longest',
                                  'rush_20+', 'rush_td', 'rec', 'target', 'rec_yards', 'rec_y/r', 
                                  'rec_td', 'fumbles', 'games', 'fpts', 'fpts/g', 'rostered'],
                          'wr': ['rank','player','rec','target','rec_yards','rec_y/r','rec_longest','rec_20+',
                                'rec_td','rush_att','rush_yards','rush_td','fumbles','games',
                                'fpts','fpts/g','rostered'],
                          'te': ['rank','player','rec','target','rec_yards','rec_y/r','rec_longest','rec_20+',
                                'rec_td','rush_att','rush_yards','rush_td','fumbles','games',
                                'fpts','fpts/g','rostered'],
                          'qb': ['rank','player','completions','attempts','percent','pass_yards','pass_y/a',
                                'pass_td','interceptions','sacks','rush_att','rush_yards','rush_td','fumbles',
                                'games','fpts','fpts/g','rostered']}

                if page.ok:
                    soup = bs(page.content, "html.parser")
                    table = soup.find_all('table')
                    df = pd.read_html(str(table))[0]
                    df.columns = col_dict[pos]
                    df['fpts'].replace(0.0, np.nan, inplace=True)
                    df.drop(['rank'], axis = 1)
                    df['player'] = df['player'].str.split('(', expand=True)[0]
                    df['player'] = df['player'].str.rstrip()
                    df["player"] = df.player.str.replace('[^\w\s]','', regex=True).str.strip()
                    first = df['player'].str.split(' ', expand=True)[0]
                    last = df['player'].str.split(' ', expand=True)[1]
                    df['player'] = first + " " + last
                    df['pos'] = pos
                    df['season'] = yr
                    df['week'] = wk
                weekly_df = pd.concat([weekly_df, df], ignore_index=True)
    weekly_df = weekly_df.fillna(0)
    return weekly_df

def scrape_ADP(start_year = 2018, end_year = 2021, game_type = 'half-ppr', team_size = 10):
    years = [i for i in range(start_year, end_year+1)]

    BASE_URL = 'https://fantasyfootballcalculator.com/adp/{}/12-team/all/{}'

    adp_df = pd.DataFrame()
    rounds = []
    for i in range(1, 30):
        rounds += [i for j in range(team_size)]

    for current_year in years:
        URL = BASE_URL.format(game_type, current_year)


        page = requests.get(URL)
        if page.ok:
            print(URL)
            soup = bs(page.content, 'html.parser')
            table = soup.find_all("table")
            df = pd.read_html(str(table))[0]
            df['Season'] = current_year
            df['Round'] = rounds[:df.shape[0]]
            df['Season'] = current_year
            df["Name"] = df.Name.str.replace('[^\w\s]','', regex=True).str.strip()
            first = df['Name'].str.split(' ', expand=True)[0]
            last = df['Name'].str.split(' ', expand=True)[1]
            df['Name'] = first + " " + last
            adp_df = pd.concat([adp_df,df], ignore_index=True)
            time.sleep(3)

    #adp_df = adp_df.drop(['TimesDrafted', 'Unnamed: 10', 'Bye', 'Graph'], axis = 1) #if 2022 included
    adp_df = adp_df.drop(['TimesDrafted', 'Unnamed: 10'], axis = 1)
    adp_df["Low_STD"] = adp_df['Overall'] - adp_df["Std.Dev"]
    adp_df["High_STD"] = adp_df['Overall'] + adp_df["Std.Dev"]
    adp_df['Pos'] = adp_df['Pos'].str.lower()
    adp_df.columns = adp_df.columns.str.lower()
    adp_df.rename(columns = {'name': 'player'}, inplace = True)
    return adp_df

def get_season_results(df):
    
    for i in range(1,18): 
        print(i, end = '\r')
        if i == 1:
            temp = df[df['week'] == i]
            Season_df = temp[['player', 'pos', 'season', 'fpts']]
            Season_df.columns = ['player', 'pos', 'season', 'week1']
        else:
            temp = df[df['week'] == i]
            temp = temp[['player', 'pos', 'season', 'fpts']]
            temp.columns = ['player', 'pos', 'season', 'week{}'.format(i)]
            Season_df = Season_df.merge(temp, on=['player', 'pos', 'season'], how='outer')
         
    VOR_dict = {'qb':10, 'rb':20, 'wr': 30, 'te': 10}  
    weeks = ['week{}'.format(i) for i in range(1,18)]
    seasons = [i for i in range(2018,2022)]
            
    Season_df['total'] = Season_df[weeks].sum(axis = 1)
    Season_df['games_played'] = Season_df[weeks].count(axis = 1)
    Season_df['avg'] = Season_df.total.div(Season_df.games_played)
    Season_df.loc[~np.isfinite(Season_df['avg']), 'avg'] = 0 

    ranking_df = pd.DataFrame()
    for s in seasons:
        for pos in VOR_dict.keys():
            temp = Season_df[(Season_df['pos'] == pos) & (Season_df['season'] == s)]
            temp['pos_rank'] = temp['total'].rank(ascending = False)
            for wk in weeks:
                temp[str(wk) + '_rank'] = temp[wk].rank(ascending = False)
            temp['rank_total'] = temp[weeks].sum(axis = 1)
            temp['rank_avg'] = temp.rank_total.div(temp.games_played)
            temp.loc[~np.isfinite(temp['rank_avg']), 'rank_avg'] = 0 
            temp['rank_median'] = temp[weeks].median(axis = 1)
            ranking_df = pd.concat([ranking_df, temp], ignore_index=True)
    Season_df = ranking_df
    
    Season_df['VOR_Total'] = np.nan
    Season_df['VOR_Avg'] = np.nan
    
    
    
    for s in seasons:
        print(s)
        for position in VOR_dict.keys():
            print(position)

            # Get VOR player data
            replacement_player = Season_df[(Season_df['pos'] == position) & (Season_df['season']==s) & (Season_df['pos_rank'] == VOR_dict[position])]
            replacement_total_value = replacement_player['total'].iloc[0]
            avg_value = replacement_player['avg']

            # Get player position df
            temp = Season_df[Season_df['pos'] == position]
            temp = temp.sort_values(by=['total'], ascending = False)
            counter = 0


            for index, row in temp.iterrows():
                counter += 1
                print(str(counter) + '/' + str(temp.shape[0]), end = '\r')

                # Add VOR
                VOR_Total = row['total'] - replacement_total_value
                VOR_Avg = VOR_Total / 16
                Season_df.at[index, 'VOR_Total'] = VOR_Total
                Season_df.at[index, 'VOR_Avg'] = VOR_Avg
            
    

    
    return Season_df.sort_values(by = 'total', ascending = False)

def combine_adp(first_df, second_df):
    
    new_df = first_df.merge(second_df, on=['player', 'pos', 'season'], how = 'outer')
    new_df = new_df.drop(['pick', '#', 'low', 'high', 'team'], axis = 1)

    
    return new_df

def scrape_teams():
    
    url_teams = {'CAR':'CAR', 'BUF':'BUF', 'SEA':'SEA', 'ATL':'ATL', 'DAL':'DAL', 'PIT':'PIT', 'NYG':'NYG',
             'CIN':'CIN', 'PHI':'PHI', 'MIN':'MIN', 'DET':'DET', 'CHI':'CHI', 'CLE':'CLE', 'JAC':'JAX', 
             'DEN':'DEN', 'NYJ':'NYJ', 'WAS':'WAS', 'MIA':'MIA', 'BAL':'BAL', 'KC':'KC', 'ARI':'ARI', 
             'GB':'GB', 'HOU':'HOU', 'LAC':'LAC', 'TB':'TB', 'TEN':'TEN', 'LAN':'LAR', 'IND':'IND', 
             'NO':'NO', 'NE':'NE', 'SF':'SF'}
    
    team_df = pd.DataFrame()
    
    for t in url_teams.keys():
        for yr in range(2018, 2022):
            time.sleep(1)
            #URL = 'https://www.pro-football-reference.com/teams/{team}/{year}_roster.htm'.format(team = t, year = yr)
            URL = 'https://www.statscrew.com/football/roster/t-{team}/y-{year}'.format(team = t, year = yr)
            page = requests.get(URL)

            if page.ok:
                print("GOOD! ", URL)
                soup = bs(page.content, "html.parser")
                table = soup.find_all('table')
                df = pd.read_html(str(table))[0]
                df = df[['Player', 'Pos.']]
                df = df[(df['Pos.'] == 'QB') | (df['Pos.'] == 'RB') | (df['Pos.'] == 'WR') | (df['Pos.'] == 'TE')]
                df['Pos.'] = df['Pos.'].str.lower()
                df['Player'] = df['Player'].str.rstrip()
                df["Player"] = df.Player.str.replace('[^\w\s]','', regex=True).str.strip()
                first = df['Player'].str.split(' ', expand=True)[0]
                last = df['Player'].str.split(' ', expand=True)[1]
                df['Player'] = first + " " + last
                df.columns = ['player', 'pos']
                df['team'] = url_teams[t]
                df['season'] = yr
                team_df = pd.concat([team_df, df])
             
    for t, yr in zip(['OAK', 'OAK', 'OAK', 'LVR'], range(2018, 2022)):
        
        time.sleep(1)
        #URL = 'https://www.pro-football-reference.com/teams/{team}/{year}_roster.htm'.format(team = t, year = yr)
        URL = 'https://www.statscrew.com/football/roster/t-{team}/y-{year}'.format(team = t, year = yr)
        page = requests.get(URL)

        if page.ok:
            print("GOOD! ", URL)
            soup = bs(page.content, "html.parser")
            table = soup.find_all('table')
            df = pd.read_html(str(table))[0]
            df = df[['Player', 'Pos.']]
            df = df[(df['Pos.'] == 'QB') | (df['Pos.'] == 'RB') | (df['Pos.'] == 'WR') | (df['Pos.'] == 'TE')]
            df['Pos.'] = df['Pos.'].str.lower()
            df['Player'] = df['Player'].str.rstrip()
            df["Player"] = df.Player.str.replace('[^\w\s]','', regex=True).str.strip()
            first = df['Player'].str.split(' ', expand=True)[0]
            last = df['Player'].str.split(' ', expand=True)[1]
            df['Player'] = first + " " + last
            df.columns = ['player', 'pos']
            df['team'] = 'RAI'
            df['season'] = yr
            team_df = pd.concat([team_df, df])
                
    return team_df


In [6]:
game_df = scrape_fantasy_pros()
adp_df = scrape_ADP()
team_df = scrape_teams()
Season_df = get_season_results(game_df)
new_df = combine_adp(Season_df, adp_df)
df = new_df.merge(team_df, on = ['player', 'pos', 'season'], how = 'outer')

2018
2019
2020
2021
https://fantasyfootballcalculator.com/adp/half-ppr/12-team/all/2018
https://fantasyfootballcalculator.com/adp/half-ppr/12-team/all/2019
https://fantasyfootballcalculator.com/adp/half-ppr/12-team/all/2020
https://fantasyfootballcalculator.com/adp/half-ppr/12-team/all/2021
GOOD!  https://www.statscrew.com/football/roster/t-CAR/y-2018
GOOD!  https://www.statscrew.com/football/roster/t-CAR/y-2019
GOOD!  https://www.statscrew.com/football/roster/t-CAR/y-2020
GOOD!  https://www.statscrew.com/football/roster/t-CAR/y-2021
GOOD!  https://www.statscrew.com/football/roster/t-BUF/y-2018
GOOD!  https://www.statscrew.com/football/roster/t-BUF/y-2019
GOOD!  https://www.statscrew.com/football/roster/t-BUF/y-2020
GOOD!  https://www.statscrew.com/football/roster/t-BUF/y-2021
GOOD!  https://www.statscrew.com/football/roster/t-SEA/y-2018
GOOD!  https://www.statscrew.com/football/roster/t-SEA/y-2019
GOOD!  https://www.statscrew.com/football/roster/t-SEA/y-2020
GOOD!  https://www.statscr

151617

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp['pos_rank'] = temp['total'].rank(ascending = False)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp[str(wk) + '_rank'] = temp[wk].rank(ascending = False)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp['rank_total'] = temp[weeks].sum(axis = 1)
A value is trying to be set on a copy of a

2018
qb
rb3/383
wr2974/262974
te25/1225
2019731
qb
rb3/383
wr2974/262974
te25/1225
2020731
qb
rb3/383
wr2974/262974
te25/1225
2021731
qb
rb3/383
wr2974/262974
te25/1225
731/731

In [7]:
df.to_csv("fatansy_data.csv")
game_df.to_csv("game_data.csv")