### Scraping Fantasy Football Data
Need to scrape the following data:
- Weekly Player PPR Projections: ESPN, CBS, Fantasy Football Today, Fantasy Sharks
- Previous Week Player Actual PPR Results
- Weekly Fanduel Player Salary (can manually download csv from a Thurs-Sun contest)

In [1]:
import pandas as pd
import numpy as np
import requests
import json
from bs4 import BeautifulSoup

import time

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
# from selenium.common.exceptions import NoSuchElementException

In [4]:
#function to initiliaze selenium web scraper
def instantiate_selenium_driver():
    chrome_options = webdriver.ChromeOptions()
    chrome_options.add_argument('--no-sandbox')
    chrome_options.add_argument('--window-size=1420,1080')
    #chrome_options.add_argument('--headless')
    chrome_options.add_argument('--disable-gpu')
    driver = webdriver.Chrome('..\plugins\chromedriver.exe', 
        chrome_options=chrome_options)
    return driver

In [5]:
#function to save dataframes to pickle archive
#file name: don't include csv in file name, function will also add a timestamp to the archive
#directory name don't include final backslash
def save_to_pickle(df, directory_name, file_name):
    lt = time.localtime()
    full_file_name = f"{file_name}_{lt.tm_year}-{lt.tm_mon}-{lt.tm_mday}-{lt.tm_hour}-{lt.tm_min}.pkl"
    path = f"{directory_name}/{full_file_name}"
    df.to_pickle(path)
    print(f"Pickle saved to: {path}")

In [6]:
#remove name suffixes of II III IV or Jr. or Sr. or random * from names to easier match other databases
def remove_suffixes(name):
    for suffix in [" II", " III", " IV", " Jr.", " Sr.", "*"]:
        name = name.replace(suffix, "")
    return name

#### Get Weekly Player Actual Fantasy PPR Points
Get from ESPN's Scoring Leaders table

http://games.espn.com/ffl/leaders?&scoringPeriodId=1&seasonId=2018&slotCategoryId=0&leagueID=0
- scoringPeriodId = week of the season
- seasonId = year
- slotCategoryId = position, where 'QB':0, 'RB':2, 'WR':4, 'TE':6, 'K':17, 'D/ST':16
- leagueID = scoring type, PPR Standard is 0

In [123]:
##SCRAPE ESPN SCORING LEADERS TABLE FOR ACTUAL FANTASY PPR POINTS##

#input needs to be year as four digit number and week as number 
#returns dataframe of scraped data
def scrape_actual_PPR_player_points_ESPN(week, year):
    #instantiate the driver
    driver = instantiate_selenium_driver()
    
    #initialize dataframe for all data
    player_actual_ppr = pd.DataFrame()
    
    #url that returns info has different code for each position
    position_ids = {'QB':0, 'RB':2, 'WR':4, 'TE':6, 'K':17, 'D/ST':16}

    #cycle through each position webpage to create comprehensive dataframe
    for pos, pos_id in position_ids.items():
        #note leagueID=0 is for PPR standard scoring
        url_start_pos = f"http://games.espn.com/ffl/leaders?&scoringPeriodId={week}&seasonId={year}&slotCategoryId={pos_id}&leagueID=0"
        driver.get(url_start_pos)
        
        #each page only gets 50 results, so cycle through next button until next button no longer exists
        while True:
            #read in the table from ESPN, by using the class, and use the 1st row index for column header
            player_actual_ppr_table_page = pd.read_html(driver.page_source,
                                               attrs={'class': 'playerTableTable'}, #return only the table of this class, which has the player data
                                               header=[1])[0] #returns table in a list, so get zeroth table

            #easier to just assign the player position rather than try to scrape it out
            player_actual_ppr_table_page['POS'] = pos

            #replace any placeholder string -- or --/-- with None type to not confuse calculations later
            player_actual_ppr_table_page.replace({'--': None, '--/--': None}, inplace=True)
            

#if want to extract more detailed data from this, can do added reformatting, etc., but not doing that for our purposes
#             #rename D/ST columns so don't get misassigned to wrong columns
#             if pos == 'D/ST':
#                 player_actual_ppr_table_page.rename(columns={'SCK':'D/ST_Sack', 
#                                                      'FR':'D/ST_FR', 'INT':'D/ST_INT',
#                                                      'TD':'D/ST_TD', 'BLK':'D/ST_BLK', 'PA':'D/ST_PA'},
#                                                     inplace=True)
            
#             #rename/recalculate Kicker columns so don't get misassigned to wrong columns
#             elif pos == 'K':
#                 player_actual_ppr_table_page.rename(columns={'1-39':'KICK_FG_1-39', '40-49':'KICK_FG_40-49',
#                                                      '50+':'KICK_FG_50+', 'TOT':'KICK_FG',
#                                                      'XP':'KICK_XP'},
#                                                     inplace=True)
                
#                 #if wanted to use all the kicker data could fix this code snipit - erroring out because can't split None types
#                 #just want made FG's for each bucket and overall FGAtt and XPAtt
#                 player_actual_ppr_table_page['KICK_FGAtt'] = player_actual_ppr_table_page['KICK_FG'].map(
#                                                             lambda x: x.split("/")[-1]).astype('float64')
#                 player_actual_ppr_table_page['KICK_XPAtt'] = player_actual_ppr_table_page['KICK_XP'].map(
#                                                             lambda x: x.split("/")[-1]).astype('float64')
#                 player_actual_ppr_table_page['KICK_FG_1-39'] = player_actual_ppr_table_page['KICK_FG_1-39'].map(
#                                                             lambda x: x.split("/")[0]).astype('float64')
#                 player_actual_ppr_table_page['KICK_FG_40-49'] = player_actual_ppr_table_page['KICK_FG_40-49'].map(
#                                                             lambda x: x.split("/")[0]).astype('float64')
#                 player_actual_ppr_table_page['KICK_FG_50+'] = player_actual_ppr_table_page['KICK_FG_50+'].map(
#                                                             lambda x: x.split("/")[0]).astype('float64')
#                 player_actual_ppr_table_page['KICK_FG'] = player_actual_ppr_table_page['KICK_FG'].map(
#                                                             lambda x: x.split("/")[0]).astype('float64')
#                 player_actual_ppr_table_page['KICK_XP'] = player_actual_ppr_table_page['KICK_XP'].map(
#                                                             lambda x: x.split("/")[0]).astype('float64')
#                 player_actual_ppr_table_page['KICK_FG%'] = player_actual_ppr_table_page['KICK_FG'] / espn_proj_table_page['KICK_FGAtt']
                           
            
            #add page data to overall dataframe
            player_actual_ppr = pd.concat([player_actual_ppr, player_actual_ppr_table_page],
                                         ignore_index=True,
                                         sort=False)

            #click to next page to get next 40 results, but check that it exists
            try:
                next_button = driver.find_element_by_partial_link_text('NEXT')
                next_button.click()
            except EC.NoSuchElementException:
                break
    
    driver.quit()
    
    #drop any completely blank columns
    player_actual_ppr.dropna(axis='columns', how='all', inplace=True)
    
    #add columns that give week/season
    player_actual_ppr['WEEK'] = week
    player_actual_ppr['SEASON'] = year
    
    return player_actual_ppr

In [180]:
###FORMAT/EXTRACT ACTUAL PLAYER PPR DATA###
#(you could make this more complex if want to extract some of the subdata)

def format_extract_PPR_player_points_ESPN(df_scraped_ppr_espn):
    #split out player, team, position based on ESPN's formatting
    def split_player_team_pos_espn(play_team_pos):
        #incoming string for players: 'Todd Gurley II, LAR RB' or 'Drew Brees, NO\xa0QB'
        #incoming string for players with special designations: 'Aaron Rodgers, GB\xa0QB Q'
        #incoming string for D/ST: 'Jaguars D/ST\xa0D/ST'

        #operations if D/ST
        if "D/ST" in play_team_pos:
            player = play_team_pos.split(' D/ST\xa0')[0]
            team = player.split()[0]

        #operations for regular players
        else:
            player = play_team_pos.split(',')[0]
            team_pos = play_team_pos.split(',')[1]
            team = team_pos.split()[0]

        return player, team

    
    df_scraped_ppr_espn[['PLAYER', 'TEAM']] = df_scraped_ppr_espn.apply(
                                               lambda x: split_player_team_pos_espn(x['PLAYER, TEAM POS']),
                                               axis='columns',
                                               result_type='expand')

    
    #need to remove name suffixes so can match players easier to other data - see function defined above
    df_scraped_ppr_espn['PLAYER'] = df_scraped_ppr_espn['PLAYER'].map(remove_suffixes)

    #convert PTS to float type (sometimes zeros have been stored as strings)
    df_scraped_ppr_espn['PTS'] = df_scraped_ppr_espn['PTS'].astype('float64')
    
    #for this function only extract 'PLAYER', 'POS', 'TEAM', 'PTS'
    df_scraped_ppr_espn = df_scraped_ppr_espn[['PLAYER', 'POS', 'TEAM', 'PTS', 'WEEK']].sort_values('PTS', ascending=False)
                                               

    return df_scraped_ppr_espn

In [127]:
#CALL SCRAPE AND FORMATTING OF ACTUAL PPR WEEK 1- AND SAVE TO PICKLES FOR LATER USE

#scrape data and save the messy full dataframe
df_wk1_player_actual_ppr_scrape = scrape_actual_PPR_player_points_ESPN(1, 2018)
save_to_pickle(df_wk1_player_actual_ppr_scrape, 'pickle_archive', 'Week1_Player_Actual_PPR_messy_scrape')

#format data to extract just player pts/playr/pos/team/weel and save the data
df_wk1_player_actual_ppr = format_extract_PPR_player_points_ESPN(df_wk1_player_actual_ppr_scrape)
#rename PTS column to something more descriptive 
df_wk1_player_actual_ppr.rename(columns={'PTS':'FPTS_PPR_ACTUAL'}, inplace=True) 
save_to_pickle(df_wk1_player_actual_ppr, 'pickle_archive', 'Week1_Player_Actual_PPR')
print(df_wk1_player_actual_ppr.shape)
df_wk1_player_actual_ppr.head()

Pickle saved to: pickle_archive/Week1_Player_Actual_PPR_messy_scrape_2018-9-15-8-29.pkl
Pickle saved to: pickle_archive/Week1_Player_Actual_PPR_2018-9-15-8-29.pkl
(1007, 5)


Unnamed: 0,PLAYER,POS,TEAM,FPTS_PPR_ACTUAL,WEEK
120,Alvin Kamara,RB,NO,43.1,1
0,Ryan Fitzpatrick,QB,TB,42.3,1
387,Tyreek Hill,WR,KC,42.3,1
388,Michael Thomas,WR,NO,38.0,1
121,James Conner,RB,Pit,34.2,1


#### Get ESPN Player Fantasy Points Projections for Week 
Get from ESPN's Projections Table

http://games.espn.com/ffl/tools/projections?&scoringPeriodId=1&seasonId=2018&slotCategoryId=0&leagueID=0
- scoringPeriodId = week of the season
- seasonId = year
- slotCategoryId = position, where 'QB':0, 'RB':2, 'WR':4, 'TE':6, 'K':17, 'D/ST':16
- leagueID = scoring type, PPR Standard is 0

In [181]:
##SCRAPE ESPN PROJECTIONS TABLE FOR PROJECTED FANTASY PPR POINTS##

#input needs to be year as four digit number and week as number 
#returns dataframe of scraped data
def scrape_weekly_player_projections_ESPN(week, year):
    #instantiate the driver on the ESPN projections page
    driver = instantiate_selenium_driver()
    
    #initialize dataframe for all data
    proj_ppr_espn = pd.DataFrame()
    
    #url that returns info has different code for each position
    position_ids = {'QB':0, 'RB':2, 'WR':4, 'TE':6, 'K':17, 'D/ST':16}

    #cycle through each position webpage to create comprehensive dataframe
    for pos, pos_id in position_ids.items():
        #note leagueID=0 is for PPR standard scoring
        url_start_pos = f"http://games.espn.com/ffl/tools/projections?&scoringPeriodId={week}&seasonId={year}&slotCategoryId={pos_id}&leagueID=0"       
        driver.get(url_start_pos)
        
        #each page only gets 50 results, so cycle through next button until next button no longer exists
        while True:
            #read in the table from ESPN, by using the class, and use the 1st row index for column header
            proj_ppr_espn_table_page = pd.read_html(driver.page_source,
                                               attrs={'class': 'playerTableTable'}, #return only the table of this class, which has the player data
                                               header=[1])[0] #returns table in a list, so get zeroth table

            #easier to just assign the player position rather than try to scrape it out
            proj_ppr_espn_table_page['POS'] = pos

            #replace any placeholder string -- or --/-- with None type to not confuse calculations later
            proj_ppr_espn_table_page.replace({'--': None, '--/--': None}, inplace=True)


#if want to extract more detailed data from this, can do added reformatting, etc., but not doing that for our purposes
#             #rename D/ST columns so don't get misassigned to wrong columns
#             if pos == 'D/ST':
#                 proj_ppr_espn_table_page.rename(columns={'SCK':'D/ST_Sack', 
#                                                      'FR':'D/ST_FR', 'INT':'D/ST_INT',
#                                                      'TD':'D/ST_TD', 'BLK':'D/ST_BLK', 'PA':'D/ST_PA'},
#                                                     inplace=True)
            
#             #rename/recalculate Kicker columns so don't get misassigned to wrong columns
#             elif pos == 'K':
#                 proj_ppr_espn_table_page.rename(columns={'1-39':'KICK_FG_1-39', '40-49':'KICK_FG_40-49',
#                                                      '50+':'KICK_FG_50+', 'TOT':'KICK_FG',
#                                                      'XP':'KICK_XP'},
#                                                     inplace=True)
                
#                 #if wanted to use all the kicker data could fix this code snipit - erroring out because can't split None types
#                 #just want made FG's for each bucket and overall FGAtt and XPAtt
#                 proj_ppr_espn_table_page['KICK_FGAtt'] = proj_ppr_espn_table_page['KICK_FG'].map(
#                                                             lambda x: x.split("/")[-1]).astype('float64')
#                 proj_ppr_espn_table_page['KICK_XPAtt'] = proj_ppr_espn_table_page['KICK_XP'].map(
#                                                             lambda x: x.split("/")[-1]).astype('float64')
#                 proj_ppr_espn_table_page['KICK_FG_1-39'] = proj_ppr_espn_table_page['KICK_FG_1-39'].map(
#                                                             lambda x: x.split("/")[0]).astype('float64')
#                 proj_ppr_espn_table_page['KICK_FG_40-49'] = proj_ppr_espn_table_page['KICK_FG_40-49'].map(
#                                                             lambda x: x.split("/")[0]).astype('float64')
#                 proj_ppr_espn_table_page['KICK_FG_50+'] = proj_ppr_espn_table_page['KICK_FG_50+'].map(
#                                                             lambda x: x.split("/")[0]).astype('float64')
#                 proj_ppr_espn_table_page['KICK_FG'] = proj_ppr_espn_table_page['KICK_FG'].map(
#                                                             lambda x: x.split("/")[0]).astype('float64')
#                 proj_ppr_espn_table_page['KICK_XP'] = proj_ppr_espn_table_page['KICK_XP'].map(
#                                                             lambda x: x.split("/")[0]).astype('float64')
#                 proj_ppr_espn_table_page['KICK_FG%'] = proj_ppr_espn_table_page['KICK_FG'] / espn_proj_table_page['KICK_FGAtt']
                           
            
            #add page data to overall dataframe
            proj_ppr_espn = pd.concat([proj_ppr_espn, proj_ppr_espn_table_page],
                                         ignore_index=True,
                                         sort=False)

            #click to next page to get next 40 results, but check that it exists
            try:
                next_button = driver.find_element_by_partial_link_text('NEXT')
                next_button.click()
            except EC.NoSuchElementException:
                break
    
    driver.quit()
    
    #drop any completely blank columns
    proj_ppr_espn.dropna(axis='columns', how='all', inplace=True)
    
    #add columns that give week/season
    proj_ppr_espn['WEEK'] = week
    proj_ppr_espn['SEASON'] = year
    
    return proj_ppr_espn

In [131]:
#formatting/extracting function is same for ESPN Actual/PPR Projections, so don't need new function

In [183]:
#WEEK 1 PROJECTIONS
#CALL SCRAPE AND FORMATTING OF ESPN WEEKLY PROJECTIONS - AND SAVE TO PICKLES FOR LATER USE

#scrape data and save the messy full dataframe
df_wk1_ppr_proj_espn_scrape = scrape_weekly_player_projections_ESPN(1, 2018)
save_to_pickle(df_wk1_ppr_proj_espn_scrape, 'pickle_archive', 'Week1_PPR_Projections_ESPN_messy_scrape')

#format data to extract just player pts/playr/pos/team/week and save the data
df_wk1_ppr_proj_espn = format_extract_PPR_player_points_ESPN(df_wk1_ppr_proj_espn_scrape)
#rename PTS column to something more descriptive 
df_wk1_ppr_proj_espn.rename(columns={'PTS':'FPTS_PPR_ESPN'}, inplace=True) 
save_to_pickle(df_wk1_ppr_proj_espn, 'pickle_archive', 'Week1_PPR_Projections_ESPN')
print(df_wk1_ppr_proj_espn.shape)
df_wk1_ppr_proj_espn.head()

Pickle saved to: pickle_archive/Week1_PPR_Projections_ESPN_messy_scrape_2018-9-15-9-11.pkl
Pickle saved to: pickle_archive/Week1_PPR_Projections_ESPN_2018-9-15-9-11.pkl
(1007, 5)


Unnamed: 0,PLAYER,POS,TEAM,FPTS_PPR_ESPN,WEEK
120,Alvin Kamara,RB,NO,22.2,1
121,David Johnson,RB,Ari,21.3,1
122,Todd Gurley,RB,LAR,21.2,1
387,Antonio Brown,WR,Pit,19.5,1
0,Tom Brady,QB,NE,19.4,1


In [184]:
#WEEK 2 PROJECTIONS
#CALL SCRAPE AND FORMATTING OF ESPN WEEKLY PROJECTIONS - AND SAVE TO PICKLES FOR LATER USE

#scrape data and save the messy full dataframe
df_wk2_ppr_proj_espn_scrape = scrape_weekly_player_projections_ESPN(2, 2018)
save_to_pickle(df_wk2_ppr_proj_espn_scrape, 'pickle_archive', 'Week2_PPR_Projections_ESPN_messy_scrape')

#format data to extract just player pts/playr/pos/team/week and save the data
df_wk2_ppr_proj_espn = format_extract_PPR_player_points_ESPN(df_wk2_ppr_proj_espn_scrape)
#rename PTS column to something more descriptive 
df_wk2_ppr_proj_espn.rename(columns={'PTS':'FPTS_PPR_ESPN'}, inplace=True) 
save_to_pickle(df_wk2_ppr_proj_espn, 'pickle_archive', 'Week2_PPR_Projections_ESPN')
print(df_wk2_ppr_proj_espn.shape)
df_wk2_ppr_proj_espn.head()

Pickle saved to: pickle_archive/Week2_PPR_Projections_ESPN_messy_scrape_2018-9-15-9-14.pkl
Pickle saved to: pickle_archive/Week2_PPR_Projections_ESPN_2018-9-15-9-14.pkl
(1007, 5)


Unnamed: 0,PLAYER,POS,TEAM,FPTS_PPR_ESPN,WEEK
120,Alvin Kamara,RB,NO,22.8,2
121,Todd Gurley,RB,LAR,22.4,2
122,Ezekiel Elliott,RB,Dal,21.0,2
387,Antonio Brown,WR,Pit,20.6,2
388,Michael Thomas,WR,NO,19.3,2


#### Get CBS Player Fantasy Points Projections for Week 
Get from CBS's Projections Table

https://www.cbssports.com/fantasy/football/stats/sortable/points/QB/ppr/projections/2018/2?&print_rows=9999
- QB is where position goes
- 2018 is where season goes
- 2 is where week goes
- print_rows = 9999 gives all results in one table

In [188]:
##SCRAPE CBS PROJECTIONS TABLE FOR PROJECTED FANTASY PPR POINTS##

#input needs to be year as four digit number and week as number 
#returns dataframe of scraped data
def scrape_weekly_player_projections_CBS(week, year):
    ###GET PROJECTIONS FROM CBS###
    #CBS has separate tables for each position, so need to cycle through them
    #but url can return all list so don't need to go page by page
    proj_ppr_cbs = pd.DataFrame()
    
    positions = ['QB', 'RB', 'WR', 'TE', 'K', 'DST']
    header_row_index = {'QB':2, 'RB':2, 'WR':2, 'TE':2, 'K':1, 'DST':1}
       
    for position in positions:
        #url just needs to change position
        url = f"https://www.cbssports.com/fantasy/football/stats/sortable/points/{position}/ppr/projections/{year}/{week}?&print_rows=9999"
    
        #read in the table from CBS by class, and use the 2nd row index for column header
        proj_ppr_cbs_pos = pd.read_html(url, 
                                  attrs={'class': 'data'}, #return only the table of this class, which has the player data
                                  header=[header_row_index[position]])[0] #returns table in a list, so get table
        proj_ppr_cbs_pos['POS'] = position
        
        #add the table to the overall df
        proj_ppr_cbs = pd.concat([proj_ppr_cbs, proj_ppr_cbs_pos], 
                             ignore_index=True, 
                             sort=False)

    #some tables include the page selector as the bottom row of the table,
    #so need to find the index values of those rows and then drop them from the table
    index_pages_rows = list(proj_ppr_cbs[proj_ppr_cbs['Player'].str.contains('Pages')].index)
    proj_ppr_cbs.drop(index_pages_rows, axis='index', inplace=True)
    
    #add columns that give week/season
    proj_ppr_cbs['WEEK'] = week
    proj_ppr_cbs['SEASON'] = year
    
    return proj_ppr_cbs    

In [189]:
###FORMAT/EXTRACT ACTUAL PLAYER PPR DATA###
#(you could make this more complex if want to extract some of the subdata)

def format_extract_PPR_player_points_CBS(df_scraped_ppr_cbs):
# #could include this extra data if you want to extract it
#     #calculate completion percentage
#     df_cbs_proj['COMPLETION_PERCENTAGE'] = df_cbs_proj.CMP/df_cbs_proj.ATT


#     #rename some of columns so don't lose meaning
#     df_cbs_proj.rename(columns={'ATT':'PASS_ATT', 'CMP':'PASS_COMP', 'COMPLETION_PERCENTAGE': 'PASS_COMP_PCT',
#                                        'YD': 'PASS_YD', 'TD':'PASS_TD', 'INT':'PASS_INT', 'RATE':'PASS_RATE', 
#                                        'ATT.1': 'RUSH_ATT', 'YD.1': 'RUSH_YD', 'AVG': 'RUSH_AVG', 'TD.1':'RUSH_TD',
#                                        'TARGT': 'RECV_TARGT', 'RECPT': 'RECV_RECPT', 'YD.2':'RECV_YD', 'AVG.1':'RECV_AVG', 'TD.2':'RECV_TD',
#                                        'FPTS':'PTS',
#                                        'FG':'KICK_FG', 'FGA': 'KICK_FGAtt', 'XP':'KICK_XP', 'XPAtt':'KICK_XPAtt', 
#                                        'Int':'D/ST_INT', 'Sty':'D/ST_Sty', 'Sack':'D/ST_Sack', 'TK':'D/ST_TK',
#                                        'DFR':'D/ST_FR', 'FF':'D/ST_FF', 'DTD':'D/ST_TD',
#                                        'Pa':'D/ST_PtsAll', 'PaNetA':'D/ST_PaYdA', 'RuYdA':'D/ST_RuYdA', 'TyDa':'D/ST_ToYdA'},
#                                 inplace=True)


#     #calculate passing, rushing, total yards/game
#     df_cbs_proj['D/ST_PaYd/G'] = df_cbs_proj['D/ST_PaYdA']/16
#     df_cbs_proj['D/ST_RuYd/G'] = df_cbs_proj['D/ST_RuYdA']/16
#     df_cbs_proj['D/ST_ToYd/G'] = df_cbs_proj['D/ST_ToYdA']/16


    #rename FPTS to PTS
    df_scraped_ppr_cbs.rename(columns={'FPTS':'FPTS_PPR_CBS'}, inplace=True) 
                              

    #split out player, team
    def split_player_team(play_team):
        #incoming string for players: 'Todd Gurley, LAR'
        #incoming string for DST: 'Jaguars, JAC'

        #operations if D/ST (can tell if there is only two items in a list separated by a space, instead of three)
        if len(play_team.split()) == 2:
            player = play_team.split(',')[0] #+ ' D/ST'
            team = play_team.split(',')[1]

        #operations for regular players
        else:
            player = play_team.split(',')[0]
            team = play_team.split(',')[1]
        
        #remove any possible name suffixes to merge with other data better
        player = remove_suffixes(player)
        
        return player, team

    
    df_scraped_ppr_cbs[['PLAYER', 'TEAM']] = df_scraped_ppr_cbs.apply(
                                                    lambda x: split_player_team(x['Player']),
                                                    axis='columns',
                                                    result_type='expand')

    
    #for this function only extract 'PLAYER', 'POS', 'TEAM', 'PTS'
    df_scraped_ppr_cbs = df_scraped_ppr_cbs[['PLAYER', 'POS', 'TEAM', 'FPTS_PPR_CBS', 'WEEK']].sort_values('FPTS_PPR_CBS', ascending=False)


    return df_scraped_ppr_cbs

In [190]:
#WEEK 1 PROJECTIONS
#CALL SCRAPE AND FORMATTING OF CBS WEEKLY PROJECTIONS - AND SAVE TO PICKLES FOR LATER USE

#scrape data and save the messy full dataframe
df_wk1_ppr_proj_cbs_scrape = scrape_weekly_player_projections_CBS(1, 2018)
save_to_pickle(df_wk1_ppr_proj_cbs_scrape, 'pickle_archive', 'Week1_PPR_Projections_CBS_messy_scrape')

#format data to extract just player pts/playr/pos/team and save the data
df_wk1_ppr_proj_cbs = format_extract_PPR_player_points_CBS(df_wk1_ppr_proj_cbs_scrape)
save_to_pickle(df_wk1_ppr_proj_cbs, 'pickle_archive', 'Week1_PPR_Projections_CBS')
print(df_wk1_ppr_proj_cbs.shape)
df_wk1_ppr_proj_cbs.head()

Pickle saved to: pickle_archive/Week1_PPR_Projections_CBS_messy_scrape_2018-9-15-9-19.pkl
Pickle saved to: pickle_archive/Week1_PPR_Projections_CBS_2018-9-15-9-19.pkl
(793, 5)


Unnamed: 0,PLAYER,POS,TEAM,FPTS_PPR_CBS,WEEK
0,Drew Brees,QB,NO,23.0,1
109,Todd Gurley,RB,LAR,23.0,1
1,Deshaun Watson,QB,HOU,23.0,1
306,Antonio Brown,WR,PIT,22.0,1
307,DeAndre Hopkins,WR,HOU,22.0,1


In [191]:
#WEEK 2 PROJECTIONS
#CALL SCRAPE AND FORMATTING OF CBS WEEKLY PROJECTIONS - AND SAVE TO PICKLES FOR LATER USE

#scrape data and save the messy full dataframe
df_wk2_ppr_proj_cbs_scrape = scrape_weekly_player_projections_CBS(2, 2018)
save_to_pickle(df_wk2_ppr_proj_cbs_scrape, 'pickle_archive', 'Week2_PPR_Projections_CBS_messy_scrape')

#format data to extract just player pts/playr/pos/team/week and save the data
df_wk2_ppr_proj_cbs = format_extract_PPR_player_points_CBS(df_wk2_ppr_proj_cbs_scrape)
save_to_pickle(df_wk2_ppr_proj_cbs, 'pickle_archive', 'Week2_PPR_Projections_CBS')
print(df_wk2_ppr_proj_cbs.shape)
df_wk2_ppr_proj_cbs.head()

Pickle saved to: pickle_archive/Week2_PPR_Projections_CBS_messy_scrape_2018-9-15-9-19.pkl
Pickle saved to: pickle_archive/Week2_PPR_Projections_CBS_2018-9-15-9-19.pkl
(821, 5)


Unnamed: 0,PLAYER,POS,TEAM,FPTS_PPR_CBS,WEEK
319,Antonio Brown,WR,PIT,25.0,2
0,Drew Brees,QB,NO,22.0,2
114,Alvin Kamara,RB,NO,21.0,2
1,Patrick Mahomes,QB,KC,21.0,2
115,Todd Gurley,RB,LAR,21.0,2


#### Get Fantasy Sharks Player Points Projection for Week
They have a json option that gets updated weekly (don't appear to store previous week projections).  The json defaults to PPR (which is lucky for us) and has an all players option.

https://www.fantasysharks.com/apps/Projections/WeeklyProjections.php?pos=ALL&format=json
It returns a list of players, each saved as a dictionary.

[
  {
    "Rank": 1,
    "ID": "4925",
    "Name": "Brees, Drew",
    "Pos": "QB",
    "Team": "NOS",
    "Opp": "CLE",
    "Comp": "27.49",
    "PassYards": "337",
    "PassTD": 2.15,
    "Int": "0.61",
    "Att": "1.5",
    "RushYards": "0",
    "RushTD": 0.12,
    "Rec": "0",
    "RecYards": "0",
    "RecTD": 0,
    "FantasyPoints": 26
  },
  
But the json is only for current week, can't get other week data - so instead use this url exampe:
https://www.fantasysharks.com/apps/bert/forecasts/projections.php?Position=99&scoring=2&Segment=628&uid=4
- Segment is the week/season id - for 2018 week 1 starts at 628 and adds 1 for each additional week
- Position=99 is all positions
- scoring=2 is PPR default


In [192]:
##SCRAPE FANTASY SHARKS PROJECTIONS TABLE FOR PROJECTED FANTASY PPR POINTS##

#input needs to be week as number (year isn't used, but keep same format as others)
#returns dataframe of scraped data
def scrape_weekly_player_projections_Sharks(week, year):
    #fantasy sharks url - segment for 2018 week 1 starts at 628 and adds 1 for each additional week
    segment = 627 + week
    #Position=99 is all positions, and scoring=2 is PPR default
    sharks_weekly_url = f"https://www.fantasysharks.com/apps/bert/forecasts/projections.php?Position=99&scoring=2&Segment={segment}&uid=4"

    #since don't need to iterate over pages, can just use reqeuests instead of selenium scraper
    #however with requests, need to include headers because this website was rejecting the request since it knew python was running it - need to spoof a browser header
    #other possible headers: 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'
    headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1)'}
    #response returns html
    response = requests.get(sharks_weekly_url, headers=headers)

    #extract the table data from the html response (call response.text) and get table with player data
    proj_ppr_sharks = pd.read_html(response.text, #response.text gives the html of the page request
                                   attrs={'id': 'toolData'}, #return only the table of this id, which has the player data
                                   header = 0 #header is the 0th row
                                   )[0] #pd.read_html returns a list of tables even though only one in it, select the table
    
    #the webpage uses different tiers, which add extra rows to the table - get rid of those
    #also sometimes repeats the column headers for readability as scrolling - get rid of those
    #so need to find the index values of those bad rows and then drop them from the table
    index_pages_rows = list(proj_ppr_sharks[proj_ppr_sharks['#'].str.contains('Tier|#')].index)
    proj_ppr_sharks.drop(index_pages_rows, axis='index', inplace=True)
    
    #add columns that give week/season
    proj_ppr_sharks['WEEK'] = week
    proj_ppr_sharks['SEASON'] = year
    
    return proj_ppr_sharks 

In [193]:
###FORMAT/EXTRACT ACTUAL PLAYER PPR DATA###
#(you could make this more complex if want to extract some of the subdata like opposing team (OPP)

def format_extract_PPR_player_points_Sharks(df_scraped_ppr_sharks):
    #rename PTS to FPTS_PPR_SHARKS and a few others
    df_scraped_ppr_sharks.rename(columns={'Pts':'FPTS_PPR_SHARKS',
                                          'Player': 'PLAYER',
                                          'Tm': 'TEAM',
                                          'Position': 'POS'},
                                 inplace=True) 
                              
    #they have player name as Last Name, First Name - reorder to First Last
    def modify_player_name(player, pos):
        #incoming string for players: 'Johnson, David' Change to: 'David Johnson'
        #incoming string for defense: 'Lions, Detroit' Change to: 'Lions'
        if pos == 'D':
            player_formatted = player.split(', ')[0]
        else:
            player_formatted = ' '.join(player.split(', ')[::-1])
            player_formatted = remove_suffixes(player_formatted)
        
        return player_formatted

    df_scraped_ppr_sharks['PLAYER'] = df_scraped_ppr_sharks.apply(
                                            lambda row: modify_player_name(row['PLAYER'], row['POS']),
                                            axis='columns')
    
    
    #convert FPTS to float type (currently stored as string)
    df_scraped_ppr_sharks['FPTS_PPR_SHARKS'] = df_scraped_ppr_sharks['FPTS_PPR_SHARKS'].astype('float64')
    
    #for this function only extract 'PLAYER', 'POS', 'TEAM', 'FPTS'
    df_scraped_ppr_sharks = df_scraped_ppr_sharks[['PLAYER', 'POS', 'TEAM', 'FPTS_PPR_SHARKS', 'WEEK']].sort_values('FPTS_PPR_SHARKS', ascending=False)


    return df_scraped_ppr_sharks

In [194]:
#WEEK 1 PROJECTIONS
#CALL SCRAPE AND FORMATTING OF FANTASY SHARKS WEEKLY PROJECTIONS - AND SAVE TO PICKLES FOR LATER USE

#scrape data and save the messy full dataframe
df_wk1_ppr_proj_sharks_scrape = scrape_weekly_player_projections_Sharks(1, 2018)
save_to_pickle(df_wk1_ppr_proj_sharks_scrape, 'pickle_archive', 'Week1_PPR_Projections_Sharks_messy_scrape')

#format data to extract just player pts/playr/pos/team/week and save the data
df_wk1_ppr_proj_sharks = format_extract_PPR_player_points_Sharks(df_wk1_ppr_proj_sharks_scrape)
save_to_pickle(df_wk1_ppr_proj_sharks, 'pickle_archive', 'Week1_PPR_Projections_Sharks')
print(df_wk1_ppr_proj_sharks.shape)
df_wk1_ppr_proj_sharks.head()

Pickle saved to: pickle_archive/Week1_PPR_Projections_Sharks_messy_scrape_2018-9-15-9-20.pkl
Pickle saved to: pickle_archive/Week1_PPR_Projections_Sharks_2018-9-15-9-20.pkl
(918, 5)


Unnamed: 0,PLAYER,POS,TEAM,FPTS_PPR_SHARKS,WEEK
0,Tom Brady,QB,NEP,27.4,1
1,David Johnson,RB,ARZ,23.6,1
3,Matthew Stafford,QB,DET,23.2,1
4,Ben Roethlisberger,QB,PIT,22.9,1
6,Russell Wilson,QB,SEA,22.8,1


In [195]:
#WEEK 2 PROJECTIONS
#CALL SCRAPE AND FORMATTING OF FANTASY SHARKS WEEKLY PROJECTIONS - AND SAVE TO PICKLES FOR LATER USE

#scrape data and save the messy full dataframe
df_wk2_ppr_proj_sharks_scrape = scrape_weekly_player_projections_Sharks(2, 2018)
save_to_pickle(df_wk2_ppr_proj_sharks_scrape, 'pickle_archive', 'Week2_PPR_Projections_Sharks_messy_scrape')

#format data to extract just player pts/playr/pos/team and save the data
df_wk2_ppr_proj_sharks = format_extract_PPR_player_points_Sharks(df_wk2_ppr_proj_sharks_scrape)
save_to_pickle(df_wk2_ppr_proj_sharks, 'pickle_archive', 'Week2_PPR_Projections_Sharks')
print(df_wk2_ppr_proj_sharks.shape)
df_wk2_ppr_proj_sharks.head()

Pickle saved to: pickle_archive/Week2_PPR_Projections_Sharks_messy_scrape_2018-9-15-9-20.pkl
Pickle saved to: pickle_archive/Week2_PPR_Projections_Sharks_2018-9-15-9-20.pkl
(1020, 5)


Unnamed: 0,PLAYER,POS,TEAM,FPTS_PPR_SHARKS,WEEK
0,Drew Brees,QB,NOR,25.6,2
1,Alvin Kamara,RB,NOR,24.1,2
3,Russell Wilson,QB,SEA,23.1,2
6,Patrick Mahomes,QB,KCC,22.7,2
4,Alex Smith,QB,WAS,22.7,2


##### !!!FFtoday apparently doesn't do weekly projections for Defenses, so don't use it for now (can check back in future and see if updated)!!!

#### Get FFtoday Player Fantasy Points Projections for Week 
Get from FFtoday's Projections Table

http://www.fftoday.com/rankings/playerwkproj.php?Season=2018&GameWeek=2&PosID=10&LeagueID=107644
- Season = year
- GameWeek = week
- PosID = the id for each position 'QB':10, 'RB':20, 'WR':30, 'TE':40, 'K':80, 'DEF':99
- LeagueID = the scoring type, 107644 gives FFToday PPR scoring

In [186]:
# ##SCRAPE FFtoday PROJECTIONS TABLE FOR PROJECTED FANTASY PPR POINTS##

# #input needs to be year as four digit number and week as number 
# #returns dataframe of scraped data
# def scrape_weekly_player_projections_FFtoday(week, year):
#     #instantiate selenium driver
#     driver = instantiate_selenium_driver()
    
#     #initialize dataframe for all data
#     proj_ppr_fft = pd.DataFrame()
    
#     #url that returns info has different code for each position and also takes year variable
#     position_ids = {'QB':10, 'RB':20, 'WR':30, 'TE':40, 'K':80, 'DEF':99}


#     #cycle through each position webpage to create comprehensive dataframe
#     for pos, pos_id in position_ids.items():
#         url_start_pos = f"http://www.fftoday.com/rankings/playerwkproj.php?Season={year}&GameWeek={week}&PosID={pos_id}&LeagueID=107644"
#         driver.get(url_start_pos)
        
#         #each page only gets 50 results, so cycle through next button until next button no longer exists
#         while True:
#             #read in table - no classes for tables so just need to find the right table in the list of tables from the page - 5th index
#             proj_ppr_fft_table_page = pd.read_html(driver.page_source, header=[1])[5]
            
#             proj_ppr_fft_table_page['POS'] = pos
            
          
#             #need to rename columns for different positions before concat because of differing column conventions
#             if pos == 'QB':
#                 proj_ppr_fft_table_page.rename(columns={'Player  Sort First: Last:':'PLAYER',
#                                                   'Comp':'PASS_COMP', 'Att': 'PASS_ATT', 'Yard':'PASS_YD',
#                                                   'TD':'PASS_TD', 'INT':'PASS_INT',
#                                                   'Att.1':'RUSH_ATT', 'Yard.1':'RUSH_YD', 'TD.1':'RUSH_TD'},
#                                          inplace=True)
#             elif pos == 'RB':
#                 proj_ppr_fft_table_page.rename(columns={'Player  Sort First: Last:':'PLAYER',
#                                                   'Att': 'RUSH_ATT', 'Yard':'RUSH_YD', 'TD':'RUSH_TD',
#                                                    'Rec':'RECV_RECPT', 'Yard.1':'RECV_YD', 'TD.1':'RECV_TD'},
#                                          inplace=True)
                
#             elif pos == 'WR':
#                 proj_ppr_fft_table_page.rename(columns={'Player  Sort First: Last:':'PLAYER',
#                                                   'Rec':'RECV_RECPT', 'Yard':'RECV_YD', 'TD':'RECV_TD',
#                                                   'Att':'RUSH_ATT', 'Yard.1':'RUSH_YD', 'TD.1':'RUSH_TD'},
#                                          inplace=True)
            
#             elif pos == 'TE':
#                 proj_ppr_fft_table_page.rename(columns={'Player  Sort First: Last:':'PLAYER',
#                                                   'Rec':'RECV_RECPT', 'Yard':'RECV_YD', 'TD':'RECV_TD'},
#                                          inplace=True)
                
#             elif pos == 'K':
#                 proj_ppr_fft_table_page.rename(columns={'Player  Sort First: Last:':'PLAYER',
#                                                   'FGM':'KICK_FG', 'FGA':'KICK_FGAtt', 'FG%':'KICK_FG%',
#                                                   'EPM':'KICK_XP', 'EPA':'KICK_XPAtt'},
#                                          inplace=True)
            
#             elif pos == 'DEF':
#                 proj_ppr_fft_table_page['PLAYER'] = proj_ppr_fft_table_page['Team'] #+ ' D/ST' #add player name with team name plus D/ST tag
#                 proj_ppr_fft_table_page.rename(columns={'Sack':'D/ST_Sack', 'FR':'D/ST_FR', 'DefTD':'D/ST_TD', 'INT':'D/ST_INT',
#                                                    'PA':'D/ST_PtsAll', 'PaYd/G':'D/ST_PaYd/G', 'RuYd/G':'D/ST_RuYd/G',
#                                                    'Safety':'D/ST_Sty', 'KickTD':'D/ST_RET_TD'},
#                                          inplace=True)
            
            
#             #add the position/page data to overall df
#             proj_ppr_fft = pd.concat([proj_ppr_fft, proj_ppr_fft_table_page],
#                                     ignore_index=True,
#                                     sort=False)
            
            
#             #click to next page to get next 50 results, but check that next button exists
#             try:
#                 next_button = driver.find_element_by_link_text("Next Page")
#                 next_button.click()
#             except EC.NoSuchElementException:
#                 break
    
    
#     driver.quit()
    
#     #add columns that give week/season
#     proj_ppr_fft['WEEK'] = week
#     proj_ppr_fft['SEASON'] = year
    
    
#     return proj_ppr_fft

In [187]:
# ###FORMAT/EXTRACT ACTUAL PLAYER PPR DATA###
# #(you could make this more complex if want to extract some of the subdata)

# def format_extract_PPR_player_points_FFtoday(df_scraped_ppr_fft):
# # #optional data formatting for additional info
# #     #calculate completion percentage
# #     df_scraped_ppr_fft['PASS_COMP_PCT'] = df_scraped_ppr_fft.PASS_COMP/df_scraped_ppr_fft.PASS_ATT


# #     #calculate total PaYd and RuYd for season
# #     df_scraped_ppr_fft['D/ST_PaYdA'] = df_scraped_ppr_fft['D/ST_PaYd/G'] * 16
# #     df_scraped_ppr_fft['D/ST_RuYdA'] = df_scraped_ppr_fft['D/ST_RuYd/G'] * 16
# #     df_scraped_ppr_fft['D/ST_ToYd/G'] = df_scraped_ppr_fft['D/ST_PaYd/G'] + df_scraped_ppr_fft['D/ST_RuYd/G']
# #     df_scraped_ppr_fft['D/ST_ToYdA'] = df_scraped_ppr_fft['D/ST_ToYd/G'] * 16


#     #rename some of outstanding columns to match other dfs
#     df_scraped_ppr_fft.rename(columns={'Team':'TEAM', 'FPts':'FPTS_PPR_FFTODAY'},
#                              inplace=True)

#     #remove any possible name suffixes to merge with other data better
#     df_scraped_ppr_fft['PLAYER'] = df_scraped_ppr_fft['PLAYER'].map(remove_suffixes)
    
    
#     #for this function only extract 'PLAYER', 'POS', 'TEAM', 'PTS'
#     df_scraped_ppr_fft = df_scraped_ppr_fft[['PLAYER', 'POS', 'TEAM', 'FPTS_PPR_FFTODAY', 'WEEK']].sort_values('FPTS_PPR_FFTODAY', ascending=False)

#     return df_scraped_ppr_fft