In [1]:
import pandas as pd
import numpy as np
import requests
import json
from bs4 import BeautifulSoup

import time

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
# from selenium.common.exceptions import NoSuchElementException

In [2]:
def instantiate_selenium_driver():
    chrome_options = webdriver.ChromeOptions()
    chrome_options.add_argument('--no-sandbox')
    chrome_options.add_argument('--window-size=1420,1080')
    #chrome_options.add_argument('--headless')
    chrome_options.add_argument('--disable-gpu')
    driver = webdriver.Chrome('..\plugins\chromedriver.exe', 
        chrome_options=chrome_options)
    return driver

#### Get ESPN Points Projections for Year

In [3]:
#scrape all ESPN projections for preseason yearlong projectison
def scrape_ESPN_preseason_projections():
    #instantiate the driver on the ESPN projections page
    driver = instantiate_selenium_driver()
    
    #initialize dataframe for all data
    espn_proj = pd.DataFrame()
    
    #url that returns info has different code for each position
    position_ids = {'QB':0, 'RB':2, 'WR':4, 'TE':6, 'K':17, 'D/ST':16}

    #cycle through each position webpage to create comprehensive dataframe
    for pos, pos_id in position_ids.items():
        url_start_pos = f"http://games.espn.com/ffl/tools/projections?slotCategoryId={pos_id}"
        driver.get(url_start_pos)
        
        #each page only gets 40 results, so cycle through next button until next button no longer exists
        while True:
            #read in the table from ESPN, by using the class, and use the 1st row index for column header
            espn_proj_table_page = pd.read_html(driver.page_source,
                                               attrs={'class': 'playerTableTable'}, #return only the table of this class, which has the player data
                                               header=[1])[0] #returns table in a list, so get table

            espn_proj_table_page['POS'] = pos

            #rename D/ST columns so don't get misassigned to wrong columns
            if pos == 'D/ST':
                espn_proj_table_page.rename(columns={'SCK':'D/ST_Sack', 'FF':'D/ST_FF',
                                                     'FR':'D/ST_FR', 'INT':'D/ST_INT',
                                                     'ITD':'D/ST_ITD', 'FTD':'D/ST_FTD'},
                                            inplace=True)
                espn_proj_table_page['D/ST_TD'] = espn_proj_table_page['D/ST_ITD'] + espn_proj_table_page['D/ST_FTD']
            
            #rename/recalculate Kicker columns so don't get misassigned to wrong columns
            elif pos == 'K':
                espn_proj_table_page.rename(columns={'1-39':'KICK_FG_1-39', '40-49':'KICK_FG_40-49',
                                                     '50+':'KICK_FG_50+', 'TOT':'KICK_FG',
                                                     'XP':'KICK_XP'},
                                            inplace=True)
                #just want made FG's for each bucket and overall FGAtt
                espn_proj_table_page['KICK_FGAtt'] = espn_proj_table_page['KICK_FG'].map(
                                                        lambda x: x.split("/")[-1]).astype('float64')
                espn_proj_table_page['KICK_XPAtt'] = espn_proj_table_page['KICK_XP'].map(
                                                        lambda x: x.split("/")[-1]).astype('float64')
                espn_proj_table_page['KICK_FG_1-39'] = espn_proj_table_page['KICK_FG_1-39'].map(
                                                        lambda x: x.split("/")[0]).astype('float64')
                espn_proj_table_page['KICK_FG_40-49'] = espn_proj_table_page['KICK_FG_40-49'].map(
                                                        lambda x: x.split("/")[0]).astype('float64')
                espn_proj_table_page['KICK_FG_50+'] = espn_proj_table_page['KICK_FG_50+'].map(
                                                        lambda x: x.split("/")[0]).astype('float64')
                espn_proj_table_page['KICK_FG'] = espn_proj_table_page['KICK_FG'].map(
                                                        lambda x: x.split("/")[0]).astype('float64')
                espn_proj_table_page['KICK_XP'] = espn_proj_table_page['KICK_XP'].map(
                                                        lambda x: x.split("/")[0]).astype('float64')
                espn_proj_table_page['KICK_FG%'] = espn_proj_table_page['KICK_FG'] / espn_proj_table_page['KICK_FGAtt']
                           
            
            #add page data to overall dataframe
            espn_proj = pd.concat([espn_proj, espn_proj_table_page],
                                         ignore_index=True,
                                         sort=False)

            #click to next page to get next 40 results, but check that it exists
            try:
                next_button = driver.find_element_by_partial_link_text('NEXT')
                next_button.click()
            except EC.NoSuchElementException:
                break
    
    driver.quit()
    return espn_proj

In [4]:
###FORMATTING/REORDERING###
def format_reduce_ESPN_preseason_projections(df_espn_proj):
    #split completions/attempts columns and add completion percentage
    ##first change any C/A as '--/--' to '0/0' so doesn't error out
    df_espn_proj['C/A'] = df_espn_proj['C/A'].map(
                                        lambda x: '0/0' if x=='--/--' else x)
    df_espn_proj['PASS_COMP'] = df_espn_proj['C/A'].map(
                                        lambda x: float(x.split('/')[0]) if isinstance(x, str) else np.nan)
    df_espn_proj['PASS_ATT'] = df_espn_proj['C/A'].map(
                                        lambda x: float(x.split('/')[1]) if isinstance(x, str) else np.nan)
    df_espn_proj['PASS_COMP_PCT'] = df_espn_proj.PASS_COMP/df_espn_proj.PASS_ATT


    #rename some of columns so don't lose meaning
    df_espn_proj.drop('C/A', axis='columns', inplace=True)
    df_espn_proj.rename(columns={'YDS': 'PASS_YD', 'TD': 'PASS_TD', 'INT': 'PASS_INT',
                                       'RUSH': 'RUSH_ATT', 'YDS.1': 'RUSH_YD', 'TD.1': 'RUSH_TD',
                                       'REC': 'RECV_RECPT', 'YDS.2': 'RECV_YD', 'TD.2': 'RECV_TD'},
                              inplace=True)


    #split out player, team, position
    def split_player_team_pos_espn(play_team_pos):
        #incoming string for players: 'Todd Gurley II, LAR RB'
        #incoming string for D/ST: 'Jaguars D/ST\xa0D/ST'

        #operations if D/ST
        if "D/ST" in play_team_pos:
            player = play_team_pos.split('\xa0')[0]
            team = player.split()[0]

        #operations for regular players
        else:
            player = play_team_pos.split(',')[0]
            team_pos = play_team_pos.split(',')[1]
            team = team_pos.split()[0]

        return player, team

    df_espn_proj[['PLAYER', 'TEAM']] = df_espn_proj.apply(
                                               lambda x: split_player_team_pos_espn(x['PLAYER, TEAM POS']),
                                               axis='columns',
                                               result_type='expand')


    #remove name suffixes of II or Jr. or Sr. so names match other databases
    def remove_suffixes(name):
        for suffix in [" II", " Jr.", " Sr."]:
            name = name.replace(suffix, "")
        return name

    df_espn_proj['PLAYER'] = df_espn_proj['PLAYER'].map(remove_suffixes)


    #reorder columns (and don't keep all)
    df_espn_proj = df_espn_proj[['RNK', 'PTS', 'PLAYER', 'TEAM', 'POS',
                                               'PASS_YD', 'PASS_TD', 'PASS_INT', 'PASS_COMP', 'PASS_ATT', 'PASS_COMP_PCT',
                                               'RUSH_ATT', 'RUSH_YD', 'RUSH_TD',
                                               'RECV_RECPT', 'RECV_YD', 'RECV_TD',
                                               'KICK_FG_1-39', 'KICK_FG_40-49', 'KICK_FG_50+', 'KICK_FG', 'KICK_FGAtt', 'KICK_FG%',
                                               'KICK_XP', 'KICK_XPAtt', 'D/ST_Sack', 'D/ST_FF', 'D/ST_FR', 'D/ST_INT', 'D/ST_TD']]


    return df_espn_proj

In [5]:
###CALL SCRAPING AND FORMAT/REDUCE
df_espn_projections = scrape_ESPN_preseason_projections()
df_espn_projections = format_reduce_ESPN_preseason_projections(df_espn_projections)
print(df_espn_projections.shape)
df_espn_projections.head()

(991, 30)


Unnamed: 0,RNK,PTS,PLAYER,TEAM,POS,PASS_YD,PASS_TD,PASS_INT,PASS_COMP,PASS_ATT,...,KICK_FG,KICK_FGAtt,KICK_FG%,KICK_XP,KICK_XPAtt,D/ST_Sack,D/ST_FF,D/ST_FR,D/ST_INT,D/ST_TD
0,1,316.8,Tom Brady,NE,QB,4704.8,32.9,7.8,390.5,606.0,...,,,,,,,,,,
1,2,315.8,Aaron Rodgers,GB,QB,4016.3,33.5,9.2,358.6,564.9,...,,,,,,,,,,
2,3,302.6,Cam Newton,Car,QB,3776.2,22.4,14.1,312.5,529.7,...,,,,,,,,,,
3,4,294.5,Russell Wilson,Sea,QB,3695.4,25.8,10.1,311.7,503.7,...,,,,,,,,,,
4,5,288.4,Andrew Luck,Ind,QB,4310.1,28.1,16.2,367.1,586.2,...,,,,,,,,,,


#### Get CBS Sports Points Projections for Year

In [6]:
def scrape_CBS_preseason_projections():
    ###GET PROJECTIONS FROM CBS###
    #CBS has separate tables for each position, so need to cycle through them
    #but url can return all list so don't need to go page by page
    cbs_proj = pd.DataFrame()
    
    positions = ['QB', 'RB', 'WR', 'TE', 'K', 'DST']
    header_row_index = {'QB':2, 'RB':2, 'WR':2, 'TE':2, 'K':1, 'DST':1}
       
    for position in positions:
        #url just needs to change position
        url = f"https://www.cbssports.com/fantasy/football/stats/sortable/points/{position}/standard/projections/2018/ytd?&print_rows=9999"
    
        #read in the table from CBS by class, and use the 2nd row index for column header
        cbs_proj_pos = pd.read_html(url, 
                                  attrs={'class': 'data'}, #return only the table of this class, which has the player data
                                  header=[header_row_index[position]])[0] #returns table in a list, so get table
        cbs_proj_pos['POS'] = position
        
        #add the table to the overall df
        cbs_proj = pd.concat([cbs_proj, cbs_proj_pos], 
                             ignore_index=True, 
                             sort=False)

    #some tables include the page selector as the bottom row of the table,
    #so need to find the index values of those rows and then drop them from the table
    index_pages_rows = list(cbs_proj[cbs_proj['Player'].str.contains('Pages')].index)
    cbs_proj.drop(index_pages_rows, axis='index', inplace=True)
    return cbs_proj    

In [7]:
###FORMATTING/REORDERING###
def format_reduce_CBS_preseason_projections(df_cbs_proj):
    #calculate completion percentage
    df_cbs_proj['COMPLETION_PERCENTAGE'] = df_cbs_proj.CMP/df_cbs_proj.ATT


    #rename some of columns so don't lose meaning
    df_cbs_proj.rename(columns={'ATT':'PASS_ATT', 'CMP':'PASS_COMP', 'COMPLETION_PERCENTAGE': 'PASS_COMP_PCT',
                                       'YD': 'PASS_YD', 'TD':'PASS_TD', 'INT':'PASS_INT', 'RATE':'PASS_RATE', 
                                       'ATT.1': 'RUSH_ATT', 'YD.1': 'RUSH_YD', 'AVG': 'RUSH_AVG', 'TD.1':'RUSH_TD',
                                       'TARGT': 'RECV_TARGT', 'RECPT': 'RECV_RECPT', 'YD.2':'RECV_YD', 'AVG.1':'RECV_AVG', 'TD.2':'RECV_TD',
                                       'FPTS':'PTS',
                                       'FG':'KICK_FG', 'FGA': 'KICK_FGAtt', 'XP':'KICK_XP', 'XPAtt':'KICK_XPAtt', 
                                       'Int':'D/ST_INT', 'Sty':'D/ST_Sty', 'Sack':'D/ST_Sack', 'TK':'D/ST_TK',
                                       'DFR':'D/ST_FR', 'FF':'D/ST_FF', 'DTD':'D/ST_TD',
                                       'Pa':'D/ST_PtsAll', 'PaNetA':'D/ST_PaYdA', 'RuYdA':'D/ST_RuYdA', 'TyDa':'D/ST_ToYdA'},
                                inplace=True)


    #calculate passing, rushing, total yards/game
    df_cbs_proj['D/ST_PaYd/G'] = df_cbs_proj['D/ST_PaYdA']/16
    df_cbs_proj['D/ST_RuYd/G'] = df_cbs_proj['D/ST_RuYdA']/16
    df_cbs_proj['D/ST_ToYd/G'] = df_cbs_proj['D/ST_ToYdA']/16


    #split out player, team
    def split_player_team(play_team):
        #incoming string for players: 'Todd Gurley, LAR'
        #incoming string for DST: 'Jaguars, JAC'

        #operations if D/ST (can tell if there is only two items in a list separated by a space, instead of three)
        if len(play_team.split()) == 2:
            player = play_team.split(',')[0] + ' D/ST'
            team = play_team.split(',')[1]

        #operations for regular players
        else:
            player = play_team.split(',')[0]
            team = play_team.split(',')[1]

        return player, team

    df_cbs_proj[['PLAYER', 'TEAM']] = df_cbs_proj.apply(
                                                    lambda x: split_player_team(x['Player']),
                                                    axis='columns',
                                                    result_type='expand')


    #reorder columns and don't keep all of them
    df_cbs_proj = df_cbs_proj[['PTS','PLAYER', 'POS', 'TEAM', 'PASS_ATT', 'PASS_COMP', 'PASS_COMP_PCT',
                                             'PASS_YD', 'PASS_TD', 'PASS_INT', 'PASS_RATE',
                                             'RUSH_ATT', 'RUSH_YD', 'RUSH_AVG', 'RUSH_TD',
                                             'RECV_TARGT', 'RECV_RECPT', 'RECV_YD', 'RECV_AVG', 'RECV_TD',
                                             'KICK_FG', 'KICK_FGAtt', 'KICK_XP', 'KICK_XPAtt',
                                             'D/ST_INT', 'D/ST_Sty', 'D/ST_Sack', 'D/ST_TK', 'D/ST_FR', 'D/ST_FF',
                                             'D/ST_TD', 'D/ST_PtsAll', 'D/ST_PaYdA', 'D/ST_RuYdA', 'D/ST_ToYdA',
                                             'D/ST_PaYd/G', 'D/ST_RuYd/G', 'D/ST_ToYd/G']].sort_values('PTS', ascending=False)


    return df_cbs_proj

In [8]:
###CALL SCRAPING AND FORMATTING###
df_cbs_projections = scrape_CBS_preseason_projections()
df_cbs_projections = format_reduce_CBS_preseason_projections(df_cbs_projections)
print(df_cbs_projections.shape)
df_cbs_projections.head()

(562, 38)


Unnamed: 0,PTS,PLAYER,POS,TEAM,PASS_ATT,PASS_COMP,PASS_COMP_PCT,PASS_YD,PASS_TD,PASS_INT,...,D/ST_FR,D/ST_FF,D/ST_TD,D/ST_PtsAll,D/ST_PaYdA,D/ST_RuYdA,D/ST_ToYdA,D/ST_PaYd/G,D/ST_RuYd/G,D/ST_ToYd/G
0,361.0,Deshaun Watson,QB,HOU,561.0,335.0,0.597148,3992.0,27.0,20.0,...,,,,,,,,,,
1,357.0,Aaron Rodgers,QB,GB,600.0,391.0,0.651667,3858.0,32.0,10.0,...,,,,,,,,,,
2,349.0,Tom Brady,QB,NE,617.0,396.0,0.641815,4651.0,31.0,10.0,...,,,,,,,,,,
3,336.0,Russell Wilson,QB,SEA,560.0,340.0,0.607143,3790.0,25.0,18.0,...,,,,,,,,,,
4,330.0,Andrew Luck,QB,IND,584.0,367.0,0.628425,4189.0,26.0,16.0,...,,,,,,,,,,


#### Get NFL Points Projections for Year

In [9]:
#scrape all NFL.com projections for preseason yearlong projectison
def scrape_NFL_preseason_projections():
    #instantiate selenium driver
    driver = instantiate_selenium_driver()
    
    #initialize dataframe for all data
    nfl_proj = pd.DataFrame()
    
    #url that returns info has different code for each position and also takes year variable
    position_url_codes = {'QB':1, 'RB':2, 'WR':3, 'TE':4, 'K':7, 'DEF':8 }
    statSeason = 2018
    #if want a weekly projection, at the end of web_url will need to add '&statWeek=1' when want the weekly projection

    #cycle through each position webpage to create comprehensive dataframe
    for pos, url_code in position_url_codes.items():
        url_start_pos = f"http://fantasy.nfl.com/research/projections?position={url_code}&sort=projectedPts&statCategory=projectedStats&statSeason={statSeason}&statType=seasonProjectedStats"
        driver.get(url_start_pos)
        
        #each page only gets 25 results, so cycle through next button until next button no longer exists
        while True:
            #read in the table from NFL, by using the class, and use the 1st row index for column headers
            nfl_proj_table_page = pd.read_html(driver.page_source,
                                               attrs={'class': 'tableType-player'}, #return only the table of this class, which has the player data
                                               header=[1])[0] #returns table in a list, so get table by going to first index

            #need to rename DEF columns so don't go to wrong columns
            if pos == 'DEF':
                nfl_proj_table_page.rename(columns={'Team':'Player', 'Sack':'D/ST_Sack',
                                                    'Int':'D/ST_INT', 'Fum Rec':'D/ST_FR',
                                                    'Saf':'D/ST_Sty', 'TD': 'D/ST_TD',
                                                    'TD.1':'D/ST_RET_TD', 'Pts Allow':'D/ST_PtsAll'},
                                           inplace=True)
                      
            nfl_proj = pd.concat([nfl_proj, nfl_proj_table_page],
                                    ignore_index=True,
                                    sort=False)
            
            
            #click to next page to get next 25 results, but check that next button exists
            try:
                next_button = driver.find_element_by_class_name("next")
                next_button.click()
            except EC.NoSuchElementException:
                break
    
    driver.quit()
    return nfl_proj

In [10]:
###FORMATTING/REORDERING###
def format_reduce_NFL_preseason_projections(df_nfl_proj):
    #doesn't have attempts and completions - so can't calculate completion percentage

    #rename some of columns so don't lose meaning
    df_nfl_proj.rename(columns={'Yds':'PASS_YD', 'TD':'PASS_TD', 'Int':'PASS_INT',
                                       'Yds.1':'RUSH_YD', 'TD.1':'RUSH_TD',
                                       'Yds.2':'RECV_YD', 'TD.2':'RECV_TD',
                                       'Lost':'FUM_LOST', 
                                       'Made':'KICK_XP', '0-19':'KICK_FG_0-19', '20-29':'KICK_FG_20-29',
                                       '30-39':'KICK_FG_30-39', '40-49':'KICK_FG_40-49',
                                       '50+':'KICK_FG_50+'}, 
                                       inplace=True)


    #get total FG made, summing up all the different yardages
    df_nfl_proj['KICK_FG'] = df_nfl_proj[['KICK_FG_0-19', 'KICK_FG_20-29', 'KICK_FG_30-39',
                                                       'KICK_FG_40-49', 'KICK_FG_50+']].sum(axis='columns')


    #split out player, team, position
    def split_player_team_pos_nfl(play_team_pos):
        #incoming string for players: 'Deshaun Watson QB - HOU View Videos'
        #incoming string for DEF: 'New York Jets DEF View Videos'

        #some players have View Videos or News, others don't, so first strip off those
        play_team_pos = play_team_pos.split(' View')[0]

        #operations if DEF
        if "DEF" in play_team_pos:
            player = play_team_pos.split(' DEF')[0] + ' D/ST'
            team = play_team_pos.split(' DEF')[0]
            pos = 'D/ST'

        #operations for regular players
        else:
            split = play_team_pos.split()        
            player = " ".join(split[0:2])
            pos = split[2]
            #free agents have no team listed, so add in FA for free agent
            if len(split) < 5:
                team = 'FA'
            else: 
                team = split[4]

        return player, team, pos

    df_nfl_proj[['PLAYER', 'TEAM', 'POS']] = df_nfl_proj.apply(
                                                    lambda x: split_player_team_pos_nfl(x['Player']),
                                                    axis='columns',
                                                    result_type='expand')


    #reorder columns and don't keep all of them
    df_nfl_proj = df_nfl_proj[['Points', 'PLAYER', 'POS', 'TEAM', 'PASS_YD', 'PASS_TD', 'PASS_INT',
                                             'RUSH_YD', 'RUSH_TD', 'RECV_YD', 'RECV_TD',  
                                             'KICK_XP', 'KICK_FG', 'KICK_FG_0-19', 'KICK_FG_20-29', 'KICK_FG_30-39',
                                             'KICK_FG_40-49', 'KICK_FG_50+', 
                                             'D/ST_Sack', 'D/ST_INT', 'D/ST_FR', 'D/ST_Sty', 'D/ST_TD',
                                             'D/ST_RET_TD', 'D/ST_PtsAll']].sort_values('Points', ascending=False)
    return df_nfl_proj

In [11]:
#call NFL scraping function into df
df_nfl_projections = scrape_NFL_preseason_projections()
df_nfl_projections = format_reduce_NFL_preseason_projections(df_nfl_projections)
print(df_nfl_projections.shape)
df_nfl_projections.head()

(1075, 25)


Unnamed: 0,Points,PLAYER,POS,TEAM,PASS_YD,PASS_TD,PASS_INT,RUSH_YD,RUSH_TD,RECV_YD,...,KICK_FG_30-39,KICK_FG_40-49,KICK_FG_50+,D/ST_Sack,D/ST_INT,D/ST_FR,D/ST_Sty,D/ST_TD,D/ST_RET_TD,D/ST_PtsAll
0,336.81,Deshaun Watson,QB,HOU,4112.1,34.63,14.12,435.09,4.07,-,...,,,,,,,,,,
25,336.81,Deshaun Watson,QB,HOU,4112.1,34.63,14.12,435.09,4.07,-,...,,,,,,,,,,
1,327.55,Aaron Rodgers,QB,GB,4348.78,33.98,9.61,278.63,2.11,-,...,,,,,,,,,,
26,327.55,Aaron Rodgers,QB,GB,4348.78,33.98,9.61,278.63,2.11,-,...,,,,,,,,,,
2,314.32,Tom Brady,QB,NE,4914.07,31.81,8.11,72.42,0.65,-,...,,,,,,,,,,


#### Get FFtoday Points Projections for Year

In [12]:
#scrape all FFtoday.com projections for preseason yearlong projections
def scrape_FFtoday_preseason_projections():
    #instantiate selenium driver
    driver = instantiate_selenium_driver()
    
    #initialize dataframe for all data
    ff_proj = pd.DataFrame()
    
    #url that returns info has different code for each position and also takes year variable
    position_ids = {'QB':10, 'RB':20, 'WR':30, 'TE':40, 'K':80, 'DEF':99}
    Season = 2018
    #if want weekly projections just add in GameWeek string/variable into the url: f"http://www.fftoday.com/rankings/playerwkproj.php?Season={Season}&GameWeek={GameWeek}&PosID={pos_id}"

    #cycle through each position webpage to create comprehensive dataframe
    for pos, pos_id in position_ids.items():
        url_start_pos = f"http://www.fftoday.com/rankings/playerproj.php?Season={Season}&PosID={pos_id}"
        driver.get(url_start_pos)
        
        #each page only gets 50 results, so cycle through next button until next button no longer exists
        while True:
            #read in table - no classes for tables so just need to find the right table in the list of tables from the page - 5th index
            ff_proj_table_page = pd.read_html(driver.page_source, header=[1])[5]
            
            ff_proj_table_page['POS'] = pos
            
            #need to rename columns for different positions before concat because of differing column conventions
            if pos == 'QB':
                ff_proj_table_page.rename(columns={'Player  Sort First: Last:':'PLAYER',
                                                  'Comp':'PASS_COMP', 'Att': 'PASS_ATT', 'Yard':'PASS_YD',
                                                  'TD':'PASS_TD', 'INT':'PASS_INT',
                                                  'Att.1':'RUSH_ATT', 'Yard.1':'RUSH_YD', 'TD.1':'RUSH_TD'},
                                         inplace=True)
            elif pos == 'RB':
                ff_proj_table_page.rename(columns={'Player  Sort First: Last:':'PLAYER',
                                                  'Att': 'RUSH_ATT', 'Yard':'RUSH_YD', 'TD':'RUSH_TD',
                                                   'Rec':'RECV_RECPT', 'Yard.1':'RECV_YD', 'TD.1':'RECV_TD'},
                                         inplace=True)
                
            elif pos == 'WR':
                ff_proj_table_page.rename(columns={'Player  Sort First: Last:':'PLAYER',
                                                  'Rec':'RECV_RECPT', 'Yard':'RECV_YD', 'TD':'RECV_TD',
                                                  'Att':'RUSH_ATT', 'Yard.1':'RUSH_YD', 'TD.1':'RUSH_TD'},
                                         inplace=True)
            
            elif pos == 'TE':
                ff_proj_table_page.rename(columns={'Player  Sort First: Last:':'PLAYER',
                                                  'Rec':'RECV_RECPT', 'Yard':'RECV_YD', 'TD':'RECV_TD'},
                                         inplace=True)
                
            elif pos == 'K':
                ff_proj_table_page.rename(columns={'Player  Sort First: Last:':'PLAYER',
                                                  'FGM':'KICK_FG', 'FGA':'KICK_FGAtt', 'FG%':'KICK_FG%',
                                                  'EPM':'KICK_XP', 'EPA':'KICK_XPAtt'},
                                         inplace=True)
            
            elif pos == 'DEF':
                ff_proj_table_page['PLAYER'] = ff_proj_table_page['Team'] + ' D/ST' #add player name with team name plus D/ST tag
                ff_proj_table_page.rename(columns={'Sack':'D/ST_Sack', 'FR':'D/ST_FR', 'DefTD':'D/ST_TD', 'INT':'D/ST_INT',
                                                   'PA':'D/ST_PtsAll', 'PaYd/G':'D/ST_PaYd/G', 'RuYd/G':'D/ST_RuYd/G',
                                                   'Safety':'D/ST_Sty', 'KickTD':'D/ST_RET_TD'},
                                         inplace=True)
            
            
            #add the position/page data to overall df
            ff_proj = pd.concat([ff_proj, ff_proj_table_page],
                                    ignore_index=True,
                                    sort=False)
            
            
            #click to next page to get next 50 results, but check that next button exists
            try:
                next_button = driver.find_element_by_link_text("Next Page")
                next_button.click()
            except EC.NoSuchElementException:
                break
    
    
    driver.quit()
    return ff_proj

In [13]:
###FORMATTING/REORDERING###
def format_reduce_FFtoday_preseason_projections(df_fft_proj):
    #calculate completion percentage
    df_fft_proj['PASS_COMP_PCT'] = df_fft_proj.PASS_COMP/df_fft_proj.PASS_ATT


    #calculate total PaYd and RuYd for season
    df_fft_proj['D/ST_PaYdA'] = df_fft_proj['D/ST_PaYd/G'] * 16
    df_fft_proj['D/ST_RuYdA'] = df_fft_proj['D/ST_RuYd/G'] * 16
    df_fft_proj['D/ST_ToYd/G'] = df_fft_proj['D/ST_PaYd/G'] + df_fft_proj['D/ST_RuYd/G']
    df_fft_proj['D/ST_ToYdA'] = df_fft_proj['D/ST_ToYd/G'] * 16


    #rename some of outstanding columns to match other dfs
    df_fft_proj.rename(columns={'Team':'TEAM'},
                             inplace=True)


    #reorder columns and don't keep all of them
    df_fft_proj = df_fft_proj[['FPts', 'PLAYER', 'POS', 'TEAM', 
                                             'PASS_YD', 'PASS_TD', 'PASS_INT', 'PASS_COMP', 'PASS_ATT', 'PASS_COMP_PCT',
                                             'RUSH_ATT', 'RUSH_YD', 'RUSH_TD',
                                             'RECV_RECPT', 'RECV_YD', 'RECV_TD',
                                             'KICK_XP', 'KICK_XPAtt', 'KICK_FG', 'KICK_FGAtt', 'KICK_FG%',
                                             'D/ST_Sack', 'D/ST_FR', 'D/ST_INT', 'D/ST_Sty', 'D/ST_TD',
                                             'D/ST_RET_TD', 'D/ST_PtsAll', 'D/ST_PaYd/G', 'D/ST_RuYd/G', 'D/ST_ToYd/G',
                                              'D/ST_PaYdA', 'D/ST_RuYdA', 'D/ST_ToYdA']].sort_values('FPts', ascending=False)

    return df_fft_proj

In [14]:
#call FFtoday scraping and reduce/format function into df
df_fft_projections = scrape_FFtoday_preseason_projections()
df_fft_projections = format_reduce_FFtoday_preseason_projections(df_fft_projections)
print(df_fft_projections.shape)
df_fft_projections.head()

(447, 34)


Unnamed: 0,FPts,PLAYER,POS,TEAM,PASS_YD,PASS_TD,PASS_INT,PASS_COMP,PASS_ATT,PASS_COMP_PCT,...,D/ST_Sty,D/ST_TD,D/ST_RET_TD,D/ST_PtsAll,D/ST_PaYd/G,D/ST_RuYd/G,D/ST_ToYd/G,D/ST_PaYdA,D/ST_RuYdA,D/ST_ToYdA
0,407.2,Aaron Rodgers,QB,GB,4338.0,36.0,8.0,391.0,611.0,0.639935,...,,,,,,,,,,
1,373.2,Russell Wilson,QB,SEA,3880.0,28.0,10.0,325.0,524.0,0.620229,...,,,,,,,,,,
2,355.7,Tom Brady,QB,NE,4350.0,32.0,9.0,359.0,544.0,0.659926,...,,,,,,,,,,
3,352.1,Cam Newton,QB,CAR,3425.0,21.0,13.0,302.0,511.0,0.590998,...,,,,,,,,,,
4,349.5,Drew Brees,QB,NO,4585.0,28.0,15.0,400.0,580.0,0.689655,...,,,,,,,,,,


In [15]:
test = time.localtime()
print(test.tm_year, test.tm_mon, test.tm_mday, test.tm_hour, test.tm_min)

2018 9 3 7 43


In [16]:
def save_to_pickle(df, source):
    lt = time.localtime()
    file_name = f"df_{source}_projections_{lt.tm_year}-{lt.tm_mon}-{lt.tm_mday}-{lt.tm_hour}-{lt.tm_min}.pkl"
    dir_name = "pickle_archive/"
    path = dir_name + file_name
    df.to_pickle(path)
    print(f"Pickle saved to: {path}")

In [17]:
#save all the dataframes to pickles for easy access later, when cache gets cleared/other notebooks
save_to_pickle(df_espn_projections, 'espn')
save_to_pickle(df_cbs_projections, 'cbs')
save_to_pickle(df_nfl_projections, 'nfl')
save_to_pickle(df_fft_projections, 'fft')

Pickle saved to: pickle_archive/df_espn_projections_2018-9-3-7-43.pkl
Pickle saved to: pickle_archive/df_cbs_projections_2018-9-3-7-43.pkl
Pickle saved to: pickle_archive/df_nfl_projections_2018-9-3-7-43.pkl
Pickle saved to: pickle_archive/df_fft_projections_2018-9-3-7-43.pkl


In [18]:
#check all column namings for errors/inconsistencies
print(df_espn_projections.columns)
print(df_cbs_projections.columns)
print(df_nfl_projections.columns)
print(df_fft_projections.columns)

Index(['RNK', 'PTS', 'PLAYER', 'TEAM', 'POS', 'PASS_YD', 'PASS_TD', 'PASS_INT',
       'PASS_COMP', 'PASS_ATT', 'PASS_COMP_PCT', 'RUSH_ATT', 'RUSH_YD',
       'RUSH_TD', 'RECV_RECPT', 'RECV_YD', 'RECV_TD', 'KICK_FG_1-39',
       'KICK_FG_40-49', 'KICK_FG_50+', 'KICK_FG', 'KICK_FGAtt', 'KICK_FG%',
       'KICK_XP', 'KICK_XPAtt', 'D/ST_Sack', 'D/ST_FF', 'D/ST_FR', 'D/ST_INT',
       'D/ST_TD'],
      dtype='object')
Index(['PTS', 'PLAYER', 'POS', 'TEAM', 'PASS_ATT', 'PASS_COMP',
       'PASS_COMP_PCT', 'PASS_YD', 'PASS_TD', 'PASS_INT', 'PASS_RATE',
       'RUSH_ATT', 'RUSH_YD', 'RUSH_AVG', 'RUSH_TD', 'RECV_TARGT',
       'RECV_RECPT', 'RECV_YD', 'RECV_AVG', 'RECV_TD', 'KICK_FG', 'KICK_FGAtt',
       'KICK_XP', 'KICK_XPAtt', 'D/ST_INT', 'D/ST_Sty', 'D/ST_Sack', 'D/ST_TK',
       'D/ST_FR', 'D/ST_FF', 'D/ST_TD', 'D/ST_PtsAll', 'D/ST_PaYdA',
       'D/ST_RuYdA', 'D/ST_ToYdA', 'D/ST_PaYd/G', 'D/ST_RuYd/G',
       'D/ST_ToYd/G'],
      dtype='object')
Index(['Points', 'PLAYER', 'POS', 'TEAM',

#### Need to Store Data to Database
Use MySQL database on PythonAnywhere

In [19]:
# Databases:
# mtsuomi$fantasy_football

# create tables for each of the different preseason data
# should add columns for season year, date scraped in those tables too