# Prep data for modeling by pulling current season in realtime
### NOTE:  this requires an updated history file, which must be manually refreshed each season)

In [None]:
#Only run this code if you need to update the history file
from jobs import SRjobs
SRjobs.create_history()

### Run the next sections every week, or every time you add new feature engineering

1. Create past_games df from history and in-season scrape
2. Apply basic features (game_conf, SRTD, CRTD) to past_games df
3. Update the GL files (offense and defense)
4. Apply GL features to past_games df
5. Apply basic and GL features to future games

In [1]:
from jobs import SRjobs, SRfeatures, SRgamelogs
import pandas as pd
import importlib

# PAST GAMES
#scrape and/or get all the information from website/disk
schedule_mstr = SRjobs.get_season_played_games()
history = SRjobs.get_history()

#combine recent history with previous seasons, apply features
past_games = pd.concat([history, schedule_mstr])

Initiate get_season_played_games
Finished scraping 2018 schedule with 884 rows
Successfully loaded 845.0 played games from 2018


In [2]:
print('Applying features to games already played')
past_games = SRfeatures.apply_features(past_games)

Applying features to games already played
Added Game_conf
Added Team_SRTD
Added Team_CRTD
Added Opp_SRTD
Added Opp_CRTD


In [3]:
past_games.to_csv('/Users/markvonoven/Projects/CollegeFootball/SRoutput/past_games_w_features.csv')

In [4]:
past_games = pd.read_csv('/Users/markvonoven/Projects/CollegeFootball/SRoutput/past_games_w_features.csv')
past_games.set_index(['Season', 'Gamecode'], inplace=True)
past_games['Date'] = pd.to_datetime(past_games['Date'])
past_games = past_games.sort_values(by=['Date'])

Update GL files (O&D) here....

In [5]:
#Add GL raw stats for all played games
past_games_w_GLstats = SRgamelogs.add_GLstats(past_games)
past_games_w_GLstats.to_csv('/Users/markvonoven/Projects/CollegeFootball/SRoutput/past_games_w_GLstats.csv')

Updated Team_Off_sumstats
Updated Team_Off_meanstats
Updated Team_Def_sumstats
Updated Team_Def_meanstats
Finished updating the GLstats for the new past_games


In [6]:
#Add GL features using the raw stats you just created
from jobs import SRGLfeatures
past_games = SRGLfeatures.apply_features(past_games)

Added Team_OffStrat
Added Opp_OffStrat
Added Team_DefStren
Added Opp_DefStren


In [7]:
past_games.to_csv('/Users/markvonoven/Projects/CollegeFootball/SRoutput/past_games_w_SRGLfeatures.csv')

In [None]:
#Add SOS ratings
from jobs import SRSOS
past_games['Team_SOS'] = past_games.apply(lambda x: SRSOS.strength_of_schedule(x['Team'], x['Date']), axis=1)
print('Added Team_SOS')
past_games['Opp_SOS'] = past_games.apply(lambda x: SRSOS.strength_of_schedule(x['Opp'], x['Date']), axis=1)
print('Added Opp_SOS')

In [None]:
past_games.to_csv('/Users/markvonoven/Projects/CollegeFootball/SRoutput/past_games_w_SOS.csv')

In [10]:
# ignore this later!!!
past_games = pd.read_csv('/Users/markvonoven/Projects/CollegeFootball/SRoutput/past_games_w_SOS.csv')
past_games.set_index(['Season', 'Gamecode'], inplace=True)
past_games['Date'] = pd.to_datetime(past_games['Date'])
past_games = past_games.sort_values(by=['Date'])
from jobs import SRSOS

In [11]:
#scrape and/or get all the information from website/disk
future_games = SRjobs.get_season_future_games()

# Add the potential BCS championship matchups to the list of future games
date = pd.Timestamp(2019, 1, 7, 18)

df2 = pd.DataFrame([[2018, '201917AlaCle', date, 'Alabama', 'Clemson', 'TBD', 'Neutral', 0.96, 0.92],
                  [2018, '201917AlaNot', date, 'Alabama', 'Notre Dame', 'TBD', 'Neutral', 0.96, 0.88],
                  [2018, '201917CleOkl', date, 'Clemson', 'Oklahoma', 'TBD', 'Neutral', 0.92, 0.84],
                    [2018, '201917OklNot', date, 'Oklahoma', 'Notre Dame', 'TBD', 'Neutral', 0.92, 0.88]],
                  columns=['Season', 'Gamecode', 'Date', 'Team', 'Opp', 'Won', 'Game_home', 'Team_rank', 'Opp_rank'])

df2.set_index(['Season', 'Gamecode'], inplace=True)
future_games = future_games.append(df2, sort=False)

#load the full history, which is now past_games plus all the features
full_history = past_games_w_GLstats.copy()

#apply features to future games
print('Applying features to future games')
future_games = SRfeatures.apply_future_features(future_games, full_history)
future_games = SRGLfeatures.apply_features(future_games)
future_games['Team_SOS'] = future_games.apply(lambda x: SRSOS.strength_of_schedule(x['Team'], x['Date']), axis=1)
future_games['Opp_SOS'] = future_games.apply(lambda x: SRSOS.strength_of_schedule(x['Opp'], x['Date']), axis=1)

Initiate get_season_future_games
Finished scraping 2018 schedule with 884 rows
Successfully loaded future_mstr with 39 remaining games in 2018
Applying features to future games
Added Game_conf
Added Team_SRTD
Added Team_CRTD
Added Opp_SRTD
Added Opp_CRTD
Added Team_OffStrat
Added Opp_OffStrat
Added Team_DefStren
Added Opp_DefStren


In [None]:
past_games.tail()

In [None]:
future_games.tail(10)

### Normalize any features which require it

In [12]:
def nz_num(num, feature):
    min_DFS = min(past_games[feature])
    max_DFS = max(past_games[feature])
    return (num-min_DFS)/(max_DFS - min_DFS)

def nz_features(feature_list):
    for feature in feature_list:
        
        past_games[feature + '_nz'] = past_games.apply(lambda x: 
                                                      nz_num(x[feature], feature), axis=1)
        print('Normalized ' + feature + ' on past_games')
        future_games[feature + '_nz'] = future_games.apply(lambda x: 
                                                      nz_num(x[feature], feature), axis=1)
        print('Normalized ' + feature + ' on future_games')
        past_games.drop(feature, axis=1, inplace=True)
        future_games.drop(feature, axis=1, inplace=True)

feature_list = ['Team_OffStrat', 'Opp_OffStrat', 'Team_DefStren', 'Opp_DefStren']

nz_features(feature_list)

Normalized Team_OffStrat on past_games
Normalized Team_OffStrat on future_games
Normalized Opp_OffStrat on past_games
Normalized Opp_OffStrat on future_games
Normalized Team_DefStren on past_games
Normalized Team_DefStren on future_games
Normalized Opp_DefStren on past_games
Normalized Opp_DefStren on future_games


In [13]:
past_games = pd.get_dummies(past_games, prefix=['Game'], columns=['Game_home'])

In [14]:
future_games = pd.get_dummies(future_games, prefix=['Game'], columns=['Game_home'])

In [15]:
# just because bowl games have no home or away games
future_games['Game_Home'] = 0
future_games['Game_Away'] = 0

In [16]:
cols = ['Date', 'Team', 'Opp', 'Won', 'Team_rank', 'Opp_rank',
       'Game_conf', 'Team_SRTD', 'Team_CRTD', 'Opp_SRTD', 'Opp_CRTD',
       'Team_SOS', 'Opp_SOS', 'Team_OffStrat_nz', 'Opp_OffStrat_nz', 'Team_DefStren_nz',
       'Opp_DefStren_nz', 'Game_Away', 'Game_Home', 'Game_Neutral']
future_games = future_games[cols]

In [17]:
future_games

Unnamed: 0_level_0,Unnamed: 1_level_0,Date,Team,Opp,Won,Team_rank,Opp_rank,Game_conf,Team_SRTD,Team_CRTD,Opp_SRTD,Opp_CRTD,Team_SOS,Opp_SOS,Team_OffStrat_nz,Opp_OffStrat_nz,Team_DefStren_nz,Opp_DefStren_nz,Game_Away,Game_Home,Game_Neutral
Season,Gamecode,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2018,20181215LouTul,2018-12-15 13:30:00,Louisiana,Tulane,TBD,0.0,0.0,0,0.538,0.556,0.5,0.714,0.537,0.54,0.1575,0.0,0.435206,0.439065,0,0,1
2018,20181215NorUta,2018-12-15 14:00:00,North Texas,Utah State,TBD,0.0,0.0,0,0.75,0.571,0.833,0.875,0.421,0.46,0.27975,0.22325,0.442032,0.43871,0,0,1
2018,20181215AriFre,2018-12-15 15:30:00,Arizona State,Fresno State,TBD,0.0,0.24,0,0.583,0.556,0.846,0.889,0.517,0.514,0.19475,0.222,0.437574,0.438535,0,0,1
2018,20181215EasGeo,2018-12-15 17:30:00,Eastern Michigan,Georgia Southern,TBD,0.0,0.0,0,0.583,0.625,0.75,0.75,0.497,0.512,0.1975,0.046,0.433097,0.437574,0,0,1
2018,20181215AppMid,2018-12-15 21:00:00,Appalachian State,Middle Tennessee State,TBD,0.0,0.0,0,0.833,0.889,0.615,0.778,0.525,0.535,0.1515,0.229,0.435994,0.43711,0,0,1
2018,20181218AlaNor,2018-12-18 19:00:00,Alabama-Birmingham,Northern Illinois,TBD,0.0,0.0,0,0.769,0.875,0.615,0.778,0.485,0.527,0.0,0.16325,0.436806,0.442245,0,0,1
2018,20181219OhiSan,2018-12-19 20:00:00,Ohio,San Diego State,TBD,0.0,0.0,0,0.667,0.75,0.583,0.5,0.489,0.521,0.14825,0.14975,0.439316,0.442084,0,0,1
2018,20181220MarSou,2018-12-20 20:00:00,Marshall,South Florida,TBD,0.0,0.0,0,0.667,0.714,0.583,0.375,0.486,0.499,0.2355,0.21375,0.442361,0.433529,0,0,1
2018,20181221FloTol,2018-12-21 12:30:00,Florida International,Toledo,TBD,0.0,0.0,0,0.667,0.714,0.583,0.625,0.438,0.492,0.19825,0.0,0.433987,0.437406,0,0,1
2018,20181221BriWes,2018-12-21 16:00:00,Brigham Young,Western Michigan,TBD,0.0,0.0,0,0.5,1.0,0.583,0.625,0.564,0.498,0.202,0.1675,0.438077,0.435865,0,0,1


In [18]:
past_games.to_csv('./SRinput/current/past_games.csv')

In [19]:
future_games.to_csv('./SRinput/current/future_games.csv')