### Scrape the SportsReference data

In [1]:
# import packages
import urllib3
import csv
from bs4 import BeautifulSoup
import re
import pandas as pd
import datetime

#settings
http = urllib3.PoolManager()
yrs_list = ['2017', '2018']
today = datetime.datetime.today()
G_year = today.year

#define some helper functions
def monthToNum(shortMonth):
    return{
            'Jan' : 1, 'Feb' : 2, 'Mar' : 3, 'Apr' : 4, 'May' : 5, 'Jun' : 6,
            'Jul' : 7, 'Aug' : 8, 'Sep' : 9, 'Oct' : 10, 'Nov' : 11, 'Dec' : 12
    }[shortMonth]

def dateToTimeStamp(datestring, timestring):
    """
    Given the following formats for datestring and timestring, function returns a valid datetime
    datestring = 'May 28, 2005'
    timestring = '2:00 PM'
    """
    dlength = len(datestring)
    mon = monthToNum(datestring[0:3])
    year = int(datestring[dlength-4:])
    middle = datestring[3:-4]
    day = int(middle.strip().strip(','))
    if timestring == '0':
        return datetime.datetime(year, mon, day)
    else:
        tlength = len(timestring)
        M = timestring[tlength-2:]
        hour = int(timestring[:timestring.find(':')])
        if (M == 'PM') & (hour != 12):
            hour = int(timestring[:timestring.find(':')]) + 12
        minute = int(timestring[timestring.find(':')+1:timestring.find(':')+3])
    return datetime.datetime(year, mon, day, hour, minute)

def winner_home(string):
    if string == '@':
        return 0
    else:
        return 1

def concat_mult_ref_tables(filename, yrs):
    """Return a dataframe that concatenates all 
    files across a list of years, with the year set as a key
    """
    # create a list to store the dfs
    df_list = []
    headerlist = ['Year', 'Weeknum', 'Date', 'Time', 'Day', 'Team', 'Team_Pts', 'At_sym', 'Opp', 'Opp_Pts', 'Notes']
    
    for yr in yrs: 
        temp_df = None   #clear out the df
        temp_df = pd.read_csv('./SRinput/preds/' + str(yr) + '/schedule' + str(yr) + '.csv', header=None)  #read in the file
        df_list.append(temp_df)
        
    final_df = pd.concat(df_list, ignore_index = True)
    final_df.columns = headerlist
    
    return final_df

#scrape the data and write the .csv files
for year in range(G_year-1, G_year+1):
    with open ('./SRinput/preds/' + str(year) + '/schedule' + str(year) + '.csv','w') as csvfile:
        wrtr = csv.writer(csvfile, delimiter=',', quotechar='"')
        url = "http://www.sports-reference.com/cfb/years/"+str(year)+"-schedule.html"
        response = http.request('GET', url)
        soup = BeautifulSoup(response.data)
        cnt = 0
        for row in soup.findAll('tr'):
            try:
                col1=row.findAll('th')
                Rank=col1[0].string
                col=row.findAll('td')
                Weeknum = col[0].get_text()
                Date = col[1].get_text()
                Time = col[2].get_text()
                Day = col[3].get_text()
                Winner = col[4].get_text()
                Pts = col[5].get_text()
                At_sym = col[6].get_text()
                Loser = col[7].get_text()
                Pts2 = col[8].get_text()
                TV = col[9].get_text()
                Notes = col[10].get_text()
                Year = year
                record = (Year, Weeknum, Date, Time, Day, Winner, Pts, At_sym, Loser, Pts2, Notes)
                wrtr.writerow(record)
                cnt += 1
                csvfile.flush()
            except:
                pass
        print("Finished writing " + str(year) + " schedule with " + str(cnt) + " rows")



Finished writing 2017 schedule with 876 rows




Finished writing 2018 schedule with 832 rows


### import the data, concatenate it to create the current schedule_mstr dataframe

In [2]:
def get_nz_rank(team):
    ff = team[0:4]
    start_paren = ff.find('(')
    end_paren = ff.find(')')
    if end_paren > 0:
        return 1 - (int(team[start_paren+1:end_paren]) / 25)
    else:
        return 0
    
def drop_rank(team):
    ff = team[0:4]
    end_paren = ff.find(')')
    if end_paren > 0:
        return team[end_paren+1:].strip()  # the first character of ranked teams has a shitty unicode character
    else:
        return team  
    
def flipper(flag):
    if flag == 0:
        return 1
    if flag == 1:
        return 0

In [3]:
schedule_mstr = concat_mult_ref_tables('schedule', yrs_list)
# Fill any missing Time values with 0
values = {'Time': '0'}
schedule_mstr = schedule_mstr.fillna(value = values)
# Adjust the date column to a true datetime dtype and remove Time column
schedule_mstr['Date'] = schedule_mstr.apply(lambda x: dateToTimeStamp(x['Date'], x['Time']), axis=1)
# Save the upcoming games for later
future_mstr = schedule_mstr[schedule_mstr['Date'] > today]
schedule_mstr = schedule_mstr[schedule_mstr['Date'] < today]
# Drop Time column, no longer necessary
schedule_mstr = schedule_mstr.drop('Time', axis=1)
future_mstr = future_mstr.drop('Time', axis=1)
# Extract rankings where applicable from team names
schedule_mstr['Team_rank'] = schedule_mstr.apply(lambda x: get_nz_rank(x['Team']), axis=1)
schedule_mstr['Opp_rank'] = schedule_mstr.apply(lambda x: get_nz_rank(x['Opp']), axis=1)
# Create binary value for home games and drop @ symbol column
schedule_mstr['Game_home'] = schedule_mstr.apply(lambda x: winner_home(x['At_sym']), axis=1)
schedule_mstr = schedule_mstr.drop('At_sym', axis=1)
# Clean up team names
schedule_mstr['Team'] = schedule_mstr.apply(lambda x: drop_rank(x['Team']), axis=1)
schedule_mstr['Opp'] = schedule_mstr.apply(lambda x: drop_rank(x['Opp']), axis=1)
# Drop any cancelled games (games with NaN in the scores column)
canc_games_list = schedule_mstr[schedule_mstr['Team_Pts'].isnull()]
schedule_mstr = schedule_mstr.drop(canc_games_list.index.values.astype(int))
# Add won column before adding loser rows
schedule_mstr['Won'] = 1

In [5]:
schedule_mstr[schedule_mstr['Team'] == 'Florida']

Unnamed: 0,Year,Weeknum,Date,Day,Team,Team_Pts,Opp,Opp_Pts,Notes,Team_rank,Opp_rank,Game_home,Won
178,2017,4,2017-09-16 00:00:00,Sat,Florida,26.0,Tennessee,20.0,"Ben Hill Griffin Stadium - Gainesville, Florida",0.04,0.08,1,1
237,2017,5,2017-09-23 00:00:00,Sat,Florida,28.0,Kentucky,27.0,"Kroger Field - Lexington, Kentucky",0.2,0.0,0,1
301,2017,6,2017-09-30 00:00:00,Sat,Florida,38.0,Vanderbilt,24.0,"Ben Hill Griffin Stadium - Gainesville, Florida",0.16,0.0,1,1
711,2017,13,2017-11-18 00:00:00,Sat,Florida,36.0,Alabama-Birmingham,7.0,"Ben Hill Griffin Stadium - Gainesville, Florida",0.0,0.0,1,1
915,2018,2,2018-09-01 19:30:00,Sat,Florida,53.0,Charleston Southern,6.0,"Ben Hill Griffin Stadium - Gainesville, Florida",0.0,0.0,1,1
1062,2018,4,2018-09-15 16:00:00,Sat,Florida,48.0,Colorado State,10.0,"Ben Hill Griffin Stadium - Gainesville, Florida",0.0,0.0,1,1
1125,2018,5,2018-09-22 19:00:00,Sat,Florida,47.0,Tennessee,21.0,"Neyland Stadium - Knoxville, Tennessee",0.0,0.0,0,1
1186,2018,6,2018-09-29 18:00:00,Sat,Florida,13.0,Mississippi State,6.0,"Davis Wade Stadium - Starkville, Mississippi",0.0,0.08,0,1
1244,2018,7,2018-10-06 15:30:00,Sat,Florida,27.0,Louisiana State,19.0,"Ben Hill Griffin Stadium - Gainesville, Florida",0.2,0.8,1,1
1303,2018,8,2018-10-13 12:00:00,Sat,Florida,37.0,Vanderbilt,27.0,"Vanderbilt Stadium - Nashville, Tennessee",0.44,0.0,0,1


### Extend the schedule_mstr dataframe to become a full dataset with all team game results, by team

In [6]:
dopplegngr = schedule_mstr.copy()
# rename columns to perform the 'swap'
dopplegngr.columns = ['Year', 'Weeknum', 'Date', 'Day', 'Opp', 'Opp_Pts', 'Team',
                      'Team_Pts', 'Notes', 'Opp_rank', 'Team_rank', 'Game_home', 'Won']
# rearrange the columns so the axis matches the original
cols = ['Year', 'Weeknum', 'Date', 'Day', 'Team', 'Team_Pts', 
                        'Opp', 'Opp_Pts', 'Notes', 'Team_rank', 'Opp_rank', 'Game_home', 'Won']
dopplegngr = dopplegngr[cols]
dopplegngr['Game_home'] = dopplegngr.apply(lambda x: flipper(x['Game_home']), axis=1)
dopplegngr['Won'] = dopplegngr.apply(lambda x: flipper(x['Won']), axis=1)

schedule_mstr = pd.concat([schedule_mstr, dopplegngr])
schedule_mstr = schedule_mstr.sort_index()

In [7]:
schedule_mstr[schedule_mstr['Team'] == 'Florida']

Unnamed: 0,Year,Weeknum,Date,Day,Team,Team_Pts,Opp,Opp_Pts,Notes,Team_rank,Opp_rank,Game_home,Won
57,2017,2,2017-09-02 00:00:00,Sat,Florida,17.0,Michigan,33.0,"AT&T Stadium - Arlington, Texas",0.32,0.56,0,0
178,2017,4,2017-09-16 00:00:00,Sat,Florida,26.0,Tennessee,20.0,"Ben Hill Griffin Stadium - Gainesville, Florida",0.04,0.08,1,1
237,2017,5,2017-09-23 00:00:00,Sat,Florida,28.0,Kentucky,27.0,"Kroger Field - Lexington, Kentucky",0.2,0.0,0,1
301,2017,6,2017-09-30 00:00:00,Sat,Florida,38.0,Vanderbilt,24.0,"Ben Hill Griffin Stadium - Gainesville, Florida",0.16,0.0,1,1
366,2017,7,2017-10-07 00:00:00,Sat,Florida,16.0,Louisiana State,17.0,"Ben Hill Griffin Stadium - Gainesville, Florida",0.16,0.0,1,0
447,2017,8,2017-10-14 00:00:00,Sat,Florida,17.0,Texas A&M,19.0,"Ben Hill Griffin Stadium - Gainesville, Florida",0.0,0.0,1,0
533,2017,10,2017-10-28 00:00:00,Sat,Florida,7.0,Georgia,42.0,"EverBank Field - Jacksonville, Florida",0.0,0.88,0,0
608,2017,11,2017-11-04 00:00:00,Sat,Florida,16.0,Missouri,45.0,"Memorial Stadium/Faurot Field - Columbia, Miss...",0.0,0.0,0,0
677,2017,12,2017-11-11 00:00:00,Sat,Florida,20.0,South Carolina,28.0,"Williams-Brice Stadium - Columbia, South Carolina",0.0,0.0,0,0
711,2017,13,2017-11-18 00:00:00,Sat,Florida,36.0,Alabama-Birmingham,7.0,"Ben Hill Griffin Stadium - Gainesville, Florida",0.0,0.0,1,1


In [8]:
gametime_mstr = schedule_mstr.copy()
cols = ['Year', 'Date', 'Team', 'Opp', 'Won', 'Game_home', 'Team_rank', 'Opp_rank']
gametime_mstr = gametime_mstr[cols]
gametime_mstr.set_index('Year', inplace=True)
gametime_mstr['Season'] = gametime_mstr.index
cols1 = ['Season', 'Date', 'Team', 'Opp', 'Won', 'Game_home', 'Team_rank', 'Opp_rank']
gametime_mstr = gametime_mstr[cols1]

In [9]:
first_year = G_year - 1

def get_season_str_yr(gamedate):
    """ Takes the date of a game and 
    returns the season year as a string"""
    if gamedate.month == 1:  # if this is a bowl game
        str_year = str(gamedate.year - 1)
    else:
        str_year = str(gamedate.year)
    return str_year

def get_season_yr(gamedate):
    """ Takes the date of a game and 
    returns the season year as an int"""
    if gamedate.month == 1:  # if this is a bowl game
        year = gamedate.year - 1
    else:
        year = gamedate.year
    return year

def season_record_to_date(team, date):
    """Given a team and date, this function returns the season win percentage as a float
    up to, but not including, that date.  If this is the first game of the season
    it returns the percentage from last season"""
    # account for bowl games that occur in next calendar year
    str_year = get_season_yr(date)
    # locate the full season for this team and calculate wins
    try:
        team_season = gametime_mstr[gametime_mstr['Team'] == team].loc[str_year]
    except KeyError:
        team_season = pd.Series()
    if isinstance(team_season, pd.core.series.Series):
        games = 0
    else:
        games_to_date = team_season[team_season['Date'] < date]
        games = games_to_date.shape[0]
    # account for first game of the season - use last year unless we don't have it
    if ((games == 0) & (str_year != first_year)):
        str_year = date.year - 1
        # Handle errors when there is no last season
        try:
            last_season = gametime_mstr[gametime_mstr['Team'] == team].loc[str_year]
            games = last_season.shape[0]
            wins = last_season['Won'].sum()
            # don't allow to divide by zero
            if games > 0:
                win_perc = wins / games
            else:
                win_perc = 0
        except KeyError:
            win_perc = 0
    elif ((games == 0) & (str_year == first_year)):
        return 0
    else:
        wins = games_to_date['Won'].sum()
        win_perc = wins / games
    
    return round(win_perc, 3)

def conf_record_to_date(team, date):
    """Given a team and date, this function returns the season win percentage against 
    conference teams as a float up to, but not including, that date.  If this is the first 
    conference game of the season it returns the win percentage from last season"""
    # account for bowl games that occur in next calendar year
    str_year = get_season_yr(date)
    # locate the full season for this team and calculate wins
    try:
        team_season = gametime_mstr[gametime_mstr['Team'] == team].loc[str_year][gametime_mstr[gametime_mstr['Team'] == team].loc[str_year]['Game_conf'] == 1]
    except KeyError:
        team_season = pd.Series()
    if isinstance(team_season, pd.core.frame.DataFrame):
        games_to_date = team_season[team_season['Date'] < date]
        games = games_to_date.shape[0]
    else:
        games = 0
    # account for first game of the season - use last year unless we don't have it
    if ((games == 0) & (str_year != first_year)):
        str_year = date.year - 1
        # Handle errors when there is no last season
        try:
            last_season = gametime_mstr[gametime_mstr['Team'] == team].loc[str_year][gametime_mstr[gametime_mstr['Team'] == team].loc[str_year]['Game_conf'] == 1]
            if isinstance(last_season, pd.core.frame.DataFrame):
                games = last_season.shape[0]
                wins = last_season['Won'].sum()
            else:
                games = 0
                wins = 0
            # don't allow to divide by zero
            if games > 0:
                win_perc = wins / games
            else:
                win_perc = 0
        except KeyError:
            win_perc = 0
    elif ((games == 0) & (str_year == first_year)):
        return 0
    else:
        wins = games_to_date['Won'].sum()
        win_perc = wins / games
    
    return round(win_perc, 3)

def get_conf(team, yr):
    try:
        conf_hist = conference_mstr[conference_mstr['School'] == team]
        conf_hist = conf_hist[(conf_hist['From'] <= yr) & (conf_hist['To'] >= yr)]
        return conf_hist['Conf'].values[0]
    except IndexError:
        return 'missing'

def in_conf_game(team1, team2, yr):
    if get_conf(team1, yr) == get_conf(team2, yr):
        return 1
    else:
        return 0
    


### import any necessary files for feature engineering

In [10]:
conference_mstr = pd.read_csv('./input/conference_master.csv')

### working apply functions to generate features

In [11]:
gametime_mstr['Game_conf'] = gametime_mstr.apply(lambda x: in_conf_game(x['Team'], x['Opp'], x['Season']), axis=1)
gametime_mstr['Team_SRTD'] = gametime_mstr.apply(lambda x: season_record_to_date(x['Team'], x['Date']), axis=1)
gametime_mstr['Team_CRTD'] = gametime_mstr.apply(lambda x: conf_record_to_date(x['Team'], x['Date']), axis=1)
gametime_mstr['Opp_SRTD'] = gametime_mstr.apply(lambda x: season_record_to_date(x['Opp'], x['Date']), axis=1)
gametime_mstr['Opp_CRTD'] = gametime_mstr.apply(lambda x: conf_record_to_date(x['Opp'], x['Date']), axis=1)

In [13]:
gametime_mstr[gametime_mstr['Team'] == 'Florida']


Unnamed: 0_level_0,Season,Date,Team,Opp,Won,Game_home,Team_rank,Opp_rank,Game_conf,Team_SRTD,Team_CRTD,Opp_SRTD,Opp_CRTD
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2017,2017,2017-09-02 00:00:00,Florida,Michigan,0,0,0.32,0.56,0,0.0,0.0,0.0,0.0
2017,2017,2017-09-16 00:00:00,Florida,Tennessee,1,1,0.04,0.08,1,0.0,0.0,1.0,0.0
2017,2017,2017-09-23 00:00:00,Florida,Kentucky,1,0,0.2,0.0,1,0.5,1.0,1.0,1.0
2017,2017,2017-09-30 00:00:00,Florida,Vanderbilt,1,1,0.16,0.0,1,0.667,1.0,0.75,0.0
2017,2017,2017-10-07 00:00:00,Florida,Louisiana State,0,1,0.16,0.0,0,0.75,1.0,0.6,1.0
2017,2017,2017-10-14 00:00:00,Florida,Texas A&M,0,1,0.0,0.0,1,0.6,1.0,0.667,0.667
2017,2017,2017-10-28 00:00:00,Florida,Georgia,0,0,0.0,0.88,1,0.5,0.75,1.0,1.0
2017,2017,2017-11-04 00:00:00,Florida,Missouri,0,0,0.0,0.0,1,0.429,0.6,0.375,0.0
2017,2017,2017-11-11 00:00:00,Florida,South Carolina,0,0,0.0,0.0,1,0.375,0.5,0.667,0.571
2017,2017,2017-11-18 00:00:00,Florida,Alabama-Birmingham,1,1,0.0,0.0,0,0.333,0.429,0.7,1.0


In [15]:
future_mstr.head()

Unnamed: 0,Year,Weeknum,Date,Day,Team,Team_Pts,At_sym,Opp,Opp_Pts,Notes
1642,2018,14,2018-11-22 15:30:00,Thu,Colorado State,,@,Air Force,,
1643,2018,14,2018-11-22 19:30:00,Thu,(22) Mississippi State,,@,Mississippi,,
1644,2018,14,2018-11-23 12:00:00,Fri,Akron,,@,Ohio,,
1645,2018,14,2018-11-23 14:30:00,Fri,Arkansas,,@,Missouri,,
1646,2018,14,2018-11-23 12:00:00,Fri,Buffalo,,@,Bowling Green State,,


In [16]:
# Extract rankings where applicable from team names
future_mstr['Team_rank'] = future_mstr.apply(lambda x: get_nz_rank(x['Team']), axis=1)
future_mstr['Opp_rank'] = future_mstr.apply(lambda x: get_nz_rank(x['Opp']), axis=1)
# Create binary value for home games and drop @ symbol column
future_mstr['Game_home'] = future_mstr.apply(lambda x: winner_home(x['At_sym']), axis=1)
future_mstr = future_mstr.drop('At_sym', axis=1)
# Clean up team names
future_mstr['Team'] = future_mstr.apply(lambda x: drop_rank(x['Team']), axis=1)
future_mstr['Opp'] = future_mstr.apply(lambda x: drop_rank(x['Opp']), axis=1)


In [17]:
dopplegngr = future_mstr.copy()
# rename columns to perform the 'swap'
dopplegngr.columns = ['Year', 'Weeknum', 'Date', 'Day', 'Opp', 'Opp_Pts', 'Team',
                      'Team_Pts', 'Notes', 'Opp_rank', 'Team_rank', 'Game_home']
# rearrange the columns so the axis matches the original
cols = ['Year', 'Weeknum', 'Date', 'Day', 'Team', 'Team_Pts', 
                        'Opp', 'Opp_Pts', 'Notes', 'Team_rank', 'Opp_rank', 'Game_home']
dopplegngr = dopplegngr[cols]
dopplegngr['Game_home'] = dopplegngr.apply(lambda x: flipper(x['Game_home']), axis=1)

future_mstr = pd.concat([future_mstr, dopplegngr])
future_mstr = future_mstr.sort_index()

In [18]:
future_mstr[future_mstr['Team'] == 'Florida']

Unnamed: 0,Year,Weeknum,Date,Day,Team,Team_Pts,Opp,Opp_Pts,Notes,Team_rank,Opp_rank,Game_home
1667,2018,14,2018-11-24 12:00:00,Sat,Florida,,Florida State,,,0.48,0.0,0


In [19]:
prediction_mstr = future_mstr.copy()
cols = ['Year', 'Date', 'Team', 'Opp', 'Game_home', 'Team_rank', 'Opp_rank']
prediction_mstr = prediction_mstr[cols]
prediction_mstr.set_index('Year', inplace=True)
prediction_mstr['Season'] = prediction_mstr.index
cols1 = ['Season', 'Date', 'Team', 'Opp', 'Game_home', 'Team_rank', 'Opp_rank']
prediction_mstr = prediction_mstr[cols1]

In [20]:
prediction_mstr[prediction_mstr['Team'] == 'Florida']

Unnamed: 0_level_0,Season,Date,Team,Opp,Game_home,Team_rank,Opp_rank
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2018,2018,2018-11-24 12:00:00,Florida,Florida State,0,0.48,0.0


In [21]:
prediction_mstr['Game_conf'] = prediction_mstr.apply(lambda x: in_conf_game(x['Team'], x['Opp'], x['Season']), axis=1)
prediction_mstr['Team_SRTD'] = prediction_mstr.apply(lambda x: season_record_to_date(x['Team'], x['Date']), axis=1)
prediction_mstr['Team_CRTD'] = prediction_mstr.apply(lambda x: conf_record_to_date(x['Team'], x['Date']), axis=1)
prediction_mstr['Opp_SRTD'] = prediction_mstr.apply(lambda x: season_record_to_date(x['Opp'], x['Date']), axis=1)
prediction_mstr['Opp_CRTD'] = prediction_mstr.apply(lambda x: conf_record_to_date(x['Opp'], x['Date']), axis=1)

In [35]:
prediction_mstr.to_csv('./testarea/prediction_mstr.csv')

In [22]:
prediction_mstr[prediction_mstr['Team'] == 'Florida']

Unnamed: 0_level_0,Season,Date,Team,Opp,Game_home,Team_rank,Opp_rank,Game_conf,Team_SRTD,Team_CRTD,Opp_SRTD,Opp_CRTD
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2018,2018,2018-11-24 12:00:00,Florida,Florida State,0,0.48,0.0,0,0.727,0.571,0.455,0.375


In [26]:
features = prediction_mstr.columns[4:]
X_new = prediction_mstr[features]
X_new.head()

Unnamed: 0_level_0,Game_home,Team_rank,Opp_rank,Game_conf,Team_SRTD,Team_CRTD,Opp_SRTD,Opp_CRTD
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2018,0,0.0,0.0,1,0.273,0.286,0.364,0.286
2018,1,0.0,0.0,1,0.364,0.286,0.273,0.286
2018,0,0.12,0.0,0,0.636,0.5,0.455,0.5
2018,1,0.0,0.12,0,0.455,0.5,0.636,0.5
2018,0,0.0,0.0,1,0.4,0.286,0.636,0.714


### Bring in latest XGB model and predict

In [46]:
import joblib
filename = './SRmodels/finalized_XGmodel.sav'
#load saved model
xgb = joblib.load(filename)

In [50]:
XG_preds = xgb.predict(X_new)
prediction_mstr['Predictions'] = XG_preds
cols = ['Date', 'Team', 'Opp', 'Predictions']
prediction_mstr_all = prediction_mstr[cols]
prediction_mstr_final = prediction_mstr_all[prediction_mstr_all['Predictions'] == 1]
prediction_mstr_final

Unnamed: 0_level_0,Date,Team,Opp,Predictions
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2018,2018-11-22 15:30:00,Air Force,Colorado State,1
2018,2018-11-22 19:30:00,Mississippi,Mississippi State,1
2018,2018-11-23 12:00:00,Ohio,Akron,1
2018,2018-11-23 14:30:00,Missouri,Arkansas,1
2018,2018-11-23 12:00:00,Buffalo,Bowling Green State,1
2018,2018-11-23 12:00:00,Central Florida,South Florida,1
2018,2018-11-23 12:00:00,Toledo,Central Michigan,1
2018,2018-11-23 12:00:00,Cincinnati,East Carolina,1
2018,2018-11-23 12:00:00,Eastern Michigan,Kent State,1
2018,2018-11-23 12:00:00,Houston,Memphis,1


### Code below to bring in latest neural network model from disk and predict

In [44]:
from tensorflow.python.keras.models import model_from_json
from tensorflow.python.keras.models import load_model

# load model
loaded_model = load_model("./SRmodels/Keras_model.h5")
print("Loaded model from disk")


Loaded model from disk


In [45]:
y_new = loaded_model.predict_classes(X_new)
y_new_proba = loaded_model.predict_proba(X_new)
prediction_mstr_w_preds = prediction_mstr.copy()
prediction_mstr_w_preds['Prediction'] = y_new
prediction_mstr_w_preds['Proba'] = y_new_proba
cols = ['Date', 'Team', 'Opp', 'Prediction', 'Proba']
prediction_final = prediction_mstr_w_preds[cols]
prediction_final = prediction_final[prediction_final['Prediction'] == 1]
prediction_final

Unnamed: 0_level_0,Date,Team,Opp,Prediction,Proba
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2018,2018-11-22 15:30:00,Colorado State,Air Force,1,0.502099
2018,2018-11-22 19:30:00,Mississippi State,Mississippi,1,0.500588
2018,2018-11-23 12:00:00,Akron,Ohio,1,0.502623
2018,2018-11-23 14:30:00,Arkansas,Missouri,1,0.503111
2018,2018-11-23 12:00:00,Buffalo,Bowling Green State,1,0.503206
2018,2018-11-23 12:00:00,South Florida,Central Florida,1,0.502739
2018,2018-11-23 12:00:00,Central Michigan,Toledo,1,0.501506
2018,2018-11-23 12:00:00,East Carolina,Cincinnati,1,0.502900
2018,2018-11-23 12:00:00,Eastern Michigan,Kent State,1,0.502635
2018,2018-11-23 12:00:00,Houston,Memphis,1,0.502651
