### Scrape the SportsReference data

In [7]:
# import packages
import urllib3
import csv
from bs4 import BeautifulSoup
import re
import pandas as pd
import datetime

#settings
http = urllib3.PoolManager()
yrs_list = ['2017', '2018']
today = datetime.datetime.today()
G_year = today.year

#define some helper functions
def monthToNum(shortMonth):
    return{
            'Jan' : 1, 'Feb' : 2, 'Mar' : 3, 'Apr' : 4, 'May' : 5, 'Jun' : 6,
            'Jul' : 7, 'Aug' : 8, 'Sep' : 9, 'Oct' : 10, 'Nov' : 11, 'Dec' : 12
    }[shortMonth]

def dateToTimeStamp(datestring, timestring):
    """
    Given the following formats for datestring and timestring, function returns a valid datetime
    datestring = 'May 28, 2005'
    timestring = '2:00 PM'
    """
    dlength = len(datestring)
    mon = monthToNum(datestring[0:3])
    year = int(datestring[dlength-4:])
    middle = datestring[3:-4]
    day = int(middle.strip().strip(','))
    if timestring == '0':
        return datetime.datetime(year, mon, day)
    else:
        tlength = len(timestring)
        M = timestring[tlength-2:]
        hour = int(timestring[:timestring.find(':')])
        if (M == 'PM') & (hour != 12):
            hour = int(timestring[:timestring.find(':')]) + 12
        minute = int(timestring[timestring.find(':')+1:timestring.find(':')+3])
    return datetime.datetime(year, mon, day, hour, minute)

def winner_home(string):
    if string == '@':
        return 0
    else:
        return 1

def concat_mult_ref_tables(filename, yrs):
    """Return a dataframe that concatenates all 
    files across a list of years, with the year set as a key
    """
    # create a list to store the dfs
    df_list = []
    headerlist = ['Year', 'Weeknum', 'Date', 'Time', 'Day', 'Team', 'Team_Pts', 'At_sym', 'Opp', 'Opp_Pts', 'Notes']
    
    for yr in yrs: 
        temp_df = None   #clear out the df
        temp_df = pd.read_csv('./SRinput/preds/' + str(yr) + '/schedule' + str(yr) + '.csv', header=None)  #read in the file
        df_list.append(temp_df)
        
    final_df = pd.concat(df_list, ignore_index = True)
    final_df.columns = headerlist
    
    return final_df

#scrape the data and write the .csv files
for year in range(G_year-1, G_year+1):
    with open ('./SRinput/preds/' + str(year) + '/schedule' + str(year) + '.csv','w') as csvfile:
        wrtr = csv.writer(csvfile, delimiter=',', quotechar='"')
        url = "http://www.sports-reference.com/cfb/years/"+str(year)+"-schedule.html"
        response = http.request('GET', url)
        soup = BeautifulSoup(response.data)
        cnt = 0
        for row in soup.findAll('tr'):
            try:
                col1=row.findAll('th')
                Rank=col1[0].string
                col=row.findAll('td')
                Weeknum = col[0].get_text()
                Date = col[1].get_text()
                Time = col[2].get_text()
                Day = col[3].get_text()
                Winner = col[4].get_text()
                Pts = col[5].get_text()
                At_sym = col[6].get_text()
                Loser = col[7].get_text()
                Pts2 = col[8].get_text()
                TV = col[9].get_text()
                Notes = col[10].get_text()
                Year = year
                record = (Year, Weeknum, Date, Time, Day, Winner, Pts, At_sym, Loser, Pts2, Notes)
                wrtr.writerow(record)
                cnt += 1
                csvfile.flush()
            except:
                pass
        print("Finished writing " + str(year) + " schedule with " + str(cnt) + " rows")



Finished writing 2017 schedule with 876 rows




Finished writing 2018 schedule with 832 rows


### import the data, concatenate it to create the current schedule_mstr dataframe

In [8]:
def get_nz_rank(team):
    ff = team[0:4]
    start_paren = ff.find('(')
    end_paren = ff.find(')')
    if end_paren > 0:
        return 1 - (int(team[start_paren+1:end_paren]) / 25)
    else:
        return 0
    
def drop_rank(team):
    ff = team[0:4]
    end_paren = ff.find(')')
    if end_paren > 0:
        return team[end_paren+1:].strip()  # the first character of ranked teams has a shitty unicode character
    else:
        return team  
    
def flipper(flag):
    if flag == 0:
        return 1
    if flag == 1:
        return 0

In [9]:
schedule_mstr = concat_mult_ref_tables('schedule', yrs_list)
# Fill any missing Time values with 0
values = {'Time': '0'}
schedule_mstr = schedule_mstr.fillna(value = values)
# Adjust the date column to a true datetime dtype and remove Time column
schedule_mstr['Date'] = schedule_mstr.apply(lambda x: dateToTimeStamp(x['Date'], x['Time']), axis=1)
# Save the upcoming games for later
future_mstr = schedule_mstr[schedule_mstr['Date'] > today]
schedule_mstr = schedule_mstr[schedule_mstr['Date'] < today]
# Drop Time column, no longer necessary
schedule_mstr = schedule_mstr.drop('Time', axis=1)
future_mstr = future_mstr.drop('Time', axis=1)
# Extract rankings where applicable from team names
schedule_mstr['Team_rank'] = schedule_mstr.apply(lambda x: get_nz_rank(x['Team']), axis=1)
schedule_mstr['Opp_rank'] = schedule_mstr.apply(lambda x: get_nz_rank(x['Opp']), axis=1)
# Create binary value for home games and drop @ symbol column
schedule_mstr['Game_home'] = schedule_mstr.apply(lambda x: winner_home(x['At_sym']), axis=1)
schedule_mstr = schedule_mstr.drop('At_sym', axis=1)
# Clean up team names
schedule_mstr['Team'] = schedule_mstr.apply(lambda x: drop_rank(x['Team']), axis=1)
schedule_mstr['Opp'] = schedule_mstr.apply(lambda x: drop_rank(x['Opp']), axis=1)
# Drop any cancelled games (games with NaN in the scores column)
canc_games_list = schedule_mstr[schedule_mstr['Team_Pts'].isnull()]
schedule_mstr = schedule_mstr.drop(canc_games_list.index.values.astype(int))
# Add won column before adding loser rows
schedule_mstr['Won'] = 1

### Extend the schedule_mstr dataframe to become a full dataset with all team game results, by team

In [10]:
dopplegngr = schedule_mstr.copy()
# rename columns to perform the 'swap'
dopplegngr.columns = ['Year', 'Weeknum', 'Date', 'Day', 'Opp', 'Opp_Pts', 'Team',
                      'Team_Pts', 'Notes', 'Opp_rank', 'Team_rank', 'Game_home', 'Won']
# rearrange the columns so the axis matches the original
cols = ['Year', 'Weeknum', 'Date', 'Day', 'Team', 'Team_Pts', 
                        'Opp', 'Opp_Pts', 'Notes', 'Team_rank', 'Opp_rank', 'Game_home', 'Won']
dopplegngr = dopplegngr[cols]
dopplegngr['Game_home'] = dopplegngr.apply(lambda x: flipper(x['Game_home']), axis=1)
dopplegngr['Won'] = dopplegngr.apply(lambda x: flipper(x['Won']), axis=1)

schedule_mstr = pd.concat([schedule_mstr, dopplegngr])
schedule_mstr = schedule_mstr.sort_index()

In [None]:
schedule_mstr

In [11]:
gametime_mstr = schedule_mstr.copy()
cols = ['Year', 'Date', 'Team', 'Opp', 'Won', 'Game_home', 'Team_rank', 'Opp_rank']
gametime_mstr = gametime_mstr[cols]
gametime_mstr.set_index('Year', inplace=True)
gametime_mstr['Season'] = gametime_mstr.index
cols1 = ['Season', 'Date', 'Team', 'Opp', 'Won', 'Game_home', 'Team_rank', 'Opp_rank']
gametime_mstr = gametime_mstr[cols1]

In [44]:
first_year = G_year - 1

def get_season_str_yr(gamedate):
    """ Takes the date of a game and 
    returns the season year as a string"""
    if gamedate.month == 1:  # if this is a bowl game
        str_year = str(gamedate.year - 1)
    else:
        str_year = str(gamedate.year)
    return str_year

def get_season_yr(gamedate):
    """ Takes the date of a game and 
    returns the season year as an int"""
    if gamedate.month == 1:  # if this is a bowl game
        year = gamedate.year - 1
    else:
        year = gamedate.year
    return year

def season_record_to_date(team, date):
    """Given a team and date, this function returns the season win percentage as a float
    up to, but not including, that date.  If this is the first game of the season
    it returns the percentage from last season"""
    # account for bowl games that occur in next calendar year
    str_year = get_season_yr(date)
    # locate the full season for this team and calculate wins
    try:
        team_season = gametime_mstr[gametime_mstr['Team'] == team].loc[str_year]
    except KeyError:
        team_season = pd.Series()
    if isinstance(team_season, pd.core.series.Series):
        games = 0
    else:
        games_to_date = team_season[team_season['Date'] < date]
        games = games_to_date.shape[0]
    # account for first game of the season - use last year unless we don't have it
    if ((games == 0) & (str_year != first_year)):
        str_year = date.year - 1
        # Handle errors when there is no last season
        try:
            last_season = gametime_mstr[gametime_mstr['Team'] == team].loc[str_year]
            games = last_season.shape[0]
            wins = last_season['Won'].sum()
            # don't allow to divide by zero
            if games > 0:
                win_perc = wins / games
            else:
                win_perc = 0
        except KeyError:
            win_perc = 0
    elif ((games == 0) & (str_year == first_year)):
        return 0
    else:
        wins = games_to_date['Won'].sum()
        win_perc = wins / games
    
    return round(win_perc, 3)

def conf_record_to_date(team, date):
    """Given a team and date, this function returns the season win percentage against 
    conference teams as a float up to, but not including, that date.  If this is the first 
    conference game of the season it returns the win percentage from last season"""
    # account for bowl games that occur in next calendar year
    str_year = get_season_yr(date)
    # locate the full season for this team and calculate wins
    try:
        team_season = gametime_mstr[gametime_mstr['Team'] == team].loc[str_year][gametime_mstr[gametime_mstr['Team'] == team].loc[str_year]['Game_conf'] == 1]
    except KeyError:
        team_season = pd.Series()
    if isinstance(team_season, pd.core.frame.DataFrame):
        games_to_date = team_season[team_season['Date'] < date]
        games = games_to_date.shape[0]
    else:
        games = 0
    # account for first game of the season - use last year unless we don't have it
    if ((games == 0) & (str_year != first_year)):
        str_year = date.year - 1
        # Handle errors when there is no last season
        try:
            last_season = gametime_mstr[gametime_mstr['Team'] == team].loc[str_year][gametime_mstr[gametime_mstr['Team'] == team].loc[str_year]['Game_conf'] == 1]
            if isinstance(last_season, pd.core.frame.DataFrame):
                games = last_season.shape[0]
                wins = last_season['Won'].sum()
            else:
                games = 0
                wins = 0
            # don't allow to divide by zero
            if games > 0:
                win_perc = wins / games
            else:
                win_perc = 0
        except KeyError:
            win_perc = 0
    elif ((games == 0) & (str_year == first_year)):
        return 0
    else:
        wins = games_to_date['Won'].sum()
        win_perc = wins / games
    
    return round(win_perc, 3)

def get_conf(team, yr):
    try:
        conf_hist = conference_mstr[conference_mstr['School'] == team]
        conf_hist = conf_hist[(conf_hist['From'] <= yr) & (conf_hist['To'] >= yr)]
        return conf_hist['Conf'].values[0]
    except IndexError:
        return 'missing'

def in_conf_game(team1, team2, yr):
    if get_conf(team1, yr) == get_conf(team2, yr):
        return 1
    else:
        return 0
    


### import any necessary files for feature engineering

In [13]:
conference_mstr = pd.read_csv('./input/conference_master.csv')

### working apply functions to generate features

In [14]:
gametime_mstr['Game_conf'] = gametime_mstr.apply(lambda x: in_conf_game(x['Team'], x['Opp'], x['Season']), axis=1)
gametime_mstr['Team_SRTD'] = gametime_mstr.apply(lambda x: season_record_to_date(x['Team'], x['Date']), axis=1)
gametime_mstr['Team_CRTD'] = gametime_mstr.apply(lambda x: conf_record_to_date(x['Team'], x['Date']), axis=1)
gametime_mstr['Opp_SRTD'] = gametime_mstr.apply(lambda x: season_record_to_date(x['Opp'], x['Date']), axis=1)
gametime_mstr['Opp_CRTD'] = gametime_mstr.apply(lambda x: conf_record_to_date(x['Opp'], x['Date']), axis=1)

In [16]:
gametime_mstr.head()


Unnamed: 0_level_0,Season,Date,Team,Opp,Won,Game_home,Team_rank,Opp_rank,Game_conf,Team_SRTD,Team_CRTD,Opp_SRTD,Opp_CRTD
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2017,2017,2017-08-26,Brigham Young,Portland State,1,1,0.0,0.0,0,0.0,0.0,0.0,0.0
2017,2017,2017-08-26,Portland State,Brigham Young,0,0,0.0,0.0,0,0.0,0.0,0.0,0.0
2017,2017,2017-08-26,Colorado State,Oregon State,1,1,0.0,0.0,0,0.0,0.0,0.0,0.0
2017,2017,2017-08-26,Oregon State,Colorado State,0,0,0.0,0.0,0,0.0,0.0,0.0,0.0
2017,2017,2017-08-26,Hawaii,Massachusetts,1,0,0.0,0.0,0,0.0,0.0,0.0,0.0


In [18]:
future_mstr.head()

Unnamed: 0,Year,Weeknum,Date,Day,Team,Team_Pts,At_sym,Opp,Opp_Pts,Notes
1640,2018,14,2018-11-20 19:00:00,Tue,Ball State,,@,Miami (OH),,
1641,2018,14,2018-11-20 19:00:00,Tue,Northern Illinois,,@,Western Michigan,,
1642,2018,14,2018-11-22 15:30:00,Thu,Colorado State,,@,Air Force,,
1643,2018,14,2018-11-22 19:30:00,Thu,(22) Mississippi State,,@,Mississippi,,
1644,2018,14,2018-11-23 12:00:00,Fri,Akron,,@,Ohio,,


In [19]:
# Extract rankings where applicable from team names
future_mstr['Team_rank'] = future_mstr.apply(lambda x: get_nz_rank(x['Team']), axis=1)
future_mstr['Opp_rank'] = future_mstr.apply(lambda x: get_nz_rank(x['Opp']), axis=1)
# Create binary value for home games and drop @ symbol column
future_mstr['Game_home'] = future_mstr.apply(lambda x: winner_home(x['At_sym']), axis=1)
future_mstr = future_mstr.drop('At_sym', axis=1)
# Clean up team names
future_mstr['Team'] = future_mstr.apply(lambda x: drop_rank(x['Team']), axis=1)
future_mstr['Opp'] = future_mstr.apply(lambda x: drop_rank(x['Opp']), axis=1)


In [21]:
dopplegngr = future_mstr.copy()
# rename columns to perform the 'swap'
dopplegngr.columns = ['Year', 'Weeknum', 'Date', 'Day', 'Opp', 'Opp_Pts', 'Team',
                      'Team_Pts', 'Notes', 'Opp_rank', 'Team_rank', 'Game_home']
# rearrange the columns so the axis matches the original
cols = ['Year', 'Weeknum', 'Date', 'Day', 'Team', 'Team_Pts', 
                        'Opp', 'Opp_Pts', 'Notes', 'Team_rank', 'Opp_rank', 'Game_home']
dopplegngr = dopplegngr[cols]
dopplegngr['Game_home'] = dopplegngr.apply(lambda x: flipper(x['Game_home']), axis=1)

future_mstr = pd.concat([future_mstr, dopplegngr])
future_mstr = future_mstr.sort_index()

In [24]:
prediction_mstr = future_mstr.copy()
cols = ['Year', 'Date', 'Team', 'Opp', 'Game_home', 'Team_rank', 'Opp_rank']
prediction_mstr = prediction_mstr[cols]
prediction_mstr.set_index('Year', inplace=True)
prediction_mstr['Season'] = prediction_mstr.index
cols1 = ['Season', 'Date', 'Team', 'Opp', 'Game_home', 'Team_rank', 'Opp_rank']
prediction_mstr = prediction_mstr[cols1]

In [46]:
prediction_mstr['Game_conf'] = prediction_mstr.apply(lambda x: in_conf_game(x['Team'], x['Opp'], x['Season']), axis=1)
prediction_mstr['Team_SRTD'] = prediction_mstr.apply(lambda x: season_record_to_date(x['Team'], x['Date']), axis=1)
prediction_mstr['Team_CRTD'] = prediction_mstr.apply(lambda x: conf_record_to_date(x['Team'], x['Date']), axis=1)
prediction_mstr['Opp_SRTD'] = prediction_mstr.apply(lambda x: season_record_to_date(x['Opp'], x['Date']), axis=1)
prediction_mstr['Opp_CRTD'] = prediction_mstr.apply(lambda x: conf_record_to_date(x['Opp'], x['Date']), axis=1)

In [47]:
prediction_mstr.head()

Unnamed: 0_level_0,Season,Date,Team,Opp,Game_home,Team_rank,Opp_rank,Game_conf,Team_SRTD,Team_CRTD,Opp_SRTD,Opp_CRTD
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2018,2018,2018-11-20 19:00:00,Ball State,Miami (OH),0,0.0,0.0,1,0.364,0.429,0.455,0.714
2018,2018,2018-11-20 19:00:00,Miami (OH),Ball State,1,0.0,0.0,1,0.455,0.714,0.364,0.429
2018,2018,2018-11-20 19:00:00,Northern Illinois,Western Michigan,0,0.0,0.0,1,0.636,0.857,0.545,0.571
2018,2018,2018-11-20 19:00:00,Western Michigan,Northern Illinois,1,0.0,0.0,1,0.545,0.571,0.636,0.857
2018,2018,2018-11-22 15:30:00,Colorado State,Air Force,0,0.0,0.0,1,0.273,0.286,0.364,0.286


### Code below to bring in latest XGBoost model from disk and predict

### Code below to bring in latest neural network model from disk and predict

In [None]:
from keras.models import model_from_yaml

# load YAML and create model
yaml_file = open('model.yaml', 'r')
loaded_model_yaml = yaml_file.read()
yaml_file.close()
loaded_model = model_from_yaml(loaded_model_yaml)
# load weights into new model
loaded_model.load_weights("model.h5")
print("Loaded model from disk")
 
# evaluate loaded model on test data
loaded_model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
score = loaded_model.evaluate(X, Y, verbose=0)
print("%s: %.2f%%" % (loaded_model.metrics_names[1], score[1]*100))