In [16]:
# hello-world-predictor.ipynb
# Objective: Read the dataset, do a simple prediction for games on a given date using data prior to that date, then
#            and record some stats on how well it did.
# 
#
# 12/21/19, Alexis: Currently this "helloworld" predictor is very simple. It does the following:
#  1. Reads in the 2010 season as the baseline games dataset. It does some simple selection
#     of columns and generates a few calculated fields.
#  2. It prompts the user to specify 'game day' which are the games it is to predict.
#  3. It calculates the avg net number of runs for the home team (when playing at home) and
#     the avg net number of runs for the visiting team (when playing away).
#  4. For a given game, the predicted winner is the team with the greatest avg net runs at
#     home or away (as applicable).
#  5. The prediction is compared with the actual results that day, with % correct reported.
#
#  Proposed next steps:
#  1. Integrate Venkat's "concat" capability to combine datasets across year blocks to result
#     in a much larger dataset. Note that for development we may want to keep the dataset
#     smaller size so that it doesn't take a long time to run.
#  2. Move this out of jupyter notebook into standard Python and embed the prediction logic
#     into a function that can be called repeatedly with different dates.
#  3. Run trials with large numbers of dates to product large numbers of predictions and
#     results.
#  4. Write results to a file. Possibly generate some plot of results as a function of
#     training set window size.
#  5. Start experimenting with diffent prediction functions, initially across individual
#     factors, and then with multiple factors.
#  6. Consider a statistically meaningful regresssion analysis to select factors and training
#     set window size, by factor.
#  7. If someone has energy, consider using a web API to hit a website with current day
#     game schedule so we can predict games more recent than the dataset.

In [17]:
# Modules
import os
import csv
import pprint
import pandas as pd

# Set path for file
csvpath = os.path.join("..", "datasets", "Final_Data_Files", "GL2010.csv")
print(csvpath)

# col_headers = [("Col_"+str(i)) for i in range(1,season_df.shape[1]+1)]
col_headers = [("Col_"+str(i)) for i in range(1,161+1)]

season_df = pd.read_csv(csvpath, delimiter=",", names=col_headers)
season_df.head()


../datasets/Final_Data_Files/GL2010.csv


Unnamed: 0,Col_1,Col_2,Col_3,Col_4,Col_5,Col_6,Col_7,Col_8,Col_9,Col_10,...,Col_152,Col_153,Col_154,Col_155,Col_156,Col_157,Col_158,Col_159,Col_160,Col_161
0,20100404,0,Sun,NYA,AL,1,BOS,AL,1,7,...,J.D. Drew,9,camem001,Mike Cameron,8,scutm001,Marco Scutaro,6,,Y
1,20100405,0,Mon,MIN,AL,1,ANA,AL,1,3,...,Howie Kendrick,4,woodb003,Brandon Wood,5,mathj001,Jeff Mathis,2,,Y
2,20100405,0,Mon,CLE,AL,1,CHA,AL,1,0,...,A.J. Pierzynski,2,teahm001,Mark Teahen,5,ramia003,Alexei Ramirez,6,,Y
3,20100405,0,Mon,DET,AL,1,KCA,AL,1,8,...,Yuniesky Betancourt,6,kendj001,Jason Kendall,2,getzc001,Chris Getz,4,,Y
4,20100405,0,Mon,SEA,AL,1,OAK,AL,1,5,...,Mark Ellis,4,buckt001,Travis Buck,7,pennc001,Cliff Pennington,6,,Y


In [18]:
season_df.shape # Useful to make sure we don't loose rows when adding column headers.

(2430, 161)

In [19]:
season_df = season_df.rename(columns={'Col_1':'Date',
                                     'Col_4':'Visiting Team',
                                     'Col_5':'Visiting League',
                                     'Col_7':'Home Team',
                                     'Col_8':'Home League',
                                     'Col_10':'Visiting Score',
                                     'Col_11':'Home Score'})
season_df = season_df[['Date', 'Visiting Team', 'Visiting League', 'Home Team', 'Home League',
                       'Visiting Score','Home Score']]
season_df.head(5)

Unnamed: 0,Date,Visiting Team,Visiting League,Home Team,Home League,Visiting Score,Home Score
0,20100404,NYA,AL,BOS,AL,7,9
1,20100405,MIN,AL,ANA,AL,3,6
2,20100405,CLE,AL,CHA,AL,0,6
3,20100405,DET,AL,KCA,AL,8,4
4,20100405,SEA,AL,OAK,AL,5,3


In [20]:
# Create a column 
season_df['Home Winner'] = season_df['Home Score'] > season_df['Visiting Score']
season_df['V NetRuns'] = season_df['Visiting Score'] - season_df['Home Score']
season_df['H NetRuns'] = - season_df['V NetRuns']
season_df['V AvgNetRuns'] = ""
season_df['H AvgNetRuns'] = ""
season_df['Predict Home Wins?'] = ""
season_df['Prediction Correct?'] = ""
season_df.head()

Unnamed: 0,Date,Visiting Team,Visiting League,Home Team,Home League,Visiting Score,Home Score,Home Winner,V NetRuns,H NetRuns,V AvgNetRuns,H AvgNetRuns,Predict Home Wins?,Prediction Correct?
0,20100404,NYA,AL,BOS,AL,7,9,True,-2,2,,,,
1,20100405,MIN,AL,ANA,AL,3,6,True,-3,3,,,,
2,20100405,CLE,AL,CHA,AL,0,6,True,-6,6,,,,
3,20100405,DET,AL,KCA,AL,8,4,False,4,-4,,,,
4,20100405,SEA,AL,OAK,AL,5,3,False,2,-2,,,,


In [21]:
start_date = int(season_df['Date'].min())
end_date = season_df['Date'].max()

good_date = False
while not good_date:
    game_day = int(input(f"What is the game day (YYYYMMDD) you want us to predict (between {start_date} and {end_date}: "))
#    game_day = 20100715
    if game_day > start_date and game_day <= end_date:
        game_day_df = season_df.loc[(season_df['Date'] == (game_day))]
        if len(game_day_df)==0:
            print("Sorry, no games are scheduled for that day")
        else:
            train_df = season_df.loc[(season_df['Date'] < (game_day))]
            gds = str(game_day)
            print(f"Nice! There are {len(game_day_df)} games on {gds[0:4]}-{gds[4:6]}-{gds[6:]}.")
            print(f"  We also have {len(train_df)} games in our training set, which should be plenty!")
            good_date = True
    else:
        print("Sorry, the date you selected is outside the range of our dataset.")


What is the game day (YYYYMMDD) you want us to predict (between 20100404 and 20101003: 20100815
Nice! There are 15 games on 2010-08-15.
  We also have 1747 games in our training set, which should be plenty!


In [22]:
# We now have the set of games we want to predict, and the training set defined.
game_day_df2 = game_day_df.copy(deep=True)
#game_day_df2 = game_day_df2.reset_index()
game_day_df2.head()

Unnamed: 0,Date,Visiting Team,Visiting League,Home Team,Home League,Visiting Score,Home Score,Home Winner,V NetRuns,H NetRuns,V AvgNetRuns,H AvgNetRuns,Predict Home Wins?,Prediction Correct?
1747,20100815,TOR,AL,ANA,AL,4,1,False,3,-3,,,,
1748,20100815,DET,AL,CHA,AL,13,8,False,5,-5,,,,
1749,20100815,SEA,AL,CLE,AL,1,9,True,-8,8,,,,
1750,20100815,NYA,AL,KCA,AL,0,1,True,-1,1,,,,
1751,20100815,OAK,AL,MIN,AL,2,4,True,-2,2,,,,


In [23]:
v_teams = game_day_df2['Visiting Team'].to_list()
v_team_net_score = []
for team in v_teams:
    v_team_net_score.append(train_df.loc[(train_df['Visiting Team']==team),:]['V NetRuns'].mean())
game_day_df2['V AvgNetRuns'] = v_team_net_score

h_teams = game_day_df2['Home Team'].to_list()
h_team_net_score = []
for team in h_teams:
    h_team_net_score.append(train_df.loc[(train_df['Home Team']==team),:]['H NetRuns'].mean())
game_day_df2['H AvgNetRuns'] = h_team_net_score

# prediction = [] # True means we predict home wins, False is visitor
# for game in game_day_df2:
#     print(type(game))
#     #prediction.append(game['H AvgNetRuns'] > game['V AvgNetRuns'])
# game_day_df2['Predict Home Wins?'] = prediction

game_day_df2['Predict Home Wins?'] = game_day_df2['H AvgNetRuns'] > game_day_df2['V AvgNetRuns']

game_day_df2.head()

Unnamed: 0,Date,Visiting Team,Visiting League,Home Team,Home League,Visiting Score,Home Score,Home Winner,V NetRuns,H NetRuns,V AvgNetRuns,H AvgNetRuns,Predict Home Wins?,Prediction Correct?
1747,20100815,TOR,AL,ANA,AL,4,1,False,3,-3,-0.12069,0.050847,True,
1748,20100815,DET,AL,CHA,AL,13,8,False,5,-5,-1.296296,0.827586,True,
1749,20100815,SEA,AL,CLE,AL,1,9,True,-8,8,-1.642857,-0.655172,True,
1750,20100815,NYA,AL,KCA,AL,0,1,True,-1,1,1.050847,-0.909091,False,
1751,20100815,OAK,AL,MIN,AL,2,4,True,-2,2,-0.824561,1.381818,True,


In [26]:
# Evaluate the predictions
game_day_df2['Prediction Correct?'] = game_day_df2['Predict Home Wins?'] == game_day_df2['Home Winner']
game_day_df2_correct = game_day_df2.loc[(game_day_df2['Prediction Correct?']),:]
print(f"{game_day}: {len(game_day_df2)} games with {len(game_day_df2_correct)} predicted correctly.")
print(f"{round((len(game_day_df2_correct)/len(game_day_df2)*100.),1)}%")
game_day_df2[['Date', 'Visiting Team', 'Home Team', 'Visiting Score', 'Home Score', 'Home Winner',
              'V AvgNetRuns', 'H AvgNetRuns', 'Predict Home Wins?', 'Prediction Correct?']]

20100815: 15 games with 9 predicted correctly.
60.0%


Unnamed: 0,Date,Visiting Team,Home Team,Visiting Score,Home Score,Home Winner,V AvgNetRuns,H AvgNetRuns,Predict Home Wins?,Prediction Correct?
1747,20100815,TOR,ANA,4,1,False,-0.12069,0.050847,True,False
1748,20100815,DET,CHA,13,8,False,-1.296296,0.827586,True,False
1749,20100815,SEA,CLE,1,9,True,-1.642857,-0.655172,True,True
1750,20100815,NYA,KCA,0,1,True,1.050847,-0.909091,False,False
1751,20100815,OAK,MIN,2,4,True,-0.824561,1.381818,True,True
1752,20100815,BAL,TBA,2,3,True,-1.423729,0.711864,True,True
1753,20100815,BOS,TEX,3,7,True,0.721311,1.262295,True,True
1754,20100815,LAN,ATL,1,13,True,0.122807,1.357143,True,True
1755,20100815,FLO,CIN,0,2,True,0.25,0.42623,True,True
1756,20100815,MIL,COL,5,6,True,0.0,1.263158,True,True
