In [1]:
# first nine commands from data wrangling project, steps were the same
import pandas as pd
import numpy as np
from scipy import stats, integrate
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import json
from pandas.io.json import json_normalize

In [2]:
# First data set is contained in one .csv file
df = pd.read_csv('data/mlb-4.1.2016-4.30.2016-2.csv', encoding='utf-8', header=None)

In [3]:
# eliminate header rows
df = df[2:]

In [4]:
# rename heading columns
heading_row = ['date', 'time','ml-fav', 'ml-dog','rl-fav', 'rl-dog','total-over', 'total-under',
               '1st-half-fav', '1st-half-dog','2nd-half-fav', '2nd-half-dog','filename']
df.columns = heading_row

In [5]:
# reset index
df = df.reset_index(drop=True)

In [6]:
# after determining the number of odds changes, we may want to determine when they generally occur
# columns were added to the dataframe to help with this determination
df['date-time'] = pd.to_datetime(df['date'] + ' ' + df['time'], format='%d-%b %I:%M%p')
# adding 42,369 days to each date was necessary, as the original .csv file did not contain the year
df['date-time'] = df['date-time'] + pd.to_timedelta(42369,unit='d')

In [7]:
# opening and closing time for odds added to each row
df['opening-time'] = df['date-time'].groupby(df['filename']).transform('min')
df['closing-time'] = df['date-time'].groupby(df['filename']).transform('max')
df['open-close-dif'] = df['closing-time'] - df['opening-time']

In [8]:
# this adds a percentage time to each row of how close to the beginning of the contest the given row occurred
df['percent-time-to-close'] = (1 - ((df['closing-time'] - df['date-time']) / df['open-close-dif']))

In [9]:
# date literal time columns show the amount of time in days and in float value where 1 day = 1
df['date-literal-time-to-close'] = df['open-close-dif'] * (1 - df['percent-time-to-close'])
df['date-literal-time-to-close-float'] = df['date-literal-time-to-close'] / np.timedelta64(1,'D')

In [10]:
# create new df with open/close line only
dfmm = df.groupby('filename')
dfmin = dfmm.first()
dfmax = dfmm.last()
dfnew = pd.concat([dfmin, dfmax], axis=1)

In [11]:
# reset column headings, though many could be removed, we won't bother at this point
heading_row = ['date-open','time-open','ml-fav-open','ml-dog-open','rl-fav-open','rl-dog-open',
               'total-over-open','total-under-open','1st-half-fav-open','1st-half-dog-open',
               '2nd-half-fav-open', '2nd-half-dog-open','date-time-open','opening-time-open',
              'closing-time-open','open-close-dif-open','percent-time-to-close-open','date-literal-time-to-close-open',
              'date-literal-time-to-close-float-open',
              'date-close','time-close','ml-fav-close','ml-dog-close','rl-fav-close','rl-dog-close',
               'total-over-close','total-under-close','1st-half-fav-close','1st-half-dog-close',
               '2nd-half-fav-close', '2nd-half-dog-close','date-time-close','opening-time-close',
              'closing-time-close','open-close-dif-close','percent-time-to-close-close','date-literal-time-to-close-close',
              'date-literal-time-to-close-float-close']
dfnew.columns = heading_row

In [12]:
# total number of games, ignoring games where the odds don't change from opening to closing
dfnew['fav-odds-same'] = (dfnew['ml-fav-open'] == dfnew['ml-fav-close'])
dfnew = dfnew[dfnew['fav-odds-same'] == False]
dfnew['date-open'].count()

338

In [13]:
# this column shows number of times the fav changed from open to close, indicating the dog became a worse bet
dfnew['fav-change'] = (dfnew['ml-fav-open'].str[:3] != dfnew['ml-fav-close'].str[:3])
dfnew['fav-change'].sum()

43

In [17]:
# this column shows number of times the odds got worse for the favorite - i.e. the number became more negative
dfnew['fav-odds-worse'] = (pd.to_numeric(dfnew['ml-fav-open'].str[3:]) > pd.to_numeric(dfnew['ml-fav-close'].str[3:]))
# sum only when the fav-change is false
dfnew['fav-odds-worse'][dfnew['fav-change'] == False].sum()

151

In [18]:
# so out of 338 games, 151 times the favorite got worse odds, and 187 times the dog got worse odds
# what are the chances this was just coincidence?  determine s
(338 * 0.5 * 0.5) ** 0.5

9.192388155425117

In [19]:
# our expected value, if fav/dog doesn't matter: 338 / 2 = 169 (we got 187, 18 more)
18 / 9.192388155425117

1.9581418555935164

In [20]:
# checking a distribution table shows the chances of this happening is 2.5% - is this significant?  Possibly.