# This Notebook is superseded by the series of .py files

In [58]:
import pandas as pd
import numpy as np
from urllib.request import Request, urlopen
from bs4 import BeautifulSoup as soup
import string
import itertools
from datetime import datetime, timedelta
import operator

In [59]:
url = 'https://www.oddschecker.com/'

In [60]:
country_code = ['UK','IRE','USA','AUS']

In [61]:
def get_soup(base_url, sport = 'horses', event_url = None):
    '''Uses beautiful soup to get parse the url
    base_url = str, www.oddschecker.com/
    sport = str, which sport do you want to look at
    event_url = str, of the url extension which will take you to the '''
    
    if sport == 'horses':
        sport = 'horse-racing'
    
    url = base_url + sport
    if event_url != None:
        url += event_url
    
    req = Request(url , headers={'User-Agent': 'Mozilla/5.0'})
    webpage = urlopen(req).read()
    return soup(webpage, "html.parser")
    

In [62]:
page_soup = get_soup(url)


In [63]:
print(type(page_soup))

<class 'bs4.BeautifulSoup'>


In [64]:
def get_races(bsoup, country_codes, sport = 'horses'):
    '''Will return a dictionary of the events displayed on www.oddschecker.com
        Only does horse_racing atm.
        dict structure = events[countrycode][venue][list of event times]
        bsoup = the page parse with beautifulsoup4
        country_codes = countries you want to get events for
        sport = the sport you want''' 
    
    events = {code:{} for code in country_codes}
    
    # website has both todays and tomorrows races on it.  Need to only get todays races
    # this returns two objects as UK and International races are in different sections
    today  = bsoup.findAll('div', {'data-day' : 'today'}) 

    for i in range(len(today)):
        result = today[i].findAll('div', {'class' : 'race-details'})
        containers = result if i == 0  else containers + result
    
    
    for container in containers:
        txt = container.find('div', {'class' : 'venue-details'}).text
        
        for code in country_codes:
            # extract country code and venue
            if code in txt[:3]:
                cc = code
                venue = txt.replace(code, '')
                break
                
        # get event times 
        events[cc][venue] = {} # dictionary for event times
        times = [x.text for x in container.findAll('div', {'class' : 'racing-time'})]
        for t in times:
            # convert to datetime
            d_time_now = datetime.combine(datetime.today(),datetime.strptime(t, '%H:%M').time()) 
            # have a datetime 5 hours before the race as a marker to start collecting data
            start_data_collection = d_time_now - timedelta(hours=5) 
#             print(f'd_time_now = {d_time_now} , start_data_collection = {start_data_collection}')
            
            events[cc][venue][t] = start_data_collection

        
        
    return events


In [65]:
events = get_races(page_soup, country_code)

In [109]:
class race():
    def __init__(self,base_url, sport, cc, venue, time):
        '''have a race as a class which we can add horse classes to.'''
        self.url  = base_url
        self.sport = sport
        self.cc = cc
        self.venue = venue
        self.time = time
        # this returns that the day is 1/1/1990 need to make it today
        self.datetime = datetime.combine(datetime.today(),datetime.strptime(self.time, '%H:%M').time())
        print(self.venue)
        print(self.time)
        
        self.url_ext = '/' + self.venue.replace(' ','-') + '/' + self.time + '/' + 'winner'
        # soup the url
        soup = get_soup(self.url, self.sport, event_url = self.url_ext)
        
        # Get race data in a dictionary.  THIS METHOD DOES"T MATCH UP THE TITLE OF THE TYPE TO THE VALUES
        race_info_container = soup.find('div', {'class':'content-right'}).findAll('li')
        self.race_info = {x.text.split(':')[0] : x.text.split(':')[1] for x in race_info_container}
        
        # These containers are the rows in the table on the url
        containers = soup.findAll('tr', {'class' : 'diff-row evTabRow bc'})
        # init horse class
        self.horses = [horse(container) for container in containers]
        self.rank_horses()
        
    def __str__(self):
        return f'{self.venue}, {self.cc} at {self.time}'
        
    def get_current_odds(self):
        '''Will update the odds in the horses class'''
        # soup the url
        soup = get_soup(base_url = self.url, sport = self.sport, event_url = self.url_ext)
        
        for horse in self.horses:
            #this should find the row for the horse we want
            container = soup.findAll('tr', {'data-bname': horse.name}) 
            if len(container) != 1 :
                return 'Error - more than one row with horse name found - fix the bug'
            horse.update_odds(container[0])
        
    def rank_horses(self):
        '''Orders the horses based on the value of their latest odds to find the favourite.'''
        win_prob = [(h.name , h.latest_prob.values[0]) for h in self.horses]
        win_prob.sort(key=operator.itemgetter(1), reverse = True)
        # This just orders the horse objects in the list, need to assign ranks to the horse (with time stamp)
        # And to some object associated with the race?



In [122]:
class horse():
    def __init__(self, container):
        '''Creates a horse object. Will initialise the dataframe to contain the odds data '''
        try:
            self.name = container.find('a', {'class' : 'popup selTxt'}).text
        except:
            self.name = container.find('a', {'class' : 'popup selTxt has-tip'}).text
        # this also contains jockey form, need to seperate if we are going to use
        self.jockey = container.find('div' ,{'class' :'bottom-row jockey'}).text 
        
        # Get the odds
        odds = self.get_odds(container)
        #start a dataframe of the odds
        self.odds = pd.DataFrame(odds,columns = [datetime.now().replace(second = 0, microsecond=0)])
        self.latest_odds = self.odds
        self.stats = pd.DataFrame(self.get_stats())
        
                  
    def __str__(self):
        return f'{self.name} ridden by {self.jockey}'
    
    def get_odds(self, container):
        '''returns a list of the odds for the horse
        the container needs to be the row in the main table with the odds info in it.'''
        odds = container.findAll('p') # these come as strings of fractional odds
        odds_list = []
        for odd in odds:
            if '/' in odd.text:
                numbers = odd.text.split('/')
                new_odd = float(numbers[0]) / float(numbers[1]) + 1.0
            
            elif odd.text == 'SP':
                new_odd = None
            else:
                new_odd = float(odd.text) + 1.0
            odds_list.append(new_odd)
        return odds_list
    
    def get_stats(self):
        '''Return some basic stats for the horses odds at a certain time'''
        mean = self.latest_odds.mean()
        std = self.latest_odds.std()
        maxx = self.latest_odds.max()
        minn = self.latest_odds.min()
        self.latest_prob = 1 / mean # use this to try and order the horses and give them a rank.
        return pd.Series( (self.latest_prob, mean,std,maxx,minn), index = ['win_prob','mean','std','max','min'], 
                         name = datetime.now().replace(second = 0, microsecond=0))
    
    def update_odds(self, container):
        '''Appends another column of raw odds and stats to their respective dataframes'''
        self.latest_odds = pd.Series(self.get_odds(container),
                                     name = datetime.now().replace(second = 0, microsecond=0) )
        self.odds = pd.concat([self.odds, self.latest_odds], axis = 1)
        self.stats = pd.concat([self.stats, self.get_stats()], axis = 1)
                  

In [119]:
events.keys()

dict_keys(['UK', 'IRE', 'USA', 'AUS'])

In [120]:
events['UK'].keys()

dict_keys(['York', 'Yarmouth', 'Newton Abbot', 'Bath', 'Perth'])

In [121]:

# This would be the loop structure required to access all the points in the event dict
# Run this cell to init the objects
for (cc,v) in events.items():
    for venue, times in v.items():
        for time in times:
            x = race(url,'horses', cc, venue, time)
            break
        break
    break


York
13:50
18
Fujaira Prince (2)
First Eleven (5)
Crystal King (11)
Corgi (7)
Stealth Fighter (6)
Collide (1)
Rare Groove (4)
Proschema (8)
Everything For You (16)
Caliburn (3)
Byron Flyer (10)
Red Galileo (18)
Perfect City (14)
Indianapolis (13)
Twin Star (9)
Blakeney Point (17)
Sir Chauvelin (15)
My Reward (12)


In [92]:
x.race_info

NameError: name 'x' is not defined

In [42]:
x.race_info

{'Starters': '5f',
 'Distance': ' 5',
 'Class': '£4033',
 'Prize': 'Good to Soft, Soft in places'}

In [15]:
for hor in x.horses:
    print(f'Name: {hor.name} , proability of win: {hor.latest_prob.values}')

Name: Quiet Place (3) , proability of win: [0.62012569]
Name: Auchterarder (1) , proability of win: [0.15069319]
Name: Out Of Here (4) , proability of win: [0.12130034]
Name: Richard R H B (6) , proability of win: [0.09838646]
Name: Bezzas Lad (2) , proability of win: [0.08355615]
Name: War of Clans (10) , proability of win: [0.05463287]
Name: Corndavon Lad (8) , proability of win: [0.04834655]
Name: Geepower (5) , proability of win: [0.0304878]
Name: Youthfilly (9) , proability of win: [0.02025932]
Name: Bosuns Chair (7) , proability of win: [0.0242483]


In [87]:


for (cc,v) in events.items():
    for venue, times in v.items():
        for time in times:
            x.get_current_odds()
            
            # set some sort of pause statement here depending on how long we want between requests
            break
        break
    break

In [None]:
x.datetime

In [88]:
x.horses[0].odds

Unnamed: 0,2019-05-11 09:48:00
0,7.0
1,7.0
2,7.0
3,7.0
4,7.5
5,7.0
6,7.0
7,7.0
8,6.5
9,7.0


In [None]:
# Cell to run 

# 1. get days races

# 2. Have a while loop running every X number of minutes until after the last race of the day

# 3. check time against time race starts, once it is Y number of hours before the start.  
#Start collecting odds data.  Odds data will get appended every X minutes as the while loop runs round

# 4.  re calculate odds stats (need to see what those are as haven't read the paper fully)

# 5. Highlight if we should bet based on betting strategy

# 6. Stop grabbing odds data once race has started.