In [81]:
import pandas as pd
import numpy as np
from urllib.request import Request, urlopen
from bs4 import BeautifulSoup as soup
import string
import itertools
from datetime import datetime


In [82]:
url = 'https://www.oddschecker.com/'

In [83]:
country_code = ['UK','IRE','USA','AUS']

In [84]:
def get_soup(base_url, sport = 'horses', event_url = None):
    '''Uses beautiful soup to get parse the url
    base_url = str, www.oddschecker.com/
    sport = str, which sport do you want to look at
    event_url = str, of the url extension which will take you to the '''
    
    if sport == 'horses':
        sport = 'horse-racing'
    
    url = base_url + sport
    if event_url != None:
        url += event_url
    
    req = Request(url , headers={'User-Agent': 'Mozilla/5.0'})
    webpage = urlopen(req).read()
    return soup(webpage, "html.parser")
    

In [85]:
page_soup = get_soup(url)


In [86]:
print(type(page_soup))

<class 'bs4.BeautifulSoup'>


In [87]:
def get_races(bsoup, country_codes, sport = 'horses'):
    '''Will return a dictionary of the events displayed on www.oddschecker.com
        Only does horse_racing atm.
        dict structure = events[countrycode][venue][list of event times]
        bsoup = the page parse with beautifulsoup4
        country_codes = countries you want to get events for
        sport = the sport you want''' 
    events = {code:{} for code in country_codes}
    
    # website has both todays and tomorrows races on it.  Need to only get todays races
    # this returns two objects as UK and International races are in different sections
    today  = bsoup.findAll('div', {'data-day' : 'today'}) 

    for i in range(len(today)):
        result = today[i].findAll('div', {'class' : 'race-details'})
        containers = result if i == 0  else containers + result

    for container in containers:
        txt = container.find('div', {'class' : 'venue-details'}).text
        
        for code in country_codes:
            # get country code and venue
            if code in txt[:3]:
                cc = code
                venue = txt.replace(code, '')
                break

        # get event times   
        times = [x.text for x in container.findAll('div', {'class' : 'racing-time'})]
        events[cc][venue] = times
        
    return events


In [88]:
events = get_races(page_soup, country_code)

In [103]:
class race():
    def __init__(self,base_url, sport, cc, venue, time):
        '''have a race as a class which we can add horse classes to.'''
        self.url  = base_url
        self.sport = sport
        self.cc = cc
        self.venue = venue
        self.time = time
        # this returns that the day is 1/1/1990 need to make it today
        self.datetime = datetime.strptime(self.time, '%H:%M') 
        
        self.url_ext = '/' + self.venue.replace(' ','-') + '/' + self.time + '/' + 'winner'
        # soup the url
        soup = get_soup(base_url, sport, event_url = self.url_ext)
        
        # These containers are the rows in the table on the url
        containers = soup.findAll('tr', {'class' : 'diff-row evTabRow bc'})
        
        # init horse class
        self.horses = [horse(container) for container in containers] 
    
        
        
    def __str__(self):
        return f'{self.venue}, {self.cc} at {self.time}'
        
    def get_current_odds(self):
        '''Will update the odds in the horses class'''
        # soup the url
        soup = get_soup(base_url = self.url, sport = self.sport, event_url = self.url_ext)
        
        for horse in self.horses:
            #this should find the row for the horse we want
            container = soup.findAll('tr', {'data-bname': horse.name}) 
            if len(container) != 1 :
                return 'Error - more than one row with horse name found - fix the bug'
            
            horse.update_odds(container[0])
        

In [97]:
class horse():
    def __init__(self, container):
        '''Creates a horse object. Will initialise the dataframe to contain the odds data '''
        self.name = container.find('a', {'class' : 'popup selTxt'}).text
        # this also contains jockey form, need to seperate if we are going to use
        self.jockey = container.find('div' ,{'class' :'bottom-row jockey'}).text 
        
        # Get the odds
        odds = self.get_odds(container)
        #start a dataframe of the odds
        self.odds = pd.DataFrame(odds,columns = [datetime.now()])
                  
    def __str__(self,soup):
        return f'{self.name} ridden by {self.jockey}'
    
    def get_odds(self, container):
        '''returns a list of the odds for the horse
        the container needs to be the row in the main table with the odds info in it.'''
        odds = container.findAll('p') # these come as strings of fractional odds
        odds_list = []
        for odd in odds:
            if '/' in odd.text:
                numbers = odd.text.split('/')
                new_odd = float(numbers[0]) / float(numbers[1]) + 1.0
            
            elif odd.text == 'SP':
                new_odd = None
            else:
                new_odd = float(odd.text) + 1.0
            odds_list.append(new_odd)
        return odds_list
    
    
    def update_odds(self, container):
        '''Appends another column of odds to the dataframe'''
        odds = pd.Series(self.get_odds(container), name = datetime.now() )
        self.odds = pd.concat([self.odds, odds], axis =1)

                  

In [98]:
events.keys()

dict_keys(['UK', 'IRE', 'USA', 'AUS'])

In [99]:
events['UK'].keys()

dict_keys(['Market Rasen', 'Chester', 'Ascot', 'Nottingham', 'Wolverhampton', 'Ripon'])

In [100]:

# This would be the loop structure required to access all the points in the event dict
# Run this cell to init the objects
for (cc,v) in events.items():
    for venue, times in v.items():
        for time in times:
            x = race(url,'horses', cc, venue, time)
            break
        break
    break


In [101]:


for (cc,v) in events.items():
    for venue, times in v.items():
        for time in times:
            x.get_current_odds()
            
            # set some sort of pause statement here depending on how long we want between requests
            break
        break
    break

In [102]:
x.datetime

datetime.datetime(1900, 1, 1, 13, 40)

In [46]:
x.horses[0].odds

Unnamed: 0,2019-05-10 07:39:41.746402,2019-05-10 07:39:48.132698,2019-05-10 07:43:48.618135
0,3.5,3.5,3.5
1,3.75,3.75,3.75
2,3.5,3.5,3.5
3,3.5,3.5,3.5
4,,,
5,3.5,3.5,3.5
6,3.75,3.75,3.75
7,3.5,3.5,3.5
8,3.5,3.5,3.5
9,3.5,3.5,3.5


In [None]:
# Cell to run 

# 1. get days races

# 2. Have a while loop running every X number of minutes until after the last race of the day

# 3. check time against time race starts, once it is Y number of hours before the start.  
#Start collecting odds data.  Odds data will get appended every X minutes as the while loop runs round

# 4.  re calculate odds stats (need to see what those are as haven't read the paper fully)

# 5. Highlight if we should bet based on betting strategy

# 6. Stop grabbing odds data once race has started.