In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re
from typing import Dict, List
import pyprind
from math import cos, pi, floor

In [2]:
def parse_challenge(page):
    """
    Parse a challenge given by mmi and mavat's web servers, forcing us to solve
    some math stuff and send the result as a header to actually get the page.
    This logic is pretty much copied from https://github.com/R3dy/jigsaw-rails/blob/master/lib/breakbot.rb
    """
    top = page.split('<script>')[1].split('\n')
    challenge = top[1].split(';')[0].split('=')[1]
    challenge_id = top[2].split(';')[0].split('=')[1]
    return {'challenge': challenge, 'challenge_id': challenge_id, 'challenge_result': get_challenge_answer(challenge)}

def telegram_bot_sendtext(bot_message: str) -> None:
    """
    Send telegram msg for my bot
    """
    bot_token = '1172952527:AAGoM74Rx25DPBpmQhEwacs_AQ9GWI8Oybk'
    chat_id = "839266998"
    send_text = 'https://api.telegram.org/bot' + bot_token + '/sendMessage?chat_id=' + chat_id + '&parse_mode=Markdown&text=' + bot_message
    requests.get(send_text)

def get_challenge_answer(challenge):
    """
    Solve the math part of the challenge and get the result
    """
    arr = list(challenge)
    last_digit = int(arr[-1])
    arr.sort()
    min_digit = int(arr[0])
    subvar1 = (2 * int(arr[2])) + int(arr[1])
    subvar2 = str(2 * int(arr[2])) + arr[1]
    power = ((int(arr[0]) * 1) + 2) ** int(arr[1])
    x = (int(challenge) * 3 + subvar1)
    y = cos(pi * subvar1)
    answer = x * y
    answer -= power
    answer += (min_digit - last_digit)
    answer = str(int(floor(answer))) + subvar2
    return answer

def get_racing_days() -> List:
    """
    filter wednesday, saturday and sundays
    filter non-august
    """
    datelist = pd.date_range("2000/01/01", "2020/09/24").tolist()
    datelist = [x.strftime("%Y/%m/%d") for x in datelist if x.dayofweek in (2,5,6) and x.month != 8]
    return datelist

def make_hkjc_request(url: str) -> str:
    headers = {'User-agent': 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1'}
    with requests.Session() as s:
        r = s.get(url, headers = headers)
        if 'X-AA-Challenge' in r.text:
            challenge = parse_challenge(r.text)
            cookies = s.get(url, headers={
                'X-AA-Challenge': challenge['challenge'],
                'X-AA-Challenge-ID': challenge['challenge_id'],
                'X-AA-Challenge-Result': challenge['challenge_result']
            }).cookies  
        else:
            cookies = r.cookies

        r = s.post(url, cookies = cookies)

    return r.text

def evaluate_course(title):
    clean_title = title.replace(" ","").lower()
    if "shatin" in clean_title:
        return "ST"
    
    elif "happyvalley" in clean_title:
        return "HV"
    
    else:
        return None


def scrape_race_day(date) -> Dict:
    html_list = []
    # first race
    url = f"https://racing.hkjc.com/racing/information/english/Racing/LocalResults.aspx?RaceDate={date}"
    first_race = make_hkjc_request(url)
    soup = BeautifulSoup(first_race, "lxml")
    html_list.append(soup)

    # extract info
    num_of_races = len(soup.find("table", attrs = {"class": re.compile("f_fs12.+racecard$")}).find_all("a"))
    course = evaluate_course(str(soup.find("span", attrs = {"class": "f_fl f_fs13"})))
    overhead_text = soup.find("tr", attrs = {"class": re.compile("bg_blue.+font_wb$")}).get_text()
    first_race_no = re.search("\((\d+)\)", overhead_text).group(1)

    # extract info for all other races and append to html_list
    for i in range(2, num_of_races + 1):
        url = f"https://racing.hkjc.com/racing/information/english/Racing/LocalResults.aspx?RaceDate={date}&Racecourse={course}&RaceNo={str(i)}"
        race_info = make_hkjc_request(url)
        soup = BeautifulSoup(race_info, "lxml")
        html_list.append(soup)

    # get horse_id_list
    horse_id_list = []
    horse_no_list = []
    for soup in html_list:
        try:
            df_rc = pd.read_html(str(soup.find("div", attrs = {"class": "performance"})))[0]
            single_race_id_list = list(df_rc["Horse"])
            single_race_id_list = [re.search("\(([\w\d]+)\)$", x.strip()).group(1) for x in single_race_id_list]
            horse_id_list.append(single_race_id_list)
            
            single_horse_no_list = list(df_rc["Horse No."])
            horse_no_list.append(single_horse_no_list)
            
        except ValueError:
            pass

    result_dict = {
        "date": date,
        "first_race_no": first_race_no,
        "horse_id_list": horse_id_list,
        "horse_no_list": horse_no_list
    }

    return result_dict



In [4]:
#https://racing.hkjc.com/racing/information/english/Racing/LocalResults.aspx?RaceDate=2000/06/04

#scrape_race_day(date)
df = pd.DataFrame(columns = ["date", "first_race_no", "horse_id_list", "horse_no_list"])

#date_list = get_racing_days() #brute force racing days
date_list = list(pd.read_pickle(r"data_20200822\horse_id_history.pkl")["date"]) #get days already existed
date_list += list(pd.read_pickle(r"2020_season\horse_id_history.pkl")["date"])
date_list = list(dict.fromkeys(date_list))
print(f"{len(date_list)} days in total.")
bar = pyprind.ProgBar(len(date_list))
for race_date in date_list:
    try:
        df_dict = scrape_race_day(race_date)
        df = df.append(df_dict, ignore_index = True)
        bar.update()
    except AttributeError:
        bar.update()
        pass
    except requests.ConnectionError:
            bar.update()
            pass
        
df.to_pickle("horse_id_history.pkl")

1615 days in total.


0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 25:51:12


## Retry

In [6]:
for i in range(5):
    #date_list = get_racing_days()
    date_list = list(pd.read_pickle(r"data_20200822\horse_id_history.pkl")["date"]) #get days already existed
    date_list += list(pd.read_pickle(r"2020_season\horse_id_history.pkl")["date"])
    date_list = list(dict.fromkeys(date_list))

    date_list = [x for x in date_list if x not in list(df["date"])]
    print(f"{len(date_list)} days in total.")
    bar = pyprind.ProgBar(len(date_list))
    #df = pd.DataFrame(columns = ["date", "first_race_no", "horse_id_list"])
    for race_date in date_list:
        try:
            df_dict = scrape_race_day(race_date)
            df = df.append(df_dict, ignore_index = True)
            bar.update()
        except AttributeError:
            bar.update()
            pass
        except requests.ConnectionError:
            bar.update()
            pass
                         
    df.to_pickle("horse_id_history.pkl")

0 days in total.
0 days in total.
0 days in total.
0 days in total.
0 days in total.


In [7]:
df

Unnamed: 0,date,first_race_no,horse_id_list,horse_no_list
0,2000/01/02,253,"[[BV125, BV099, BV093, BV051, BV008, BV049, BV...","[[8, 5, 6, 1, 7, 4, 9, 2, 3], [1, 2, 13, 8, 14..."
1,2000/01/23,304,"[[BL153, BP269, BP185, BL117, BM210, BT130, BP...","[[7, 4, 9, 10, 2, 1, 6, 5, 12, 3, 13, 11, 8], ..."
2,2000/02/20,363,"[[BV240, BV074, BV099, BV122, BV112, BV050, BV...","[[12, 11, 5, 6, 3, 2, 7, 14, 9, 1, 10, 8, 4, 1..."
3,2000/03/05,397,"[[BV146, BV122, BV240, BV105, BV164, BV100, BV...","[[10, 5, 9, 4, 11, 7, 8, 2, 13, 3, 12, 6, 1], ..."
4,2000/03/26,448,"[[BV105, BV224, BV146, BV240, BV096, BV051, BV...","[[5, 2, 1, 11, 9, 3, 8, 12, 6, 4, 10, 7], [8, ..."
...,...,...,...,...
1610,2017/07/01,769,"[[A277, S349, A123, V089, V185, T373, T345, A2...","[[7, 2, 12, 14, 3, 1, 13, 9, 11, 5, 10, 4, 8, ..."
1611,2016/10/16,102,"[[V347, A051, V259, V049, V303, V307, V367, V3...","[[4, 7, 11, 3, 10, 1, 9, 12, 6, 8, 2, 5], [1, ..."
1612,2017/01/01,297,"[[P272, P293, P415, S354, P088, S205, P405, T0...","[[2, 9, 13, 11, 1, 4, 3, 5, 8, 12, 7, 10, 6], ..."
1613,2018/07/15,797,"[[S417, V369, V258, A311, B137, T256, V302, T2...","[[13.0, 14.0, 8.0, 5.0, 9.0, 4.0, 1.0, 2.0, 10..."


## Parse table 

In [None]:
df = pd.read_pickle(r"horse_id_history.pkl")
df["date"] = pd.to_datetime(df["date"])

def parse_id_table(df):
    df_consol = pd.DataFrame()
    for ix, rw in df.iterrows():
        i = 1
        for horse_id_list, horse_no_list in zip(rw["horse_id_list"], rw["horse_no_list"]):
            temp_df = pd.DataFrame()
            temp_df["horse_id"] = horse_id_list
            temp_df["horse_no"] = horse_no_list
            temp_df["date"] = rw["date"]
            temp_df["race_id"] = rw["date"].strftime("%Y-%m-%d-") + str(i)
            i += 1
            df_consol = df_consol.append(temp_df)
            
    return df_consol

In [10]:
df_consol = df_consol.reset_index(drop = True)
df_consol.to_pickle(r"horse_no_data.pkl")