In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re
from typing import Dict, List
import pyprind
from math import cos, pi, floor
import sqlalchemy

In [39]:
def parse_challenge(page):
    """
    Parse a challenge given by mmi and mavat's web servers, forcing us to solve
    some math stuff and send the result as a header to actually get the page.
    This logic is pretty much copied from https://github.com/R3dy/jigsaw-rails/blob/master/lib/breakbot.rb
    """
    top = page.split('<script>')[1].split('\n')
    challenge = top[1].split(';')[0].split('=')[1]
    challenge_id = top[2].split(';')[0].split('=')[1]
    return {'challenge': challenge, 'challenge_id': challenge_id, 'challenge_result': get_challenge_answer(challenge)}

def telegram_bot_sendtext(bot_message: str) -> None:
    """
    Send telegram msg for my bot
    """
    bot_token = '1172952527:AAGoM74Rx25DPBpmQhEwacs_AQ9GWI8Oybk'
    chat_id = "839266998"
    send_text = 'https://api.telegram.org/bot' + bot_token + '/sendMessage?chat_id=' + chat_id + '&parse_mode=Markdown&text=' + bot_message
    requests.get(send_text)

def get_challenge_answer(challenge):
    """
    Solve the math part of the challenge and get the result
    """
    arr = list(challenge)
    last_digit = int(arr[-1])
    arr.sort()
    min_digit = int(arr[0])
    subvar1 = (2 * int(arr[2])) + int(arr[1])
    subvar2 = str(2 * int(arr[2])) + arr[1]
    power = ((int(arr[0]) * 1) + 2) ** int(arr[1])
    x = (int(challenge) * 3 + subvar1)
    y = cos(pi * subvar1)
    answer = x * y
    answer -= power
    answer += (min_digit - last_digit)
    answer = str(int(floor(answer))) + subvar2
    return answer

def get_racing_days() -> List:
    """
    filter wednesday, saturday and sundays
    filter non-august
    """
    datelist = pd.date_range("2000/01/01", "2020/09/24").tolist()
    datelist = [x.strftime("%Y/%m/%d") for x in datelist if x.dayofweek in (2,5,6) and x.month != 8]
    return datelist

def make_hkjc_request(url: str) -> str:
    headers = {'User-agent': 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1'}
    with requests.Session() as s:
        r = s.get(url, headers = headers)
        if 'X-AA-Challenge' in r.text:
            challenge = parse_challenge(r.text)
            cookies = s.get(url, headers={
                'X-AA-Challenge': challenge['challenge'],
                'X-AA-Challenge-ID': challenge['challenge_id'],
                'X-AA-Challenge-Result': challenge['challenge_result']
            }).cookies  
        else:
            cookies = r.cookies

        r = s.post(url, cookies = cookies)

    return r.text

def evaluate_course(title):
    clean_title = title.replace(" ","").lower()
    if "shatin" in clean_title:
        return "ST"
    
    elif "happyvalley" in clean_title:
        return "HV"
    
    else:
        return None


def scrape_race_day(date) -> Dict:
    html_list = []
    # first race
    url = f"https://racing.hkjc.com/racing/information/english/Racing/LocalResults.aspx?RaceDate={date}"
    first_race = make_hkjc_request(url)
    soup = BeautifulSoup(first_race, "lxml")
    html_list.append(soup)

    # extract info
    num_of_races = len(soup.find("table", attrs = {"class": re.compile("f_fs12.+racecard$")}).find_all("a"))
    course = evaluate_course(str(soup.find("span", attrs = {"class": "f_fl f_fs13"})))
    overhead_text = soup.find("tr", attrs = {"class": re.compile("bg_blue.+font_wb$")}).get_text()
    first_race_no = re.search("\((\d+)\)", overhead_text).group(1)

    # extract info for all other races and append to html_list
    for i in range(2, num_of_races + 1):
        url = f"https://racing.hkjc.com/racing/information/english/Racing/LocalResults.aspx?RaceDate={date}&Racecourse={course}&RaceNo={str(i)}"
        race_info = make_hkjc_request(url)
        soup = BeautifulSoup(race_info, "lxml")
        html_list.append(soup)

    # get horse_id_list
    horse_id_list = []
    horse_no_list = []
    for soup in html_list:
        try:
            df_rc = pd.read_html(str(soup.find("div", attrs = {"class": "performance"})))[0]
            single_race_id_list = list(df_rc["Horse"])
            single_race_id_list = [re.search("\(([\w\d]+)\)$", x.strip()).group(1) for x in single_race_id_list]
            horse_id_list.append(single_race_id_list)
            
            single_horse_no_list = list(df_rc["Horse No."])
            horse_no_list.append(single_horse_no_list)
            
        except ValueError:
            pass

    result_dict = {
        "date": date,
        "first_race_no": first_race_no,
        "horse_id_list": horse_id_list,
        "horse_no_list": horse_no_list
    }

    return result_dict

def parse_id_table(df):
    df["date"] = pd.to_datetime(df["date"])
    df_consol = pd.DataFrame()
    for ix, rw in df.iterrows():
        i = 1
        for horse_id_list, horse_no_list in zip(rw["horse_id_list"], rw["horse_no_list"]):
            temp_df = pd.DataFrame()
            temp_df["horse_id"] = horse_id_list
            temp_df["horse_no"] = horse_no_list
            temp_df["date"] = rw["date"]
            temp_df["race_id"] = rw["date"].strftime("%Y-%m-%d-") + str(i)
            i += 1
            df_consol = df_consol.append(temp_df)
            
    return df_consol

def parse_string_to_list(string):
    """
    list of list are stored as string in SQL
    this convert these strings back to list of list
    """
    l = string.replace(" ", "").split("],[")
    l = [x.replace("]","").replace("[","").replace("'","") for x in l]
    l = [x.split(",") for x in l]
    return l


In [46]:
#https://racing.hkjc.com/racing/information/english/Racing/LocalResults.aspx?RaceDate=2000/06/04

#scrape_race_day(date)
df = pd.DataFrame(columns = ["date", "first_race_no", "horse_id_list", "horse_no_list"])

#date_list = get_racing_days() #brute force racing days
engine = sqlalchemy.create_engine('postgresql://postgres:HKJC2020@localhost:9020/horse_racing')
query = """
select date from 
(select date, count(*) cnt from vw_racecard 
 where horse_no is null
	group by date
 ) a
 where cnt > 10
"""
date_list = pd.read_sql(query, con = engine)
date_list = [x.strftime("%Y/%m/%d") for x in pd.to_datetime(date_list["date"])]
print(f"{len(date_list)} days in total.")
bar = pyprind.ProgBar(len(date_list))
for race_date in date_list:
    try:
        df_dict = scrape_race_day(race_date)
        df = df.append(df_dict, ignore_index = True)
        bar.update()
    except AttributeError:
        bar.update()
        pass
    except requests.ConnectionError:
            bar.update()
            pass

df_horse_no = parse_id_table(df)
df["horse_id_list"] = [str(x) for x in df["horse_id_list"]]
df["horse_no_list"] = [str(x) for x in df["horse_no_list"]]
df.to_sql("horse_id_history", con = engine, if_exists = 'append', method = 'multi', index = False)
# upload horse_no parsing table
df_horse_no.to_sql("horse_no_data", con = engine, if_exists = 'append', method = 'multi', index = False)

4 days in total.


0% [####] 100% | ETA: 00:00:00
Total time elapsed: 00:00:24


In [50]:
df_horse_no

Unnamed: 0,horse_id,horse_no,date,race_id
0,BV023,6,2001-11-06,2001-11-06-1
1,BV327,2,2001-11-06,2001-11-06-1
2,CA312,4,2001-11-06,2001-11-06-1
3,BT127,10,2001-11-06,2001-11-06-1
4,CA257,5,2001-11-06,2001-11-06-1
...,...,...,...,...
1,A314,2,2019-03-23,2019-03-23-1
2,A308,3,2019-03-23,2019-03-23-1
3,B374,1,2019-03-23,2019-03-23-1
4,A319,6,2019-03-23,2019-03-23-1
