In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re
from typing import Dict, List
import pyprind
from math import cos, pi, floor
from datetime import datetime
import sqlalchemy

In [3]:
def parse_challenge(page):
    """
    Parse a challenge given by mmi and mavat's web servers, forcing us to solve
    some math stuff and send the result as a header to actually get the page.
    This logic is pretty much copied from https://github.com/R3dy/jigsaw-rails/blob/master/lib/breakbot.rb
    """
    top = page.split('<script>')[1].split('\n')
    challenge = top[1].split(';')[0].split('=')[1]
    challenge_id = top[2].split(';')[0].split('=')[1]
    return {'challenge': challenge, 'challenge_id': challenge_id, 'challenge_result': get_challenge_answer(challenge)}

def telegram_bot_sendtext(bot_message: str) -> None:
    """
    Send telegram msg for my bot
    """
    bot_token = '1172952527:AAGoM74Rx25DPBpmQhEwacs_AQ9GWI8Oybk'
    chat_id = "839266998"
    send_text = 'https://api.telegram.org/bot' + bot_token + '/sendMessage?chat_id=' + chat_id + '&parse_mode=Markdown&text=' + bot_message
    requests.get(send_text)

def get_challenge_answer(challenge):
    """
    Solve the math part of the challenge and get the result
    """
    arr = list(challenge)
    last_digit = int(arr[-1])
    arr.sort()
    min_digit = int(arr[0])
    subvar1 = (2 * int(arr[2])) + int(arr[1])
    subvar2 = str(2 * int(arr[2])) + arr[1]
    power = ((int(arr[0]) * 1) + 2) ** int(arr[1])
    x = (int(challenge) * 3 + subvar1)
    y = cos(pi * subvar1)
    answer = x * y
    answer -= power
    answer += (min_digit - last_digit)
    answer = str(int(floor(answer))) + subvar2
    return answer

def make_hkjc_request(url: str) -> str:
    headers = {'User-agent': 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1'}
    with requests.Session() as s:
        r = s.get(url, headers = headers)
        if 'X-AA-Challenge' in r.text:
            challenge = parse_challenge(r.text)
            cookies = s.get(url, headers={
                'X-AA-Challenge': challenge['challenge'],
                'X-AA-Challenge-ID': challenge['challenge_id'],
                'X-AA-Challenge-Result': challenge['challenge_result']
            }).cookies  
        else:
            cookies = r.cookies

        r = s.post(url, cookies = cookies)

    return r.text

def evaluate_course(title):
    clean_title = title.replace(" ","").lower()
    if "shatin" in clean_title:
        return "ST"
    
    elif "happyvalley" in clean_title:
        return "HV"
    
    else:
        return None


def scrape_section_time(date, engine) -> pd.DataFrame:
    df = pd.DataFrame()
    
    # first race
    url = f"https://racing.hkjc.com/racing/information/english/Racing/LocalResults.aspx?RaceDate={date}"
    first_race = make_hkjc_request(url)
    soup = BeautifulSoup(first_race, "lxml")

    # extract info
    num_of_races = len(soup.find("table", attrs = {"class": re.compile("f_fs12.+racecard$")}).find("tr").find_all("a"))
    #course = evaluate_course(str(soup.find("span", attrs = {"class": "f_fl f_fs13"})))
    #overhead_text = soup.find("tr", attrs = {"class": re.compile("bg_blue.+font_wb$")}).get_text()
    #first_race_no = re.search("\((\d+)\)", overhead_text).group(1)

    # extract info for all other races and append to html_list
    for i in range(1, num_of_races + 1):
        url = f"https://racing.hkjc.com/racing/information/english/Racing/DisplaySectionalTime.aspx?RaceDate={date}&RaceNo={str(i)}"
        race_info = make_hkjc_request(url)
        soup = BeautifulSoup(race_info, "lxml")
        try:
            df = df.append(parse_html_to_df(soup, str(i), date))
        except ValueError:
            print(f"Error occurred with {date} Race {str(i)}")

    df.to_sql("sectional_time", con = engine, if_exists = "append", method = "multi")

def parse_html_to_df(soup, race_no, date):
    # date is in the format %d/%m/%Y
    date = pd.to_datetime(date, format = "%d/%m/%Y")
    
    for i in soup.find_all("span", attrs = {"class":"f_fl"}):
        i.string = ""
    table = soup.find("table", attrs = {"class": "table_bd f_tac race_table"})
    df = pd.read_html(str(table))[0]
    df.columns = df.columns.get_level_values(1)
    df.columns = ["finishing_order", "horse_no", "horse_name", "sectional_time_1", "sectional_time_2", "sectional_time_3", "sectional_time_4", "sectional_time_5", "sectional_time_6", "total_time"]
    df["horse_id"] = [re.search("\(([A-Z0-9]+)\)$", x).group(1) for x in df["horse_name"]]
    df["race_date"] = date.strftime("%Y-%m-%d")
    df["race_no"] = str(race_no)
    df["race_id"] = df["race_date"] + "-" + df["race_no"]
    
    return df
    
def main():
    engine = sqlalchemy.create_engine('postgresql://postgres:HKJC2020@localhost:9020/horse_racing')
    date_list = pd.read_sql(
        """
        select distinct vw_racecard.date 
        from vw_racecard 
        where vw_racecard.date not in (select distinct cast(race_date as date) from sectional_time)
        and date_part('year', vw_racecard.date) > 2008
        """, con = engine)["date"]
    date_list = [x.strftime("%d/%m/%Y") for x in date_list]
    
    bar = pyprind.ProgBar(len(date_list))
    for date in date_list:
        try:
            scrape_section_time(date, engine)
            bar.update()
        except Exception as e:
            print(f"{date}: {e}")
            bar.update()
        
if __name__ == "__main__":
    main()
    
        

In [4]:

engine = sqlalchemy.create_engine('postgresql://postgres:HKJC2020@localhost:9020/horse_racing')
#date_list = ["2020-10-11", "2020-10-04", "2020-10-01", "2020-10-07"]
#date_list = [x.strftime("%d/%m/%Y") for x in pd.to_datetime(date_list, format = "%Y-%m-%d")]

bar = pyprind.ProgBar(len(date_list))
for date in date_list:
    try:
        scrape_section_time(date, engine)
        bar.update()
    except Exception as e:
        print(f"{date}: {e}")
        bar.update()

0% [####] 100% | ETA: 00:00:00
Total time elapsed: 00:00:27
