In [1]:
from bs4 import BeautifulSoup
import pandas as pd
from pandas import HDFStore
import time
import re
import pyprind
import requests
import warnings
from datetime import datetime
warnings.filterwarnings("ignore")
from math import cos, pi, floor
import sqlalchemy

In [2]:
def fetch_horse_id(engine) -> list:
    """
    fetch dataset from SQL with engine as input
    name fixed as "horse_id_data"
    remove duplicates in return
    """
    df = pd.read_sql("select * from horse_id_data", con = engine)
    
    with open("race_dates.txt", "r") as f:
        date_list = f.readlines()
        date_list = [x.replace("\n", "") for x in date_list]
        
    # remove scraped race dates
    df = df[~df["date"].isin(date_list)] #date is not in date_list - '~' means the opposite
    output = []
    horse_id_list = df["horse_id_list"]
    date_list = df["date"]
    for lst_of_lst in horse_id_list:
        for lst in lst_of_lst:
            for i in lst:
                output.append(i)
    return date_list, list(dict.fromkeys(output))

def update_status(date_list: str) -> None:
    for dt in date_list:
        with open("race_dates.txt","a+",encoding = 'utf-8') as txt:
            txt.write(dt +"\n")
        
def remove_scraped_id(horse_id_list: list) -> list:
    """
    read status.txt and remove those already been successfully scraped
    return a filtered horse_id_list for future loops
    """
    with open("updater_status.txt","r",encoding = 'utf-8') as txt:
        status = txt.readlines()
        status = [x.replace('\n','') for x in status]
    horse_id_list = [x for x in horse_id_list if x not in status]
    return horse_id_list

def parse_challenge(page):
    """
    Parse a challenge given by mmi and mavat's web servers, forcing us to solve
    some math stuff and send the result as a header to actually get the page.
    This logic is pretty much copied from https://github.com/R3dy/jigsaw-rails/blob/master/lib/breakbot.rb
    """
    top = page.split('<script>')[1].split('\n')
    challenge = top[1].split(';')[0].split('=')[1]
    challenge_id = top[2].split(';')[0].split('=')[1]
    return {'challenge': challenge, 'challenge_id': challenge_id, 'challenge_result': get_challenge_answer(challenge)}

def telegram_bot_sendtext(bot_message: str) -> None:
    """
    Send telegram msg for my bot
    """
    bot_token = '1172952527:AAGoM74Rx25DPBpmQhEwacs_AQ9GWI8Oybk'
    chat_id = "839266998"
    send_text = 'https://api.telegram.org/bot' + bot_token + '/sendMessage?chat_id=' + chat_id + '&parse_mode=Markdown&text=' + bot_message
    requests.get(send_text)

def get_challenge_answer(challenge):
    """
    Solve the math part of the challenge and get the result
    """
    arr = list(challenge)
    last_digit = int(arr[-1])
    arr.sort()
    min_digit = int(arr[0])
    subvar1 = (2 * int(arr[2])) + int(arr[1])
    subvar2 = str(2 * int(arr[2])) + arr[1]
    power = ((int(arr[0]) * 1) + 2) ** int(arr[1])
    x = (int(challenge) * 3 + subvar1)
    y = cos(pi * subvar1)
    answer = x * y
    answer -= power
    answer += (min_digit - last_digit)
    answer = str(int(floor(answer))) + subvar2
    return answer
    
def make_hkjc_request(url: str) -> str:
    headers = {'User-agent': 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1'}
    with requests.Session() as s:
        r = s.get(url, headers = headers)
        if 'X-AA-Challenge' in r.text:
            challenge = parse_challenge(r.text)
            cookies = s.get(url, headers={
                'X-AA-Challenge': challenge['challenge'],
                'X-AA-Challenge-ID': challenge['challenge_id'],
                'X-AA-Challenge-Result': challenge['challenge_result']
            }).cookies  
        else:
            cookies = r.cookies

        r = s.post(url, cookies = cookies)

    return r.text
    
def scrape_horse_data(horse_id):
    url = f"https://racing.hkjc.com/racing/information/english/Horse/HorseSearch.aspx?HorseName=&SearchType=BrandNumber&BrandNumber={horse_id}"
    res = make_hkjc_request(url)
    soup = BeautifulSoup(res, "lxml")
    horse_nm = re.search("^[^\(]+",soup.find("td", attrs = {"class":"subsubheader"}).get_text()).group().strip()
    
    # currently the response may only give records of 3 seasons - to solve this, we make another request if there is the "all records option" available
    all_records_url = soup.find("table", attrs = {"align":"right"})
    if all_records_url != None:
        all_records_url = "https://racing.hkjc.com" + all_records_url.find("a", attrs = {"href": re.compile(".+Option=1$")})["href"]
        res = make_hkjc_request(all_records_url)
        soup = BeautifulSoup(res, "lxml")

    df_horse_profile = extract_horse_profile(res, horse_nm, horse_id)
    df_race_history = extract_racing_history(soup, horse_nm, horse_id)
    try:
        df_vet = extract_vet_record(soup, horse_nm, horse_id)
    except TypeError:
        df_vet = pd.DataFrame(columns = ['Date', 'Details', 'Passed Date', 'Horse_name', 'Horse_code']) 
    except IndexError:
        df_vet = pd.DataFrame(columns = ['Date', 'Details', 'Passed Date', 'Horse_name', 'Horse_code'])
        
    try:
        df_trackwork = extract_trackwork_record(soup, horse_nm, horse_id)
    except TypeError:
        df_trackwork = pd.DataFrame(columns = ['Date', 'Type', 'Racecourse_track', 'Workouts', 'Gear', 'Horse_name', 'Horse_code'])
    except IndexError:
        df_trackwork = pd.DataFrame(columns = ['Date', 'Type', 'Racecourse_track', 'Workouts', 'Gear', 'Horse_name', 'Horse_code'])

    return df_horse_profile, df_race_history, df_vet, df_trackwork


def extract_horse_profile(res: str, horse_nm, horse_id) -> pd.DataFrame:
    """
    Extract horse profile, require source html
    """
    tables = pd.read_html(res)
    df = pd.concat([tables[2][[0,2]],tables[3][[0,2]]])
    df = df.transpose()
    df.columns = df.iloc[0]
    df = df.reindex(df.index.drop(0))

    df['Horse_nm'] = horse_nm
    df["Horse_code"] = horse_id
    df['snapshot_date'] = pd.to_datetime(datetime.now())
    try:
        df[["Country of Origin","Age"]] = df["Country of Origin / Age"].str.split("/", n=2, expand = True)
    except KeyError:
        df["Age"] = "Unknown"
        
    df["Colour"] = "and".join(df["Colour / Sex"].iloc[0].split("/")[:-1]).strip()
    df["Sex"] = df["Colour / Sex"].iloc[0].split("/")[-1].strip()
    
    standard_col = ['snapshot_date', 'Country of Origin', 'Import Type', 'Owner', 'Sire', 'Dam',
           'Dam\'s Sire', 'Same Sire', 'Horse_nm', 'Horse_code', 'Colour', 'Sex', 'Age']
    df = df[standard_col]
    return df

# racing history
def extract_racing_history(soup, horse_nm, horse_id) -> pd.DataFrame:
    """
    Extract racing history for the specific horse
    """
    horsedata = soup.find('table',attrs = {"class":"bigborder"})
    tables = pd.read_html(str(horsedata))
    df_horse = tables[0]
    df_horse.columns = df_horse.iloc[0]
    df_horse = df_horse.reindex(df_horse.index.drop(0))
    df_horse = df_horse.reset_index(drop=True)
    index_list = df_horse[(df_horse["RaceIndex"] == df_horse["RaceIndex"]) & (df_horse["Dist."] == df_horse["G"])].index
    df_horse_list = []
    hiearchy_list = []

    for indexes in range(len(index_list)):
        if indexes != len(index_list)-1:
            df_horse_list.append(df_horse[index_list[indexes]+1:index_list[indexes+1]])
            hiearchy_list.append(df_horse.iloc[index_list[indexes]]["RaceIndex"])
        else:
            df_horse_list.append(df_horse[index_list[indexes]+1:])
            hiearchy_list.append(df_horse.iloc[index_list[indexes]]["RaceIndex"])

    for i in range(0,len(df_horse_list)):
        df_horse_list[i]["Season"] = hiearchy_list[i]

        df_horse = pd.concat(df_horse_list)
        df_horse = df_horse.reset_index(drop=True)
    
    df_horse["RC_Track_Course"] = df_horse["RC/Track/Course"]
    df_horse = df_horse.drop(columns=["RC/Track/Course"])
    df_horse['Horse_name'] = horse_nm
    df_horse["Horse_code"] = horse_id
    
    return df_horse

def extract_vet_record(soup, horse_nm, horse_id):
    """
    extract veterinary records for the horse - does not exist for retired horses
    """
    vet_record = soup.find("a", attrs = {"class": "table_eng_text", "href": re.compile(".+VeterinaryRecords.+")})
    vet_record_link = r"https://racing.hkjc.com" + vet_record["href"]
    vet_res = make_hkjc_request(vet_record_link)
    tables = pd.read_html(vet_res)  
    df_vet = tables[4]
    df_vet["Horse_name"] = horse_nm
    df_vet["Horse_code"] = horse_id
    return df_vet

def extract_trackwork_record(soup, horse_nm, horse_id): 
    """
    extract trackwork records for the horse - does not exist for retired horses
    """
    tw_record = soup.find("a", attrs = {"class": "table_eng_text", "href": re.compile(".+Trackwork.+")})
    tw_record_link = r"https://racing.hkjc.com" + tw_record["href"]
    tw_res = make_hkjc_request(tw_record_link)
    tables = pd.read_html(tw_res)  
    df_tw = tables[4]
    df_tw["Racecourse_track"] = df_tw["Racecourse/Track"]
    df_tw["Horse_name"] = horse_nm
    df_tw["Horse_code"] = horse_id
    
    standard_col = ['Date', 'Type', 'Racecourse_track', 'Workouts', 'Gear', 'Horse_name', 'Horse_code']
    df_tw = df_tw[standard_col]
    
    return df_tw

In [3]:
if __name__ == "__main__":
    # assuming port forwarded to 9001
    engine = sqlalchemy.create_engine('postgresql://postgres:postgres@localhost:9001/horse_racing')
    
    telegram_bot_sendtext(f"Horse scraper program started at {str(datetime.now())}.")
    error_list = []
    date_list, horse_id_list = fetch_horse_id(engine)
    
    #can put the below in a retry loop
    print(f"Total {str(len(horse_id_list))} horses.")
    bar = pyprind.ProgBar(len(horse_id_list))
    status_int = 0
    for horse_id in horse_id_list:
        try:
  
            # scrape data for the horse_id
            df_horse_profile, df_race_history, df_vet, df_trackwork = scrape_horse_data(horse_id)
            # write to sql - append mode + multi
            
            df_horse_profile.to_sql(r"horse_profile", if_exists = 'append', con = engine, method = 'multi')
            df_race_history.to_sql(r"race_history", if_exists = 'append', con = engine, method = 'multi')
            df_vet.to_sql(r"vet_record", if_exists = 'append', con = engine, method = 'multi')
            df_trackwork.to_sql(r"trackwork_record", if_exists = 'append', con = engine, method = 'multi')

            # report status to telegram
            if status_int % 500 == 0 and status_int > 0:
                telegram_bot_sendtext(f"Completed {str(status_int)} out of {str(len(horse_id_list))} at {str(datetime.now())} with {len(error_list)} errors.")

            # update status
            status_int +=1
            bar.update()

        except Exception as e:
            status_int +=1
            print(f"{horse_id}: {str(e)}")
            error_list.append((horse_id, str(e)))
            
    # update status if there are no errors
    if len(error_list) == 0:
        update_status(date_list)

    telegram_bot_sendtext(f"Horse scraper program completed at {str(datetime.now())}.")
    

Total 89 horses.


0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:11:04


NameError: name 'i' is not defined