In [93]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re

def get_label(home_score,away_score):
    if home_score == away_score:
        label = 'D'
    elif home_score > away_score:
        label = 'H'
    elif home_score < away_score:
        label = 'A'
    return label

def get_match_info(soup,_id):
    match_table = soup.find_all('table', {"id":"matchs_table"})[0]
    match_title = match_table.find_all('span',{"class":"odds_second_title"})[0]
    title = match_title.contents[0].strip()
    match_result = match_table.find_all('span')[1].contents[-1].strip()
    time_raw = match_table.find_all('span',{"class":"odds_first_title"})[0].contents[0]
    match_time = '2017-' + time_raw[-12:].strip() + ':00'
    final_result = re.sub('\(.*\) ','',match_result)
    if len(final_result.split(':')) > 1:
        home_score = int(final_result.split(':')[0].strip())
        away_score = int(final_result.split(':')[1].strip())
    else:
        home_score = -1
        away_score = -1
    label = get_label(home_score,away_score)
    
    info = {'id' : _id,
            'title': title,
            'match_time' : match_time,
            'match_result' : match_result,
            'home_score' : home_score,
            'away_score': away_score,
            'label' : label,
            'match_table' : match_table
    }
    
    return info

#########################################################################

def enhanced_df(df,  match_info, df_type='hdw'):
    df_result = df
    if df_type == 'hdw':
        df_result['HOME'] = df_result['HDW'].apply(lambda x:x[0])
        df_result['DRAW'] = df_result['HDW'].apply(lambda x:x[1])
        df_result['AWAY'] = df_result['HDW'].apply(lambda x:x[2])
    elif df_type == 'handi':
        df_result['HOME'] = df_result['HA'].apply(lambda x:x[0])
        df_result['AWAY'] = df_result['HA'].apply(lambda x:x[1])         
    df_result['LABEL'] = match_info['label']
    df_result['ID'] = match_info['id']
    df_result['MATCH_TIME'] = match_info['match_time']
    df_result['TITLE'] = match_info['title']
    df_result['MATCH_TIME'] = df_result['MATCH_TIME'].astype('datetime64')
    df_result['MATCH_RESULT'] = match_info['match_result']
    df_result['HOME_SCORE'] = match_info['home_score']
    df_result['AWAY_SCORE'] = match_info['away_score']
    df_result['DELTA'] = df_result['MATCH_TIME'] - df_result['TIME']
    df_result['HOME_LAG'] = df_result.HOME.shift(1)
    df_result['AWAY_LAG'] = df_result.AWAY.shift(1)
    df_result['HOME_CHANGE'] = df_result.apply(lambda rec: up_down(rec['HOME'],rec['HOME_LAG']),axis=1)
    df_result['AWAY_CHANGE'] = df_result.apply(lambda rec: up_down(rec['AWAY'],rec['AWAY_LAG']),axis=1)
    return df_result
#########################################################################

def get_hdw(match_info):
    if match_info['match_table'].find('div',{'id':'collapse_odds_org_'+str(match_info['id'])}) is None:
        return None
    odds_table = match_info['match_table'].find('div',{'id':'collapse_odds_org_'+str(match_info['id'])}).find_all('table')[0]
    #match_info['match_table'].find_all('table',{"class":"table table-condensed table-striped table-hover odds_detail"})[0]
    
    ### times array
    time_arr = []
    for time in odds_table.find_all('span', {'class':'odds_time'}):
        t = '2017-' + time.contents[0].strip().replace('\t','').replace('\n',' ') +':00'
        revised_t = pd.to_datetime(t)
        time_arr.append(revised_t)
        
    ### odds array
    odds_arr = []
    i=0
    odds_row = [] 
    for odds in odds_table.find_all('a'):
        i+=1
        o = odds.contents[0].strip()
        revised_o = float(re.sub('\(.*\)','',o))
        odds_row.append(revised_o)        
        if i%3==0:
            odds_tuple = tuple(odds_row) 
            odds_arr.append(odds_tuple)
            odds_row = []
    hdw_result = list(zip(time_arr,odds_arr))
    df_hdw = pd.DataFrame(hdw_result)
    df_hdw.columns = ['TIME','HDW']
    df_hdw = enhanced_df(df_hdw, match_info, df_type='hdw')
    return df_hdw

def get_handi(match_info):
    if match_info['match_table'].find('div',{'id':'collapse_odds_handicap_'+str(match_info['id'])}) is None:
        return None
    odds_handi_table = match_info['match_table'].find('div',{'id':'collapse_odds_handicap_'+str(match_info['id'])}).find_all('table')[0]
    #match_info['match_table'].find_all('table',{"class":"table table-condensed table-striped table-hover odds_detail"})[2]
    
    ### times array
    time_arr = []
    for time in odds_handi_table.find_all('span', {'class':'odds_time'}):
        t = '2017-' + time.contents[0].strip().replace('\t','').replace('\n',' ') +':00'
        revised_t = pd.to_datetime(t)
        time_arr.append(revised_t)
    
    ### odds line array
    line_arr = []
    for line in odds_handi_table.find_all('span',{'class':'odds_line'}):
        l = line.contents[0].strip().replace('\t','').replace('\n',' ')
        line_arr.append(l)
    
    ### odds array
    odds_arr = []
    i=0
    odds_row = [] 
    for odds in odds_handi_table.find_all('a'):
        i+=1
        o = odds.contents[0].strip()
        revised_o = float(re.sub('\(.*\)','',o))
        odds_row.append(revised_o)    
        if i%2==0:
            odds_tuple = tuple(odds_row) 
            odds_arr.append(odds_tuple)
            odds_row = []
    handi_result = list(zip(time_arr, line_arr, odds_arr))
    df_handi = pd.DataFrame(handi_result)
    df_handi.columns = ['TIME','LINE','HA']
    df_handi = enhanced_df(df_handi, match_info, df_type='handi')
    return df_handi

import math
def up_down(HOME,HOME_LAG):
    if  math.isnan(HOME_LAG) or HOME_LAG==HOME:
        return '0'
    elif HOME > HOME_LAG:
        return '+'
    elif HOME < HOME_LAG:
        return '-'
    
def get_soup(_id):
    url = 'http://hkjcodds.com/match_board/' + str(_id)
    r = requests.get(url,verify=False)
    text = r.content.decode()
    soup = BeautifulSoup(text, 'html.parser')
    return soup


In [8]:
df_all_hdw = None
df_all_handi = None
81000

In [55]:
print(_id)

116886


In [109]:
for _id in reversed(range(min(df_all_handi.ID)-1000,min(df_all_handi.ID))):
    #print(_id)
    try:
        soup = get_soup(_id)    
        match_info = get_match_info(soup,_id)
        #df_hdw = get_hdw(match_info)
        df_handi = get_handi(match_info)

        #df_all_hdw = pd.concat([df_all_hdw, df_hdw])
        df_all_handi =  pd.concat([df_all_handi, df_handi])
    except:
        pass

In [None]:
for _id in range(max(df_all_handi.ID),119037):
    #print(_id)
    try:
        soup = get_soup(_id)    
        match_info = get_match_info(soup,_id)
        #df_hdw = get_hdw(match_info)
        df_handi = get_handi(match_info)

        #df_all_hdw = pd.concat([df_all_hdw, df_hdw])
        df_all_handi =  pd.concat([df_all_handi, df_handi])
    except:
        pass

In [110]:
df_all_handi = df_all_handi[df_all_handi.MATCH_RESULT.apply(lambda x: len(re.sub('\(.*\) ','',x).split(':'))>1 )]

In [112]:
min(df_all_handi.ID), max(df_all_handi.ID)

(99754, 118967)

In [113]:
min(df_all_handi.ID), max(df_all_handi.ID)

(99754, 118967)

In [108]:
len(df_all_handi)

74663

In [104]:
df_all_handi.to_pickle('handi.pkl')