In [1]:
import requests
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
import urllib
import re
import time
from datetime import datetime, timedelta, date
from dateutil.relativedelta import relativedelta
from pathlib import Path
import csv
import os
import pprint

In [2]:
def get_data_url(target_month, place):
    year = target_month.strftime("%Y")
    month = target_month.strftime("%m")
    base = 'https://keirin.kdreams.jp'
    baseAdd1 = "/" + place + "/schedule/" + year + "/" + month
    url = base + baseAdd1
    return urllib.parse.quote_plus(url, "/:?=&")

In [3]:
def parse_racedetail_to_csv(racedetail_links, kaisaidate, place, filename):

    for i, racedetail_link in enumerate(racedetail_links):

        race_num = str(i + 1)
        
        headers = {'User-Agent': 'Mozilla/5.0'}
        time.sleep(sleep_time)
        try:
            response = requests.get(racedetail_link, headers=headers)
        except:
            print("Response error:", racedetail_link)
            continue
            
        soup = BeautifulSoup(response.text, 'html.parser')

        result_bs = soup.find("table", class_="result_table")
        if result_bs:
            continue

        print("RACE RECORD")
        racecard_bs = soup.find("table", class_="racecard_table")
        #print(racecard)
        if not racecard_bs:
            continue
        riders_bs = racecard_bs.find_all("tr", class_=re.compile("^n"))
            
        for rider_bs in riders_bs:
            rows = []
            for td in rider_bs.find_all("td"):
                row = td.text
                row = ''.join(row.split())
                rows.append(row)
            if len(rows) < 23:
                rows.insert(3, bracket)

            names = rider_bs.find("td", class_="rider bdr_r").get_text().strip().splitlines()
            name = names[0]
            homes = ''.join(names[1].split()).split("/")
            prefecture = homes[0]
            age = homes[1]
            period = homes[2]

            rows.pop(5)
            rows.insert(5, name)
            rows.insert(6, prefecture)
            rows.insert(7, age)
            rows.insert(8, period)

            #print(rows)

            rows[0] = rows[0].replace('◎', '9').replace('○', '8').replace('△', '7').replace('▲', '6').replace('×', '5').replace('注', '4')
            bracket = rows[3]
            rows[6] = rows[6].replace('北海道', '1').replace('青森', '2').replace('岩手', '3').replace('宮城', '4').replace('秋田', '5').replace('山形', '6').replace('福島', '7').replace('茨城', '8').replace('栃木', '9').replace('群馬', '10').replace('埼玉', '11').replace('千葉', '12').replace('東京', '13').replace('神奈川', '14').replace('新潟', '15').replace('富山', '16').replace('石川', '17').replace('福井', '18').replace('山梨', '19').replace('長野', '20').replace('岐阜', '21').replace('静岡', '22').replace('愛知', '23').replace('三重', '24').replace('滋賀', '25').replace('京都', '26').replace('大阪', '27').replace('兵庫', '28').replace('奈良', '29').replace('和歌山', '30').replace('鳥取', '31').replace('島根', '32').replace('岡山', '33').replace('広島', '34').replace('山口', '35').replace('徳島', '36').replace('香川', '37').replace('愛媛', '38').replace('高知', '39').replace('福岡', '40').replace('佐賀', '41').replace('長崎', '42').replace('熊本', '43').replace('大分', '44').replace('宮崎', '45').replace('鹿児島', '46').replace('沖縄', '47')
            rows[9] = rows[9].replace('S1', '5').replace('S2', '4').replace('A1', '3').replace('A2', '2').replace('A3', '1')
            rows[10] = rows[10].replace('両', '1').replace('逃', '2').replace('追', '3')

            car_num = rows[4]

            rows.insert(0, kaisaidate)
            rows.insert(1, place)
            rows.insert(2, race_num)

            #print(rows)

            with open(filename, "a") as f:
                    writer = csv.writer(f, lineterminator='\n') # 改行コード（\n）を指定しておく
                    writer.writerow(rows)     # list（1次元配列）の場合

In [4]:
# wait 1 seconds before access url
sleep_time = 1

In [5]:
def get_places(): # get places
    places = []
    home_url = 'https://keirin.kdreams.jp'
    headers = {'User-Agent': 'Mozilla/5.0'}
    response = requests.get(home_url, headers=headers)# <Response [200]>
    soup = BeautifulSoup(response.text, 'html.parser')
    stadiums = soup.find("div", class_="stadium_nav_list").find_all("dl", class_="stadium")
    for stadium in stadiums:
        links = stadium.find_all("a")
        for link in links:
            place = link.get("href").split("/")[-2]
            places.append(place)
            
    return places

In [6]:
def init_file(date):

    filename = "predict/" + date.strftime("%Y%m") + "_data.csv"
    
    if os.path.exists(filename):
        os.remove(filename)
    
    csv_header = [''] * 30
    csv_header[0] = 'date'
    csv_header[1] = 'place'
    csv_header[2] = 'race_num'
    csv_header[3] = 'predict'
    csv_header[4] = 'koukiai'
    csv_header[5] = 'evaluation'
    csv_header[6] = 'bracket'
    csv_header[7] = 'car_num'
    csv_header[8] = 'name'
    csv_header[9] = 'prefecture'
    csv_header[10] = 'age'
    csv_header[11] = 'period'
    csv_header[12] = 'rank'
    csv_header[13] = 'leg'
    csv_header[14] = 'gear'
    csv_header[15] = 'racing piont'
    csv_header[16] = 'S'
    csv_header[17] = 'B'
    csv_header[18] = 'Nige'
    csv_header[19] = 'Maki'
    csv_header[20] = 'Sashi'
    csv_header[21] = 'Ma'
    csv_header[22] = '1st'
    csv_header[23] = '2nd'
    csv_header[24] = '3rd'
    csv_header[25] = 'Chakugai'
    csv_header[26] = 'win'
    csv_header[27] = '2ren'
    csv_header[28] = '3ren'

    with open(filename, "a") as f:
            writer = csv.writer(f, lineterminator='\n') # 改行コード（\n）を指定しておく
            writer.writerow(csv_header)   # list（1次元配列）の場合
            
    return filename

In [7]:
places = get_places()
print(places)

['hakodate', 'aomori', 'iwakitaira', 'yahiko', 'maebashi', 'toride', 'utsunomiya', 'omiya', 'seibuen', 'keiokaku', 'tachikawa', 'matsudo', 'chiba', 'kawasaki', 'hiratsuka', 'odawara', 'ito', 'shizuoka', 'nagoya', 'gifu', 'ogaki', 'toyohashi', 'toyama', 'matsusaka', 'yokkaichi', 'fukui', 'nara', 'mukomachi', 'wakayama', 'kishiwada', 'tamano', 'hiroshima', 'hofu', 'takamatsu', 'komatsushima', 'kochi', 'matsuyama', 'kokura', 'kurume', 'takeo', 'sasebo', 'beppu', 'kumamoto']


In [8]:
today = date.today()
filename = init_file(today)

for place in places:

    target_url = get_data_url(today, place)
    print(target_url)
    headers = {'User-Agent': 'Mozilla/5.0'}
    time.sleep(sleep_time)
    try:
        response = requests.get(target_url, headers=headers)# <Response [200]>
    except:
        print("Response error:", target_url)
        continue

    # 結果があったら、処理を飛ばす
    soup = BeautifulSoup(response.text, 'html.parser')
    a_results = soup.find_all("a", string="結果")
    if not a_results:
        continue

    for a_result in a_results:
        raceresult_link = a_result.get("href")
        print(raceresult_link)
        time.sleep(sleep_time)
        try:
            response = requests.get(raceresult_link, headers=headers)
        except:
            print("Response error:", raceresult_link)
            continue

        soup = BeautifulSoup(response.text, 'html.parser')

        kaisaidate_tab_bs = soup.find("ul", id="JS_UL_KAISAI_DATE_TAB")
        if not kaisaidate_tab_bs:
            print("No races")
            continue

        lis = kaisaidate_tab_bs.find_all("li")
        kaisaidates = []
        for li in lis:
            kaisaidates.append(li.get("kaisaidate"))

        for kaisaidate in kaisaidates:
            #もし、日付が今日よりも小さければ処理をスキップ
            kaisaidate_dt = datetime.strptime(kaisaidate, "%Y%m%d")
            print(kaisaidate_dt)
            if datetime.now() > kaisaidate_dt:
                continue
                
            # get race detail base URL
            racedetail_id = "JS_DL_KAISAI_DETAIL_INFO_NAV_" + kaisaidate
            racedetail_a = soup.find("dl", id=racedetail_id).find("a", string="レース詳細")
            if not racedetail_a:
                print("No rance")
                continue
            else:
                racedetail_link_base = racedetail_a.get("href")
            print(racedetail_link_base)

            # get racedetail URLs of all races
            time.sleep(sleep_time)
            try:
                response = requests.get(racedetail_link_base, headers=headers)
            except:
                print("Response error:", racedetail_link_base)
                continue

            soup = BeautifulSoup(response.text, 'html.parser')

            dl_id = "JS_DL_KAISAI_DETAIL_RACE_NAV_" + kaisaidate
            a_races = soup.find("dl", id=dl_id).find_all("a")
            if not a_races:
                continue

            racedetail_links = []
            for a_race in a_races:
                href = a_race.get("href")
                racedetail_links.append(href)
            racedetail_links[0] = racedetail_link_base

            # call function - parse racedetail URL and save racer data and result to csv file
            parse_racedetail_to_csv(racedetail_links, kaisaidate, place, filename)


https://keirin.kdreams.jp/hakodate/schedule/2018/09
https://keirin.kdreams.jp/hakodate/raceresult/11201809030100/
2018-09-03 00:00:00
2018-09-04 00:00:00
2018-09-05 00:00:00
javascript:void(0)
Response error: javascript:void(0)
javascript:void(0)
Response error: javascript:void(0)
https://keirin.kdreams.jp/raceresult/
No races
https://keirin.kdreams.jp/aomori/schedule/2018/09
https://keirin.kdreams.jp/aomori/raceresult/12201809020100/
2018-09-02 00:00:00
2018-09-03 00:00:00
2018-09-04 00:00:00
https://keirin.kdreams.jp/aomori/raceresult/12201809180100/
2018-09-18 00:00:00
2018-09-19 00:00:00
2018-09-20 00:00:00
https://keirin.kdreams.jp/aomori/raceresult/12201809270100/
2018-09-27 00:00:00
https://keirin.kdreams.jp/aomori/racedetail/1220180927010001/?l-id=l-pc-srri-srdi-raceinfo_kaisai_detail_info_nav_btn
RACE RECORD
RACE RECORD
RACE RECORD
RACE RECORD
RACE RECORD
RACE RECORD
RACE RECORD
RACE RECORD
RACE RECORD
RACE RECORD
RACE RECORD
RACE RECORD
2018-09-28 00:00:00
No rance
2018-09-29

2018-09-26 00:00:00
2018-09-27 00:00:00
No rance
2018-09-28 00:00:00
No rance
https://keirin.kdreams.jp/raceresult/
No races
https://keirin.kdreams.jp/mukomachi/schedule/2018/09
https://keirin.kdreams.jp/mukomachi/raceresult/54201809220100/
2018-09-22 00:00:00
2018-09-23 00:00:00
2018-09-24 00:00:00
2018-09-25 00:00:00
https://keirin.kdreams.jp/raceresult/
No races
https://keirin.kdreams.jp/wakayama/schedule/2018/09
https://keirin.kdreams.jp/wakayama/raceresult/55201809010100/
2018-09-01 00:00:00
2018-09-02 00:00:00
2018-09-03 00:00:00
https://keirin.kdreams.jp/wakayama/raceresult/55201809180100/
2018-09-18 00:00:00
2018-09-19 00:00:00
2018-09-20 00:00:00
https://keirin.kdreams.jp/raceresult/
No races
https://keirin.kdreams.jp/kishiwada/schedule/2018/09
https://keirin.kdreams.jp/kishiwada/raceresult/56201809110100/
2018-09-11 00:00:00
2018-09-12 00:00:00
2018-09-13 00:00:00
https://keirin.kdreams.jp/raceresult/
No races
https://keirin.kdreams.jp/tamano/schedule/2018/09
https://keirin.k