In [21]:
import requests
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
import urllib
import re
import time
from datetime import datetime, timedelta
from dateutil.relativedelta import relativedelta
from pathlib import Path

In [22]:
def get_data_url(target_month, place):
    year = target_month.strftime("%Y")
    month = target_month.strftime("%m")
    base = 'https://keirin.kdreams.jp'
    baseAdd1 = "/" + place + "/schedule/" + year + "/" + month
    url = base + baseAdd1
    return urllib.parse.quote_plus(url, "/:?=&")

In [23]:
def month_span(start_month, end_month):
    """start_date、end_dateの期間に含まれる日毎のdatetimeオブジェクトを返すジェネレータ
    """
    i = 0
    while 1 :
        target_month = start_month + relativedelta(months=i)
        if target_month > end_month: break
        yield target_month
        i += 1
        

In [28]:
def get_tables(content, is_talkative=True):
    """table要素を取得する"""
    bs = BeautifulSoup(content, "lxml")
    tables = bs.find_all("table")
    n_tables = len(tables)
    if n_tables == 0:
        emsg = "table not found."
        raise Exception(emsg)
    if is_talkative:
        print("%d table tags found.." % n_tables)
    return tables

In [25]:
def parse_table(table):
    """table要素のデータを読み込んで二次元配列を返す"""

    ##### thead 要素をパースする #####

    # thead 要素を取得 (存在する場合)
    thead = table.find("thead")

    # thead が存在する場合
    if thead:
        tr = thead.find("tr")
        ths = tr.find_all("th")
        columns = [th.text for th in ths]    # pandas.DataFrame を意識
    
    # thead が存在しない場合
    else:
        columns = []

    ##### tbody 要素をパースする #####

    # tbody 要素を取得
    tbody = table.find("tbody")

    # tr 要素を取得
    trs = tbody.find_all("tr")

    # 出力したい行データ
    rows = [columns]

    # td (th) 要素の値を読み込む
    # tbody -- tr 直下に th が存在するパターンがあるので
    # find_all(["td", "th"]) とするのがコツ
    for tr in trs:
        row = [td.text for td in tr.find_all(["td", "th"])]
        rows.append(row)

    return rows

In [26]:
start_month = datetime.strptime('201708', '%Y%m')
end_month = datetime.strptime('201708', '%Y%m')

today = datetime.today()
date_prefix = today.strftime('%Y%m%d-%H%M%S')
file_origin = 'crawl_' + date_prefix + '_Crawler.csv'

my_file = Path(file_origin)
isFirst = True

if my_file.is_file():
    mainDf = pd.read_csv(file_origin, index_col=0, header=0)
    isFirst = False
else:
    mainDf = None
    

In [27]:
places = ['hakodate']

sleep_time = 5

In [20]:
for target_month in month_span(start_month, end_month):
    for place in places:
        target_url = get_data_url(target_month, place)
        print(target_url)
        headers = {'User-Agent': 'Mozilla/5.0'}
        time.sleep(sleep_time)
        response = requests.get(target_url, headers=headers)# <Response [200]>
        soup = BeautifulSoup(response.text, 'html.parser')
        dokantos = soup.find_all("p", class_="dokanto")
        for dokanto in dokantos:
            raceresult_link = dokanto.a.get("href")
            print(raceresult_link)
            time.sleep(sleep_time)
            response = requests.get(raceresult_link, headers=headers)
            soup = BeautifulSoup(response.text, 'html.parser')
            racedetails_bs = soup.find_all("p", class_="race")
            for racedetail_bs in racedetails_bs:
                racedetail_link = racedetail_bs.a.get('href')
                print(racedetail_link)
                time.sleep(sleep_time)
                response = requests.get(racedetail_link, headers=headers)
                soup = BeautifulSoup(response.text, 'html.parser')
                print("RACE RACORD")
                racecard_bs = soup.find("table", class_="racecard_table")
                #print(racecard)
                riders_bs = racecard_bs.find_all("tr", class_=re.compile("^n"))
                print(riders_bs)
                for rider_bs in riders_bs:
                    names = rider_bs.find("td", class_="rider bdr_r").get_text().strip().splitlines()
                    name = names[0]
                    homes = ''.join(names[1].split()).split("/")
                    home = homes[0]
                    age = homes[1]
                    period = homes[2]
                    print(name, home, age, period)
                #print("RACE RESULT")
                #print(soup.find("table", class_="result_table"))
                break
    

https://keirin.kdreams.jp/hakodate/schedule/2017/08
https://keirin.kdreams.jp/hakodate/raceresult/11201707310100/
https://keirin.kdreams.jp/hakodate/racedetail/1120170731010001/?pageType=KS_RACE_CARD_PAGE_TYPE_SHOW_RESULT
RACE RACORD
[<tr class="n1 ">
<td class="tip">
<span class="icon icon_t1">◎</span>
</td>
<td class="kiai">
</td>
<td class="evaluation bdr_r">
<span>17</span>
</td>
<td class="bracket ">1</td>
<td class="num"><span>1</span></td>
<td class="rider bdr_r">
									伊藤 司<br/>
<span class="home">福　島/39/83									</span>
</td>
<td>A3</td>
<td class="bdr_r">両</td>
<td>3.92</td>
<td class="bdr_r">
<span class="best">78.08</span>
</td>
<td>
<span class="best">1</span>
</td>
<td class="bdr_r">
<span class="best">2</span>
</td>
<td>
								1								</td>
<td>
<span class="best">2</span>
</td>
<td>
								2								</td>
<td class="bdr_r">
								0								</td>
<td>
<span class="best">2</span>
</td>
<td>
								3								</td>
<td>
								2								</td>
<td class="bdr_r">
			

https://keirin.kdreams.jp/hakodate/racedetail/1120170825010001/?pageType=KS_RACE_CARD_PAGE_TYPE_SHOW_RESULT
RACE RACORD
[<tr class="n1 ">
<td class="tip">
<span class="icon icon_t1">◎</span>
</td>
<td class="kiai">
</td>
<td class="evaluation bdr_r">
<span>10</span>
</td>
<td class="bracket ">1</td>
<td class="num"><span>1</span></td>
<td class="rider bdr_r">
									門脇 翼<br/>
<span class="home">秋　田/24/111									</span>
</td>
<td>A3</td>
<td class="bdr_r">逃</td>
<td>3.85</td>
<td class="bdr_r">
<span class="best">76.55</span>
</td>
<td>
								1								</td>
<td class="bdr_r">
								7								</td>
<td>
								5								</td>
<td>
<span class="best">1</span>
</td>
<td>
								0								</td>
<td class="bdr_r">
								0								</td>
<td>
<span class="best">6</span>
</td>
<td>
								0								</td>
<td>
								1								</td>
<td class="bdr_r">
								2								</td>
<td>
<span class="best">66.6</span>
</td>
<td>
<span class="best">66.6</span>
</td>
<td class="bdr_r">
<span class=

In [121]:
        
            target_url = get_data_url(target_month.strftime('%Y%m'), pid, rno)
            date = target_date.strftime('%Y%m%d')
            headers = {'User-Agent': 'Mozilla/5.0'}
            time.sleep(sleep_time)
            response = requests.get(target_url, headers=headers)# <Response [200]>
            soup = BeautifulSoup(response.text, 'html.parser')
            content1 = soup.find_all("h1", class_="h_content1")
            content2 = soup.find_all("div", class_="blocks")
            content3 = soup.find_all("table", id="detail_program")
            for content3_part in content3:
                tds1 = [td.find_all("p") for td in content3_part.find_all("td", class_="border_all name_kanji")]
                ID = [row[0].string for row in tds1]
                name = [row[1].string for row in tds1]
                prefecture = [row[2].string.split("/") for row in tds1]
                age = [row[3].string.split("/") for row in tds1]
                ages = [row[0].replace('歳', '') for row in age]
                term = [row[1].replace('期', '') for row in age]
                prefectures = [row[0].replace('北海道', '1').replace('青森', '2').replace('岩手', '3').replace('宮城', '4').replace('秋田', '5').replace('山形', '6').replace('福島', '7').replace('茨城', '8').replace('栃木', '9').replace('群馬', '10').replace('埼玉', '11').replace('千葉', '12').replace('東京', '13').replace('神奈川', '14').replace('新潟', '15').replace('富山', '16').replace('石川', '17').replace('福井', '18').replace('山梨', '19').replace('長野', '20').replace('岐阜', '21').replace('静岡', '22').replace('愛知', '23').replace('三重', '24').replace('滋賀', '25').replace('京都', '26').replace('大阪', '27').replace('兵庫', '28').replace('奈良', '29').replace('和歌山', '30').replace('鳥取', '31').replace('島根', '32').replace('岡山', '33').replace('広島', '34').replace('山口', '35').replace('徳島', '36').replace('香川', '37').replace('愛媛', '38').replace('高知', '39').replace('福岡', '40').replace('佐賀', '41').replace('長崎', '42').replace('熊本', '43').replace('大分', '44').replace('宮崎', '45').replace('鹿児島', '46').replace('沖縄', '47') for row in prefecture]
                rank = [row[1].replace('A1', '1').replace('A2', '2').replace('B1', '3').replace('B2', '4') for row in prefecture]
                tds2 = [td.text for td in content3_part.find_all("td", class_="border_all average")[2:][::5]]
                morter_num = [td[:-5] for td in tds2]
                for content2_part in content2:
                    race_number = [p.text.replace('【', '').replace('】', '').replace('予選', '').replace('R', '') for p in content2_part.find_all("p", class_="left")]
                    for num in race_number:
                        race_number = [num for i in range(6)]
                        race_number = [num[:3] for num in race_number]
                for content1_part in content1:
                    race_place = [n[:3] for n in content1_part]
                    race_place = race_place * 6
                raceDict = pd.DataFrame({"date": date,                                         
                                        "ID": ID,
                                        "name": name,
                                        "age": ages,
                                        "term": term,
                                        "prefecture": prefectures,
                                        "rank": rank,
                                        "morter_num": morter_num,
                                        "race_number": race_number,
                                        "race_place": race_place
                                         })
                print(raceDict)
                # 取得した後、追加していく
                if mainDf is None:
                    raceDict.to_csv(file_origin)
                    mainDf = raceDict
                else:
                    raceDict.to_csv(file_origin, mode='a', header=False)
                    mainDf = mainDf.append(raceDict)
                                    
    print("Completed.")

IndentationError: unexpected indent (<ipython-input-121-b3413e40a238>, line 2)