- sportsnaviプロ野球ページからのwebスクレイピングスクリプト

In [1]:
import os
import re
import requests
import datetime
import pandas as pd
from bs4 import BeautifulSoup

In [3]:
# dateで受け取った日に開催された各試合の出場成績のリンクをリスト形式で返す
def fetch_game_links(date):
    params = { 'date': date }
    schedule_page = requests.get('https://baseball.yahoo.co.jp/npb/schedule', params=params)
    soup_schedule = BeautifulSoup(schedule_page.text, 'html.parser')
    game_link_elms = soup_schedule.find_all('a', class_='bb-score__content')
    game_links = list(map(lambda x: x['href'].replace('index', 'stats'), game_link_elms))
    detail_links = list(map(lambda x: x['href'].replace('index', 'score'), game_link_elms))
    return game_links, detail_links

def fetch_score_stats(game_link, dat, id_):
    game_page = requests.get(game_link)
    soup_game = BeautifulSoup(game_page.text, 'html.parser')
    
    score_stats_rows = soup_game.find_all('td', class_='bb-gameScoreTable__data')
    result_kind_rows = ["チーム"] + [str(i) for i in range(1,13)]
    result_kind_rows.append("日付")
    result_kind_rows.append("ID")
    
    stats_list = [result_kind_rows]
    stats = [score_stats_rows[0].text.replace('\n', '')]
    count = 1
    for row in score_stats_rows[1:]:
        ele = row.text.replace('\n', '')
        try:
            if ele[-1] == 'X':
                ele = int(ele[:-1])
            else:
                ele = int(ele)
            stats.append(ele)
        except:
            if count == 1:
                stats += ["X"] * (13 -len(stats))
                stats.append(dat)
                stats.append(id_)
                stats_list.append(stats)
                stats = []
                stats.append(ele)
                count += 1
            else:
                break
    stats += ["X"] * (13 -len(stats))
    stats.append(dat)
    stats.append(id_)
    stats_list.append(stats)
        
    return stats_list

# game_linkで受け取ったリンク先の野手成績をリスト形式で返す
def fetch_batter_stats(game_link, dat, id_):
    flg = False
    game_page = requests.get(game_link)
    soup_game = BeautifulSoup(game_page.text, 'html.parser')
    
    result_kind_rows = soup_game.find_all('p', class_='bb-statsTable__headLabel')
    result_kind_rows = [i.text for i in result_kind_rows]
    result_kind_rows = result_kind_rows[:len(result_kind_rows)//2]
    result_kind_rows.insert(1, "名")
    result_kind_rows.append("自")
    result_kind_rows.append("相")
    result_kind_rows.append("H/V")
    result_kind_rows.append("日付")
    result_kind_rows.append("ID")
    
    team_name_rows = soup_game.find_all('h1', class_='bb-head03__title')
    team_name_rows = [i.text for i in team_name_rows]
    
    batter_stats_rows = soup_game.find_all('tr', class_='bb-statsTable__row')

    stats_list = [result_kind_rows]
    # 野手成績
    for row in batter_stats_rows:
        if len(row.find_all('p', class_='bb-statsTable__headLabel')) and flg:
            my = team_name_rows[1][:2]
            enemy = team_name_rows[0][:2]
            team = "H"
        if len(row.find_all('p', class_='bb-statsTable__headLabel')) and not flg:
            my = team_name_rows[0][:2]
            enemy = team_name_rows[1][:2]
            team = "V"
            flg = True
    
        stats_html = row.find_all('td', class_='bb-statsTable__data')
        if stats_html:
            stats = list(map(lambda x: x.text, stats_html))[:14]
            stats[0] = re.sub("[()打走]", '', stats[0])
            stats[0] = '指' if len(stats[0]) == 0 else stats[0][:len(stats[0])] # 最初に出場したポジションのみに変換（代打のみの場合は指名打者扱い）
            stats[3:] = list(map(int, stats[3:])) # 野手成績をintに変換
            stats.append(my)
            stats.append(enemy)
            stats.append(team)
            stats.append(dat)
            stats.append(id_)
            stats_list.append(stats)
    return stats_list

# game_linkで受け取ったリンク先の投手成績をリスト形式で返す
def fetch_pitcher_stats(game_link, dat, id_):
    flg = False
    count = 0
    game_page = requests.get(game_link)
    soup_game = BeautifulSoup(game_page.text, 'html.parser')

    result_kind_rows = soup_game.find_all('th', class_='bb-scoreTable__head')
    result_kind_rows = [i.text for i in result_kind_rows]
    result_kind_rows = result_kind_rows[:len(result_kind_rows)//2]
    result_kind_rows[0] = "位"
    result_kind_rows.append("自")
    result_kind_rows.append("相")
    result_kind_rows.append("H/V")
    result_kind_rows.append("日付")
    result_kind_rows.append("ID")
    
    team_name_rows = soup_game.find_all('h1', class_='bb-head03__title')
    team_name_rows = [i.text for i in team_name_rows]
    
    pitcher_stats_rows = soup_game.find_all(['tr', 'h1'], class_=['bb-scoreTable__row', 'bb-head03__title'])
    words = ["北海道", "東北", "埼玉", "千葉ロッテ", "オリックス", "ソフトバンク", "横浜", "東京", "読売", "中日", "阪神", "広島"]
    stats_list = [result_kind_rows]
    # 投手成績
    for row in pitcher_stats_rows:
        if any([s in row.text for s in words]) and flg:
            my = team_name_rows[1][:2]
            enemy = team_name_rows[0][:2]
            team = "H"
            count = 0
        if any([s in row.text for s in words]) and not flg:
            my = team_name_rows[0][:2]
            enemy = team_name_rows[1][:2]
            team = "V"
            flg = True
            
        stats_html = row.find_all('td', class_='bb-scoreTable__data')
        if stats_html:
            stats = list(map(lambda x: x.text, stats_html))[:14]
            stats[0] = '先発' if count == 0 else '中継'
            stats[1] = stats[1].replace('\n', '')
            stats[2] = stats[2].replace('\n', '')
            stats[3] = float(stats[3]) # 投球回をfloatに変換
            stats[4:] = list(map(int, stats[4:])) # 投手成績をintに変換
            stats.append(my)
            stats.append(enemy)
            stats.append(team)
            stats.append(dat)
            stats.append(id_)
            stats_list.append(stats)
            count += 1
    return stats_list


# 各試合のイニングへのリンクをリストで取得
def get_inning_links(detail_link):
    game_page = requests.get(detail_link)
    soup_game = BeautifulSoup(game_page.text, 'html.parser')
    
    result_rows = soup_game.find_all('td', class_="bb-gameScoreTable__data")
    link_to_each_inning = []
    
    for row in result_rows:
        count = 100
        if row.find_all('a', class_='bb-gameScoreTable__score'):
            text = row.find_all('a', class_='bb-gameScoreTable__score')[0].get('href')
            print(text.split("=")[1])
            inning = int(text[-6]) if int(text[-7]) == 0 else int(text[-7:-5])
            
            if text[-3:]== "000":
                text = text[:-3] + str(count)
            link_to_each_inning.append([inning, "https://baseball.yahoo.co.jp"+text])
            print("https://baseball.yahoo.co.jp"+text)
            
            while True:  
                count += 100
                if count >= 1000:
                    text = text[:-4] + str(count)
                else:
                    text = text[:-3] + str(count)
                url = "https://baseball.yahoo.co.jp"+text
                res = requests.get(url)
                if res.status_code != 404:
                    link_to_each_inning.append([inning, "https://baseball.yahoo.co.jp"+text])
                    print("https://baseball.yahoo.co.jp"+text)
                else:
                    break
    
    return sorted(link_to_each_inning)

# 1球データを手に入れる
def fetch_details(inning_links, dat, id_):
    people_list = [["投手", "投", "打者", "打", "T/B", "イニング", "日付", "ID"]]
    balltype_list = [["pitch_"+str(i) for i in range(1, 20+1)] + [dat, id_]]
    speed_list = [["pitch_"+str(i) for i in range(1, 20+1)] + [dat, id_]]
    result_list = [["pitch_"+str(i) for i in range(1, 20+1)] + [dat, id_]]
    for inning, link_ in inning_links:
        game_page = requests.get(link_)
        soup_game = BeautifulSoup(game_page.text, 'html.parser')
    
        table = soup_game.findAll('table', {'class':'bb-splitsTable tracked_mods'}) 
        people = [i.text for i in table] 
        people = re.sub(r" ", '', people[0])
        people = re.sub(r"[¥\n]{2,}", " ", people)
        people = re.sub(r"[¥\n]{1,}", " ", people)     
        people = people.split(" ")[1:-1]
        if people[0] == "投手":
            people = people[2:]
            people.append("T")
        else:
            people = people[4:] + people[2:4]
            people.append("B")
        people += [inning, dat, id_]
        
        balltype = soup_game.find_all('td', class_="bb-splitsTable__data--ballType")
        balltype = [ele.text for ele in balltype]
    
        speed = soup_game.find_all('td', class_="bb-splitsTable__data--speed")
        speed = [ele.text for ele in speed]
        
        result = soup_game.find_all('td', class_="bb-splitsTable__data--result")
        result = [ele.text.replace('\n', '').replace(' ', '') for ele in result]
                
        balltype_list.append(balltype)
        speed_list.append(speed)
        result_list.append(result)
        people_list.append(people)
            
    return people_list, balltype_list, speed_list, result_list

def connect_data(list_):
    main_df = pd.DataFrame(list_[0])
    for df_ in list_[1:]:
        df_ = pd.DataFrame(df_)
        main_df = pd.concat([main_df, df_.iloc[1:]], axis=0)
    main_df.columns = main_df.iloc[0]
    main_df = main_df.iloc[1:].reset_index(drop=True)
    return main_df

In [4]:
if __name__ == '__main__':
    d_today = datetime.date(2022,3,25) #datetime.date.today()
    game_links, detail_links = fetch_game_links(d_today)
    batter_list = []
    pitcher_list = []
    score_list = []
    
    people = []
    balltype = []
    speed = []
    result = []
    
    for id_, game_link in enumerate(game_links):
        print(id_)
        detail_link = detail_links[id_]
        # 野手成績取得
        batter_stats = fetch_batter_stats(game_link, str(d_today), id_)
        # 投手成績取得
        pitcher_stats = fetch_pitcher_stats(game_link, str(d_today), id_)
        # スコア取得
        score_stats = fetch_score_stats(game_link, str(d_today), id_)
        # 1球データ取得
        inning_links = get_inning_links(detail_link) 
        pe, ba, sp, res = fetch_details(inning_links, str(d_today), id_)
        
        batter_list.append(batter_stats) 
        pitcher_list.append(pitcher_stats)
        score_list.append(score_stats)
        
        people.append(pe)
        balltype.append(ba)
        speed.append(sp)
        result.append(res)

    batter_df = connect_data(batter_list)
    pitcher_df = connect_data(pitcher_list)
    score_df = connect_data(score_list)
    
    people = connect_data(people)
    balltype = connect_data(balltype)
    speed = connect_data(speed)
    result = connect_data(result)
    
    if str(d_today) not in os.listdir("./data/"):
        os.makedirs("./data/"+str(d_today))
        
    batter_df.to_csv("./data/"+str(d_today)+"/batter.csv", index=False)
    pitcher_df.to_csv("./data/"+str(d_today)+"/pitcher.csv", index=False)
    score_df.to_csv("./data/"+str(d_today)+"/score.csv", index=False)

    people.to_csv("./data/"+str(d_today)+"/people.csv", index=False)
    balltype.to_csv("./data/"+str(d_today)+"/balltype.csv", index=False)
    speed.to_csv("./data/"+str(d_today)+"/speed.csv", index=False)
    result.to_csv("./data/"+str(d_today)+"/result.csv", index=False)

0
0110100
https://baseball.yahoo.co.jp/npb/game/2021005423/score?index=0110100
https://baseball.yahoo.co.jp/npb/game/2021005423/score?index=0110200
https://baseball.yahoo.co.jp/npb/game/2021005423/score?index=0110300
https://baseball.yahoo.co.jp/npb/game/2021005423/score?index=0110400
https://baseball.yahoo.co.jp/npb/game/2021005423/score?index=0110500
https://baseball.yahoo.co.jp/npb/game/2021005423/score?index=0110600
0210100
https://baseball.yahoo.co.jp/npb/game/2021005423/score?index=0210100
https://baseball.yahoo.co.jp/npb/game/2021005423/score?index=0210200
https://baseball.yahoo.co.jp/npb/game/2021005423/score?index=0210300
0310100
https://baseball.yahoo.co.jp/npb/game/2021005423/score?index=0310100
https://baseball.yahoo.co.jp/npb/game/2021005423/score?index=0310200
https://baseball.yahoo.co.jp/npb/game/2021005423/score?index=0310300
https://baseball.yahoo.co.jp/npb/game/2021005423/score?index=0310400
https://baseball.yahoo.co.jp/npb/game/2021005423/score?index=0310500
0410100


https://baseball.yahoo.co.jp/npb/game/2021005421/score?index=0710200
https://baseball.yahoo.co.jp/npb/game/2021005421/score?index=0710300
https://baseball.yahoo.co.jp/npb/game/2021005421/score?index=0710400
0810000
https://baseball.yahoo.co.jp/npb/game/2021005421/score?index=0810100
https://baseball.yahoo.co.jp/npb/game/2021005421/score?index=0810200
https://baseball.yahoo.co.jp/npb/game/2021005421/score?index=0810300
https://baseball.yahoo.co.jp/npb/game/2021005421/score?index=0810400
https://baseball.yahoo.co.jp/npb/game/2021005421/score?index=0810500
0910000
https://baseball.yahoo.co.jp/npb/game/2021005421/score?index=0910100
https://baseball.yahoo.co.jp/npb/game/2021005421/score?index=0910200
https://baseball.yahoo.co.jp/npb/game/2021005421/score?index=0910300
https://baseball.yahoo.co.jp/npb/game/2021005421/score?index=0910400
https://baseball.yahoo.co.jp/npb/game/2021005421/score?index=0910500
https://baseball.yahoo.co.jp/npb/game/2021005421/score?index=0910600
0120100
https://ba

0620000
https://baseball.yahoo.co.jp/npb/game/2021005422/score?index=0620100
https://baseball.yahoo.co.jp/npb/game/2021005422/score?index=0620200
https://baseball.yahoo.co.jp/npb/game/2021005422/score?index=0620300
https://baseball.yahoo.co.jp/npb/game/2021005422/score?index=0620400
0720100
https://baseball.yahoo.co.jp/npb/game/2021005422/score?index=0720100
https://baseball.yahoo.co.jp/npb/game/2021005422/score?index=0720200
https://baseball.yahoo.co.jp/npb/game/2021005422/score?index=0720300
https://baseball.yahoo.co.jp/npb/game/2021005422/score?index=0720400
https://baseball.yahoo.co.jp/npb/game/2021005422/score?index=0720500
https://baseball.yahoo.co.jp/npb/game/2021005422/score?index=0720600
0820000
https://baseball.yahoo.co.jp/npb/game/2021005422/score?index=0820100
https://baseball.yahoo.co.jp/npb/game/2021005422/score?index=0820200
https://baseball.yahoo.co.jp/npb/game/2021005422/score?index=0820300
0920000
https://baseball.yahoo.co.jp/npb/game/2021005422/score?index=0920100
ht

https://baseball.yahoo.co.jp/npb/game/2021005425/score?index=0810400
0910000
https://baseball.yahoo.co.jp/npb/game/2021005425/score?index=0910100
https://baseball.yahoo.co.jp/npb/game/2021005425/score?index=0910200
https://baseball.yahoo.co.jp/npb/game/2021005425/score?index=0910300
https://baseball.yahoo.co.jp/npb/game/2021005425/score?index=0910400
https://baseball.yahoo.co.jp/npb/game/2021005425/score?index=0910500
https://baseball.yahoo.co.jp/npb/game/2021005425/score?index=0910600
https://baseball.yahoo.co.jp/npb/game/2021005425/score?index=0910700
https://baseball.yahoo.co.jp/npb/game/2021005425/score?index=0910800
0120100
https://baseball.yahoo.co.jp/npb/game/2021005425/score?index=0120100
https://baseball.yahoo.co.jp/npb/game/2021005425/score?index=0120200
https://baseball.yahoo.co.jp/npb/game/2021005425/score?index=0120300
0220100
https://baseball.yahoo.co.jp/npb/game/2021005425/score?index=0220100
https://baseball.yahoo.co.jp/npb/game/2021005425/score?index=0220200
https://ba

参考
- https://qiita.com/Jun-T/items/6a641b6e28c487127484
- https://gammasoft.jp/support/solutions-of-requests-get-failed/
- https://tonari-it.com/python-response-status-code/