In [1]:
"""
pybaseballを使用して、一年分のMLBデータ(トラッキングデータ)をcsvで取得する
対象サイト: Baseball Reference, Baseball Savant, FanGraphs
"""

import calendar
import re
import traceback

import pandas as pd
from pybaseball import batting_stats, pitching_stats, statcast

players_info_df = pd.read_csv('people.csv', low_memory=False)

pd.set_option('display.max_columns', None)

# 取得したい年に変換
SELECT_YEAR = 2020


def create_df_pitching_stats(year):
    """
    投手のシーズン成績を取得(登板数, 勝利数など)

    Parameters
    --------------
    year: int
        対象シーズン

    Returns
    --------------
    output_df: DataFrame
        投手のシーズン成績

    Examples
    --------------
    pitching_stats関数の使い方: pitching_stats(start_season, end_season)
    """
    # qualは最低インニング数
    output_df = pitching_stats(year, qual=1)
    return output_df


def create_df_batting_stats(year):
    """
    打者のシーズン成績を取得(打席数, 打率など)

    Parameters
    --------------
    year: int
        対象シーズン

    Returns
    --------------
    output_df: DataFrame
        打者のシーズン成績

    Examples
    --------------
    batting_stats関数の使い方: batting_stats(start_season, end_season)
    """
    output_df = batting_stats(year, qual=1)
    return output_df


def create_df_statcast(start_year):
    """
    トラッキングデータを取得(リリースポイント, 打球角度など)

    Parameters
    --------------
    start_year: int
        対象シーズン

    Returns
    --------------
    output_df: DataFrame
        トラッキングデータ

    Examples
    --------------
    statcast関数の使い方: statcast(start_dt='YYYY-MM-DD', end_dt='YYYY-MM-DD')
    """
    list_df = []
    end_year = start_year + 1
    start_month = 3
    end_month = 11
    for year in range(start_year, end_year):
        # 2020年の開幕は7月
        if year == 2020:
            start_month = 7
        for month in range(start_month, end_month):
            start = str(year) + '-' + str(month).zfill(2) + '-01'
            end = str(year) + '-' + str(month).zfill(2) + '-' + str(calendar.monthrange(year, month)[1])
            data = statcast(start_dt=start, end_dt=end)
            list_df.append(data)
    output_df = pd.concat(list_df)
    return output_df


def add_batter_name(players_info_df, statcast_df):
    """
    打者の名前を追加

    Parameters
    --------------
    players_info_df: DataFrame
        選手のプロフィール
    statcast_df: DataFrame
        トラッキングデータ

    Returns
    --------------
    output_df: DataFrame
        トラッキングデータ
    """
    players_info_df['name_first'] = players_info_df['name_first'].apply(lambda x: str(x))
    players_info_df['name_last'] = players_info_df['name_last'].apply(lambda x: str(x))
    players_info_df['batter_name'] = players_info_df.apply(lambda x: x['name_first'] + ' ' + x['name_last'], axis=1)
    players_info_df = players_info_df[['key_mlbam', 'batter_name']].dropna()

    output_df = pd.merge(statcast_df, players_info_df, left_on="batter", right_on='key_mlbam', how="left").rename(columns={'player_name': 'pitcher_name'})
    return output_df


def convert_units(df):
    """
    各指標の単位を変換する

    Parameters
    --------------
    df: DataFrame
         トラッキングデータ

    Returns
    --------------
    output_df: DataFrame
        トラッキングデータ

    Examples
    --------------
    release_speed: マイルからキロメートルに変換(* 1.609)
    release_pos_x: フィートからメートルに変換(* 0.3048)
    """
    output_df = df.copy()
    output_df['release_speed'] = output_df['release_speed'].apply(lambda x: x * 1.609)
    output_df['release_pos_x'] = output_df['release_pos_x'].apply(lambda x: x * 0.3048)
    output_df['release_pos_y'] = output_df['release_pos_y'].apply(lambda x: x * 0.3048)
    output_df['release_pos_z'] = output_df['release_pos_z'].apply(lambda x: x * 0.3048)
    output_df['pfx_x'] = output_df['pfx_x'].apply(lambda x: x * 0.3048)
    output_df['pfx_z'] = output_df['pfx_z'].apply(lambda x: x * 0.3048)
    output_df['plate_x'] = output_df['plate_x'].apply(lambda x: x * 0.3048)
    output_df['plate_z'] = output_df['plate_z'].apply(lambda x: x * 0.3048)
    output_df['launch_speed'] = output_df['launch_speed'].apply(lambda x: x * 1.609)
    output_df['effective_speed'] = output_df['effective_speed'].apply(lambda x: x * 1.609)
    output_df['release_extension'] = output_df['release_extension'].apply(lambda x: x * 0.3048)
    return output_df

if __name__ == '__main__':
    try:
        pitching_df = create_df_pitching_stats(SELECT_YEAR).reset_index(drop=True)
        pitching_csv_name = 'pitching_' + str(SELECT_YEAR) + ".csv"
        pitching_df.to_csv(pitching_csv_name)

        batting_df = create_df_batting_stats(SELECT_YEAR).reset_index(drop=True)
        batting_csv_name = 'batting_' + str(SELECT_YEAR) + ".csv"
        batting_df.to_csv(batting_csv_name)

        statcast_df = create_df_statcast(SELECT_YEAR).reset_index(drop=True)

        statcast_df = add_batter_name(players_info_df, statcast_df)

#         statcast_df = convert_units(statcast_df)

        statcast_csv_name = 'statcast_' + str(SELECT_YEAR) + ".csv"
        statcast_df.to_csv(statcast_csv_name)

    except Exception as e:
        print(traceback.format_exc())


This is a large query, it may take a moment to complete
Query unsuccessful for data from 2020-07-01 to 2020-07-05. Skipping these dates.
Query unsuccessful for data from 2020-07-06 to 2020-07-06. Skipping these dates.
Query unsuccessful for data from 2020-07-07 to 2020-07-11. Skipping these dates.
Query unsuccessful for data from 2020-07-12 to 2020-07-12. Skipping these dates.
Query unsuccessful for data from 2020-07-13 to 2020-07-17. Skipping these dates.
Query unsuccessful for data from 2020-07-18 to 2020-07-18. Skipping these dates.
Completed sub-query from 2020-07-19 to 2020-07-24
Completed sub-query from 2020-07-25 to 2020-07-30
Completed sub-query from 2020-07-31 to 2020-07-31
This is a large query, it may take a moment to complete
Completed sub-query from 2020-08-01 to 2020-08-06
Completed sub-query from 2020-08-07 to 2020-08-12
Completed sub-query from 2020-08-13 to 2020-08-18
Completed sub-query from 2020-08-19 to 2020-08-24
Completed sub-query from 2020-08-25 to 2020-08-30
Co

In [2]:
print(statcast_df.shape)
statcast_df.head()

(279660, 92)


Unnamed: 0,index,pitch_type,game_date,release_speed,release_pos_x,release_pos_z,pitcher_name,batter,pitcher,events,description,spin_dir,spin_rate_deprecated,break_angle_deprecated,break_length_deprecated,zone,des,game_type,stand,p_throws,home_team,away_team,type,hit_location,bb_type,balls,strikes,game_year,pfx_x,pfx_z,plate_x,plate_z,on_3b,on_2b,on_1b,outs_when_up,inning,inning_topbot,hc_x,hc_y,tfs_deprecated,tfs_zulu_deprecated,fielder_2,umpire,sv_id,vx0,vy0,vz0,ax,ay,az,sz_top,sz_bot,hit_distance_sc,launch_speed,launch_angle,effective_speed,release_spin_rate,release_extension,game_pk,pitcher.1,fielder_2.1,fielder_3,fielder_4,fielder_5,fielder_6,fielder_7,fielder_8,fielder_9,release_pos_y,estimated_ba_using_speedangle,estimated_woba_using_speedangle,woba_value,woba_denom,babip_value,iso_value,launch_speed_angle,at_bat_number,pitch_number,pitch_name,home_score,away_score,bat_score,fld_score,post_away_score,post_home_score,post_bat_score,post_fld_score,if_fielding_alignment,of_fielding_alignment,key_mlbam,batter_name
0,89,FF,2020-07-31,96.5,-3.11,4.64,Craig Kimbrel,657061.0,518886.0,field_out,hit_into_play,,,,,8.0,"Cole Tucker grounds out, second baseman Nico H...",R,L,R,CHC,PIT,X,4.0,ground_ball,0.0,0.0,2020.0,-1.03,1.13,0.0,2.09,,,,2.0,9.0,Top,143.4,154.62,,,605170.0,,,10.691298,-140.122961,-3.804094,-15.984306,31.483245,-16.574634,3.49,1.6,48.0,102.4,0.0,97.2,2205.0,6.8,631647.0,518886.0,605170.0,519203.0,663538.0,592178.0,595879.0,664023.0,546991.0,518792.0,53.73,0.379,0.341,0.0,1.0,0.0,0.0,4.0,75.0,1.0,4-Seam Fastball,6.0,3.0,3.0,6.0,3.0,6.0,3.0,6.0,Standard,Standard,657061.0,Cole Tucker
1,92,FF,2020-07-31,95.8,-3.04,4.49,Craig Kimbrel,570256.0,518886.0,field_out,hit_into_play,,,,,7.0,Gregory Polanco lines out sharply to center fi...,R,L,R,CHC,PIT,X,8.0,line_drive,2.0,1.0,2020.0,-1.0,1.14,-0.52,2.17,,,,1.0,9.0,Top,126.14,34.96,,,605170.0,,,8.961111,-139.335972,-3.146013,-15.041709,29.344842,-16.796815,3.58,1.68,388.0,104.9,19.0,96.4,2194.0,6.5,631647.0,518886.0,605170.0,519203.0,663538.0,592178.0,595879.0,664023.0,546991.0,518792.0,53.96,0.671,0.76,0.0,1.0,0.0,0.0,6.0,74.0,4.0,4-Seam Fastball,6.0,3.0,3.0,6.0,3.0,6.0,3.0,6.0,Infield shift,Standard,570256.0,Gregory Polanco
2,96,FF,2020-07-31,96.7,-3.06,4.66,Craig Kimbrel,570256.0,518886.0,,ball,,,,,11.0,,R,L,R,CHC,PIT,B,,,1.0,1.0,2020.0,-0.93,1.06,-1.35,3.17,,,,1.0,9.0,Top,,,,,605170.0,,,6.675811,-140.737624,-0.867223,-13.853987,30.53271,-17.992128,3.85,1.86,,,,97.0,2155.0,6.4,631647.0,518886.0,605170.0,519203.0,663538.0,592178.0,595879.0,664023.0,546991.0,518792.0,54.14,,,,,,,,74.0,3.0,4-Seam Fastball,6.0,3.0,3.0,6.0,3.0,6.0,3.0,6.0,Infield shift,Standard,570256.0,Gregory Polanco
3,102,FF,2020-07-31,96.8,-3.05,4.53,Craig Kimbrel,570256.0,518886.0,,called_strike,,,,,7.0,,R,L,R,CHC,PIT,S,,,1.0,0.0,2020.0,-0.82,1.15,-0.72,1.93,,,,1.0,9.0,Top,,,,,605170.0,,,8.124308,-140.780372,-4.0431,-12.72611,31.116005,-16.114603,3.87,1.95,,,,97.4,2982.0,6.6,631647.0,518886.0,605170.0,519203.0,663538.0,592178.0,595879.0,664023.0,546991.0,518792.0,53.89,,,,,,,,74.0,2.0,4-Seam Fastball,6.0,3.0,3.0,6.0,3.0,6.0,3.0,6.0,Infield shift,Standard,570256.0,Gregory Polanco
4,108,KC,2020-07-31,85.1,-2.85,4.94,Craig Kimbrel,570256.0,518886.0,,ball,,,,,14.0,,R,L,R,CHC,PIT,B,,,0.0,0.0,2020.0,0.88,-0.45,2.37,1.34,,,,1.0,9.0,Top,,,,,605170.0,,,10.500907,-123.570303,-1.197401,6.938228,24.629923,-36.731829,3.78,1.88,,,,85.3,2227.0,6.5,631647.0,518886.0,605170.0,519203.0,663538.0,592178.0,595879.0,664023.0,546991.0,518792.0,53.98,,,,,,,,74.0,1.0,Knuckle Curve,6.0,3.0,3.0,6.0,3.0,6.0,3.0,6.0,Infield shift,Standard,570256.0,Gregory Polanco
