# MLB Pitcher Arsenal Evolution (2020-2025)

投手の球種構成（Arsenal）の年次変化を追跡するデータセット

**データ形式**: Wide format（1行 = 投手×シーズン、球種を横展開）

**対象期間**: 2020-2025シーズン（6シーズン）

**対象投手**: 各シーズンで100球以上投球した投手

**主要球種**: FF, SI, FC, SL, CU, CH, FS, KC, FO, EP, KN

**メトリクス（各球種ごと）**:
- usage_pct: 使用率 (%)
- avg_speed: 平均球速 (mph)
- avg_spin: 平均回転数 (rpm)
- whiff_rate: 空振り率
- avg_pfx_x: 平均横変化量 (inch)
- avg_pfx_z: 平均縦変化量 (inch)

In [5]:
# 必要なパッケージをインストール
!pip install -q pybaseball duckdb

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/426.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m [32m419.8/426.1 kB[0m [31m22.4 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m426.1/426.1 kB[0m [31m11.9 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/432.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m432.7/432.7 kB[0m [31m30.8 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.4 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.4/1.4 MB[0m [31m57.6 MB/s[0m eta [36m0:00:00[0m
[?25h

In [6]:
import pandas as pd
import numpy as np
from datetime import date
from pybaseball import statcast, playerid_reverse_lookup
import duckdb
import warnings
warnings.filterwarnings('ignore')

print(f"Data collection date: {date.today()}")

Data collection date: 2026-02-08


## Step 1: データ取得（2020-2025）

**注意**: この処理は時間がかかります（30分〜1時間）

In [7]:
# 各シーズンごとにデータ取得
seasons = [2020, 2021, 2022, 2023, 2024, 2025]
all_data = []

for season in seasons:
    print(f"\nFetching {season} season data...")
    start_date = f"{season}-03-01"
    end_date = f"{season}-11-30" if season < 2025 else date.today().strftime("%Y-%m-%d")

    df = statcast(start_dt=start_date, end_dt=end_date)
    df['season'] = season
    all_data.append(df)
    print(f"  {season}: {len(df):,} pitches")

# 全シーズン結合
df_all = pd.concat(all_data, ignore_index=True)
print(f"\nTotal pitches: {len(df_all):,}")
print(f"Columns: {len(df_all.columns)}")


Fetching 2020 season data...
This is a large query, it may take a moment to complete
Skipping offseason dates
Skipping offseason dates


100%|██████████| 97/97 [01:20<00:00,  1.21it/s]


  2020: 280,398 pitches

Fetching 2021 season data...
This is a large query, it may take a moment to complete
Skipping offseason dates
Skipping offseason dates


100%|██████████| 246/246 [03:58<00:00,  1.03it/s]


  2021: 765,733 pitches

Fetching 2022 season data...
This is a large query, it may take a moment to complete
Skipping offseason dates
Skipping offseason dates


100%|██████████| 246/246 [04:05<00:00,  1.00it/s]


  2022: 775,330 pitches

Fetching 2023 season data...
This is a large query, it may take a moment to complete
Skipping offseason dates
Skipping offseason dates


100%|██████████| 246/246 [04:05<00:00,  1.00it/s]


  2023: 774,038 pitches

Fetching 2024 season data...
This is a large query, it may take a moment to complete
Skipping offseason dates
Skipping offseason dates


100%|██████████| 246/246 [03:45<00:00,  1.09it/s]


  2024: 760,248 pitches

Fetching 2025 season data...
This is a large query, it may take a moment to complete
Skipping offseason dates
Skipping offseason dates


100%|██████████| 246/246 [04:03<00:00,  1.01it/s]


  2025: 770,795 pitches

Total pitches: 4,126,542
Columns: 119


## Step 2: DuckDBで集計

投手×シーズン×球種でグループ化して統計量を計算

In [8]:
con = duckdb.connect()

# 投手×シーズン×球種で集計
query = """
WITH pitcher_stats AS (
    SELECT
        pitcher,
        season,
        pitch_type,
        COUNT(*) as n_pitches,
        AVG(release_speed) as avg_speed,
        AVG(release_spin_rate) as avg_spin,
        AVG(pfx_x) as avg_pfx_x,
        AVG(pfx_z) as avg_pfx_z,
        SUM(CASE WHEN description IN ('swinging_strike', 'swinging_strike_blocked') THEN 1 ELSE 0 END)::FLOAT /
            NULLIF(SUM(CASE WHEN description LIKE '%strike%' OR description LIKE '%foul%' OR description IN ('hit_into_play') THEN 1 ELSE 0 END), 0) as whiff_rate
    FROM df_all
    WHERE pitch_type IS NOT NULL
        AND pitcher IS NOT NULL
    GROUP BY pitcher, season, pitch_type
),
pitcher_totals AS (
    SELECT
        pitcher,
        season,
        SUM(n_pitches) as total_pitches
    FROM pitcher_stats
    GROUP BY pitcher, season
    HAVING total_pitches >= 100  -- 最低100球以上
)
SELECT
    ps.pitcher,
    ps.season,
    ps.pitch_type,
    ps.n_pitches,
    ROUND(100.0 * ps.n_pitches / pt.total_pitches, 2) as usage_pct,
    ROUND(ps.avg_speed, 2) as avg_speed,
    ROUND(ps.avg_spin, 0) as avg_spin,
    ROUND(ps.whiff_rate, 4) as whiff_rate,
    ROUND(ps.avg_pfx_x, 2) as avg_pfx_x,
    ROUND(ps.avg_pfx_z, 2) as avg_pfx_z
FROM pitcher_stats ps
INNER JOIN pitcher_totals pt
    ON ps.pitcher = pt.pitcher AND ps.season = pt.season
ORDER BY ps.pitcher, ps.season, ps.pitch_type
"""

df_long = con.execute(query).df()
print(f"Long format: {len(df_long):,} rows (投手×シーズン×球種)")
print(f"Unique pitchers: {df_long['pitcher'].nunique():,}")
print(f"\nPitch types: {sorted(df_long['pitch_type'].unique())}")

Long format: 19,530 rows (投手×シーズン×球種)
Unique pitchers: 1,458

Pitch types: ['CH', 'CS', 'CU', 'EP', 'FA', 'FC', 'FF', 'FO', 'FS', 'KC', 'KN', 'PO', 'SC', 'SI', 'SL', 'ST', 'SV', 'UN']


In [9]:
# サンプル確認
df_long.head(20)

Unnamed: 0,pitcher,season,pitch_type,n_pitches,usage_pct,avg_speed,avg_spin,whiff_rate,avg_pfx_x,avg_pfx_z
0,424144,2020,FF,39,13.64,89.52,2152.0,0.0968,0.97,1.29
1,424144,2020,SI,123,43.01,89.94,2102.0,0.1,1.48,0.62
2,424144,2020,SL,124,43.36,76.05,2192.0,0.1392,-0.82,-0.1
3,425794,2020,CH,24,2.4,81.84,1679.0,0.1,-1.14,0.58
4,425794,2020,CU,385,38.42,73.56,2788.0,0.2325,1.4,-1.18
5,425794,2020,FC,219,21.86,85.04,2344.0,0.1699,0.5,0.51
6,425794,2020,FF,99,9.88,89.12,2222.0,0.069,-0.17,1.2
7,425794,2020,SI,275,27.45,89.37,2189.0,0.0647,-1.02,0.95
8,425794,2021,CH,223,6.79,82.76,1729.0,0.1203,-1.15,0.64
9,425794,2021,CS,13,0.4,65.84,2554.0,0.3333,1.25,-1.25


## Step 3: Wide formatに変換

各球種を横展開（FF_usage_pct, FF_avg_speed, ...）

In [10]:
# メトリクスリスト
metrics = ['usage_pct', 'avg_speed', 'avg_spin', 'whiff_rate', 'avg_pfx_x', 'avg_pfx_z']

# 各メトリクスごとにpivot
pivoted_dfs = []

for metric in metrics:
    pivot = df_long.pivot_table(
        index=['pitcher', 'season'],
        columns='pitch_type',
        values=metric,
        aggfunc='first'
    )
    # カラム名を "PITCH_metric" 形式に変更
    pivot.columns = [f"{col}_{metric}" for col in pivot.columns]
    pivoted_dfs.append(pivot)

# 全メトリクスを結合
df_wide = pd.concat(pivoted_dfs, axis=1).reset_index()

print(f"Wide format: {len(df_wide):,} rows (投手×シーズン)")
print(f"Columns: {len(df_wide.columns)}")

Wide format: 4,253 rows (投手×シーズン)
Columns: 110


In [11]:
# カラム一覧確認
print("\nColumn names:")
for i, col in enumerate(df_wide.columns, 1):
    print(f"{i:3d}. {col}")


Column names:
  1. pitcher
  2. season
  3. CH_usage_pct
  4. CS_usage_pct
  5. CU_usage_pct
  6. EP_usage_pct
  7. FA_usage_pct
  8. FC_usage_pct
  9. FF_usage_pct
 10. FO_usage_pct
 11. FS_usage_pct
 12. KC_usage_pct
 13. KN_usage_pct
 14. PO_usage_pct
 15. SC_usage_pct
 16. SI_usage_pct
 17. SL_usage_pct
 18. ST_usage_pct
 19. SV_usage_pct
 20. UN_usage_pct
 21. CH_avg_speed
 22. CS_avg_speed
 23. CU_avg_speed
 24. EP_avg_speed
 25. FA_avg_speed
 26. FC_avg_speed
 27. FF_avg_speed
 28. FO_avg_speed
 29. FS_avg_speed
 30. KC_avg_speed
 31. KN_avg_speed
 32. PO_avg_speed
 33. SC_avg_speed
 34. SI_avg_speed
 35. SL_avg_speed
 36. ST_avg_speed
 37. SV_avg_speed
 38. UN_avg_speed
 39. CH_avg_spin
 40. CS_avg_spin
 41. CU_avg_spin
 42. EP_avg_spin
 43. FA_avg_spin
 44. FC_avg_spin
 45. FF_avg_spin
 46. FO_avg_spin
 47. FS_avg_spin
 48. KC_avg_spin
 49. KN_avg_spin
 50. PO_avg_spin
 51. SC_avg_spin
 52. SI_avg_spin
 53. SL_avg_spin
 54. ST_avg_spin
 55. SV_avg_spin
 56. UN_avg_spin
 57. C

## Step 4: 投手名を追加

In [12]:
# ユニークな投手IDリスト
unique_pitchers = df_wide['pitcher'].unique()
print(f"Looking up names for {len(unique_pitchers):,} pitchers...")

# 投手名取得（バッチ処理）
name_dict = {}
batch_size = 100

for i in range(0, len(unique_pitchers), batch_size):
    batch = unique_pitchers[i:i+batch_size]
    for player_id in batch:
        try:
            result = playerid_reverse_lookup([player_id], key_type='mlbam')
            if not result.empty:
                name_dict[player_id] = f"{result.iloc[0]['name_first']} {result.iloc[0]['name_last']}"
        except:
            name_dict[player_id] = f"Player_{player_id}"

    if (i + batch_size) % 500 == 0:
        print(f"  {i + batch_size:,} / {len(unique_pitchers):,}")

# 投手名を追加
df_wide.insert(1, 'player_name', df_wide['pitcher'].map(name_dict))

print(f"\nCompleted. {df_wide['player_name'].notna().sum()} names found.")

Looking up names for 1,458 pitchers...
Gathering player lookup table. This may take a moment.
  500 / 1,458
  1,000 / 1,458
  1,500 / 1,458

Completed. 4252 names found.


In [13]:
# サンプル確認
df_wide.head(10)

Unnamed: 0,pitcher,player_name,season,CH_usage_pct,CS_usage_pct,CU_usage_pct,EP_usage_pct,FA_usage_pct,FC_usage_pct,FF_usage_pct,...,FS_avg_pfx_z,KC_avg_pfx_z,KN_avg_pfx_z,PO_avg_pfx_z,SC_avg_pfx_z,SI_avg_pfx_z,SL_avg_pfx_z,ST_avg_pfx_z,SV_avg_pfx_z,UN_avg_pfx_z
0,424144,óliver pérez,2020,,,,,,,13.64,...,,,,,,0.62,-0.1,,,
1,425794,adam wainwright,2020,2.4,,38.42,,,21.86,9.88,...,,,,,,0.95,,,,
2,425794,adam wainwright,2021,6.79,0.4,33.48,,,21.33,10.13,...,,,,,,1.02,,,,
3,425794,adam wainwright,2022,6.32,1.02,30.37,,,24.88,9.2,...,,,,,,1.01,-0.42,,,
4,425794,adam wainwright,2023,4.99,0.17,30.59,,,22.63,9.97,...,,,,,,0.93,-0.24,,,
5,425844,zack greinke,2020,21.79,0.38,14.76,0.15,4.89,,41.67,...,,,,,,1.19,0.46,,,
6,425844,zack greinke,2021,22.6,0.22,15.86,0.07,0.33,0.07,40.02,...,,,,,,0.92,0.21,,,
7,425844,zack greinke,2022,16.43,,20.19,,,15.0,38.39,...,,,,,,0.84,0.49,,,
8,425844,zack greinke,2023,16.41,0.04,14.0,,,6.85,27.03,...,,,,,,0.8,0.21,,,
9,429722,ervin santana,2021,3.29,,,,,,45.06,...,,,,,,,0.15,,,


## Step 5: CSV出力

In [14]:
# カラム名をpitcherからplayer_idに変更
df_wide.rename(columns={'pitcher': 'player_id'}, inplace=True)

# CSV出力
output_file = "pitcher_arsenal_evolution_2020_2025.csv"
df_wide.to_csv(output_file, index=False)

print(f"\n=== Dataset Summary ===")
print(f"File: {output_file}")
print(f"Rows: {len(df_wide):,}")
print(f"Columns: {len(df_wide.columns)}")
print(f"Pitchers: {df_wide['player_id'].nunique():,}")
print(f"Seasons: {sorted(df_wide['season'].unique())}")
print(f"\nFile size: {df_wide.memory_usage(deep=True).sum() / 1024 / 1024:.2f} MB (in memory)")


=== Dataset Summary ===
File: pitcher_arsenal_evolution_2020_2025.csv
Rows: 4,253
Columns: 111
Pitchers: 1,458
Seasons: [np.int64(2020), np.int64(2021), np.int64(2022), np.int64(2023), np.int64(2024), np.int64(2025)]

File size: 3.53 MB (in memory)


In [15]:
# 基本統計
print("\n=== Basic Statistics ===")
print(f"\nPitchers per season:")
print(df_wide.groupby('season')['player_id'].nunique())

print(f"\nMost common pitch types (by usage):")
usage_cols = [col for col in df_wide.columns if col.endswith('_usage_pct')]
for col in usage_cols:
    mean_usage = df_wide[col].mean()
    if pd.notna(mean_usage) and mean_usage > 1.0:  # 1%以上の球種のみ
        pitch_type = col.replace('_usage_pct', '')
        print(f"  {pitch_type}: {mean_usage:.2f}%")


=== Basic Statistics ===

Pitchers per season:
season
2020    582
2021    745
2022    734
2023    727
2024    727
2025    738
Name: player_id, dtype: int64

Most common pitch types (by usage):
  CH: 14.02%
  CS: 1.80%
  CU: 13.48%
  EP: 8.98%
  FA: 17.13%
  FC: 18.61%
  FF: 35.32%
  FO: 22.13%
  FS: 15.89%
  KC: 18.93%
  KN: 15.44%
  SC: 8.73%
  SI: 21.99%
  SL: 23.57%
  ST: 18.34%
  SV: 14.18%
