# Japanese MLB Players Statcast Data Generator

Generates pitch-by-pitch Statcast data for all Japanese MLB players (2015-2025).

**Output files:**
- `japanese_mlb_pitching.csv` - Pitching Statcast data
- `japanese_mlb_batting.csv` - Batting Statcast data
- `players.csv` - Player metadata

In [None]:
!pip install pybaseball -q

In [None]:
import pandas as pd
import time
from pybaseball import statcast_pitcher, statcast_batter, playerid_lookup

# Japanese MLB players in the Statcast era (2015-2025)
# MLBAM IDs: verified via Baseball Savant or playerid_lookup
# IDs marked with # VERIFY should be confirmed with playerid_lookup() on first run
PLAYERS = [
    # === Pitchers ===
    # -- 2025 Active --
    {"name": "Shohei Ohtani",           "mlbam_id": 660271, "role": "pitcher", "seasons": "2018-2023", "team": "LAA/LAD",           "npb_team": "Nippon-Ham Fighters",             "note": "Also batter. No pitching 2019 (TJ), 2024-2025 (DH only)"},
    {"name": "Yu Darvish",              "mlbam_id": 506433, "role": "pitcher", "seasons": "2017-2025", "team": "TEX/LAD/CHC/SD",    "npb_team": "Nippon-Ham Fighters",             "note": "2012-2014 pre-Statcast excluded"},
    {"name": "Yoshinobu Yamamoto",      "mlbam_id": 808967, "role": "pitcher", "seasons": "2024-2025", "team": "LAD",               "npb_team": "Orix Buffaloes",                  "note": ""},
    {"name": "Shota Imanaga",           "mlbam_id": 705838, "role": "pitcher", "seasons": "2024-2025", "team": "CHC",               "npb_team": "Yokohama DeNA BayStars",          "note": ""},
    {"name": "Kodai Senga",             "mlbam_id": 694973, "role": "pitcher", "seasons": "2023-2025", "team": "NYM",               "npb_team": "SoftBank Hawks",                  "note": "Limited innings in 2024 (injury)"},
    {"name": "Yusei Kikuchi",           "mlbam_id": 579328, "role": "pitcher", "seasons": "2019-2025", "team": "SEA/TOR/HOU/LAA",   "npb_team": "Seibu Lions",                     "note": ""},
    {"name": "Yuki Matsui",             "mlbam_id": 680686, "role": "pitcher", "seasons": "2024-2025", "team": "SD",                "npb_team": "Tohoku Rakuten Golden Eagles",    "note": "Reliever"},
    {"name": "Kenta Maeda",             "mlbam_id": 628317, "role": "pitcher", "seasons": "2016-2025", "team": "LAD/MIN/DET",       "npb_team": "Hiroshima Toyo Carp",             "note": ""},
    {"name": "Tomoyuki Sugano",         "mlbam_id": 807185, "role": "pitcher", "seasons": "2025",      "team": "BAL",               "npb_team": "Yomiuri Giants",                  "note": "MLB debut 2025"},
    {"name": "Roki Sasaki",             "mlbam_id": 811521, "role": "pitcher", "seasons": "2025",      "team": "LAD",               "npb_team": "Chiba Lotte Marines",             "note": "MLB debut 2025"},
    {"name": "Shinnosuke Ogasawara",    "mlbam_id": 700247, "role": "pitcher", "seasons": "2025",      "team": "WSH",               "npb_team": "Chunichi Dragons",                "note": "MLB debut 2025, reliever"},  # VERIFY ID

    # -- Past (returned to NPB / inactive) --
    {"name": "Masahiro Tanaka",         "mlbam_id": 547888, "role": "pitcher", "seasons": "2015-2020", "team": "NYY",               "npb_team": "Tohoku Rakuten Golden Eagles",    "note": "Returned to NPB 2021"},
    {"name": "Hisashi Iwakuma",         "mlbam_id": 461325, "role": "pitcher", "seasons": "2015-2017", "team": "SEA",               "npb_team": "Tohoku Rakuten Golden Eagles",    "note": "Retired after 2017"},
    {"name": "Koji Uehara",             "mlbam_id": 493157, "role": "pitcher", "seasons": "2015-2017", "team": "BOS/CHC",           "npb_team": "Yomiuri Giants",                  "note": "Closer. Returned to NPB 2018"},
    {"name": "Junichi Tazawa",          "mlbam_id": 547749, "role": "pitcher", "seasons": "2015-2019", "team": "BOS/MIA/LAA/LAA",   "npb_team": "",                                "note": "Signed directly from Japan amateur. NPB debut after MLB"},
    {"name": "Shintaro Fujinami",       "mlbam_id": 692006, "role": "pitcher", "seasons": "2023",      "team": "OAK",               "npb_team": "Hanshin Tigers",                  "note": "1 MLB season. 102.1 mph fastest by Japanese pitcher"},
    {"name": "Kohei Arihara",           "mlbam_id": 685503, "role": "pitcher", "seasons": "2021",      "team": "TEX",               "npb_team": "Nippon-Ham Fighters",             "note": "1 MLB season"},
    {"name": "Tsuyoshi Wada",           "mlbam_id": 647098, "role": "pitcher", "seasons": "2015",      "team": "CHC",               "npb_team": "SoftBank Hawks",                  "note": "Last MLB year 2015"},  # VERIFY ID

    # === Batters ===
    {"name": "Shohei Ohtani",           "mlbam_id": 660271, "role": "batter",  "seasons": "2018-2025", "team": "LAA/LAD",           "npb_team": "Nippon-Ham Fighters",             "note": "Two-way player"},
    {"name": "Seiya Suzuki",            "mlbam_id": 673548, "role": "batter",  "seasons": "2022-2025", "team": "CHC",               "npb_team": "Hiroshima Toyo Carp",             "note": ""},
    {"name": "Masataka Yoshida",        "mlbam_id": 807799, "role": "batter",  "seasons": "2023-2025", "team": "BOS",               "npb_team": "Orix Buffaloes",                  "note": ""},
    {"name": "Norichika Aoki",          "mlbam_id": 493114, "role": "batter",  "seasons": "2015-2017", "team": "SF/SEA/HOU/TOR/NYM","npb_team": "Yakult Swallows",                 "note": "Returned to NPB 2018"},
    {"name": "Munenori Kawasaki",       "mlbam_id": 493128, "role": "batter",  "seasons": "2015",      "team": "TOR/CHC",           "npb_team": "SoftBank Hawks",                  "note": "Last MLB year 2015"},  # VERIFY ID
    {"name": "Lars Nootbaar",           "mlbam_id": 663457, "role": "batter",  "seasons": "2021-2025", "team": "STL",               "npb_team": "",                                "note": "Japanese-American, WBC 2023 Japan team"},
]

print(f"Total entries: {len(PLAYERS)} ({len([p for p in PLAYERS if p['role']=='pitcher'])} pitching, {len([p for p in PLAYERS if p['role']=='batter'])} batting)")
print(f"Unique players: {len(set(p['mlbam_id'] for p in PLAYERS))}")

In [None]:
# Verify MLBAM IDs for players marked with VERIFY
# Run this cell once to confirm, then skip on subsequent runs
verify_players = [
    ("Ogasawara", "Shinnosuke"),
    ("Wada", "Tsuyoshi"),
    ("Kawasaki", "Munenori"),
]
for last, first in verify_players:
    print(f"\n--- {first} {last} ---")
    try:
        result = playerid_lookup(last, first)
        print(result[["name_last", "name_first", "key_mlbam"]].to_string())
    except Exception as e:
        print(f"Error: {e}")

In [None]:
# Generate players.csv metadata
players_meta = []
seen = set()
for p in PLAYERS:
    if p["mlbam_id"] not in seen:
        players_meta.append({
            "mlbam_id": p["mlbam_id"],
            "name": p["name"],
            "roles": ", ".join(sorted(set(x["role"] for x in PLAYERS if x["mlbam_id"] == p["mlbam_id"]))),
            "mlb_teams": p["team"],
            "npb_team": p["npb_team"],
            "mlb_seasons": p["seasons"],
            "note": p["note"],
        })
        seen.add(p["mlbam_id"])

df_players = pd.DataFrame(players_meta)
df_players.to_csv("players.csv", index=False)
print(f"players.csv: {len(df_players)} players")
df_players

In [None]:
def parse_seasons(seasons_str):
    """Parse '2018-2023' or '2025' into list of years."""
    parts = seasons_str.split("-")
    if len(parts) == 2:
        return list(range(int(parts[0]), int(parts[1]) + 1))
    return [int(parts[0])]

def fetch_statcast_data(player_id, role, seasons, name):
    """Fetch Statcast data for a player across multiple seasons."""
    all_data = []
    fetch_fn = statcast_pitcher if role == "pitcher" else statcast_batter
    years = parse_seasons(seasons)
    
    for year in years:
        # Use wide date range to capture Spring Training + Postseason
        start_dt = f"{year}-02-01"
        end_dt = f"{year}-11-30"
        print(f"  Fetching {name} ({role}) {year}...", end=" ")
        try:
            df = fetch_fn(start_dt, end_dt, player_id)
            if df is not None and len(df) > 0:
                df["player_name_eng"] = name
                all_data.append(df)
                print(f"{len(df)} pitches")
            else:
                print("0 pitches")
        except Exception as e:
            print(f"Error: {e}")
        time.sleep(1)  # Rate limit
    
    if all_data:
        return pd.concat(all_data, ignore_index=True)
    return pd.DataFrame()

In [None]:
# Fetch all pitching data
pitching_dfs = []
pitchers = [p for p in PLAYERS if p["role"] == "pitcher"]

for i, p in enumerate(pitchers):
    print(f"\n[{i+1}/{len(pitchers)}] {p['name']} ({p['seasons']})")
    df = fetch_statcast_data(p["mlbam_id"], "pitcher", p["seasons"], p["name"])
    if len(df) > 0:
        pitching_dfs.append(df)
        print(f"  Total: {len(df)} pitches")

df_pitching = pd.concat(pitching_dfs, ignore_index=True) if pitching_dfs else pd.DataFrame()
print(f"\n=== Pitching total: {len(df_pitching)} pitches ===")

In [None]:
# Fetch all batting data
batting_dfs = []
batters = [p for p in PLAYERS if p["role"] == "batter"]

for i, p in enumerate(batters):
    print(f"\n[{i+1}/{len(batters)}] {p['name']} ({p['seasons']})")
    df = fetch_statcast_data(p["mlbam_id"], "batter", p["seasons"], p["name"])
    if len(df) > 0:
        batting_dfs.append(df)
        print(f"  Total: {len(df)} pitches")

df_batting = pd.concat(batting_dfs, ignore_index=True) if batting_dfs else pd.DataFrame()
print(f"\n=== Batting total: {len(df_batting)} pitches ===")

In [None]:
# Summary stats
print("=== Pitching Data ===")
print(f"Total pitches: {len(df_pitching):,}")
print(f"Columns: {len(df_pitching.columns)}")
if len(df_pitching) > 0:
    print(f"\nGame types: {df_pitching['game_type'].value_counts().to_dict()}")
    print(f"Seasons: {sorted(df_pitching['game_year'].unique())}")
    print(f"\nPitches per player:")
    print(df_pitching.groupby('player_name_eng').size().sort_values(ascending=False).to_string())

print("\n=== Batting Data ===")
print(f"Total pitches: {len(df_batting):,}")
if len(df_batting) > 0:
    print(f"\nGame types: {df_batting['game_type'].value_counts().to_dict()}")
    print(f"Seasons: {sorted(df_batting['game_year'].unique())}")
    print(f"\nPitches per player:")
    print(df_batting.groupby('player_name_eng').size().sort_values(ascending=False).to_string())

In [None]:
# Save to CSV
df_pitching.to_csv("japanese_mlb_pitching.csv", index=False)
df_batting.to_csv("japanese_mlb_batting.csv", index=False)

import os
for f in ["japanese_mlb_pitching.csv", "japanese_mlb_batting.csv", "players.csv"]:
    size_mb = os.path.getsize(f) / (1024 * 1024)
    print(f"{f}: {size_mb:.1f} MB")

In [None]:
# Download files (Colab)
try:
    from google.colab import files
    for f in ["japanese_mlb_pitching.csv", "japanese_mlb_batting.csv", "players.csv"]:
        files.download(f)
except ImportError:
    print("Not running in Colab - files saved to current directory")