**Mount Google Drive**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


**Load Match Metadata and Event JSON Files（試合メタデータとイベントJSONファイルの取得）**

In [None]:
from google.colab import drive
import os
import json
import requests

# Mount Google Drive / Googleドライブをマウント
drive.mount('/content/drive')

# Create folder to save event data / イベントデータ保存用フォルダの作成
events_dir = '/content/drive/MyDrive/MIT_Sloan/fifa_wc_2022_events'
os.makedirs(events_dir, exist_ok=True)

# Load match list from JSON / 試合一覧JSONファイルの読み込み
matches_path = '/content/drive/MyDrive/MIT_Sloan/fifa_wc_2022_matches.json'
with open(matches_path, 'r', encoding='utf-8') as f:
    matches = json.load(f)

# Extract match IDs / 試合ID（match_id）の抽出
match_ids = [match['match_id'] for match in matches]

# Download and save event data for each match / 各試合のイベントデータを取得・保存
for match_id in match_ids:
    url = f"https://raw.githubusercontent.com/statsbomb/open-data/master/data/events/{match_id}.json"
    response = requests.get(url)
    if response.status_code == 200:
        save_path = os.path.join(events_dir, f"{match_id}.json")
        with open(save_path, 'w', encoding='utf-8') as f:
            json.dump(response.json(), f, ensure_ascii=False, indent=2)
        print(f"✅ Saved: {match_id}.json")
    else:


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
✅ Saved: 3857256.json
✅ Saved: 3869151.json
✅ Saved: 3857257.json
✅ Saved: 3857258.json
✅ Saved: 3857288.json
✅ Saved: 3857267.json
✅ Saved: 3869321.json
✅ Saved: 3857287.json
✅ Saved: 3869486.json
✅ Saved: 3869685.json
✅ Saved: 3857260.json
✅ Saved: 3857264.json
✅ Saved: 3857266.json
✅ Saved: 3857289.json
✅ Saved: 3857269.json
✅ Saved: 3857294.json
✅ Saved: 3869254.json
✅ Saved: 3869118.json
✅ Saved: 3869684.json
✅ Saved: 3869519.json
✅ Saved: 3869354.json
✅ Saved: 3869552.json
✅ Saved: 3869420.json
✅ Saved: 3869220.json
✅ Saved: 3869219.json
✅ Saved: 3869253.json
✅ Saved: 3869152.json
✅ Saved: 3869117.json
✅ Saved: 3857270.json
✅ Saved: 3857263.json
✅ Saved: 3857259.json
✅ Saved: 3857295.json
✅ Saved: 3857283.json
✅ Saved: 3857284.json
✅ Saved: 3857282.json
✅ Saved: 3857286.json
✅ Saved: 3857301.json
✅ Saved: 3857300.json
✅ Saved: 3857299.json
✅ Saved: 3857

**Generate Player Appearance and Minutes List per Match（試合ごとの出場選手と出場時間リストの作成）**

In [None]:
import pandas as pd
import os
import json
from datetime import timedelta

# Define folder path where event JSON files are stored
# イベントJSONファイルが保存されているフォルダパスを定義
folder_path = '/content/drive/MyDrive/MIT_Sloan/fifa_wc_2022_events'

# Initialize list to store player data
# 出力用の選手データリストを初期化
all_players = []

# Iterate through all JSON files in the folder
# フォルダ内のすべてのJSONファイルを処理
for filename in os.listdir(folder_path):
    if filename.endswith('.json'):
        match_id = filename.replace('.json', '')  # Extract match_id from filename / ファイル名からmatch_idを抽出
        file_path = os.path.join(folder_path, filename)

        with open(file_path, 'r', encoding='utf-8') as f:
            data = json.load(f)

        # Extract Starting XI events / スタメンイベントを抽出
        starting_xi = [e for e in data if e['type']['name'] == 'Starting XI']
        if len(starting_xi) < 2:
            continue  # Skip if Starting XI info is incomplete / スタメン情報が不足している場合はスキップ

        team1 = starting_xi[0]['team']['name']
        team2 = starting_xi[1]['team']['name']

        # Register starting players / スタメン選手を登録
        players = []
        for event in starting_xi:
            team = event['team']['name']
            opponent = team1 if team == team2 else team2
            for p in event['tactics']['lineup']:
                players.append({
                    'match_id': match_id,
                    'player_id': p['player']['id'],
                    'player_name': p['player']['name'],
                    'team': team,
                    'opponent': opponent,
                    'in_time': timedelta(minutes=0),  # Start time is 0 / 出場開始は0分
                    'out_time': None  # Will be updated later / 後で更新される
                })

        # Extract substitution events / 交代イベントを抽出
        subs = [e for e in data if e['type']['name'] == 'Substitution']
        for sub in subs:
            out_id = sub['player']['id']
            in_id = sub['substitution']['replacement']['id']
            in_name = sub['substitution']['replacement']['name']
            team = sub['team']['name']
            opponent = team1 if team == team2 else team2
            time = timedelta(minutes=sub['minute'], seconds=sub['second'])

            # Update out_time for substituted player / 交代された選手のout_timeを更新
            for p in players:
                if p['player_id'] == out_id:
                    p['out_time'] = time

            # Add substituted-in player / 交代で入った選手を追加
            players.append({
                'match_id': match_id,
                'player_id': in_id,
                'player_name': in_name,
                'team': team,
                'opponent': opponent,
                'in_time': time,
                'out_time': None
            })

        # Get match end time from last event / 最後のイベントから試合終了時間を取得
        last_event = data[-1]
        end_time = timedelta(minutes=last_event['minute'], seconds=last_event['second'])

        # Calculate minutes played for each player / 各選手の出場時間を計算
        for p in players:
            if p['out_time'] is None:
                p['out_time'] = end_time
            p['duration'] = p['out_time'] - p['in_time']

        # Add player data to the master list / 選手データをリストに追加
        all_players.extend(players)

# Convert to DataFrame / DataFrameに変換
df = pd.DataFrame(all_players)

# Convert time columns to string / 時間列を文字列に変換
df['in_time'] = df['in_time'].astype(str)
df['out_time'] = df['out_time'].astype(str)
df['minutes_played'] = df['duration'].astype(str)

# Save to CSV / CSVファイルとして保存
df.to_csv('/content/drive/MyDrive/all_player_minutes_wc2022.csv', index=False)

# Display sample rows / サンプル行を表示
print("📋 DataFrame columns:", df.columns.tolist())  # 列一覧を表示
print("🔢 Number of rows:", len(df))                 # 行数を表示
print("🔍 First 5 rows:")
print(df.head())                                     # 先頭5行を表示

📋 DataFrame columns: ['match_id', 'player_id', 'player_name', 'team', 'opponent', 'in_time', 'out_time', 'duration', 'minutes_played']
🔢 Number of rows: 1995
🔍 First 5 rows:
  match_id  player_id             player_name    team     opponent  \
0  3857256      20600  Vanja Milinković Savić  Serbia  Switzerland   
1  3857256       5603       Nikola Milenković  Serbia  Switzerland   
2  3857256       6321         Milos Veljkovic  Serbia  Switzerland   
3  3857256      27719      Strahinja Pavlović  Serbia  Switzerland   
4  3857256       6318        Andrija Živković  Serbia  Switzerland   

           in_time         out_time        duration   minutes_played  
0  0 days 00:00:00  0 days 01:40:18 0 days 01:40:18  0 days 01:40:18  
1  0 days 00:00:00  0 days 01:40:18 0 days 01:40:18  0 days 01:40:18  
2  0 days 00:00:00  0 days 00:54:02 0 days 00:54:02  0 days 00:54:02  
3  0 days 00:00:00  0 days 01:40:18 0 days 01:40:18  0 days 01:40:18  
4  0 days 00:00:00  0 days 01:17:40 0 days 01:17:4

**Aggregate Player Stats for All 64 Matches and Export to CSV（全64試合の選手スタッツを集計しCSV出力）**

In [None]:
# @title
# Import necessary libraries (skip if already imported)
# 必要なライブラリをインポート（すでにインポート済みならスキップ）
import json
import pandas as pd
from collections import defaultdict
import os
import numpy as np
from datetime import timedelta

# Define folder paths for input and output
# 入力・出力用のフォルダパスを定義
events_folder = '/content/drive/MyDrive/MIT_Sloan/fifa_wc_2022_events'
output_path = '/content/drive/MyDrive/MIT_Sloan/wc2022_player_stats_newKPI3.csv'

# Load player minutes data from CSV
# 出場時間データをCSVから読み込み
df_minutes = pd.read_csv('/content/drive/MyDrive/all_player_minutes_wc2022.csv')

# Load match metadata from JSON
# 試合メタデータをJSONから読み込み
with open('/content/drive/MyDrive/MIT_Sloan/fifa_wc_2022_matches.json', encoding='utf-8') as f:
    match_metadata = json.load(f)

# Create dictionary of team info per match_id
# match_idごとのチーム情報を辞書化
match_info_dict = {
    str(m['match_id']): {
        'home_team_id': m['home_team']['home_team_id'],
        'away_team_id': m['away_team']['away_team_id'],
        'home_team_name': m['home_team']['home_team_name'],
        'away_team_name': m['away_team']['away_team_name']
    }
    for m in match_metadata
}

# Define function to aggregate player stats from event data
# イベントデータから選手スタッツを集計する関数（簡易版）
def aggregate_player_stats(events, match_id, match_info):
    player_stats = defaultdict(lambda: defaultdict(int))

    for event in events:
        if 'player' not in event or 'type' not in event:
            continue  # Skip events without player or type info / playerやtype情報がないイベントはスキップ

        player_id = event['player'].get('id')
        team_id = event.get('team', {}).get('id')
        key = (match_id, player_id)

        # Basic player info / 基本的な選手情報
        player_stats[key]['match_id'] = match_id
        player_stats[key]['player_id'] = player_id
        player_stats[key]['player_name'] = event['player']['name']
        player_stats[key]['team_id'] = team_id

        # Team and opponent name / チーム名と相手チーム名
        if team_id == match_info.get('home_team_id'):
            player_stats[key]['team_name'] = match_info.get('home_team_name')
            player_stats[key]['opponent_name'] = match_info.get('away_team_name')
        elif team_id == match_info.get('away_team_id'):
            player_stats[key]['team_name'] = match_info.get('away_team_name')
            player_stats[key]['opponent_name'] = match_info.get('home_team_name')
        else:
            player_stats[key]['team_name'] = 'Unknown'
            player_stats[key]['opponent_name'] = 'Unknown'

        # Count event types / イベントタイプごとの集計
        event_type = event['type']['name']
        if event_type == 'Pass':
            player_stats[key]['pass_count'] += 1
        elif event_type == 'Carry':
            player_stats[key]['carry_count'] += 1
        elif event_type == 'Shot':
            player_stats[key]['shot_count'] += 1
            xg = event.get('shot', {}).get('statsbomb_xg', 0)
            player_stats[key]['xG_total'] += xg

        # Position info / ポジション情報（最初のイベントから取得）
        if 'position_name' not in player_stats[key] or not player_stats[key]['position_name']:
            player_stats[key]['position_name'] = event.get('position', {}).get('name', '')

    return player_stats

# Aggregate stats for all matches
# 全試合のスタッツを集計
all_stats = []

for filename in os.listdir(events_folder):
    if filename.endswith('.json'):
        match_id = filename.replace('.json', '')
        file_path = os.path.join(events_folder, filename)

        with open(file_path, 'r', encoding='utf-8') as f:
            events = json.load(f)

        match_info = match_info_dict.get(match_id, {})
        stats_dict = aggregate_player_stats(events, match_id, match_info)

        for key, stats in stats_dict.items():
            all_stats.append(stats)

# Convert stats to DataFrame
# スタッツをDataFrameに変換
df_stats = pd.DataFrame(all_stats)

# Ensure match_id is string type for merging
# 結合のためmatch_idを文字列型に統一
df_minutes['match_id'] = df_minutes['match_id'].astype(str)
df_stats['match_id'] = df_stats['match_id'].astype(str)

# Merge minutes and stats data
# 出場時間データとスタッツデータを結合
df_merged = pd.merge(df_minutes, df_stats, on=['match_id', 'player_id'], how='left')

# Save merged data to CSV
# 結合データをCSVとして保存
df_merged.to_csv(output_path, index=False)
print(f"✅ Merged player minutes and stats saved to: {output_path}")  # 保存完了メッセージ

✅ Merged player minutes and stats saved to: /content/drive/MyDrive/MIT_Sloan/wc2022_player_stats_newKPI3.csv


**Specialized KPI Aggregation for Positional Classification（ポジション判別の専門的KPI集計）**

In [None]:
# Define function to aggregate detailed player stats from event data
# イベントデータから詳細な選手スタッツを集計する関数を定義（成功率→成功数に表記修正済み）
def aggregate_player_stats(events, match_id, match_info):
    player_stats = defaultdict(lambda: defaultdict(int))  # Initialize nested dictionary / ネストされた辞書を初期化

    for event in events:
        if 'player' not in event or 'type' not in event:
            continue  # Skip events without player or type info / playerやtype情報がないイベントはスキップ

        player_id = event['player'].get('id')
        team_id = event.get('team', {}).get('id')
        key = (match_id, player_id)

        # Basic player info / 基本的な選手情報
        player_stats[key]['match_id'] = match_id
        player_stats[key]['player_id'] = player_id
        player_stats[key]['player_name'] = event['player']['name']
        player_stats[key]['team_id'] = team_id

        # Team and opponent name / チーム名と相手チーム名
        if team_id == match_info.get('home_team_id'):
            player_stats[key]['team_name'] = match_info.get('home_team_name')
            player_stats[key]['opponent_name'] = match_info.get('away_team_name')
        elif team_id == match_info.get('away_team_id'):
            player_stats[key]['team_name'] = match_info.get('away_team_name')
            player_stats[key]['opponent_name'] = match_info.get('home_team_name')
        else:
            player_stats[key]['team_name'] = 'Unknown'
            player_stats[key]['opponent_name'] = 'Unknown'

        # Count event types / イベントタイプごとの集計
        event_type = event['type']['name']
        if event_type == 'Pass':
            player_stats[key]['pass_count'] += 1
        if event_type == 'Carry':
            player_stats[key]['carry_count'] += 1
        if event_type == 'Shot':
            player_stats[key]['shot_count'] += 1
            xg = event.get('shot', {}).get('statsbomb_xg', 0)
            player_stats[key]['xG_total'] += xg
        if event_type == 'Ball Receipt*':
            player_stats[key]['ball_receipt_count'] += 1

        # Key passes / キーパス数
        if event_type == 'Pass' and event.get('pass', {}).get('key_pass_id'):
            player_stats[key]['key_pass_count'] += 1

        # Dribble attempts and success / ドリブル試行数・成功数
        if event_type == 'Dribble':
            player_stats[key]['dribble_attempts'] += 1
            if event.get('dribble', {}).get('outcome', {}).get('name') == 'Complete':
                player_stats[key]['dribble_success'] += 1

        # Crosses / クロス数
        if event_type == 'Pass' and event.get('pass', {}).get('cross'):
            player_stats[key]['cross_count'] += 1

        # Long pass attempts and success (≥30m) / ロングパス試行数・成功数（30m以上）
        if event_type == 'Pass' and event.get('pass', {}).get('length', 0) >= 30:
            player_stats[key]['long_pass_attempts'] += 1
            if event.get('pass', {}).get('outcome') is None:
                player_stats[key]['long_pass_success'] += 1

        # Position info / ポジション情報
        if 'position_name' not in player_stats[key] or not player_stats[key]['position_name']:
            player_stats[key]['position_name'] = event.get('position', {}).get('name', '')

        # Location info / 位置情報取得
        location = event.get('location')

        # Actions inside penalty area / PA内アクション数
        if location and 102 <= location[0] <= 120 and 18 <= location[1] <= 62:
            player_stats[key]['action_in_PA'] += 1

        # Carries into penalty area / キャリーでのPA侵入数
        if event_type == 'Carry':
            end_loc = event.get('carry', {}).get('end_location')
            if end_loc and 102 <= end_loc[0] <= 120 and 18 <= end_loc[1] <= 62:
                player_stats[key]['carry_into_PA'] += 1

        # Pocket entries / ポケット侵入数
        if event_type == 'Carry':
            end_loc = event.get('carry', {}).get('end_location')
            if end_loc and 102 <= end_loc[0] <= 120 and (18 <= end_loc[1] <= 30 or 50 <= end_loc[1] <= 62):
                player_stats[key]['pocket_entry'] += 1

        # Successful vertical passes (x-direction ≥10m) / 縦パス成功数（x方向に10m以上進む）
        if event_type == 'Pass':
            start = event.get('location')
            end = event.get('pass', {}).get('end_location')
            if start and end and (end[0] - start[0]) > 10:
                if event.get('pass', {}).get('outcome') is None:
                    player_stats[key]['vertical_pass_success'] += 1

        # Switch passes (y-direction ≥20m, length ≥30m, into opponent half) / サイドチェンジパス数
        if event_type == 'Pass':
            start = event.get('location')
            end = event.get('pass', {}).get('end_location')
            length = event.get('pass', {}).get('length', 0)
            if start and end:
                x_end = end[0]
                y_start = start[1]
                y_end = end[1]
                if x_end >= 60 and abs(y_end - y_start) >= 20 and length >= 30:
                    player_stats[key]['switch_pass_count'] += 1

        # Carry distance by zone (CA, HS, SA) / キャリー距離（CA・HS・SA）
        if event_type == 'Carry':
            location = event.get('location')
            end_loc = event.get('carry', {}).get('end_location')
            if location and end_loc:
                x_start, y_start = location
                x_end, y_end = end_loc
                carry_length = ((x_end - x_start)**2 + (y_end - y_start)**2)**0.5
                if 30.1 <= y_start <= 49.9:
                    player_stats[key]['carry_CA'] += carry_length
                elif 18 <= y_start <= 30 or 50 <= y_start <= 62:
                    player_stats[key]['carry_HS'] += carry_length
                elif y_start <= 17.9 or y_start >= 62.1:
                    player_stats[key]['carry_SA'] += carry_length

        # Pass count by zone (CA, HS, SA) / パス数（CA・HS・SA）
        if event_type == 'Pass' and location:
            x, y = location
            if 30.1 <= y <= 49.9:
                player_stats[key]['pass_CA'] += 1
            elif 18 <= y <= 30 or 50 <= y <= 62:
                player_stats[key]['pass_HS'] += 1
            elif y <= 17.9 or y >= 62.1:
                player_stats[key]['pass_SA'] += 1

        # Short pass attempts and success (central area) / ショートパス（CA）試行数・成功数
        if event_type == 'Pass' and location:
            x, y = location
            length = event.get('pass', {}).get('length', 0)
            if x <= 60 and 30 <= y <= 50 and length < 20:
                player_stats[key]['short_pass_central_attempt'] += 1
                if event.get('pass', {}).get('outcome') is None:
                    player_stats[key]['short_pass_central_success'] += 1

        # Short pass attempts and success (side area) / ショートパス（HS,SA）試行数・成功数
        if event_type == 'Pass' and location:
            x, y = location
            length = event.get('pass', {}).get('length', 0)
            if x <= 60 and (y <= 30 or y >= 50) and length < 20:
                player_stats[key]['short_pass_side_attempt'] += 1
                if event.get('pass', {}).get('outcome') is None:
                    player_stats[key]['short_pass_side_success'] += 1

    # Calculate success rates / 成功率の算出（関数内で追加）
    import numpy as np
    for key, stats in player_stats.items():
        stats['dribble_success_rate'] = stats['dribble_success'] / stats['dribble_attempts'] if stats['dribble_attempts'] > 0 else np.nan
        stats['long_pass_success_rate'] = stats['long_pass_success'] / stats['long_pass_attempts'] if stats['long_pass_attempts'] > 0 else np.nan
        stats['short_pass_central_success_rate'] = stats['short_pass_central_success'] / stats['short_pass_central_attempt'] if stats['short_pass_central_attempt'] > 0 else np.nan
        stats['short_pass_side_success_rate'] = stats['short_pass_side_success'] / stats['short_pass_side_attempt'] if stats['short_pass_side_attempt'] > 0 else np.nan

    return player_stats

# Aggregate player stats for all matches
# 全試合のスタッツを集計
all_stats = []

for filename in os.listdir(events_folder):
    if filename.endswith('.json'):
        match_id = filename.replace('.json', '')  # Extract match_id from filename / ファイル名からmatch_idを抽出
        file_path = os.path.join(events_folder, filename)

        with open(file_path, 'r', encoding='utf-8') as f:
            events = json.load(f)  # Load event data / イベントデータを読み込み

        match_info = match_info_dict.get(match_id, {})  # Get team info for the match / 試合のチーム情報を取得
        stats_dict = aggregate_player_stats(events, match_id, match_info)  # Aggregate stats using custom function / 集計関数でスタッツを集計

        for key, stats in stats_dict.items():
            all_stats.append(stats)  # Append each player's stats to the list / 各選手のスタッツをリストに追加

# Convert aggregated stats to DataFrame
# 集計したスタッツをDataFrameに変換
df_stats = pd.DataFrame(all_stats)

# Ensure match_id is string type for merging
# 結合のためmatch_idを文字列型に統一
df_minutes['match_id'] = df_minutes['match_id'].astype(str)
df_stats['match_id'] = df_stats['match_id'].astype(str)

# Merge minutes and stats data on match_id and player_id
# match_idとplayer_idをキーにして出場時間データとスタッツデータを結合
df_merged = pd.merge(df_minutes, df_stats, on=['match_id', 'player_id'], how='left')

# Save merged data to CSV
# 結合データをCSVとして保存
df_merged.to_csv('/content/drive/MyDrive/MIT_Sloan/wc2022_player_stats_newKPI3.csv', index=False)
print("✅ Merged player minutes and stats saved to CSV.")  # 保存完了メッセージ

✅ Merged player minutes and stats saved to CSV.
