In [1]:
import json
import pandas as pd
import os

interested_ids = [1701, 1702, 101]

directory = 'data/events'

data_list = []

for file_name in os.listdir(directory):
    if file_name.endswith('.json'):  # 确保只处理JSON文件
        file_path = os.path.join(directory, file_name)
        with open(file_path, 'r') as file:
            # 加载JSON数据
            data = json.load(file)
            for item in data:
                for tags_ids in item['tags']:
                    if int(tags_ids['id']) in interested_ids:
                        # 提取需要的字段并添加到列表中
                        data_list.append({
                            'teamId': item['teamId'],
                            'eventSec': item['eventSec'],
                            'matchPeriod': item['matchPeriod'],
                            'tag_id':tags_ids['id']
                        })

df = pd.DataFrame(data_list)

df.head()

Unnamed: 0,teamId,eventSec,matchPeriod,tag_id
0,695,2026.86306,1H,1702
1,682,2196.267257,1H,1702
2,682,1025.147516,2H,1702
3,682,2497.622568,2H,1702
4,695,2571.818575,2H,101


In [2]:
df.shape

(18005, 4)

In [3]:
import numpy as np
df['minute'] = np.where(df['matchPeriod'] == '2H', df['eventSec'] / 60 + 45, df['eventSec'] / 60)
df.drop('eventSec', axis=1, inplace=True)
df.head()

Unnamed: 0,teamId,matchPeriod,tag_id,minute
0,695,1H,1702,33.781051
1,682,1H,1702,36.604454
2,682,2H,1702,62.085792
3,682,2H,1702,86.627043
4,695,2H,101,87.863643


In [4]:
conditions = [
    df['tag_id'] == 101,
    df['tag_id'] == 1701,
    df['tag_id'] == 1702
]

choices = ['Goal', 'Red_Card', 'Yellow_Card']

df['eventName'] = np.select(conditions, choices, default='Other')

df.drop('tag_id', axis=1, inplace=True)

df.head()

Unnamed: 0,teamId,matchPeriod,minute,eventName
0,695,1H,33.781051,Yellow_Card
1,682,1H,36.604454,Yellow_Card
2,682,2H,62.085792,Yellow_Card
3,682,2H,86.627043,Yellow_Card
4,695,2H,87.863643,Goal


In [5]:
data_for_df = []
with open('data/teams.json', 'r') as file:
    data2 = json.load(file)
    for item in data2:
        data_for_df.append({
            'TeamName': item['name'],
            'teamId': item['wyId'],
        })

df_team = pd.DataFrame(data_for_df)
df_team.head()

Unnamed: 0,TeamName,teamId
0,Newcastle United,1613
1,Celta de Vigo,692
2,Espanyol,691
3,Deportivo Alav\u00e9s,696
4,Levante,695


In [6]:
df_merged = pd.merge(df, df_team, on='teamId', how='left')

print(df_merged.head(20))

    teamId matchPeriod     minute    eventName       TeamName
0      695          1H  33.781051  Yellow_Card        Levante
1      682          1H  36.604454  Yellow_Card     Villarreal
2      682          2H  62.085792  Yellow_Card     Villarreal
3      682          2H  86.627043  Yellow_Card     Villarreal
4      695          2H  87.863643         Goal        Levante
5      682          2H  87.904443         Goal     Villarreal
6      692          1H  21.676380         Goal  Celta de Vigo
7      687          1H  21.725120         Goal  Real Sociedad
8      687          1H  32.235932         Goal  Real Sociedad
9      692          1H  32.287470         Goal  Celta de Vigo
10     692          2H  47.854804  Yellow_Card  Celta de Vigo
11     692          2H  49.823315         Goal  Celta de Vigo
12     687          2H  49.867597         Goal  Real Sociedad
13     687          2H  79.731994         Goal  Real Sociedad
14     692          2H  79.767527         Goal  Celta de Vigo
15     6

In [7]:
df_merged.drop('teamId', axis=1, inplace=True)

In [8]:
df_merged.to_csv('final_data.csv', index=False, encoding='utf-8-sig')