In [1]:
import pandas as pd
import json
import logging
import os

In [2]:
leagues = ["Bundesliga", "EPL", "La_Liga", "Ligue_1", "Serie_A", "Liga_Nos", "Eredivisie", "Jupiler_Pro_League"]

In [3]:
def load_json(folder, league):
    json_file = "{}/{}_2024_match_data.json".format(folder, league)
    with open(json_file, "r") as json_data:
        data = json.load(json_data)
    return data

In [4]:
data_dict = {}
for league in leagues:
    data_dict[league] = load_json("json_data", league)

In [3]:
import pandas as pd
import logging
import os

def json_to_df(data, league):
    logging.info("Traitement de la ligue : {}".format(league))
    print("Traitement de la ligue : {}".format(league))

    cols = ["game", "game_id", "score", "event_id", "period_id", "team_id", "player_id", "player_name", "type_id", "date", "minute", "second", "outcome", "start_x", "start_y", "end_x", "end_y", "qualifiers", "related_player_id", "touch", "shot", "goal", "type_name"]
    
    # Vérifier si le fichier CSV existe
    csv_file = "csv_data/{}_events.csv".format(league)
    df = pd.DataFrame(columns=cols)
    if os.path.exists(csv_file):
        df = pd.read_csv(csv_file)

    # Récupérer les matchs déjà présents dans le DataFrame
    matches_in_df = set(df['game_id'])

    rows = []
    batch_size = 30
    match_count = len(data)
    batch_count = match_count // batch_size + (1 if match_count % batch_size != 0 else 0)

    for i, (match_key, match) in enumerate(data.items(), 1):
        game_id = match.get("matchId", None)
        # Vérifier si le match est déjà dans le DataFrame
        if game_id in matches_in_df:
            logging.info("Le match {} existe déjà dans le DataFrame. Passage au suivant.".format(game_id))
            continue

        logging.info("Traitement du match {}/{}".format(i, match_count))
        print("Traitement du match {}/{}".format(i, match_count))
        
        game = match_key.split("2024-")[1]
        try:
            playerIdNameDictionary = match["matchCentreData"].get("playerIdNameDictionary", {})
        except AttributeError as e:
            logging.error("Une erreur AttributeError est survenue pour le match {}: {}".format(game, e))
            continue

        date = match["matchCentreData"].get("startDate", None)
        score = match["matchCentreData"].get("score", None)
        
        for event in match["matchCentreData"].get("events", []):
            event_id = event.get("id", None)
            period_id = event["period"].get("value", None)
            team_id = event.get("teamId", None)
            player_id = event.get("playerId", None)
            player_name = playerIdNameDictionary.get(str(player_id), None)
            type_id = event.get("eventId", None)
            minute = event.get("minute", None)
            second = event.get("second", None)
            outcome = event.get("outcomeType", {}).get("value", None) == 1
            start_x = event.get("x", None)
            start_y = event.get("y", None)
            end_x = event.get("endX", None)
            end_y = event.get("endY", None)
            qualifiers = event.get("qualifiers", None)
            related_player_id = None
            touch = event.get("isTouch", None)
            shot = event.get("isShot", False)
            goal = event.get("isGoal", False)
            type_name = event["type"].get("displayName", None)

            rows.append([game, game_id, score, event_id, period_id, team_id, player_id, player_name, type_id, date, minute, second, outcome, start_x, start_y, end_x, end_y, qualifiers, related_player_id, touch, shot, goal, type_name])

        # Ajouter les données au DataFrame par batch
        if i % batch_size == 0 or i == match_count:
            df_new = pd.DataFrame(rows, columns=cols)
            df = pd.concat([df, df_new], ignore_index=True)
            rows = []
            # Sauvegarder le DataFrame dans le fichier CSV
            df.to_csv(csv_file, index=False)

    logging.info("Traitement terminé pour la ligue : {}".format(league))
    print("Traitement terminé pour la ligue : {}".format(league))

    return df


In [4]:
events_data_dict = {}
for league in leagues:
    events_data_dict[league] = json_to_df(data_dict[league], league)