In [1]:
import pandas as pd
import json
import logging
import os

In [2]:
leagues = ["Bundesliga", "EPL", "La_Liga", "Ligue_1", "Serie_A"]

In [3]:
def load_json(folder, league):
    json_file = "{}/{}_2024_match_data.json".format(folder, league)
    with open(json_file, "r") as json_data:
        data = json.load(json_data)
    return data

In [4]:
data_dict = {}
for league in leagues:
    data_dict[league] = load_json("json_data", league)

In [28]:
import pandas as pd
import logging
import os

def json_to_df(data, league):
    logging.info("Traitement de la ligue : {}".format(league))
    print("Traitement de la ligue : {}".format(league))

    cols = ["game", "game_id", "score", "event_id", "period_id", "team_id", "player_id", "player_name", "type_id", "date", "minute", "second", "outcome", "start_x", "start_y", "end_x", "end_y", "qualifiers", "related_player_id", "touch", "shot", "goal", "type_name"]
    
    # Vérifier si le fichier CSV existe
    csv_file = "csv_data/{}_events.csv".format(league)
    df = pd.DataFrame(columns=cols)
    if os.path.exists(csv_file):
        df = pd.read_csv(csv_file)

    # Récupérer les matchs déjà présents dans le DataFrame
    matches_in_df = set(df['game_id'])

    rows = []
    batch_size = 30
    match_count = len(data)
    batch_count = match_count // batch_size + (1 if match_count % batch_size != 0 else 0)

    for i, (match_key, match) in enumerate(data.items(), 1):
        game_id = match.get("matchId", None)
        # Vérifier si le match est déjà dans le DataFrame
        if game_id in matches_in_df:
            logging.info("Le match {} existe déjà dans le DataFrame. Passage au suivant.".format(game_id))
            continue

        logging.info("Traitement du match {}/{}".format(i, match_count))
        print("Traitement du match {}/{}".format(i, match_count))
        
        game = match_key.split("2024-")[1]
        try:
            playerIdNameDictionary = match["matchCentreData"].get("playerIdNameDictionary", {})
        except AttributeError as e:
            logging.error("Une erreur AttributeError est survenue pour le match {}: {}".format(game, e))
            continue

        date = match["matchCentreData"].get("startDate", None)
        score = match["matchCentreData"].get("score", None)
        
        for event in match["matchCentreData"].get("events", []):
            event_id = event.get("id", None)
            period_id = event["period"].get("value", None)
            team_id = event.get("teamId", None)
            player_id = event.get("playerId", None)
            player_name = playerIdNameDictionary.get(str(player_id), None)
            type_id = event.get("eventId", None)
            minute = event.get("minute", None)
            second = event.get("second", None)
            outcome = event.get("outcomeType", {}).get("value", None) == 1
            start_x = event.get("x", None)
            start_y = event.get("y", None)
            end_x = event.get("endX", None)
            end_y = event.get("endY", None)
            qualifiers = event.get("qualifiers", None)
            related_player_id = None
            touch = event.get("isTouch", None)
            shot = event.get("isShot", False)
            goal = event.get("isGoal", False)
            type_name = event["type"].get("displayName", None)

            rows.append([game, game_id, score, event_id, period_id, team_id, player_id, player_name, type_id, date, minute, second, outcome, start_x, start_y, end_x, end_y, qualifiers, related_player_id, touch, shot, goal, type_name])

        # Ajouter les données au DataFrame par batch
        if i % batch_size == 0 or i == match_count:
            df_new = pd.DataFrame(rows, columns=cols)
            df = pd.concat([df, df_new], ignore_index=True)
            rows = []
            # Sauvegarder le DataFrame dans le fichier CSV
            df.to_csv(csv_file, index=False)

    logging.info("Traitement terminé pour la ligue : {}".format(league))
    print("Traitement terminé pour la ligue : {}".format(league))

    return df


In [30]:
events_data_dict = {}
for league in leagues:
    events_data_dict[league] = json_to_df(data_dict[league], league)

Traitement de la ligue : Bundesliga
Traitement terminé pour la ligue : Bundesliga
Traitement de la ligue : EPL
Traitement terminé pour la ligue : EPL
Traitement de la ligue : La_Liga
Traitement terminé pour la ligue : La_Liga
Traitement de la ligue : Ligue_1


ERROR:root:Une erreur AttributeError est survenue pour le match Rennes-Metz: 'NoneType' object has no attribute 'get'


Traitement du match 198/234
Traitement du match 205/234
Traitement du match 206/234
Traitement du match 207/234
Traitement du match 208/234
Traitement du match 209/234
Traitement du match 210/234
Traitement du match 211/234
Traitement du match 212/234
Traitement du match 213/234
Traitement du match 214/234
Traitement du match 215/234
Traitement du match 216/234
Traitement du match 217/234
Traitement du match 218/234
Traitement du match 219/234
Traitement du match 220/234
Traitement du match 221/234
Traitement du match 222/234
Traitement du match 223/234
Traitement du match 224/234
Traitement du match 225/234
Traitement du match 226/234
Traitement du match 227/234
Traitement du match 228/234
Traitement du match 229/234
Traitement du match 230/234
Traitement du match 231/234
Traitement du match 232/234
Traitement du match 233/234
Traitement du match 234/234
Traitement terminé pour la ligue : Ligue_1
Traitement de la ligue : Serie_A
Traitement du match 1/289
Traitement du match 2/289
Trai

In [25]:
data_dict["Ligue_1"][list(data_dict["Ligue_1"].keys())[195]]["matchCentreData"].keys()

dict_keys(['playerIdNameDictionary', 'periodMinuteLimits', 'timeStamp', 'attendance', 'venueName', 'referee', 'weatherCode', 'elapsed', 'startTime', 'startDate', 'score', 'htScore', 'ftScore', 'etScore', 'pkScore', 'statusCode', 'periodCode', 'home', 'away', 'maxMinute', 'minuteExpanded', 'maxPeriod', 'expandedMinutes', 'expandedMaxMinute', 'periodEndMinutes', 'commonEvents', 'events', 'timeoutInSeconds'])

In [None]:
65901

In [27]:
data_dict["Ligue_1"]['https://www.whoscored.com/Matches/1741226/Live/France-Ligue-1-2023-2024-Paris-Saint-Germain-Lens']["matchCentreData"].get("playerIdNameDictionary", {}).get(str(65901))

'Keylor Navas'

In [31]:
league_df = pd.read_csv("csv_data/Ligue_1_events.csv")

In [21]:
def json_to_df(data, league):
    logging.info("Process league: {}".format(league))
    print("Process league: {}".format(league))
    cols = ["game", "game_id", "event_id", "period_id", "team_id", "player_id", "player_name", "type_id", "timestamp", "minute", "second", "outcome", "start_x", "start_y", "end_x", "end_y", "qualifiers", "related_player_id", "touch", "shot", "goal", "type_name"]
    df = pd.DataFrame(columns=cols)
    matchs = list(data.keys())
    iterator = 1
    for match_key in matchs:
        logging.info("Process match {}/{}".format(iterator, len(matchs)))
        print("Process match {}/{}".format(iterator, len(matchs)))
        match = data[match_key]
        game = match_key.split("2024-")[1]
        game_id = match["matchId"]
        playerIdNameDictionary = match["matchCentreData"]["playerIdNameDictionary"]
        timestamp = match["matchCentreData"]["timeStamp"]
        for event in match["matchCentreData"]["events"]:
            event_id = event["id"]
            period_id = event["period"]["value"]
            team_id = event["teamId"]
            player_id = event["playerId"] if "playerId" in event.keys() else None
            player_name = playerIdNameDictionary.get(player_id) if "playerId" in event.keys() else None
            type_id = event["eventId"]
            minute = event["minute"]
            second = event["second"] if "second" in event.keys() else None
            outcome = True if event["outcomeType"]["value"] == 1 else False
            start_x = event["x"]
            start_y = event["y"]
            end_x = event["endX"] if "endX" in event.keys() else None
            end_y = event["endY"] if "endY" in event.keys() else None
            qualifiers = event["qualifiers"]
            related_player_id = None
            touch = event["isTouch"]
            shot = True if "isShot" in event.keys() else False
            goal = True if "isGoal" in event.keys() else False
            type_name = event["type"]["displayName"]
            df.loc[len(df)] = [game, game_id, event_id, period_id, team_id, player_id, player_name, type_id, timestamp, minute, second, outcome, start_x, start_y, end_x, end_y, qualifiers, related_player_id, touch, shot, goal, type_name]

        iterator += 1

    df.to_csv("csv_data/{}_events.csv".format(league), index=False)

    return df


In [17]:
data_dict.keys()

dict_keys(['Bundesliga', 'EPL', 'La_Liga', 'Ligue_1', 'Serie_A'])

In [35]:
cols = ["game", "game_id", "event_id", "period_id", "team_id", "player_id", "player_name", "type_id", "timestamp", "minute", "second", "outcome", "start_x", "start_y", "end_x", "end_y", "qualifiers", "related_player_id", "touch", "shot", "goal", "type_name"]

In [36]:
df = pd.DataFrame(columns=cols)

In [26]:
matchs = list(data.keys())

In [38]:
for match_key in matchs[:3]:
    match = data[match_key]
    game = match_key.split("2024-")[1]
    game_id = match["matchId"]
    playerIdNameDictionary = match["matchCentreData"]["playerIdNameDictionary"]
    timestamp = match["matchCentreData"]["timeStamp"]
    for event in match["matchCentreData"]["events"]:
        event_id = event["id"]
        period_id = event["period"]["value"]
        team_id = event["teamId"]
        player_id = event["playerId"] if "playerId" in event.keys() else None
        player_name = playerIdNameDictionary.get(player_id) if "playerId" in event.keys() else None
        type_id = event["eventId"]
        minute = event["minute"]
        second = event["second"] if "second" in event.keys() else None
        outcome = True if event["outcomeType"]["value"] == 1 else False
        start_x = event["x"]
        start_y = event["y"]
        end_x = event["endX"] if "endX" in event.keys() else None
        end_y = event["endY"] if "endY" in event.keys() else None
        qualifiers = event["qualifiers"]
        related_player_id = None
        touch = event["isTouch"]
        shot = True if "isShot" in event.keys() else False
        goal = True if "isGoal" in event.keys() else False
        type_name = event["type"]["displayName"]
        df.loc[len(df)] = [game, game_id, event_id, period_id, team_id, player_id, player_name, type_id, timestamp, minute, second, outcome, start_x, start_y, end_x, end_y, qualifiers, related_player_id, touch, shot, goal, type_name]

In [15]:
test = data["https://www.whoscored.com/Matches/1741226/Live/France-Ligue-1-2023-2024-Paris-Saint-Germain-Lens"]

In [16]:
test.keys()

dict_keys(['matchId', 'matchCentreData', 'matchCentreEventTypeJson', 'formationIdNameMappings'])

In [40]:
df.game.unique()

array(['Paris-Saint-Germain-Lens', 'Brest-Nice', 'Toulouse-Le-Havre'],
      dtype=object)

In [41]:
df1 = df[df["game"] == "Paris-Saint-Germain-Lens"].reset_index()

In [43]:
df1.type_name.value_counts()

Pass               1724
BallRecovery        170
BallTouch            89
TakeOn               75
Foul                 68
Tackle               65
CornerAwarded        32
Aerial               32
Dispossessed         31
Clearance            30
Challenge            28
BlockedPass          28
MissedShots          19
Interception         18
Save                 16
SavedShot            16
Card                  9
SubstitutionOn        8
KeeperPickup          8
SubstitutionOff       8
End                   6
Start                 6
Goal                  5
OffsidePass           3
OffsideProvoked       3
OffsideGiven          3
ShieldBallOpp         2
FormationSet          2
Claim                 1
FormationChange       1
Error                 1
Name: type_name, dtype: int64