In [1]:
import json
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd

from pandas import json_normalize
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [2]:
pd.set_option("display.max_columns", None)
pd.set_option('display.max_rows', 50)


In [3]:
# Define constants variables
PATH_OPEN_DATA = "../open-data/data/"

### 1. **<ins>Extract competitions name</ins>**

In [4]:
# Load competitions data
with open(PATH_OPEN_DATA + "competitions.json") as f:
    data = json.load(f)

In [5]:
df = json_normalize(data, sep="_")
df.sort_values(by="season_id", inplace=True)

df.head()

Unnamed: 0,competition_id,season_id,country_name,competition_name,competition_gender,competition_youth,competition_international,season_name,match_updated,match_updated_360,match_available_360,match_available
39,11,1,Spain,La Liga,male,False,False,2017/2018,2023-07-24T13:03:48.574627,2021-06-13T16:17:31.694,,2023-07-24T13:03:48.574627
3,16,1,Europe,Champions League,male,False,False,2017/2018,2021-08-27T11:26:39.802832,2021-06-13T16:17:31.694,,2021-01-23T21:55:30.425330
40,11,2,Spain,La Liga,male,False,False,2016/2017,2023-11-14T18:26:44.671413,2021-06-13T16:17:31.694,,2023-11-14T18:26:44.671413
4,16,2,Europe,Champions League,male,False,False,2016/2017,2021-08-27T11:26:39.802832,2021-06-13T16:17:31.694,,2020-07-29T05:00
61,49,3,United States of America,NWSL,female,False,False,2018,2023-07-24T13:01:22.094587,2021-06-13T16:17:31.694,,2023-07-24T13:01:22.094587


In [6]:
# Create a new column with the competition and season
df["competition_season"] = df["competition_name"] + "_" + df["season_name"]

print("The number of competitions is", df["competition_season"].nunique())

The number of competitions is 71


### 2. **<ins>Extract matches name</ins>**

In [7]:
# Create a dataframe containing all the matches
matches = pd.DataFrame()

# Load and iterate over all the matches
for subdir, dirs, files in os.walk(PATH_OPEN_DATA + "matches"):
    for file in files:
        
        if file.endswith(".json"):
            filepath = subdir + os.sep + file

            with open(filepath, "r") as f:
                data = json.load(f)

            flattened_data = pd.json_normalize(data)
            matches = pd.concat([matches, flattened_data], ignore_index=True)


In [8]:
matches["match_date"] = pd.to_datetime(matches["match_date"])
matches["competition_season"] = matches["competition.competition_name"] + "_" + matches["season.season_name"]
matches["match_outcome"] = matches["home_score"] - matches["away_score"]


### 3. **<ins>Extract events name</ins>**

In [9]:
events = pd.read_csv("events_WC22.csv", low_memory=False)

# Keep only the events that happened during the match (not overtime)
events = events[events["period"] <= 4]

# Keep last match
eventsfinal = events[events["match_date"] == "2022-12-18"]

In [10]:
# Store the number of events played by each player
events["player.name"].value_counts().to_csv('player_values_counts.csv', index=True)

In [11]:
def extract_player_names(tactics_lineup):
    """
    Extract the names of the players from the tactics lineup
    
    Parameters:
    ----------
    tactics_lineup: list of dictionaries
        The list of players in the lineup

    Returns:
    -------
    list
        The list of players' names
    """
    return [player["player"]["name"] for player in tactics_lineup]

def extract_player_position(tactics_lineup):
    """
    Extract the position of the players from the tactics lineup

    Parameters:
    ----------
    tactics_lineup: list of dictionaries
        The list of players in the lineup

    Returns:
    -------
    list
        The list of players' positions
    """
    return [player["position"]["name"] for player in tactics_lineup]

## **<ins>Creation of the base dataset</ins>**

In [12]:
# Extract the names of the players from the tactics lineup
lineup = events[events["type.name"] == "Starting XI"]

# Define the defensive players positions
defensive_players = ["Right Center Back", 
                     "Left Center Back", 
                     "Right Back", 
                     "Left Back",
                     "Center Back",
                     "Left Defensive Midfield", 
                     "Right Defensive Midfield", 
                     "Right Wing Back","Left Wing Back"]


# Create a cache to store player statistics
player_cache_appearances = {}
player_cache_clearance = {}
player_cache_keypass = {}
player_cache_assist = {}
player_cache_goals = {}

# Define the columns of the dataframe
columns = ["match_id", "competition_season", "match_date", "match_outcome"]
columns += [f"player_{i}" for i in range(1, 23)]

# Create a dataframe to store the starting XI
startingXI = pd.DataFrame(columns=columns)

In [13]:
# Iterate over each match date in the lineup dataframe
for match_date in lineup['match_date'].unique():
    # Get all events before the match date
    event_prev = events[events['match_date'] < match_date]

    # Get the lineup for the match date
    lineups_for_date = lineup[lineup["match_date"] == match_date]["tactics.lineup"]
    
    # Get the goals conceded by the goalkeepers
    goals_conceded = event_prev[(event_prev["goalkeeper.type.name"] == "Goal Conceded") | (event_prev["goalkeeper.type.name"] == "Penalty Conceded")]

    # Get the number of key passes and assists
    pass_ = (event_prev["type.name"] == "Pass")
    match_keypass = event_prev[pass_ & (event_prev["pass.assisted_shot_id"].notnull())]
    match_assists = event_prev[pass_ & (event_prev["pass.goal_assist"] == True)]

    # Get the number of Ball Recovery, Clearance and Blocks
    match_recovers = event_prev[(event_prev["type.name"] == "Ball Recovery")]
    match_clearance = event_prev[(event_prev["type.name"] == "Clearance")]
    match_blocks = event_prev[(event_prev["type.name"] == "Block")]

    # Get the number of duels won
    match_duels = event_prev[(event_prev["type.name"] == "Duel") & (event_prev["duel.outcome.name"] == "Won")]

    # Get the number of goals scored
    match_goals = event_prev[event_prev["shot.outcome.name"] == "Goal"]

    for tactics_lineup in lineups_for_date:
        tactics_lineup = eval(tactics_lineup)
        player_names = extract_player_names(tactics_lineup)
        player_positions = extract_player_position(tactics_lineup)

        # Map the players to their positions
        player_to_position = dict(zip(player_names, player_positions))

        for player, position in player_to_position.items():
            # Get the number of appearances by the players
            player_cache_appearances[(player, match_date)] = event_prev[event_prev["player.name"] == player].groupby('match_id').first().reset_index().shape[0]
            
            # Get the number of assists and key passes by the players
            player_cache_assist[(player, match_date)] = match_assists[match_assists["player.name"] == player].shape[0]
            
            # Get the number of key passes by the players
            player_cache_keypass[(player, match_date)] = match_keypass[match_keypass["player.name"] == player].shape[0]
            
            # Store the number of goals conceded by the goalkeepers
            if position == "Goalkeeper":
                player_cache_goals[(player, match_date)] = goals_conceded[goals_conceded["player.name"] == player].shape[0]
            
            # Store the number of defensive actions by the players
            elif position in defensive_players:
                # Assign a weight to each defensive action
                clearance = match_clearance[match_clearance["player.name"] == player].shape[0] * .2
                blocks = match_blocks[match_blocks["player.name"] == player].shape[0] * .2
                recovers = match_recovers[match_recovers["player.name"] == player].shape[0] * .3
                duels = match_duels[match_duels["player.name"] == player].shape[0] * .3

                # Store the sum of all defensive actions
                player_cache_goals[(player, match_date)] = clearance + recovers + duels + blocks
            
            # Store the number of goals scored by the players
            elif not position in defensive_players:
                player_cache_goals[(player, match_date)] = match_goals[match_goals["player.name"] == player].shape[0]


In [14]:
# Reset the index of the starting XI dataframe
lineup.reset_index(drop=True, inplace=True)


In [15]:
for i in range(0, len(lineup), 2):
    # Extract the lineup for each team
    team1_lineup = eval(lineup["tactics.lineup"][i])
    team2_lineup = eval(lineup["tactics.lineup"][i + 1])

    # Check if the lineup is complete
    if len(team1_lineup) < 11 or len(team2_lineup) < 11:
        continue

    # Extract the names of the players from the tactics lineup
    player_names = extract_player_names(team1_lineup) + extract_player_names(team2_lineup)
    player_positions = extract_player_position(team1_lineup) + extract_player_position(team2_lineup)
    
    lineup_data = {
        "match_id": lineup["match_id"][i],
        "competition_season": lineup["competition_season"][i],
        "match_date": lineup["match_date"][i],
        "match_outcome": lineup["match_outcome"][i],

        **{f"player_{j+1}": player_names[j] for j in range(22)},

        **{f"player_{j+1}_position": player_positions[j] for j in range(22)},

        **{
            f"player_{j+1}_goals": (
                player_cache_goals.get((player_names[j], lineup["match_date"][i]), 0) /
                player_cache_appearances.get((player_names[j], lineup["match_date"][i]), 1)
                if player_cache_appearances.get((player_names[j], lineup["match_date"][i]), 1) != 0 else 0
            ) for j in range(22)
        },

        **{
            f"player_{j+1}_assists": (
                player_cache_assist.get((player_names[j], lineup["match_date"][i]), 0) /
                player_cache_appearances.get(
                    (player_names[j], lineup["match_date"][i]), 1)
                if player_cache_appearances.get((player_names[j], lineup["match_date"][i]), 1) != 0 else 0
            ) for j in range(22)
        },

        **{
            f"player_{j+1}_keypass": (
                player_cache_keypass.get((player_names[j], lineup["match_date"][i]), 0) /
                player_cache_appearances.get(
                    (player_names[j], lineup["match_date"][i]), 1)
                if player_cache_appearances.get((player_names[j], lineup["match_date"][i]), 1) != 0 else 0
            ) for j in range(22)
        }
    }

    startingXI = pd.concat([startingXI, pd.DataFrame([lineup_data])], ignore_index=True)

In [16]:
# Encode the categorical columns
enc = LabelEncoder()
for col in startingXI.columns:
    if col.endswith("position"):
        startingXI[col] = enc.fit_transform(startingXI[col])
    
    elif col.startswith("player"):
        startingXI[col] = enc.fit_transform(startingXI[col])

# Convert the match outcome to a binary variable
startingXI["match_outcome"] = startingXI["match_outcome"].astype(int).apply(np.sign)

# Store the starting XI dataframe
startingXI.to_csv("startingXI.csv", index=False)
startingXI.head()

Unnamed: 0,match_id,competition_season,match_date,match_outcome,player_1,player_2,player_3,player_4,player_5,player_6,player_7,player_8,player_9,player_10,player_11,player_12,player_13,player_14,player_15,player_16,player_17,player_18,player_19,player_20,player_21,player_22,player_1_position,player_2_position,player_3_position,player_4_position,player_5_position,player_6_position,player_7_position,player_8_position,player_9_position,player_10_position,player_11_position,player_12_position,player_13_position,player_14_position,player_15_position,player_16_position,player_17_position,player_18_position,player_19_position,player_20_position,player_21_position,player_22_position,player_1_goals,player_2_goals,player_3_goals,player_4_goals,player_5_goals,player_6_goals,player_7_goals,player_8_goals,player_9_goals,player_10_goals,player_11_goals,player_12_goals,player_13_goals,player_14_goals,player_15_goals,player_16_goals,player_17_goals,player_18_goals,player_19_goals,player_20_goals,player_21_goals,player_22_goals,player_1_assists,player_2_assists,player_3_assists,player_4_assists,player_5_assists,player_6_assists,player_7_assists,player_8_assists,player_9_assists,player_10_assists,player_11_assists,player_12_assists,player_13_assists,player_14_assists,player_15_assists,player_16_assists,player_17_assists,player_18_assists,player_19_assists,player_20_assists,player_21_assists,player_22_assists,player_1_keypass,player_2_keypass,player_3_keypass,player_4_keypass,player_5_keypass,player_6_keypass,player_7_keypass,player_8_keypass,player_9_keypass,player_10_keypass,player_11_keypass,player_12_keypass,player_13_keypass,player_14_keypass,player_15_keypass,player_16_keypass,player_17_keypass,player_18_keypass,player_19_keypass,player_20_keypass,player_21_keypass,player_22_keypass
0,3857256,FIFA World Cup_2022,2022-12-02,-1,29,34,26,34,4,15,37,39,16,14,1,11,41,10,22,41,40,19,51,16,39,6,0,1,0,0,1,1,3,1,0,4,1,0,0,1,0,0,2,1,4,0,3,0,9,23,16,29,22,19,21,16,0,0,5,0,23,0,17,17,16,8,0,0,0,3,0,0,0,0,4,0,0,0,2,0,0,0,0,0,0,0,0,0,4,0,0,0,0,0,0,0,9,0,0,0,7,0,4,0,3,0,0,1,0,0,4,5,6,5
1,3869151,FIFA World Cup_2022,2022-12-03,1,4,30,5,28,23,14,33,1,29,0,24,19,28,12,20,7,25,0,32,45,35,23,0,0,1,0,0,0,2,0,4,3,0,0,0,1,0,0,2,1,3,2,4,1,3,9,6,11,26,2,0,5,2,0,6,9,8,33,11,27,3,25,1,0,2,0,0,4,0,0,0,3,0,0,0,0,1,0,0,0,0,0,0,0,2,4,0,0,0,4,0,3,5,3,2,7,0,0,9,0,0,2,0,0,0,2,2,5,1,3
2,3857257,FIFA World Cup_2022,2022-11-30,1,18,28,11,18,6,0,23,25,13,33,31,15,38,16,4,24,39,31,8,4,20,32,0,0,1,0,0,2,1,3,2,4,1,0,0,1,0,0,0,2,0,4,3,0,8,3,34,7,31,26,13,0,3,0,5,7,9,31,22,24,0,0,0,0,0,0,0,0,0,0,0,0,0,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,1,3,2,0,0,3,0,0,1,2,9,10,0,0,0
3,3857258,FIFA World Cup_2022,2022-11-24,1,1,9,24,35,2,24,8,34,35,41,35,32,34,27,38,4,15,41,35,18,42,1,0,0,1,0,0,2,1,4,0,3,0,0,1,0,0,1,1,3,1,3,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,3857288,FIFA World Cup_2022,2022-11-26,-1,3,10,40,22,26,2,11,6,34,42,13,19,12,12,20,7,20,0,32,14,35,39,0,1,0,0,1,1,3,1,3,1,0,0,0,1,0,0,2,1,3,2,4,1,0,18,32,38,15,23,26,15,0,0,0,13,0,39,15,30,9,24,0,6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4,0,0,0,0,0,0,0,5,6,7,0,0,13,5,0,0,0,0,0,0,0,4,0,0,7
