In [5]:
import sys
from pathlib import Path
ROOT = Path.cwd().parent 
sys.path.insert(0, str(ROOT))
from socceraction.data.wyscout import PublicWyscoutLoader
import pandas as pd
from pathlib import Path
from urllib.request import urlopen, urlretrieve, urlparse
from zipfile import ZipFile, is_zipfile
from io import BytesIO
import socceraction.atomic.spadl as atomicspadl
import pickle
from playstyle_utils import EventToAtomic

In [6]:
PROJECT_ROOT = ROOT
DATA_ROOT = PROJECT_ROOT/ "data" / "wyscout"
DATA_ROOT.mkdir(parents=True, exist_ok=True)
WSL = PublicWyscoutLoader(root = DATA_ROOT)

#Get all available competitions
df_competitions = WSL.competitions()
df_competitions

Unnamed: 0,competition_id,season_id,country_name,competition_name,competition_gender,season_name
0,524,181248,Italy,Italian first division,male,2017/2018
1,364,181150,England,English first division,male,2017/2018
2,795,181144,Spain,Spanish first division,male,2017/2018
3,412,181189,France,French first division,male,2017/2018
4,426,181137,Germany,German first division,male,2017/2018
5,102,9291,International,European Championship,male,2016
6,28,10078,International,World Cup,male,2018


In [7]:
#Get all games for chosen competition

frames = []
seasons_and_competitions = [(181248,524) , (181150,364), (181144,795), (181189,412), (181137,426)]
for i in seasons_and_competitions:

    frames.append(WSL.games(competition_id=i[1], season_id=i[0]))

df_matches = pd.concat(frames)
df_matches.reset_index(drop=True, inplace=True)
df_matches.head()

Unnamed: 0,game_id,competition_id,season_id,game_date,game_day,home_team_id,away_team_id
0,2576335,524,181248,2018-05-20 18:45:00,38,3162,3161
1,2576336,524,181248,2018-05-20 18:45:00,38,3315,3158
2,2576329,524,181248,2018-05-20 16:00:00,38,3173,3172
3,2576330,524,181248,2018-05-20 16:00:00,38,3165,3219
4,2576331,524,181248,2018-05-20 16:00:00,38,3163,3166


In [8]:
#Link every team ID with the name of the club

dfs_teams = []
for i in df_matches["game_id"]:
    df_teams_1match = WSL.teams(game_id = i)
    dfs_teams.append(df_teams_1match)
df_teams = pd.concat(dfs_teams)

df_teams["team_name"] = df_teams["team_name"].apply(lambda x: x.encode("latin-1").decode("unicode_escape","ignore"))
df_teams["team_name_short"] = df_teams["team_name_short"].apply(lambda x: x.encode("latin-1").decode("unicode_escape","ignore"))

In [9]:
#Remove duplicate teams
df_teams.drop_duplicates()

#Map club ID to club name
team_name_mapping = df_teams.set_index('team_id')['team_name_short'].to_dict()
team_name_mapping

#Add home and away team name in the df_matches table
df_matches['home_team_name'] = df_matches['home_team_id'].map(team_name_mapping)
df_matches['away_team_name'] = df_matches['away_team_id'].map(team_name_mapping)
df_matches.head()

Unnamed: 0,game_id,competition_id,season_id,game_date,game_day,home_team_id,away_team_id,home_team_name,away_team_name
0,2576335,524,181248,2018-05-20 18:45:00,38,3162,3161,Lazio,Internazionale
1,2576336,524,181248,2018-05-20 18:45:00,38,3315,3158,Sassuolo,Roma
2,2576329,524,181248,2018-05-20 16:00:00,38,3173,3172,Cagliari,Atalanta
3,2576330,524,181248,2018-05-20 16:00:00,38,3165,3219,Chievo,Benevento
4,2576331,524,181248,2018-05-20 16:00:00,38,3163,3166,Udinese,Bologna


In [10]:
player_data_url = dict(
    players = "https://ndownloader.figshare.com/files/15073721"
)

DATA_DIR = PROJECT_ROOT / "data" / "players"
DATA_DIR.mkdir(parents=True, exist_ok=True)

# Load player data set
for url in player_data_url.values():
    url_s3 = urlopen(url).geturl()
    path = Path(urlparse(url_s3).path)
    file_name = path.name
    local_path = DATA_DIR / file_name
    file_local, _ = urlretrieve(url_s3, local_path)
    if is_zipfile(file_local):
        with ZipFile(file_local) as zip_file:
            zip_file.extractall()

In [11]:
def read_json_file(filename: str):
    with open(filename, 'rb') as json_file:
        return BytesIO(json_file.read()).getvalue().decode('unicode_escape')
    
#Get all player names with their corresponding ID as a dataframe
json_players = read_json_file(local_path)
df_players = pd.read_json(json_players)

df_players.rename(columns = {'wyId':'player_id'}, inplace = True)

df_players["player_name"] = df_players["firstName"] + " " + df_players["lastName"]

df_players = df_players[["player_id", "player_name"]]
df_players.head()

Unnamed: 0,player_id,player_name
0,32777,Harun Tekin
1,393228,Malang Sarr
2,393230,Over Mandanda
3,32793,Alfred John Momar N'Diaye
4,393247,Ibrahima Konaté


In [None]:
# Get all action types and map to corresponding ID
atomic_type_df = atomicspadl.actiontypes_df()
atomic_type_mapping = atomic_type_df.set_index('type_id')['type_name'].to_dict()
print(f"Actions mapped to their ID: \n{atomic_type_mapping}")

# Get all player names and map to corresponding ID
player_name_mapping = df_players.set_index('player_id')['player_name'].to_dict()
print(f"\nPlayers mapped to their ID: \n{player_name_mapping}")

# Club names mapped to corresponding ID
print(f"\nTeams mapped to their ID: \n{team_name_mapping}")

In [None]:
# For each match in df_matches, convert Wyscout events → SPADL → Atomic-SPADL
match_events = {}

for idx, row in df_matches.iterrows():
    try:
        game_id = row["game_id"]
        home_id = row["home_team_id"]
        print(f"Processing game {idx + 1}/{len(df_matches)}")
        df_atomic_actions = EventToAtomic(game_id, home_id, team_name_mapping, player_name_mapping, atomic_type_mapping, WSL).complete_atomic_events()
        match_events[game_id] = df_atomic_actions
    except Exception as e:
        print(f"Skipped game {idx+1} with game ID {game_id} due to error: {type(e).__name__}: {e}")
        continue

In [15]:
OUT = PROJECT_ROOT / "data" / "derived"
OUT.mkdir(parents=True, exist_ok=True)

with open(OUT / "match_events.pkl", "wb") as f:
    pickle.dump(match_events, f)

with open(OUT / "mappings.pkl", "wb") as f:
    pickle.dump(
        {"team_name_mapping": team_name_mapping, "player_name_mapping": player_name_mapping, "atomic_type_mapping": atomic_type_mapping,}, f)
    
with open(OUT / "df_matches.pkl", "wb") as f:
    pickle.dump(df_matches, f)

with open(OUT / "team_mapping.pkl", "wb") as f:
    pickle.dump(team_name_mapping, f)
