# English Premier League 2017-2018 Season
Data from:
- Pappalardo et al., (2019) A public data set of spatio-temporal match events in soccer competitions, Nature Scientific Data 6:236, https://www.nature.com/articles/s41597-019-0247-7
- Pappalardo et al. (2019) PlayeRank: Data-driven Performance Evaluation and Player Ranking in Soccer via a Machine Learning Approach. ACM Transactions on Intellingent Systems and Technologies (TIST) 10, 5, Article 59 (September 2019), 27 pages. DOI: https://doi.org/10.1145/3343172

In [1]:
import json
import re
import pandas as pd

In [2]:
# Individual events across all matches
file_path = './events/events_England.json'
with open(file_path) as file:
    events = json.load(file)

# Matches
file_path = './matches/matches_England.json'
with open(file_path) as file:
    matches = json.load(file)

# Players
file_path = './players.json'
with open(file_path) as file:
    players = json.load(file)

# Teams
file_path = './teams.json'
with open(file_path) as file:
    teams = json.load(file)

In [3]:
# Only keep EPL teams
epl_teams = [ team for team in teams if team['area']['id'] == '0' and team['type'] == 'club' ]
epl_team_ids = [ team['wyId'] for team in epl_teams ]

# Only keep EPL players
epl_players = [ player for player in players if player['currentTeamId'] in epl_team_ids ]
epl_player_ids = [ player['wyId'] for player in epl_players ]

## Formatting JSON Data

In [4]:
# Convert team ids to names
teams_dict = {}
for team in epl_teams:
    teams_dict[f"{team['wyId']}"] = team['name']

In [5]:
# Action by player
event_tags = pd.read_csv('tags2name.csv')
tag_dict = event_tags.set_index('Tag').to_dict()['Description']

In [44]:
# Function to fix incorrect encoding, error in encode(latin-1)->decode(utf-8)
def word_replace(string):
    string.encode('latin-1')
    string = string.replace('\\u00f0', '\u00f0')
    string = string.replace('\\u00ad', '')
    string = string.replace('\\u00d6', '\u00d6')
    string = string.replace('\\u00ed', '\u00ed')
    string = string.replace('\\u010d', '\u010d')
    string = string.replace('\\u0107', '\u0107')
    string = string.replace('\\u00e9', '\u00e9')
    string = string.replace('\\u0141', '\u0141')
    string = string.replace('\\u00f6', '\u00f6')
    string = string.replace('\\u0144', '\u0144')
    string = string.replace('\\u00e1', '\u00e1')
    string = string.replace('\\u00fc', '\u00fc')
    string = string.replace('\\u0130', '\u0130')
    string = string.replace('\\u011f', '\u011f')
    string = string.replace('\\u00e4', '\u00e4')
    string = string.replace('\\u00f8', '\u00f8')
    string = string.replace('\\u00c1', '\u00c1')
    string = string.replace('\\u00fa', '\u00fa')
    string = string.replace('\\u010c', '\u010c')
    string = string.replace('\\u00e0', '\u00e0')
    string = string.replace('\\u00f3', '\u00f3')
    string = string.replace('\\u00df', '\u00df')
    string = string.replace('\\u0161', '\u0161')
    string = string.replace('\\u00de', '\u00de')
    string = string.replace('\\u00eb', '\u00eb')
    string = string.replace('\\u00f1', '\u00f1')
    string = string.replace('\\u00ef', '\u00ef')
    return string

In [45]:
# Convert player ids to names
players_dict = {}
for player in epl_players:
    players_dict[f"{player['wyId']}"] = word_replace(player['shortName'])

In [46]:
# Function to format matches
def format_match(match):
    a_id = list(match['teamsData'].keys())[0]
    b_id = list(match['teamsData'].keys())[1]
    
    # Match identifiers
    match_id = match['wyId']
    date = match['dateutc']
    
    # Home and away team
    if match['teamsData'][a_id]['side'] == 'home':
        home_team = a_id
        away_team = b_id
    else:
        home_team = b_id
        away_team = a_id
    
    # Score
    score = (match['teamsData'][home_team]['score'], match['teamsData'][away_team]['score'])
    
    new_match = {
        'match_id': match_id,
        'date': date,
        'home_team': teams_dict[home_team],
        'away_team': teams_dict[away_team],
        'score': score
    }
    
    return new_match

In [47]:
formatted_matches = list(map(format_match, matches))

In [48]:
# Function to format players
def format_player(player):
    player_id = player['wyId']
    country = player['passportArea']['name']
    name = word_replace(player['shortName'])
    first_name = word_replace(player['firstName'].strip())
    last_name = word_replace(player['lastName'])
    team = teams_dict[f"{player['currentTeamId']}"]
    role = player['role']['name']
    
    new_player = {
        'id': player_id,
        'abb_name': name,
        'first_name': first_name,
        'last_name': last_name,
        'team': team,
        'role': role,
        'nationality': country
    }
    
    return new_player

In [49]:
formatted_players = list(map(format_player, epl_players))

In [50]:
# Function to format events
def formatting_event(event):
    match_id = event['matchId']
    time = event['eventSec']
    period = event['matchPeriod']
    
    try:
        player = players_dict[f"{event['playerId']}"]
    except:
        player = 'null'
        
    team = teams_dict[f"{event['teamId']}"]
    tag_labels = [ tag_dict[tag['id']] for tag in event['tags'] ]
    event_label = event['eventName']
    subevent_label = event['subEventName']
    
    
    start_pos_x = event['positions'][0]['x']
    start_pos_y = event['positions'][0]['y']
    
    if len(event['positions']) > 1:
        end_pos_x = event['positions'][1]['x']
        end_pos_y = event['positions'][1]['y']
    else:
        end_pos_x = 'null'
        end_pos_y = 'null'
    
    new_event = {
        'match_id': match_id,
        'time': time,
        'period': period,
        'player': player,
        'team': team,
        'tags': tag_labels,
        'event': event_label,
        'subevent': subevent_label,
        'start_pos_x': start_pos_x,
        'start_pos_y': start_pos_y,
        'end_pos_x': end_pos_x,
        'end_pos_y': end_pos_y
    }
    
    return new_event

In [51]:
formatted_events = list(map(formatting_event, events))

In [52]:
[ f"{player['first_name'] + ' ' + player['last_name'] }" for player in formatted_players ]

['Toby Alderweireld',
 'Jan Vertonghen',
 'Christian Dannemann Eriksen',
 'Johann Berg Guðmundsson',
 'Nacer Chadli',
 'Leon-Aderemi Balogun',
 'Alex Iwobi',
 'Maya Yoshida',
 'Onyinye Wilfred Ndidi',
 'Mesut Özil',
 'Kepa Arrizabalaga Revuelta',
 'Bernardo Mota Veiga de Carvalho e Silva',
 'Wilfredo Daniel Caballero',
 'Mateo Kovačić',
 'Marcus Rashford',
 'Andreas Christensen',
 'Danilo Luiz da Silva',
 'Nicolás Hernán Otamendi',
 'Nemanja Matić',
 'Ricardo Domingos Barbosa Pereira',
 'Cédric Ricardo Alves Soares',
 'Adrien Sebastian Perruchet Silva',
 'Ramadan Sobhi',
 'Cheikhou Kouyaté',
 'Kevin De Bruyne',
 'Lucas Torreira Di Pascua',
 'Ederson Santana de Moraes',
 'Victor Nilsson Lindelöf',
 'Ruben Loftus-Cheek',
 'Willian Borges da Silva',
 'Fernando Luiz Rosa',
 'Łukasz Fabiański',
 'Gary Cahill',
 'Marko Grujić',
 'Romelu Lukaku Menama',
 'David de Gea Quintana',
 'Phil Jones',
 'Jesse Lingard',
 'Paul Pogba',
 'Ashley Young',
 'Javier Hernández Balcázar',
 'Daniel Nii Tackie 