# Scientific Visualization Project

Replication of visualizations from the following paper:

- Pappalardo, L., Cintia, P., Rossi, A. et al. A public data set of spatio-temporal match events in soccer competitions. Sci Data 6, 236 (2019) doi:10.1038/s41597-019-0247-7, https://www.nature.com/articles/s41597-019-0247-7


- Pappalardo, L., Cintia, P., Ferragina, P., Massucco, E., Pedreschi, D., Giannotti, F. (2019) PlayeRank: Data-driven Performance Evaluation and Player Ranking in Soccer via a Machine Learning Approach. ACM Transactions on Intellingent Systems and Technologies 10(5) Article 59, DOI: https://doi.org/10.1145/3343172, https://dl.acm.org/citation.cfm?id=3343172

and the data collection on figshare:

- Pappalardo, Luca; Massucco, Emanuele (2019): Soccer match event dataset. figshare. Collection. https://doi.org/10.6084/m9.figshare.c.4415000

### Import Libraries

In [1]:
import json
from collections import Counter
import numpy as np
import operator
import matplotlib.pyplot as plt
from matplotlib.ticker import FuncFormatter
from matplotlib.patches import Ellipse
import pandas as pd
import base64
from collections import defaultdict
import sys,os
import operator
import csv
import matplotlib.pylab as pyl
import itertools
import scipy as sp
from scipy import stats
from scipy import optimize
from scipy.integrate import quad
import altair as alt
import vega_datasets as vega_data

import warnings
warnings.filterwarnings('ignore')

### Import data

In [2]:
# loading the events data
events={}
nations = ['Italy','England','Germany','France','Spain','European_Championship','World_Cup']
for nation in nations:
    with open('./data/events/events_%s.json' %nation) as json_data:
        events[nation] = json.load(json_data)

In [7]:
events['Italy'][0]

{'eventId': 8,
 'subEventName': 'Simple pass',
 'tags': [{'id': 1801}],
 'playerId': 8327,
 'positions': [{'y': 52, 'x': 49}, {'y': 44, 'x': 43}],
 'matchId': 2575959,
 'eventName': 'Pass',
 'teamId': 3158,
 'matchPeriod': '1H',
 'eventSec': 2.5305359999999837,
 'subEventId': 85,
 'id': 180423957}

In [56]:
event_time = []
for nation in nations:
    for ev in events[nation]:
        for i in range(len(ev['tags'])):
            if ev['tags'][i]['id'] == 1701:
                event_time.append([ev['matchId'],ev['matchPeriod'],ev['eventSec'], ev['tags'][i]['id']])
            else:
                pass

In [60]:
df_event = pd.DataFrame(event_time, columns=['matchId','matchPeriod','eventSec', 'eventId'])
df_event['eventMin'] = df_event['eventSec']/60
df_event['eventMin'] = df_event['eventMin'].apply(np.ceil)
df_event['eventMin'] = df_event['eventMin'].astype(int)
df_event.head(10)

Unnamed: 0,matchId,matchPeriod,eventSec,eventId,eventMin
0,2575961,1H,129.395092,1701,3
1,2575962,2H,2173.210069,1701,37
2,2575988,1H,1939.780229,1701,33
3,2575988,2H,1986.80236,1701,34
4,2576013,2H,2693.626235,1701,45
5,2576013,2H,2865.704088,1701,48
6,2576033,1H,2188.869838,1701,37
7,2576043,1H,1304.643303,1701,22
8,2576046,2H,2633.574198,1701,44
9,2576053,2H,1653.838085,1701,28


### Data Processing

In [63]:
df_event['if_injury_time'] = np.where(df_event['eventMin']>45, 1, 0)
df_event['time'] = np.where(df_event['matchPeriod'] == '1H', df_event['eventMin'], df_event['eventMin'] + 45)
df_event['binned_time'] = pd.cut(df_event['time'], bins=range(0, 100, 5), labels=['0-5','5-10','10-15','15-20','20-25','25-30','30-35','35-40','40-45','45-50','50-55','55-60','60-65','65-70','70-75','75-80','80-85','85-90','>90'])

# function to calculate the binned time of the event
# ['0-5','5-10','10-15','15-20','20-25','25-30','30-35','35-40','40-45','>45', '45-50','50-55','55-60','60-65','65-70','70-75','75-80','80-85','85-90','>90'],
def convert_to_lable(time, if_injury_time, matchPeriod):
    if if_injury_time == 1:
        if matchPeriod == '1H':
            return '>45'
        else:
            return '>90'
    else:
        return time
    
df_event['binned_label'] = df_event.apply(lambda x: convert_to_lable(x['binned_time'], x['if_injury_time'], x['matchPeriod']), axis=1)
df_event.head(30)

Unnamed: 0,matchId,matchPeriod,eventSec,eventId,eventMin,if_injury_time,time,binned_time,binned_label
0,2575961,1H,129.395092,1701,3,0,3,0-5,0-5
1,2575962,2H,2173.210069,1701,37,0,82,80-85,80-85
2,2575988,1H,1939.780229,1701,33,0,33,30-35,30-35
3,2575988,2H,1986.80236,1701,34,0,79,75-80,75-80
4,2576013,2H,2693.626235,1701,45,0,90,85-90,85-90
5,2576013,2H,2865.704088,1701,48,1,93,>90,>90
6,2576033,1H,2188.869838,1701,37,0,37,35-40,35-40
7,2576043,1H,1304.643303,1701,22,0,22,20-25,20-25
8,2576046,2H,2633.574198,1701,44,0,89,85-90,85-90
9,2576053,2H,1653.838085,1701,28,0,73,70-75,70-75


In [68]:
event_time = []
for nation in nations:
    for ev in events[nation]:
        for i in range(len(ev['tags'])):
            if ev['tags'][i]['id'] == 1702 or ev['tags'][i]['id'] == 101:
                event_time.append([ev['matchId'],ev['matchPeriod'],ev['eventSec'], ev['tags'][i]['id']])
            else:
                pass

In [69]:
df_event_other = pd.DataFrame(event_time, columns=['matchId','matchPeriod','eventSec', 'eventId'])
df_event_other['eventMin'] = df_event_other['eventSec']/60
df_event_other['eventMin'] = df_event_other['eventMin'].apply(np.ceil)
df_event_other['eventMin'] = df_event_other['eventMin'].astype(int)
df_event_other.head(10)

Unnamed: 0,matchId,matchPeriod,eventSec,eventId,eventMin
0,2575959,1H,1844.168147,101,31
1,2575959,1H,1847.978228,101,31
2,2575959,2H,571.363862,1702,10
3,2575959,2H,1377.286953,1702,23
4,2575959,2H,2656.659101,1702,45
5,2575960,1H,1592.832659,101,27
6,2575960,1H,1595.14595,101,27
7,2575960,1H,1979.741913,101,33
8,2575960,1H,1981.933238,101,34
9,2575960,2H,120.208421,1702,3


In [72]:
df_event_other['if_injury_time'] = np.where(df_event_other['eventMin']>45, 1, 0)
df_event_other['time'] = np.where(df_event_other['matchPeriod'] == '1H', df_event_other['eventMin'], df_event_other['eventMin'] + 45)
df_event_other['binned_time'] = pd.cut(df_event_other['time'], bins=range(0, 100, 5), 
                                       labels=['0-5','5-10','10-15','15-20','20-25','25-30','30-35','35-40','40-45','45-50','50-55','55-60','60-65','65-70','70-75','75-80','80-85','85-90','>90'])

df_event_other['binned_label'] = df_event_other.apply(lambda x: convert_to_lable(x['binned_time'], x['if_injury_time'], x['matchPeriod']), axis=1)
reduced_df = df_event_other.sample(2000, random_state=1)
reduced_df.shape

(2000, 9)

In [73]:
# concatenate the two dataframes and save only matchPeriod,eventId, binned_label
df_event = pd.concat([df_event, reduced_df]).drop(['matchId','eventSec','eventMin','if_injury_time','time','binned_time'], axis=1)
df_event.head(10)

Unnamed: 0,matchPeriod,eventId,binned_label
0,1H,1701,0-5
1,2H,1701,80-85
2,1H,1701,30-35
3,2H,1701,75-80
4,2H,1701,85-90
5,2H,1701,>90
6,1H,1701,35-40
7,1H,1701,20-25
8,2H,1701,85-90
9,2H,1701,70-75


### New data frame for plot

In [95]:
df_plot = df_event.groupby(['matchPeriod','binned_label','eventId']).size().reset_index(name='count')
# only keep the matchPeriod == 1H and 2H
df_plot = df_plot[(df_plot['matchPeriod'] == '1H') | (df_plot['matchPeriod'] == '2H')]
# add a row where the count is 0
new_data = {'matchPeriod':'1H','binned_label':'15-20','eventId':1701,'count':0}
df_plot.loc[len(df_plot)] = new_data
df_plot = df_plot.sort_values(by=['matchPeriod','binned_label','eventId']).reset_index(drop=True)
df_plot

Unnamed: 0,matchPeriod,binned_label,eventId,count
0,1H,0-5,101,49
1,1H,0-5,1701,3
2,1H,0-5,1702,9
3,1H,10-15,101,53
4,1H,10-15,1701,7
5,1H,10-15,1702,27
6,1H,15-20,101,61
7,1H,15-20,1701,0
8,1H,15-20,1702,22
9,1H,20-25,101,61


In [139]:
def add_color(matchPeriod, binned_label):
    if matchPeriod == '1H':
       if binned_label == '>45':
           return '1st Half Injury Time'
       else:
           return '1st Half'
    else:
        if binned_label == '>90':
            return '2nd Half Injury Time'
        else:
            return '2nd Half'
        
df_plot['color'] = df_plot.apply(lambda x: add_color(x['matchPeriod'], x['binned_label']), axis=1)
df_plot.head(10)

Unnamed: 0,matchPeriod,binned_label,eventId,count,color
0,1H,0-5,101,49,1st Half
1,1H,0-5,1701,3,1st Half
2,1H,0-5,1702,9,1st Half
3,1H,10-15,101,53,1st Half
4,1H,10-15,1701,7,1st Half
5,1H,10-15,1702,27,1st Half
6,1H,15-20,101,61,1st Half
7,1H,15-20,1701,0,1st Half
8,1H,15-20,1702,22,1st Half
9,1H,20-25,101,61,1st Half


In [143]:
def map_eventId(eventId):
    if eventId == 101:
        return 'Goal'
    elif eventId == 1701:
        return 'Red Card'
    else:
        return 'Yellow Card'
    
df_plot['Event Type'] = df_plot['eventId'].apply(map_eventId)
df_plot.head(10)

Unnamed: 0,matchPeriod,binned_label,eventId,count,color,Event Type
0,1H,0-5,101,49,1st Half,Goal
1,1H,0-5,1701,3,1st Half,Red Card
2,1H,0-5,1702,9,1st Half,Yellow Card
3,1H,10-15,101,53,1st Half,Goal
4,1H,10-15,1701,7,1st Half,Red Card
5,1H,10-15,1702,27,1st Half,Yellow Card
6,1H,15-20,101,61,1st Half,Goal
7,1H,15-20,1701,0,1st Half,Red Card
8,1H,15-20,1702,22,1st Half,Yellow Card
9,1H,20-25,101,61,1st Half,Goal


In [145]:
# plot the event distribution based on the selected eventId from the dropdown using altair
# create a dropdown for the eventId
event_list = list(df_plot['Event Type'].unique())
event_list.sort()
event_dropdown = alt.binding_select(options=event_list)

# create selection objects
selection = alt.selection_single(fields=['Event Type'], bind=event_dropdown, name='Select')

selection_time = alt.selection_interval(encodings=['x'])

opacity = alt.condition(selection_time,
                        alt.value(1),
                        alt.value(0.6))

# create a base chart
base = alt.Chart(df_plot).transform_filter(selection).encode(
    x=alt.X(
        'binned_label:O', 
        title='Match Time (min)',
        sort=['0-5','5-10','10-15','15-20','20-25','25-30','30-35','35-40','40-45','>45',
              '45-50','50-55','55-60','60-65','65-70','70-75','75-80','80-85','85-90','>90']
    ),
    y=alt.Y('count:Q', title='Number of Events'),
    color=alt.Color(
        'color:N', 
        scale=alt.Scale(
            domain=['1st Half','1st Half Injury Time','2nd Half','2nd Half Injury Time'], 
            range=['#1f77b4','#aec7e8','#38761d','#93c47d']
            ),
    ),
    tooltip=['count']
)

# create a bar chart
bar = base.mark_bar(size=20)

# create a line chart
line = alt.Chart(df_plot).transform_filter(selection).mark_rule(color='firebrick').encode(
    y='mean(count):Q',
    size=alt.value(3)
).transform_filter(selection_time)

# add text to the line chart
text = alt.Chart(df_plot).transform_filter(selection).mark_text(
    align='left',
    baseline='bottom',
    dx=7,
    color='firebrick',
    size=12
).encode(
    y='mean(count):Q',
    text=alt.Text('mean(count):Q', format='.2f')
).transform_filter(selection_time)

# create a layer chart
chart = (bar + line + text).properties(
    width=600,
    height=400,
    title='Event Distribution Based on Match Time'
).add_params(
    selection,
    selection_time
).encode(
    opacity=opacity
)
chart