## Soccer Networks

**Yuval Berman**

Making passing networks out of the events logs stream and evaluating metrics for temporal windows.

In [105]:
# Import necessary libraries
import pandas as pd
import networkx as nx
import os
import matplotlib.pyplot as plt
import numpy as np
import math
import matplotlib.cm as mplcm
import matplotlib.colors as colors

In [106]:
## Import up sound alert dependencies
from IPython.display import Audio, display

def allDone():
    display(Audio(url='https://sound.peal.io/ps/audios/000/000/537/original/woo_vu_luvub_dub_dub.wav', autoplay=True))
## Insert whatever audio file you want above

In [107]:
# Navigate to the relevant directory
os.chdir('/Users/Yuval/Desktop/Pawsey/data/')

In [108]:
# Read in the players (not necessary for global entropy, but helps if wanting to find individual node
# entropy or visualise the passing network)
players = pd.read_json('players.json')
players = players.rename(columns ={'wyId': 'playerId'}) # rename for convenience

In [109]:
cleaned_events = os.listdir('cleaned_events/')

In [110]:
def initial_df_processing(name):
    
    df = pd.read_json('cleaned_events/'+name)
    
    # Merge the two databases together
    df = df.merge(players, how = 'left', on = 'playerId')
    
    # Let's clean it up a bit and get rid of some unnecessary info so we have just what we need
    df.drop(['passportArea', 'weight', 'firstName','middleName','lastName','currentTeamId',
         'birthDate','height','role','foot','currentNationalTeamId','birthArea','subEventId',
         'id','xStart', 'xEnd', 'yStart', 'yEnd', 'attackMetres'],inplace = True, axis = 1)
    
    return df

In [111]:
def get_game(df, matchId):
    # Finds the match we're after.
    df = df.loc[df['matchId'] == matchId]
    return df

In [112]:
def continuous_time(df):
    # Converts to a continuous 90 min (ie 90*60 = 5400 second) game
    firsthalf = df.loc[df['matchPeriod'] == '1H']
    first_half_length = list(firsthalf['eventSec'])[-1]
    df.loc[ df.matchPeriod == '2H' , 'eventSec'] += first_half_length
    
    #Take out extra time / penalties for the cup games
    df = df.loc[df['matchPeriod'] != 'E1']
    df = df.loc[df['matchPeriod'] != 'E2']
    df = df.loc[df['matchPeriod'] != 'P']
    
    return df

In [113]:
def event_handling(df):
    # Cleans things up based on our interpretation of the events and how they relate to passes. Note the meaning of
    # the events:
    # 1 - duels (this is a hard one to define as it means many different things)
    # 2 - fouls
    # 3 - free kicks / corners (hence we add a node so that it picks these up as the first play in the chain)
    # 4 - keeper leaving their line
    # 7 - Touch, acceletation or clearance (also vague- get rid of touch/acc but keep clearance if its accurate)
    # 8 - Pass - simple/smart, cross (main data we're after for this)
    # 9 - Reflexes / save (however if tagged as inaccurate it means there was a goal)
    # 10- Shot (add a shot node)
    
    df = df.loc[ df['eventId'] != 9]
    df = df.loc[ df['eventId'] != 1]
    df = df.loc[ df['eventId'] != 2]
    df = df.loc[ df['eventId'] != 4]
    df = df.loc[ df['eventId'] != 7]
    df = df.loc[ df['subEventName'] != 'Touch']
    df = df.loc[ df['subEventName'] != 'Acceleration']
    df = df.loc[ (df['subEventName'] != 'Clearance') | (df['Result'] == 'Inaccurate')]
    df = df.drop(df[(df.subEventName == 'Clearance') & (df.Result =='Inaccurate')].index)

    # Create some new columns to see where passes ended up etc
    df["next_event"] = df['eventId'].shift(-1)
    df["next_teamId"] = df["teamId"].shift(-1)
    df["next_player"] = df["playerId"].shift(-1)
    df.loc[ df["next_event"] == 3, 'next_player'] = 0.5
    df.loc[ df["eventId"] == 10, 'next_player'] = 0
    
    return df

In [114]:
def passeslist(df, teamIds):
    
    # Create a Passes column which only has successive events between the same team 
    df = df.drop( df[(df.teamId != df.next_teamId) & (df.next_player != 0)].index)
    df['Passes'] = list(zip(df['playerId'], df['next_player']))
    
    # Get a passes list for each team
    passes = []
    for team in teamIds:    
        temp = df.loc[df['teamId'] == team]
        passes.append(list(temp['Passes']))

    return passes[0], passes[1]

In [115]:
def weighted_passes(passes):
    # Builds a network for the team based on the passes as edges
    n = nx.DiGraph()
    for (i,j) in passes:
        if n.has_edge(i,j):
            n[i][j]['weight'] += 1
        else:
            n.add_edge(i,j, weight = 1)
    
    # Remove node 0.5 as only used to signify start of plays from free kicks / corners
    if 0.5 in n.nodes():
        n.remove_node(0.5)
        
    return n

## Entropy

In [116]:
# Defines a function to calculate the entropy of each node from the transition matrix.
def entropy(row):
    enode = 0
    for item in row:
        if item > 0:
            enode -= item * math.log10(item) # Entropy formula for each node.
    return enode

In [194]:
def find_entropy(net,playerlist):
    # This function attempts to calculate a team's entropy- i.e. their unpredictability.
    
    n = net
    n = nx.relabel_nodes(n, {0:10000000000})
    
    # A stochastic graph gives the probabilities of connections between certain platers       
    prob = nx.stochastic_graph(n)
    
    # Allows us to sort nodes alphabetically so that we can easily remove shot node later
    nodelist = sorted(list(n.nodes()))
    
    # Generates transition matrix
    probmat = nx.adjacency_matrix(prob, nodelist = nodelist)
    probmat = probmat.todense()
    matrix = np.array(probmat) # turns it into numpy array so that we can operate on it

    #entnode =  np.apply_along_axis( entropy, axis=1, arr = matrix ) # calculates entropy for each node
    
    entnode = []
    for row in matrix:
        e = 0
        for item in row:
            if item >0:
                e -= item * math.log10(item)
        entnode.append(e)
    
    entnode = entnode[:-1] # Gets rid of entropy of shot node
    
#     matchid = []
#     teamid = []
#     for i in range(len(entnode)):
#         matchid.append(match)
#         teamid.append(team)

#     entrop = sum(entnode)/len(n.nodes()) # Gets the average
        
    return entnode

### Betweenness

In [195]:
#betweenness
def find_betweenness(net,playerlist):
    
    n = net

    dic = nx.betweenness_centrality(n, normalized = False, weight = 'weight') #Just uses nx built ins. # weight?
    
    met = []
    for player in playerlist:
        met.append(dic[player])
    
#     playerlist = list(bet.keys())
#     betweenness = list(bet.values())
    
#     matchid = []
#     teamid = []
#     for i in range(len(playerlist)):
#         matchid.append(match)
#         teamid.append(team)
    
    return met

### Closeness 

In [196]:
def find_closeness(net,playerlist):
    
    n = net

    out_close = nx.algorithms.centrality.closeness_centrality(n.reverse())
    in_close  = nx.algorithms.centrality.closeness_centrality(n)
    
    met = []
    for player in playerlist:
        met.append((out_close[player]+in_close[player])/2)
    
#     co = list(out_close.values())
#     ci = list(in_close.values())
#     summ = [(co[i] + ci[i])/2 for i in range(len(co))]
#     close = {}
#     keys = list(out_close.keys())
#     for i in range(len( summ ) ):
#         close[keys[i]] = summ[i]
        
#     playerlist = list(close.keys())
#     closeness = list(close.values())
    
#     matchid = []
#     teamid = []
#     for i in range(len(playerlist)):
#         matchid.append(match)
#         teamid.append(team)
    
    # The closeness centrality uses inward distance to a node, not outward. If you want to use outward 
    # distances apply the function to G.reverse(). We want both, with equal weighting, so we add them up.
    #close =  np.mean( sum(list( out_close.values() ))/11 + sum(list(in_close.values()))/11 )
    
    return met

### Clustering 

In [197]:
def find_clustering(net, playerlist):
    n = net
    
    dic = nx.clustering(n)
    #clus = sum( list(dict(cluplayers).values())) /11
    # clus = nx.algorithms.transitivity(n)
    # Transitivity and clustering are similarly defined but slightly different. Transitivity gives more 
    # weight to higher degree nodes, and clustering gives more weight to lower degree nodes.
    
    met = []
    for player in playerlist:
        met.append(dic[player])
    
#     playerlist = list(cluplayers.keys())
#     clustering = list(cluplayers.values())
    
#     matchid = []
#     teamid = []
#     for i in range(len(playerlist)):
#         matchid.append(match)
#         teamid.append(team)
    
    
    return met

## eVector centrality

In [198]:
def find_evec(net, playerlist):
    
    n = net
    
    # Gets list of every player's eigenvector centrality and takes the standard deviation of that list.
    dic = nx.algorithms.centrality.eigenvector_centrality(n,max_iter = 10000)
    
    evec_cent = []
    for player in playerlist:
        evec_cent.append(dic[player])
    
#     playerlist = list(evec.keys())
#     #evecstd = np.std(evec)
#     evec_cent = list(evec.values())
    
#     matchid = []
#     teamid = []
#     for i in range(len(playerlist)):
#         matchid.append(match)
#         teamid.append(team)
    
    return evec_cent

### Outdegree

In [199]:
# out degree
def find_outdeg(net,playerlist):
    n = net
    # Gives dictionaries of the metrics for each node.
    dic = dict ( n.out_degree() )
    
    degree = []
    for player in playerlist:
        degree.append(dic[player])
    
    return degree

## Metrics Function

In [200]:
def metric_calc(net,metric, playerlist):
    
    if metric == 'passes':
        met = 0
        for item in list(net.edges(data=True)):
            met += item[2]['weight']
    elif metric == 'entropy':
        met = find_entropy(net, playerlist)
    elif metric == 'outdegree':
        met = find_outdeg(net, playerlist)
    elif metric == 'indegree':
        met = find_indeg(net)
    elif metric == 'outstrength':
        met = find_outstr(net)
    elif metric == 'instrength':
        met = find_instr(net)
    elif metric == 'betweenness':
        met = find_betweenness(net, playerlist)
    elif metric == 'nonzero_betweenness':
        met = find_nonzero(net)
    elif metric == 'closeness':
        met = find_closeness(net, playerlist)
    elif metric == 'clustering':
        met = find_clustering(net, playerlist)
    elif metric == 'eigenvector':
        met = find_evec(net, playerlist)

    return met

In [262]:
def time_window(df,time,teams, metric, match):
    # This function iterates sliding 10 minute windows, moving forward by 30 seconds each time and calculate a 
    # team's entropy during that 30 second window

    metrics1 = []
    metrics2 = []

    # locate the window in the database
    temp_df = df.loc[df['eventSec'] < time]
    #temp_df = temp_df.loc[temp_df['eventSec'] > time-600]

    # Get pass list, create a network, and find that network's entropy
    pass1, pass2 = passeslist(temp_df, teams)

    net1 = weighted_passes(pass1)
    net2 = weighted_passes(pass2)

    # The team's total amount of passes during that 10 minute window 
    tot1 = 0
    tot2 = 0
    for item in list(net1.edges(data=True)):
        tot1 += item[2]['weight']
    for item in list(net2.edges(data=True)):
        tot2 += item[2]['weight']

    metric_names = ['entropy','outdegree','betweenness','closeness','clustering','eigenvector']
    
    playerlist1 = sorted(list(net1.nodes()))
    playerlist2 = sorted(list(net2.nodes()))

    if 0 in playerlist1:
        playerlist1 = playerlist1[1:]

    if 0 in playerlist2:
        playerlist2 = playerlist2[1:]

    matchid = []
    for i in range(len(playerlist1)+len(playerlist2)):
        matchid.append(match)

    teamid = []
    for i in range(len(playerlist1)):
        teamid.append(teams[0])
    for i in range(len(playerlist2)):
        teamid.append(teams[1])
    
    mets = []
    for metric in metric_names:
        
        if tot1 > 0:
            met1 = metric_calc(net1, metric, playerlist1)
        else:
            met1 = 0
        if tot2 > 0:
            met2 = metric_calc(net2, metric, playerlist2)
        else:
            met2 = 0
            
        mets.append(met1 + met2)
    
    players = playerlist1 + playerlist2
    
    results = list(zip(matchid,teamid,players,mets[0],mets[1],mets[2],mets[3],mets[4],mets[5]))
    
    metric_df = pd.DataFrame(results, columns = ['matchId','teamId','playerId','entropy','outdegree','betweenness',
                                                 'closeness','clustering','eigenvector'])
    
    return metric_df

In [263]:
def game_iterator(master_df,league, metric,time):
    
    #Shows what the longest half is and converts to longest possible match
    #longest_game = math.ceil(max(master_df['eventSec'])*2 / 60)
    
    matches = list(set(master_df['matchId']))
    
    mins = []
    for i in range(10,170):
        mins.append(i/2)
        
    results = []
    
    metric_df = pd.DataFrame()
    for m in range(len(matches)):

        #print(m)
        match = matches[m]

        df = get_game(master_df,match)
        df = continuous_time(df)
        df = event_handling(df)

        teams = list(set(df['teamId']))
        
        metric_df = pd.concat([metric_df, time_window(df,time, teams, metric, match)],ignore_index = True )

#         met1, met2 = time_window(df,time, teams, metric, match)
        
#         for item in met1:
#             results.append(item)
#         for item in met2:
#             results.append(item)
    
#     metric_df = pd.DataFrame(results, columns = ['matchId','teamId','playerId','outdegree'])
    
    return metric_df

In [264]:
# Read in teams df
Teams = pd.read_json('teams.json')
Teams = Teams.rename(columns = {'wyId': 'teamId'})

In [269]:
def main(cleaned_events):
    
    #metric_names = ['passes','entropy','outdegree','indegree','outstrength','instrength','betweenness',
    #             'nonzero_betweenness','closeness','clustering','eigenvector_std']
    metric='outdegree'
    
    times = []
    for time in range(600, 6000, 600):
        times.append(time)
    
    #for metric in metric_names:
    for t in times:
        print(t/60)
        
        live_df = pd.DataFrame()
        for name in cleaned_events:
            df = initial_df_processing(name)
        
            met_df = game_iterator(df, name, metric, t)
            
            live_df = pd.concat([live_df,met_df],ignore_index = True)
        
        live_df.to_json('MetAggs/met_live_'+str(int(t/60)))

In [270]:
cleaned_events

['cleaned_events_England.json',
 'cleaned_events_France.json',
 'cleaned_events_Spain.json',
 'cleaned_events_European_Championship.json',
 'cleaned_events_World_Cup.json',
 'cleaned_events_Germany.json',
 'cleaned_events_Italy.json']

In [271]:
main(cleaned_events)

10.0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(ilocs[0], value, pi)


20.0
30.0
40.0
50.0
60.0
70.0
80.0
90.0


In [272]:
allDone()

In [210]:
os.mkdir('MetAggs/')

In [160]:
agg = pd.read_json('/Users/Yuval/Desktop/Pawsey/data/ClusteringAggs/clustering_live_90')

In [161]:
agg

Unnamed: 0,matchId,teamId,playerId,clustering
0,2499719,1609,25413,0.732759
1,2499719,1609,370224,0.783951
2,2499719,1609,3319,0.743781
3,2499719,1609,120339,0.779006
4,2499719,1609,167145,0.731707
...,...,...,...,...
56198,2576338,3185,21234,0.515337
56199,2576338,3185,0,1.000000
56200,2576338,3185,354552,0.297619
56201,2576338,3185,14745,0.200000


In [55]:
# I forgot to remove shot node!
def remove_shot_node(df):
    df = df.loc[df['playerId']!=0]
    return df

In [163]:
for file in os.listdir('EigenvectorAggs/'):
    df = pd.read_json('EigenvectorAggs/'+file)
    df = remove_shot_node(df)
    df.to_json('EigenvectorAggs/'+file)

In [173]:
name = 'cleaned_events_Italy.json'
df = initial_df_processing(name)
matches = list(set(df['matchId']))
metric = 'entropy'
match = matches[0]
df = get_game(df,match)
df = continuous_time(df)
df = event_handling(df)
teams = list(set(df['teamId']))
pass1, pass2 = passeslist(df, teams)
net1 = weighted_passes(pass1)
net2 = weighted_passes(pass2)
met1 = metric_calc(net1, metric)
met2 = metric_calc(net2, metric)

TypeError: '<' not supported between instances of 'float' and 'str'