# Understanding of Network Analysis using Network with an example of 2022 FIFA World Cup: South Korea vs Portugal Match

* This notebook is copied and fixed some errors from https://github.com/indrag49/football-analysis-project

## 1. Import required Python libraries

In [None]:
from statsbombpy import sb
import matplotlib.pyplot as plt
from mplsoccer.pitch import Pitch
import seaborn as sns
import numpy as np
import pandas as pd
import networkx as nx
import math

## 2. Get Competitions data from https://statsbomb.com/

In [None]:
sb.competitions()

## 3. Extract 2022 FIFA World Cup data from Competitions data

In [None]:
sb.matches(competition_id=43, season_id=106).head(60)

## 4. Extract South Korea vs Portugal Match data

In [None]:
events=sb.events(match_id=3857262)
events.head()

## 5. Analyze South Korea vs Portugal Match

### 5.1 Rearrange Dataframe using several column names

In [None]:
events_1=events[['team', 'type', 'minute', 'location', 'pass_end_location', 'pass_outcome', 'player']]
events_1

### 5.2 Check all players

In [None]:
events_1.player.unique()

### 5.3 Select "Heung-Min Son" to analyze his performance

In [None]:
events_1=events_1[events_1['player']=='Heung-Min Son'].reset_index()
events_1

### 5.4 Check types of Performance

In [None]:
events_1.type.unique()

### 5.5 "Pass" Analysis

#### 5.5.1 Select "Pass" 

In [None]:
events_1 = events_1[events_1['type'].isin(['Pass'])]
events_1

#### 5.5.2 Display his Pass during the match

In [None]:
Loc = events_1['location']
Loc = pd.DataFrame(Loc.to_list(), columns=['x', 'y'])

pitch = Pitch(pitch_type='statsbomb',
              pitch_color='grass', line_color='#c7d5cc', 
              stripe=True)
fig, ax = pitch.draw(figsize=(13.5, 8), constrained_layout=True, tight_layout=False)
#plt.gca().invert_yaxis()

kde = pitch.kdeplot(
    Loc['x'], Loc['y'], ax=ax,
    fill=True, levels=100,
    # shade the lowest area so it looks smooth
    # so even if there are no events it gets some color
    shade_lowest=True,
    thresh=0.05,
    alpha = 0.5,
    cut=4,  # extended the cut so it reaches the bottom edge
    cmap='gnuplot')

for i in range(len(events_1)):
    if events_1.pass_outcome[i]=='Incomplete' or events_1.pass_outcome[i]=='Unknown':
        plt.plot((events_1.location[i][0], events_1.pass_end_location[i][0]), (events_1.location[i][1], events_1.pass_end_location[i][1]), color='red')
        plt.scatter(events_1.location[i][0], events_1.location[i][1], color='red')
    elif events_1.pass_outcome[i]=='Pass Offside':
        plt.plot((events_1.location[i][0], events_1.pass_end_location[i][0]), (events_1.location[i][1], events_1.pass_end_location[i][1]), color='blue')
        plt.scatter(events_1.location[i][0], events_1.location[i][1], color='blue')
    elif events_1.pass_outcome[i]=='Out':
        plt.plot((events_1.location[i][0], events_1.pass_end_location[i][0]), (events_1.location[i][1], events_1.pass_end_location[i][1]), color='yellow')
        plt.scatter(events_1.location[i][0], events_1.location[i][1], color='yellow')
    else:
        # successful pass, nan
        plt.plot((events_1.location[i][0], events_1.pass_end_location[i][0]), (events_1.location[i][1], events_1.pass_end_location[i][1]), color='black')
        plt.scatter(events_1.location[i][0], events_1.location[i][1], color='black')

#### 5.5.4 Plot Ratio of Successful Pass 

In [None]:
data=[]
for i in range(len(events_1)):
    if type(events_1.pass_outcome[i])==float:
        data.append('successful')
    else:
        data.append('unsuccessful')

In [None]:
from matplotlib.ticker import PercentFormatter

plt.hist(data, weights=np.ones(len(data)) / len(data))
plt.gca().yaxis.set_major_formatter(PercentFormatter(1))

### 5.6 "Shot" Analysis

#### 5.6.1 Select Columns for Shot

In [None]:
events_2=events[['team', 'type', 'minute', 'location', 'shot_end_location', 'shot_outcome', 'player']]
events_2

In [None]:
events_2 = events_2[events_2['type'].isin(['Shot'])]
events_2

#### 5.6.2 Check Shot Outcome types

In [None]:
events_2.shot_outcome.unique()

#### 5.6.3 Extract X, Y location for visualization

In [None]:
shots_South_Korea = events_2[events_2['team'] == 'South Korea'].reset_index()
shots_Portugal = events_2[events_2['team'] == 'Portugal'].reset_index()

In [None]:
shots_Loc_South_Korea = shots_South_Korea['location']
shots_Loc_South_Korea = pd.DataFrame(shots_Loc_South_Korea.to_list(), columns=['x', 'y'])
shots_Loc_South_Korea

In [None]:
shots_Loc_Portugal = shots_Portugal['location']
shots_Loc_Portugal = pd.DataFrame(shots_Loc_Portugal.to_list(), columns=['x', 'y'])
shots_Loc_Portugal

#### 5.6.4 South Korea's Attack to Portugal

In [None]:
pitch = Pitch(pitch_type='statsbomb', pitch_color='#22312b', line_color='#c7d5cc')
fig, ax = pitch.draw(figsize=(13.5, 8),constrained_layout=True, tight_layout=False)

kde = pitch.kdeplot(
    shots_Loc_South_Korea['x'],
    shots_Loc_South_Korea['y'], ax=ax,
    fill=True, levels=100,
    # shade the lowest area so it looks smooth
    # so even if there are no events it gets some color
    shade_lowest=True,
    thresh=0.05,
    alpha = 0.7,
    cut=4,  # extended the cut so it reaches the bottom edge
    cmap='Pastel1')


for i in range(len(shots_South_Korea)):
    if shots_South_Korea.shot_outcome[i]=='Goal':
        pitch.arrows(shots_South_Korea.location[i][0], shots_South_Korea.location[i][1], shots_South_Korea.shot_end_location[i][0], shots_South_Korea.shot_end_location[i][1], ax=ax, color='green', width=3)
        pitch.scatter(shots_South_Korea.location[i][0], shots_South_Korea.location[i][1], ax = ax, color='green', alpha=1)
    elif shots_South_Korea.shot_outcome[i] in ['Blocked', 'Saved']:
        pitch.arrows(shots_South_Korea.location[i][0], shots_South_Korea.location[i][1], shots_South_Korea.shot_end_location[i][0], shots_South_Korea.shot_end_location[i][1], ax=ax, color='red', width=3)
        pitch.scatter(shots_South_Korea.location[i][0], shots_South_Korea.location[i][1], ax = ax, color='red', alpha=1)
    else:
        pitch.arrows(shots_South_Korea.location[i][0], shots_South_Korea.location[i][1], shots_South_Korea.shot_end_location[i][0], shots_South_Korea.shot_end_location[i][1], ax=ax, color='orange', width=3)
        pitch.scatter(shots_South_Korea.location[i][0], shots_South_Korea.location[i][1], ax = ax, color='orange', alpha=1)

#### 5.6.5 Portugal's Attack to South Korea

In [None]:
pitch = Pitch(pitch_type='statsbomb', pitch_color='#22312b', line_color='#c7d5cc')
fig, ax = pitch.draw(figsize=(13.5, 8),constrained_layout=True, tight_layout=False)
#plt.gca().invert_yaxis()

kde = pitch.kdeplot(
    shots_Loc_Portugal['x'],
    shots_Loc_Portugal['y'], ax=ax,
    fill=True, levels=100,
    # shade the lowest area so it looks smooth
    # so even if there are no events it gets some color
    shade_lowest=True,
    thresh=0.05,
    alpha = 0.7,
    cut=4,  # extended the cut so it reaches the bottom edge
    cmap='Pastel1')

for i in range(len(shots_Portugal)):
    if shots_Portugal.shot_outcome[i]=='Goal':
        pitch.arrows(shots_Portugal.location[i][0], shots_Portugal.location[i][1], shots_Portugal.shot_end_location[i][0], shots_Portugal.shot_end_location[i][1], ax=ax, color='green', width=3)
        pitch.scatter(shots_Portugal.location[i][0], shots_Portugal.location[i][1], ax = ax, color='green', alpha=1)
    elif shots_Portugal.shot_outcome[i] in ['Blocked', 'Saved']:
        pitch.arrows(shots_Portugal.location[i][0], shots_Portugal.location[i][1], shots_Portugal.shot_end_location[i][0], shots_Portugal.shot_end_location[i][1], ax=ax, color='red', width=3)
        pitch.scatter(shots_Portugal.location[i][0], shots_Portugal.location[i][1], ax = ax, color='red', alpha=1)
    else:
        pitch.arrows(shots_Portugal.location[i][0], shots_Portugal.location[i][1], shots_Portugal.shot_end_location[i][0], shots_Portugal.shot_end_location[i][1], ax=ax, color='orange', width=3)
        pitch.scatter(shots_Portugal.location[i][0], shots_Portugal.location[i][1], ax = ax, color='orange', alpha=1)

### 5.7 "Pass" Network Analysis

#### 5.7.1 Select Columns for Pass

In [None]:
events_3 = events[['minute', 'second', 'team', 'location', 'period', 'type', 'pass_outcome', 'player', 'position', 'pass_end_location']]
events_3

#### 5.7.2 Check Pass Outcome types

In [None]:
events_3.pass_outcome.unique()

#### 5.7.3 Select South Korea Team

In [None]:
# filter for South Korea
events_South_Korea = events_3[events_3['team']=='South Korea']
events_South_Korea

#### 5.7.4 Create new Columns: "passer" and "recipient" considering time

In [None]:
# Add the passer and the recipient columns
events_South_Korea['passer'] = events_South_Korea['player']
events_South_Korea['recipient'] = events_South_Korea['player'].shift(-1)

In [None]:
# Check added columns
events_South_Korea

#### 5.7.5 Select "Pass" Type from the DataFrame

In [None]:
# data frame with 'Pass' type
passes_South_Korea = events_South_Korea[events_South_Korea['type'] == 'Pass']
passes_South_Korea

#### 5.7.6 Extract "Successful Passes" Type which has "nan" in the DataFrame

In [None]:
# Successful passes
successful_South_Korea = passes_South_Korea[passes_South_Korea['pass_outcome'].isnull()]
successful_South_Korea

In [None]:
# Check whether no NaNs are present in "recipient" columns
sum(successful_South_Korea.recipient.isnull()+1)

#### 5.7.7 Select data before the first substitution took place for Pass Network

In [None]:
subs_South_Korea = events_South_Korea[events_South_Korea['type']=='Substitution']
subs_South_Korea

In [None]:
# Time when the first substituion took place
first_sub_South_Korea_minute = subs_South_Korea['minute'].min()
first_sub_South_Korea_minute_df = subs_South_Korea[subs_South_Korea['minute'] == first_sub_South_Korea_minute]
first_sub_South_Korea_second = first_sub_South_Korea_minute_df['second'].min()
first_sub_South_Korea_minute, first_sub_South_Korea_second

In [None]:
# Filter out the data for generating pass network before the first substitution took place
successful_South_Korea = successful_South_Korea[(successful_South_Korea['minute']<=first_sub_South_Korea_minute) & (successful_South_Korea['second']<first_sub_South_Korea_second)]
successful_South_Korea

#### 5.7.8 Create new columns to add passer locations (x, y) and receiver locations (end_x, end_y)

In [None]:
pass_loc_South_Korea = successful_South_Korea['location']
pass_loc_South_Korea = pd.DataFrame(pass_loc_South_Korea.to_list(), columns=['x', 'y'])
pass_end_loc_South_Korea = successful_South_Korea['pass_end_location']
pass_end_loc_South_Korea = pd.DataFrame(pass_end_loc_South_Korea.to_list(), columns=['end_x', 'end_y'])
pass_end_loc_South_Korea

In [None]:
successful_South_Korea=successful_South_Korea.reset_index()
successful_South_Korea['x'] = pass_loc_South_Korea['x']
successful_South_Korea['y'] = pass_loc_South_Korea['y']
successful_South_Korea['end_x'] = pass_end_loc_South_Korea['end_x']
successful_South_Korea['end_y'] = pass_end_loc_South_Korea['end_y']
successful_South_Korea

In [None]:
del successful_South_Korea['location']
del successful_South_Korea['pass_end_location']
successful_South_Korea

In [None]:
successful_South_Korea['pass_outcome'] = 'successful'
successful_South_Korea

#### 5.7.9  Calculate the average locations of the passer

In [None]:
avg_loc_South_Korea = successful_South_Korea.groupby('passer').agg({'x':['mean'], 'y': ['mean', 'count']})
avg_loc_South_Korea

In [None]:
avg_loc_South_Korea.columns=['x', 'y', 'count']
avg_loc_South_Korea

#### 5.7.10  Create Pass Network Plot between Each Player

In [None]:
# Number of passes between each player
pass_bet_South_Korea = successful_South_Korea.groupby(['passer', 'recipient']).index.count().reset_index()
pass_bet_South_Korea

In [None]:
pass_bet_South_Korea.rename({'index':'pass_count'}, axis='columns', inplace=True)
pass_bet_South_Korea

In [None]:
pass_bet_South_Korea = pass_bet_South_Korea.merge(avg_loc_South_Korea, left_on = 'passer', right_index=True)
pass_bet_South_Korea

In [None]:
pass_bet_South_Korea = pass_bet_South_Korea.merge(avg_loc_South_Korea, left_on = 'recipient', right_index=True, suffixes=['', '_end'])
pass_bet_South_Korea

In [None]:
pitch = Pitch(pitch_type='statsbomb', pitch_color='grass', line_color='#c7d5cc')
fig, ax = pitch.draw(figsize=(13.5, 8), constrained_layout=True, tight_layout=False)

arrows = pitch.arrows(pass_bet_South_Korea.x, pass_bet_South_Korea.y, pass_bet_South_Korea.x_end, pass_bet_South_Korea.y_end, ax=ax, width=5,
                     headwidth=3, color='white', zorder=1, alpha=0.5)
nodes = pitch.scatter(avg_loc_South_Korea.x, avg_loc_South_Korea.y, s=400, color='red', edgecolors='black', linewidth=2.5, alpha=1, zorder=1, ax=ax)

ax.set_title('Pass Network of South Korea against Portugal on 2022 FIFA World Cup (South Korea 2-1 Portugal)', size=20)

## 6. Apply Networkx 

### 6.1  Create a DataFrame 

In [None]:
graph_South_Korea = pass_bet_South_Korea[['passer', 'recipient', 'pass_count']]
graph_South_Korea

In [None]:
graph_South_Korea.passer.unique()

In [None]:
player_number_dict = {'Gue-Sung Cho': 9, 'Heung-Min Son': 7, 'In-Beom Hwang': 6, 'Jae-Sung Lee': 10,
                      'Jin-Su Kim': 3, 'Kang-In Lee': 18, 'Kyung-Won Kwon': 20, 'Moon-Hwan Kim': 15, 
                      'Seung-Gyu Kim': 1, 'Woo-Young Jung': 5, 'Young-Gwon Kim': 19}
graph_South_Korea['passer'] = graph_South_Korea.passer.map(player_number_dict)
graph_South_Korea['recipient'] = graph_South_Korea.recipient.map(player_number_dict)

In [None]:
graph_South_Korea

In [None]:
# convert the dataframe to a list of tuples
L = graph_South_Korea.apply(tuple, axis=1).tolist()
L

### 6.2 Draw the NetworkX graph using Pass Data

In [None]:
# Create Object using DiGraph (Directed Network)
G = nx.DiGraph()
G.add_weighted_edges_from(L)
nx.draw(G, node_size=800, with_labels=True, node_color='red')

### 6.3 Count Degree (the number of link, the number of pass) for each player

In [None]:
# Vertex degrees, the number of link
dict(nx.degree(G))

### 6.4 Count Total Degree (the number of link, the number of pass) for each player

In [None]:
dic = dict(nx.degree(G))
player = dic.keys()
degrees = dic.values()
degree_South_Korea = pd.DataFrame({'player':player, 'degrees':degrees})

ordered = degree_South_Korea.sort_values(by = 'degrees')
x_range = range(len(degree_South_Korea.index))
y_range = range(math.ceil(max(degree_South_Korea.degrees)))


plt.stem(ordered['degrees'])
plt.xticks(x_range, ordered['player'])
plt.yticks(y_range)
plt.xlabel("player jersey number")
plt.ylabel("degree (total number of passes played)")
plt.title("Successful passes (degrees) of each player (vertex)", size=15)

### 6.5 Count Passes received (the number of link, the number of pass) by each player

In [None]:
dic = dict(G.in_degree())
player = dic.keys()
in_degrees = dic.values()
in_degree_South_Korea = pd.DataFrame({'player':player, 'in_degrees':in_degrees})

ordered = in_degree_South_Korea.sort_values(by = 'in_degrees')
x_range = range(len(in_degree_South_Korea.index))
y_range = range(math.ceil(max(in_degree_South_Korea.in_degrees)))


plt.stem(ordered['in_degrees'])
plt.xticks(x_range, ordered['player'])
plt.yticks(y_range)
plt.xlabel("player jersey number")
plt.ylabel("in degree (total number of passes received)")
plt.title("Successful passes received (indegrees) for each player (vertex)", size=15)

### 6.6 Count Passes given (the number of link, the number of pass) by each player

In [None]:
dic = dict(G.out_degree())
player = dic.keys()
out_degrees = dic.values()
out_degree_South_Korea = pd.DataFrame({'player':player, 'out_degrees':out_degrees})

ordered = out_degree_South_Korea.sort_values(by = 'out_degrees')
x_range = range(len(out_degree_South_Korea.index))
y_range = range(math.ceil(max(out_degree_South_Korea.out_degrees)))


plt.stem(ordered['out_degrees'])
plt.xticks(x_range, ordered['player'])
plt.yticks(y_range)
plt.xlabel("player jersey number")
plt.ylabel("out degree (total number of passes given)")
plt.title("Successful passes given (indegrees) by each player (vertex)", size=15)

### 6.7 Adjacency matrix using Pass Network Data 

In [None]:
nx.draw(G, node_size=800, with_labels=True, node_color='red')

In [None]:
# Adjacency matrix of the pass network
A = nx.adjacency_matrix(G)
A.todense()

### 6.8 degree correlation coefficient

In [None]:
# degree correlation coefficient of a graph
r_degree = nx.degree_pearson_correlation_coefficient(G)
r_degree

n = np.shape(A)[0]
S = 0
dic = dict(nx.degree(G)).values()
D = list(dic)

for i in range(n):
    for j in range(i+1, n):        
        d_i = D[i]
        d_j = D[j]
        S += d_i*d_j*A[i, j]
print(S)

### Some distance statistics metrics

### 6.9 Shortest Pass

In [None]:
def inv(x): 
    return 1/x

GR = graph_South_Korea[:]
GR['pass_count'] = GR['pass_count'].apply(inv)

LR = GR.apply(tuple, axis=1).tolist()

G_inw = nx.DiGraph()
G_inw.add_weighted_edges_from(LR)
nx.draw(G_inw, node_size=800, with_labels=True, node_color='green')

In [None]:
dis = nx.shortest_path(G_inw, weight='pass_count')
dis

### 6.10 Clustering Coefficient

In [None]:
cc = nx.average_clustering(G, weight='pass_count')
cc

### 6.11 betweenness centrality

In [None]:
bc = nx.betweenness_centrality(G, weight='pass_count')
bc

In [None]:
max_bc = max(bc, key=bc.get)
max_bc

## The jersey number 19 Young-Gwon Kim is the most important player in the pass network