# Live Match Data Scraping and Analysis
### Prepared by- Uday Suri

#### Variables:
* In [4]:  Match_id | Match_id_fbref
* In [11]:  hfbref_element- Xpath | afbref_element- Xpath
* In [12]:  hgkfbref_element- Xpath | agkfbref_element- Xpath

## I. Importing Libraries

In [1]:
#importing the necessary libraries
import pandas as pd, numpy as np
#importing json package to read the data
import json
#importing drivers
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, WebDriverException
from selenium.webdriver.common.action_chains import ActionChains
import time
#importing beautiful soup
from bs4 import BeautifulSoup
#importing visualizing libraries
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.patches import Arc
import plotly.graph_objects as go
#importing warnings
import warnings
warnings.filterwarnings("ignore")
#importing os module
import os
from os.path import basename
#importing time
import time
#importing pathlib
import pathlib
#importing string
import string

import unicodedata

import difflib

## II. Defining Functions

In [2]:
# defining flatten_nested_json_df to extarct data from dictionaries and put it in different columns
def flatten_nested_json_df(df):

    df = df.reset_index()

    print(f"original shape: {df.shape}")
    print(f"original columns: {df.columns}")


    # search for columns to explode/flatten
    s = (df.applymap(type) == list).all()
    list_columns = s[s].index.tolist()

    s = (df.applymap(type) == dict).all()
    dict_columns = s[s].index.tolist()

    print(f"lists: {list_columns}, dicts: {dict_columns}")
    while len(list_columns) > 0 or len(dict_columns) > 0:
        new_columns = []

        for col in dict_columns:
            print(f"flattening: {col}")
            # explode dictionaries horizontally, adding new columns
            horiz_exploded = pd.json_normalize(df[col]).add_prefix(f'{col}.')
            horiz_exploded.index = df.index
            df = pd.concat([df, horiz_exploded], axis=1).drop(columns=[col])
            new_columns.extend(horiz_exploded.columns) # inplace

        for col in list_columns:
            print(f"exploding: {col}")
            # explode lists vertically, adding new columns
            df = df.drop(columns=[col]).join(df[col].explode().to_frame())
            new_columns.append(col)

        # check if there are still dict o list fields to flatten
        s = (df[new_columns].applymap(type) == list).all()
        list_columns = s[s].index.tolist()

        s = (df[new_columns].applymap(type) == dict).all()
        dict_columns = s[s].index.tolist()

        print(f"lists: {list_columns}, dicts: {dict_columns}")

    print(f"final shape: {df.shape}")
    print(f"final columns: {df.columns}")
    return df

In [3]:
def draw_pitch():
    #Create figure
    fig=plt.figure()
    fig.set_size_inches(20, 13.846)
    fig.patch.set_facecolor('xkcd:black')
    ax=fig.add_subplot(1,1,1)

    #Pitch Outline & Centre Line
    plt.plot([0,0],[0,90], color="white")
    plt.plot([0,130],[90,90], color="white")
    plt.plot([130,130],[90,0], color="white")
    plt.plot([130,0],[0,0], color="white")
    plt.plot([65,65],[0,90], color="white")

    #Left Penalty Area
    plt.plot([16.5,16.5],[65,25],color="white")
    plt.plot([0,16.5],[65,65],color="white")
    plt.plot([16.5,0],[25,25],color="white")

    #Right Penalty Area
    plt.plot([130,113.5],[65,65],color="white")
    plt.plot([113.5,113.5],[65,25],color="white")
    plt.plot([113.5,130],[25,25],color="white")

    #Left 6-yard Box
    plt.plot([0,5.5],[54,54],color="white")
    plt.plot([5.5,5.5],[54,36],color="white")
    plt.plot([5.5,0.5],[36,36],color="white")

    #Right 6-yard Box
    plt.plot([130,124.5],[54,54],color="white")
    plt.plot([124.5,124.5],[54,36],color="white")
    plt.plot([124.5,130],[36,36],color="white")

    #Prepare Circles
    centreCircle = plt.Circle((65,45),9.15,color="white",fill=False)
    centreSpot = plt.Circle((65,45),0.8,color="white")
    leftPenSpot = plt.Circle((11,45),0.8,color="white")
    rightPenSpot = plt.Circle((119,45),0.8,color="white")

    #Draw Circles
    ax.add_patch(centreCircle)
    ax.add_patch(centreSpot)
    ax.add_patch(leftPenSpot)
    ax.add_patch(rightPenSpot)

    #Prepare Arcs
    leftArc = Arc((11,45),height=18.3,width=18.3,angle=0,theta1=310,theta2=50,color="white")
    rightArc = Arc((119,45),height=18.3,width=18.3,angle=0,theta1=130,theta2=230,color="white")

    #Draw Arcs
    ax.add_patch(leftArc)
    ax.add_patch(rightArc)
    
    #Tidy Axes
    plt.axis('off')

## III. Defining Variables And Assigning Necessary Values

In [4]:
# Variable Values [To be changed for each match]
Match_id= 1485382
Match_id_fbref= "28ebbc70"

In [5]:
# Required Column names for each df
position_cols_order1= ['playerId', 'teamId','type.displayName', 'minute', 'second','x', 'y', 'endX', 'endY', 'blockedX', 'blockedY','goalMouthZ', 'goalMouthY','period.displayName','outcomeType.displayName']
position_cols_rename= ["Player","Team","Event Type","Minute","Second","x","y","x_end","y_end","x_block","y_block","z_GoalMouth","y_GoalMouth","Phase","Outcome"]
position_cols_integer= ["Player","Second"]
position_cols_order2=['Event ID','Player', 'Team','Match Details','Date',
       'Time', 'Stadium', 'Event Type',"Next Player", 'Minute',
       'Second', 'x', 'y', 'x_end', 'y_end', 'x_block',
       'y_block', 'z_GoalMouth', 'y_GoalMouth', 'Phase', 'Outcome']
summary_cols_rename= ["Shots","Shots on Target","Key Passes","Passing Accuracy","Aerials Won","Touches","Rating","Team","Player","Position"]
summary_cols_order= ["Player","Team","Position","Shots","Shots on Target","Key Passes","Passing Accuracy","Aerials Won","Touches","Rating"]
offensive_cols_rename= ["Shots","Shots on Target","Key Passes","Dribbles","Fouled","Offside","Dispossessed","Bad Touches","Rating","Team","Player","Position"]
offensive_cols_order= ["Player","Team","Position","Shots","Shots on Target","Key Passes","Dribbles","Fouled","Offside","Dispossessed","Bad Touches","Rating"]
defensive_cols_rename= ["Total Tackles","Interceptions","Clearances","Blocked Shots","Fouls","Rating","Team","Player","Position"]
defensive_cols_order= ["Player","Team","Position","Total Tackles","Interceptions","Clearances","Blocked Shots","Fouls","Rating"]
passing_cols_rename= ["Key Passes","Passes","Passing Accuracy","Crosses","Accurate Crosses","LongBalls","Accurate LongBalls","ThroughBalls","Accurate ThroughBalls","Rating","Team","Player","Position"]
passing_cols_order= ["Player","Team","Position","Key Passes","Passes","Passing Accuracy","Crosses","Accurate Crosses","LongBalls","Accurate LongBalls","ThroughBalls","Accurate ThroughBalls","Rating"]
Fbref_cols_rename= ['Player', 'Jersey No.', 'Nation', 'Position', 'Age', 'Minutes', 'Goals', 'Assists', 'Penalty Scored',
       'Penalty Attempted', 'Shots', 'Shots on Target', 'Yellow Card', 'Red Card', 'Touches', 'Press', 'Tackles', 'Interceptions',
       'Blocks', 'xG', 'non-penalty xG', 'xA', 'Shot Creating Actions', 'Goal Creating Actions', 'Completed Passes', 'Attempted Passes', 'Passing Accuracy',
       'Progressive Pass', 'Carries', 'Progressive Carry', 'Successful Dribbles', 'Dribbles Attempted']
Fbref_cols_order= ['Player','Position', 'Age', 'Minutes', 'Goals',
       'Assists','Yellow Card', 'Red Card', 'Press',
       'Blocks', 'xG', 'non-penalty xG', 'xA',
       'Shot Creating Actions', 'Goal Creating Actions',
       'Dribbles Attempted', 'Progressive Pass',
       'Carries', 'Progressive Carry']
gkfbref_cols_rename= ['Player', 'Nation', 'Age', 'Minutes', 'Shots on Target Against', 'Goals Against', 'Saves', 'Save%',
       'Post Shot xG', 'Completed LongBalls', 'Attempted LongBalls', 'LongBall Accuracy', 'Completed Passes', 'Attempted Passes', 'LongBall/Pass %', 'Average Pass Length(Yds)',
       'GoalKicks Attempted','GK/LongBall %', 'Average GK Length(Yds)', 'Crosses Against', 'Crosses Stopped', 'Cross Stop %', 'Defensive Action(outside Pen Area)', 'Average distance from Goal']
gkfbref_cols_order= ['Player', 'Nation', 'Age', 'Minutes', 'Shots on Target Against', 'Goals Against', 'Saves', 'Save%',
       'Post Shot xG', 'Completed Passes', 'Attempted Passes', 'LongBall/Pass %', 'Average Pass Length(Yds)',
       'GoalKicks Attempted','GK/LongBall %', 'Average GK Length(Yds)', 'Crosses Against', 'Crosses Stopped', 'Cross Stop %', 'Defensive Action(outside Pen Area)', 'Average distance from Goal']
Outfield_stats_cols_order= ['Player', 'Team','Age','Position','Minutes', 'Goals', 'Assists','xG', 'non-penalty xG',
       'xA','Yellow Card', 'Red Card', 'Shots', 'Shots on Target', 'Key Passes',
       'Passing Accuracy', 'Aerials Won', 'Touches', 'Rating', 'Total Tackles',
       'Interceptions', 'Clearances', 'Blocked Shots', 'Fouls','Fouled', 'Offside', 'Dispossessed', 'Bad Touches', 'Passes', 'Crosses',
       'Accurate Crosses', 'LongBalls', 'Accurate LongBalls', 'ThroughBalls',
       'Accurate ThroughBalls', 'Press', 'Blocks', 'Shot Creating Actions', 'Goal Creating Actions',
       'Dribbles Attempted', 'Dribbles','Progressive Pass', 'Carries','Progressive Carry']

## IV. Scraping, Cleaning & Transforming Live Match Data From WhoScored.com & Fbref.com

#### A. Positioning Data

In [6]:
# Using Selenium to define driver
driver = webdriver.Chrome(executable_path ="C:\Program Files (x86)\Google\Chrome\chromedriver.exe")
url= "https://www.whoscored.com/Matches/"+str(Match_id)+"/Live/" # Defining the url
driver.get(url) # Opening the browser window
p_element = driver.find_element_by_xpath('//*[@id="layout-wrapper"]/script[1]') # Finding the desired element using xpath
p_db=p_element.get_attribute('innerHTML') # Getting attributes of an element
driver.close() # Closing the browser window
p_db=p_db.strip() # Removing any space at the start or end.
p_db=p_db[p_db.index(": {")+2:p_db.index(";")] # Using .index() to get the required json data
p_db= p_db[0:p_db.index("\n")-1] # Using .index() to get the required json data
p= json.loads(p_db) # Parse and convert the json data into dictionary
Player= p["playerIdNameDictionary"] # Extracting the Player ID: Player Name data from the main dictionary
Players= dict(Player) # Converting the Player ID: Player Name data into a dictionary
Teams= {p["home"]["teamId"]:p["home"]["name"],p["away"]["teamId"]:p["away"]["name"]} # Creating the Team ID: Team Name dictionary
events= p["events"] # Extracting the events data from the main dictionary

# Converting the events data to a pandas DataFrame
events_df= pd.DataFrame(events)
events_df_flat= flatten_nested_json_df(events_df) # Using the defined function to extarct data from dictionaries and put it in different columns
events_df_flat= events_df_flat[position_cols_order1] # Selecting a sub-set of the original df using the pre-defined column list
events_df_flat= events_df_flat[events_df_flat["period.displayName"]!='PreMatch'] # Dropping the pre-match event details
events_df_flat= events_df_flat[events_df_flat["period.displayName"]!='PostGame'] # Dropping the post-match event details
events_df_flat= events_df_flat.drop_duplicates(subset=None, keep='first') # Dropping the duplicate rows
events_df_flat= events_df_flat.dropna(subset=['playerId']) # Dropping the rows where player details are missing
events_df_flat= events_df_flat.rename(columns=dict(zip(events_df_flat.columns,position_cols_rename))) # Renaming the columns
events_df_flat["Second"]= events_df_flat["Second"].fillna(0) # Filling null values in 'Second' column by 0.
events_df_flat[position_cols_integer]= events_df_flat[position_cols_integer].astype(int) # Converting select columns to integers
events_df_flat["Player"]= events_df_flat["Player"].astype(str) # Converting select columns to string
events_df_flat.replace({"Player": Players}, inplace=True) # Adding player names in place of ID using the players dictionary
events_df_flat.replace({"Team": Teams}, inplace=True) # Adding team names in place of ID using the teams dictionary
events_df_flat["Stadium"]= p["venueName"] # Adding Stadium name column from the original dictionary
events_df_flat["Date-Time"]= p["startTime"] # Adding Date-Time column from the original dictionary
events_df_flat["Date"]= "" # Creating a Date column
events_df_flat["Time"]= "" # Creating a Time column
events_df_flat[["Date","Time"]]=events_df_flat["Date-Time"].str.split("T",expand=True) # Adding values in Date,Time columns
events_df_flat["Date"]= events_df_flat["Date"].astype("datetime64[ns]") # Converting Date column to Datetime
events_df_flat["Match Details"]= p["home"]["name"]+" vs "+p["away"]["name"] # Adding Match Details column from the original dictionary
events_df_flat["Event ID"]= events_df_flat.index # Adding Event ID column from the index values
events_df_flat["Next Player"]= ""
events_df_flat.reset_index(inplace = True, drop = True) #Resetting index values
for i in range(events_df_flat.shape[0]):
    try:
        events_df_flat["Next Player"][i]= events_df_flat["Player"][i+1]
    except:
        pass
events_df_flat=events_df_flat[position_cols_order2] # Selecting a sub-set of the original df using the pre-defined column list
events_df_flat.reset_index(inplace = True, drop = True) #Resetting index values

# Converting all the x,y coordinate values from 100x100 to 130x90
events_df_flat["x"]= events_df_flat["x"]/100*130 
events_df_flat["y"]= events_df_flat["y"]/100*90
events_df_flat["x_end"]= np.where(events_df_flat['x_end'].isna(), events_df_flat['x_end'],events_df_flat['x_end']/100*130)
events_df_flat["y_end"]= np.where(events_df_flat['y_end'].isna(), events_df_flat['y_end'],events_df_flat['y_end']/100*90)
events_df_flat["x_block"]= np.where(events_df_flat['x_block'].isna(), events_df_flat['x_block'],events_df_flat['x_block']/100*130)
events_df_flat["y_block"]= np.where(events_df_flat['y_block'].isna(), events_df_flat['y_block'],events_df_flat['y_block']/100*90)
events_df_flat["y_GoalMouth"]= np.where(events_df_flat['y_GoalMouth'].isna(), events_df_flat['y_GoalMouth'],events_df_flat['y_GoalMouth']/100*90)

original shape: (1706, 27)
original columns: Index(['index', 'id', 'eventId', 'minute', 'second', 'teamId', 'x', 'y',
       'expandedMinute', 'period', 'type', 'outcomeType', 'qualifiers',
       'satisfiedEventsTypes', 'isTouch', 'playerId', 'endX', 'endY',
       'goalMouthZ', 'goalMouthY', 'isGoal', 'isShot', 'blockedX', 'blockedY',
       'relatedEventId', 'relatedPlayerId', 'cardType'],
      dtype='object')
lists: ['qualifiers', 'satisfiedEventsTypes'], dicts: ['period', 'type', 'outcomeType']
flattening: period
flattening: type
flattening: outcomeType
exploding: qualifiers
exploding: satisfiedEventsTypes
lists: [], dicts: []
final shape: (423101, 30)
final columns: Index(['index', 'id', 'eventId', 'minute', 'second', 'teamId', 'x', 'y',
       'expandedMinute', 'isTouch', 'playerId', 'endX', 'endY', 'goalMouthZ',
       'goalMouthY', 'isGoal', 'isShot', 'blockedX', 'blockedY',
       'relatedEventId', 'relatedPlayerId', 'cardType', 'period.value',
       'period.displayName', '

#### B. Match Summary Stats

In [7]:
driver = webdriver.Chrome(executable_path ="C:\Program Files (x86)\Google\Chrome\chromedriver.exe") 
url= "https://www.whoscored.com/Matches/"+str(Match_id)+"/LiveStatistics/" # Defining the url
driver.get(url)
time.sleep(8)
s_element= driver.find_elements_by_xpath('//*[@id="top-player-stats-summary-grid"]')

hsummary_db=s_element[0].get_attribute('innerHTML')
hsummary_db='<table>'+hsummary_db+'</table>'
hsummary_df=pd.read_html(hsummary_db)[0]
hsummary_df.drop(hsummary_df.columns[0], axis=1,inplace=True)
hsummary_df["Team"]= p["home"]["name"]

asummary_db=s_element[1].get_attribute('innerHTML')
asummary_db='<table>'+asummary_db+'</table>'
asummary_df=pd.read_html(asummary_db)[0]
asummary_df.drop(asummary_df.columns[0], axis=1,inplace=True)
asummary_df["Team"]= p["away"]["name"]

driver.close() # Closing the browser window

summary_df= pd.concat([hsummary_df,asummary_df],axis=0)
summary_df.drop("Key Events", axis=1,inplace=True)
summary_df.reset_index(inplace = True, drop = True)
summary_df= summary_df[summary_df["Rating"] != "-"]
summary_df= pd.concat([summary_df,summary_df["Player.1"].str.split(",", 1, expand=True)],axis=1)
summary_df[0]=summary_df[0].str.replace('\d+', '')
summary_df[0]=summary_df[0].str.replace("′", "")
summary_df[0]=summary_df[0].str.replace("(", "")
summary_df[0]=summary_df[0].str.replace(")", "")
summary_df[0]=summary_df[0].str.strip(" ")
summary_df.drop(summary_df.columns[0], axis=1,inplace=True)
summary_df= summary_df.rename(columns=dict(zip(summary_df.columns,summary_cols_rename)))
summary_df= summary_df[summary_cols_order]
summary_df.reset_index(inplace = True, drop = True)

#### C. Match Offensive Stats

In [8]:
driver = webdriver.Chrome(executable_path ="C:\Program Files (x86)\Google\Chrome\chromedriver.exe") 
url= "https://www.whoscored.com/Matches/"+str(Match_id)+"/LiveStatistics/" # Defining the url
driver.get(url)
links= driver.find_elements_by_link_text("Offensive")
links[0].click()
links[1].click()
time.sleep(8)
o_element= driver.find_elements_by_xpath('//*[@id="top-player-stats-summary-grid"]')
hoffensive_db=o_element[1].get_attribute('innerHTML')
hoffensive_db='<table>'+hoffensive_db+'</table>'
hoffensive_df=pd.read_html(hoffensive_db)[0]
hoffensive_df.drop(hoffensive_df.columns[0], axis=1,inplace=True)
hoffensive_df["Team"]= p["home"]["name"]

aoffensive_db=o_element[3].get_attribute('innerHTML')
aoffensive_db='<table>'+aoffensive_db+'</table>'
aoffensive_df=pd.read_html(aoffensive_db)[0]
aoffensive_df.drop(aoffensive_df.columns[0], axis=1,inplace=True)
aoffensive_df["Team"]= p["away"]["name"]

driver.close() # Closing the browser window

offensive_df= pd.concat([hoffensive_df,aoffensive_df],axis=0)
offensive_df.drop("Key Events", axis=1,inplace=True)
offensive_df.reset_index(inplace = True, drop = True)
offensive_df= offensive_df[offensive_df["Rating"] != "-"]
offensive_df= pd.concat([offensive_df,offensive_df["Player.1"].str.split(",", 1, expand=True)],axis=1)
offensive_df[0]=offensive_df[0].str.replace('\d+', '')
offensive_df[0]=offensive_df[0].str.replace("′", "")
offensive_df[0]=offensive_df[0].str.replace("(", "")
offensive_df[0]=offensive_df[0].str.replace(")", "")
offensive_df[0]=offensive_df[0].str.strip(" ")
offensive_df.drop(offensive_df.columns[0], axis=1,inplace=True)
offensive_df= offensive_df.rename(columns=dict(zip(offensive_df.columns,offensive_cols_rename)))
offensive_df= offensive_df[offensive_cols_order]
offensive_df.reset_index(inplace = True, drop = True)

#### D. Match Defensive Stats

In [9]:
driver = webdriver.Chrome(executable_path ="C:\Program Files (x86)\Google\Chrome\chromedriver.exe") 
url= "https://www.whoscored.com/Matches/"+str(Match_id)+"/LiveStatistics/" # Defining the url
driver.get(url)
links= driver.find_elements_by_link_text("Defensive")
links[0].click()
links[1].click()
time.sleep(8)
d_element= driver.find_elements_by_xpath('//*[@id="top-player-stats-summary-grid"]')
hdefensive_db=d_element[1].get_attribute('innerHTML')
hdefensive_db='<table>'+hdefensive_db+'</table>'
hdefensive_df=pd.read_html(hdefensive_db)[0]
hdefensive_df.drop(hdefensive_df.columns[0], axis=1,inplace=True)
hdefensive_df["Team"]= p["home"]["name"]

adefensive_db=d_element[3].get_attribute('innerHTML')
adefensive_db='<table>'+adefensive_db+'</table>'
adefensive_df=pd.read_html(adefensive_db)[0]
adefensive_df.drop(adefensive_df.columns[0], axis=1,inplace=True)
adefensive_df["Team"]= p["away"]["name"]

driver.close() # Closing the browser window

defensive_df= pd.concat([hdefensive_df,adefensive_df],axis=0)
defensive_df.drop("Key Events", axis=1,inplace=True)
defensive_df.reset_index(inplace = True, drop = True)
defensive_df= defensive_df[defensive_df["Rating"] != "-"]
defensive_df= pd.concat([defensive_df,defensive_df["Player.1"].str.split(",", 1, expand=True)],axis=1)
defensive_df[0]=defensive_df[0].str.replace('\d+', '')
defensive_df[0]=defensive_df[0].str.replace("′", "")
defensive_df[0]=defensive_df[0].str.replace("(", "")
defensive_df[0]=defensive_df[0].str.replace(")", "")
defensive_df[0]=defensive_df[0].str.strip(" ")
defensive_df.drop(defensive_df.columns[0], axis=1,inplace=True)
defensive_df= defensive_df.rename(columns=dict(zip(defensive_df.columns,defensive_cols_rename)))
defensive_df= defensive_df[defensive_cols_order]
defensive_df.reset_index(inplace = True, drop = True)

#### E. Match Passing Stats

In [10]:
driver = webdriver.Chrome(executable_path ="C:\Program Files (x86)\Google\Chrome\chromedriver.exe") 
url= "https://www.whoscored.com/Matches/"+str(Match_id)+"/LiveStatistics/" # Defining the url
driver.get(url)
links= driver.find_elements_by_link_text("Passing")
links[0].click()
links[1].click()
time.sleep(8)
pa_element= driver.find_elements_by_xpath('//*[@id="top-player-stats-summary-grid"]')
hpassing_db=pa_element[1].get_attribute('innerHTML')
hpassing_db='<table>'+hpassing_db+'</table>'
hpassing_df=pd.read_html(hpassing_db)[0]
hpassing_df.drop(hpassing_df.columns[0], axis=1,inplace=True)
hpassing_df["Team"]= p["home"]["name"]

apassing_db=pa_element[3].get_attribute('innerHTML')
apassing_db='<table>'+apassing_db+'</table>'
apassing_df=pd.read_html(apassing_db)[0]
apassing_df.drop(apassing_df.columns[0], axis=1,inplace=True)
apassing_df["Team"]= p["away"]["name"]

driver.close() # Closing the browser window

passing_df= pd.concat([hpassing_df,apassing_df],axis=0)
passing_df.drop("Key Events", axis=1,inplace=True)
passing_df.reset_index(inplace = True, drop = True)
passing_df= passing_df[passing_df["Rating"] != "-"]
passing_df= pd.concat([passing_df,passing_df["Player.1"].str.split(",", 1, expand=True)],axis=1)
passing_df[0]=passing_df[0].str.replace('\d+', '')
passing_df[0]=passing_df[0].str.replace("′", "")
passing_df[0]=passing_df[0].str.replace("(", "")
passing_df[0]=passing_df[0].str.replace(")", "")
passing_df[0]=passing_df[0].str.strip(" ")
passing_df.drop(passing_df.columns[0], axis=1,inplace=True)
passing_df= passing_df.rename(columns=dict(zip(passing_df.columns,passing_cols_rename)))
passing_df= passing_df[passing_cols_order]
passing_df.reset_index(inplace = True, drop = True)

#### E. FbRef Outfield Match Stats

In [11]:
driver = webdriver.Chrome(executable_path ="C:\Program Files (x86)\Google\Chrome\chromedriver.exe") 
url= "https://fbref.com/en/matches/"+ Match_id_fbref +str("/") # Defining the url
driver.get(url)
time.sleep(8)

hfbref_element= driver.find_elements_by_xpath('//*[@id="stats_b8fd03ef_summary"]')
hfbref_db= hfbref_element[0].get_attribute('innerHTML')
hfbref_db= '<table>'+hfbref_db+'</table>'
hfbref_df= pd.read_html(hfbref_db)[0]
hfbref_df.columns=hfbref_df.columns.droplevel()
hfbref_df.drop(hfbref_df.tail(1).index,inplace=True)

afbref_element= driver.find_elements_by_xpath('//*[@id="stats_19538871_summary"]')
afbref_db= afbref_element[0].get_attribute('innerHTML')
afbref_db= '<table>'+afbref_db+'</table>'
afbref_df= pd.read_html(afbref_db)[0]
afbref_df.columns=afbref_df.columns.droplevel()
afbref_df.drop(afbref_df.tail(1).index,inplace=True)

driver.close() # Closing the browser window

fbref_df= pd.concat([hfbref_df,afbref_df],axis=0)
fbref_df.reset_index(inplace = True, drop = True)
fbref_df.columns= Fbref_cols_rename
table = str.maketrans('', '', string.ascii_lowercase)
fbref_df["Nation"]= fbref_df["Nation"].apply(lambda x: x.translate(table).strip(" "))
fbref_df["Age"]=fbref_df["Age"].apply(lambda x: x[0:x.index("-")])
fbref_df= fbref_df[Fbref_cols_order]

#### F. FbRef GK Match Stats

In [12]:
driver = webdriver.Chrome(executable_path ="C:\Program Files (x86)\Google\Chrome\chromedriver.exe") 
url= "https://fbref.com/en/matches/"+ Match_id_fbref +str("/") # Defining the url
driver.get(url)
time.sleep(8)

hgkfbref_element= driver.find_elements_by_xpath('//*[@id="keeper_stats_b8fd03ef"]')
hgkfbref_db= hgkfbref_element[0].get_attribute('innerHTML')
hgkfbref_db= '<table>'+hgkfbref_db+'</table>'
hgkfbref_df= pd.read_html(hgkfbref_db)[0]
hgkfbref_df.columns= hgkfbref_df.columns.droplevel()

agkfbref_element= driver.find_elements_by_xpath('//*[@id="keeper_stats_19538871"]')
agkfbref_db= agkfbref_element[0].get_attribute('innerHTML')
agkfbref_db= '<table>'+agkfbref_db+'</table>'
agkfbref_df= pd.read_html(agkfbref_db)[0]
agkfbref_df.columns= agkfbref_df.columns.droplevel()

driver.close() # Closing the browser window

gkfbref_df= pd.concat([hgkfbref_df,agkfbref_df],axis=0)
gkfbref_df.reset_index(inplace = True, drop = True)
gkfbref_df.columns= gkfbref_cols_rename
table = str.maketrans('', '', string.ascii_lowercase)
gkfbref_df["Nation"]= gkfbref_df["Nation"].apply(lambda x: x.translate(table).strip(" "))
gkfbref_df["Age"]=gkfbref_df["Age"].apply(lambda x: x[0:x.index("-")])
gkfbref_df= gkfbref_df[gkfbref_cols_order]

## V. Merging Stats DFs

In [13]:
Outfield_stats_df= pd.merge(summary_df,defensive_df, on=["Player"],how='outer',suffixes=('', '_y'))
Outfield_stats_df.drop(Outfield_stats_df.filter(regex='_y$').columns.tolist(),axis=1, inplace=True)
Outfield_stats_df= pd.merge(Outfield_stats_df,offensive_df, on=["Player"],how='outer',suffixes=('', '_y'))
Outfield_stats_df.drop(Outfield_stats_df.filter(regex='_y$').columns.tolist(),axis=1, inplace=True)
Outfield_stats_df= pd.merge(Outfield_stats_df,passing_df, on=["Player"],how='outer',suffixes=('', '_y'))
Outfield_stats_df.drop(Outfield_stats_df.filter(regex='_y$').columns.tolist(),axis=1, inplace=True)

for i in range(Outfield_stats_df.shape[0]):
    Outfield_stats_df["Player"][i]= str(unicodedata.normalize('NFD', str(Outfield_stats_df["Player"][i])).encode('ascii', 'ignore')).strip("b").strip("'")
for i in range(fbref_df.shape[0]):
    fbref_df["Player"][i]= str(unicodedata.normalize('NFD', str(fbref_df["Player"][i])).encode('ascii', 'ignore')).strip("b").strip("'")

fbref_df["Player"]= fbref_df["Player"].apply(lambda x: difflib.get_close_matches(x, Outfield_stats_df['Player'])[0])
Outfield_stats_df= pd.merge(Outfield_stats_df,fbref_df, on=["Player"],how='outer',suffixes=('', '_y'))
Outfield_stats_df.drop(Outfield_stats_df.filter(regex='_y$').columns.tolist(),axis=1, inplace=True)
Outfield_stats_df= Outfield_stats_df[Outfield_stats_cols_order]
Outfield_stats_df= Outfield_stats_df[Outfield_stats_df["Position"] != "GK"]

In [14]:
Outfield_stats_df.shape[0]== summary_df.shape[0]

True

## VI. Visualizing Data

### Creating Directories

In [15]:
# Parent Directory
parent_dir = "C:/Users/UDAY SURI/Desktop/Live Match Coding/"

# Directory
directory = p["home"]["name"]+" vs "+p["away"]["name"]
d1= 'Heatmaps/'
d2= 'Pass maps/'
d3= 'xgnxgxa Polar Charts/'
d4= 'Final Third Polar Charts/'
d5= 'Bad Actions Polar Charts/'
d6= 'Out of Possession Polar Charts/'
d7= 'Defensive Polar Charts/'
d8= 'Progressive Polar Charts/'
d9= 'Passes Received Heatmap/'

# Directory paths
path = os.path.join(parent_dir, directory)
path1= os.path.join(path, d1)
path2= os.path.join(path, d2)
path3= os.path.join(path, d3)
path4= os.path.join(path, d4)
path5= os.path.join(path, d5)
path6= os.path.join(path, d6)
path7= os.path.join(path, d7)
path8= os.path.join(path, d8)
path9= os.path.join(path, d9)

# Create the directory
if not os.path.exists(path):
    os.mkdir(path)
else:
    pass
if not os.path.exists(path1):
    os.mkdir(path1)
else:
    pass
if not os.path.exists(path2):
    os.mkdir(path2)
else:
    pass
if not os.path.exists(path3):
    os.mkdir(path3)
else:
    pass
if not os.path.exists(path4):
    os.mkdir(path4)
else:
    pass
if not os.path.exists(path5):
    os.mkdir(path5)
else:
    pass
if not os.path.exists(path6):
    os.mkdir(path6)
else:
    pass
if not os.path.exists(path7):
    os.mkdir(path7)
else:
    pass
if not os.path.exists(path8):
    os.mkdir(path8)
else:
    pass
if not os.path.exists(path9):
    os.mkdir(path9)
else:
    pass

### A. Heatmaps

In [16]:
Players= list(events_df_flat["Player"].unique())

In [17]:
for i in range(len(Players)):
    try:
        draw_pitch()
        ax= sns.kdeplot(events_df_flat[events_df_flat["Player"]==str(Players[i])]["x"],events_df_flat[events_df_flat["Player"]==str(Players[i])]["y"],shade=True,cmap="mako",n_levels=100)
        plt.ylim(0,90)
        plt.xlim(0,130)
        fig= ax.get_figure()
        plt.title(str(Players[i])+" : "+str(events_df_flat["Match Details"].unique()[0]), fontsize= 30, color="White")
        plt.savefig(str(path1)+str(events_df_flat[events_df_flat["Player"]==str(Players[i])]["Team"].unique()[0])+" "+str(Players[i])+".png" ,orientation='landscape',facecolor ="black",transparent = True)
    except:
        pass
        print("error")
plt.close('all')

### B. Pass Maps

In [18]:
passmap_df= events_df_flat[events_df_flat["Event Type"]=="Pass"]
passmap_df.reset_index(inplace = True, drop = True)

In [19]:
for n in range(len(Players)):
    try:
        passes= passmap_df[passmap_df["Player"]==str(Players[n])]
        draw_pitch()
        for i in list(passes.index):
            plt.plot([int(passes["x"][i]),int(passes["x_end"][i])],
                     [int(passes["y"][i]),int(passes["y_end"][i])], 
                     color="darkturquoise",alpha=1)
            plt.plot(int(passes["x"][i]),int(passes["y"][i]),"o", color="firebrick",markersize=6,alpha=1)
            plt.title(str(passes["Player"].unique()[0])+" : "+str(passes["Match Details"].unique()[0]), color="white",fontsize= 30)
        plt.ylim(0,90)
        plt.xlim(0,130)
        plt.savefig(str(path2)+str(passes["Team"].unique()[0])+" "+str(Players[n])+".png" ,orientation='landscape',transparent = True, facecolor="black")
    except:
        pass
        print("error")
plt.close('all')

### C. Polar Charts

In [20]:
h_players= list(Outfield_stats_df[Outfield_stats_df["Team"]==p["home"]["name"]]["Player"].unique())

In [21]:
a_players= list(Outfield_stats_df[Outfield_stats_df["Team"]==p["away"]["name"]]["Player"].unique())

#### 1. XG-NXG-XA

In [22]:
Theta1= ['xG', 'non-penalty xG', 'xA']

In [46]:
for i in h_players:
    for j in a_players:
        fig = go.Figure()
        fig.add_trace(go.Scatterpolar(r=Outfield_stats_df[Outfield_stats_df["Player"]==i][Theta1].iloc[0].values.tolist(),theta=Theta1,fill='toself'))
        fig.add_trace(go.Scatterpolar(r=Outfield_stats_df[Outfield_stats_df["Player"]==j][Theta1].iloc[0].values.tolist(),theta=Theta1,fill='toself'))
        fig.update_layout(title={
                'text': str(i)+" vs "+str(j),
                'y':0.98,
                'x':0.5,
                'xanchor': 'center',
                'yanchor': 'top'},template="plotly_dark",polar=dict(radialaxis=dict(visible=True,range=[0, Outfield_stats_df[(Outfield_stats_df["Player"]==i) | (Outfield_stats_df["Player"]==j)][Theta1].max().max()])),showlegend=False,width=700, height=700)
        fig.write_image(str(path3)+str(i)+" vs "+str(j)+".png")

#### 2. Final Third

In [47]:
Theta2= ['Shots on Target','Key Passes','Accurate Crosses','Accurate LongBalls','Accurate ThroughBalls']

In [48]:
for i in h_players:
    for j in a_players:
        fig = go.Figure()
        fig.add_trace(go.Scatterpolar(r=Outfield_stats_df[Outfield_stats_df["Player"]==i][Theta2].iloc[0].tolist(),theta=Theta2,fill='toself'))
        fig.add_trace(go.Scatterpolar(r=Outfield_stats_df[Outfield_stats_df["Player"]==j][Theta2].iloc[0].tolist(),theta=Theta2,fill='toself'))
        fig.update_layout(title={
                'text': str(i)+" vs "+str(j),
                'y':0.98,
                'x':0.5,
                'xanchor': 'center',
                'yanchor': 'top'},template="plotly_dark",polar=dict(radialaxis=dict(visible=True,range=[0, Outfield_stats_df[(Outfield_stats_df["Player"]==i) | (Outfield_stats_df["Player"]==j)][Theta2].max().max()])),showlegend=False,width=700, height=700)
        fig.write_image(str(path4)+str(i)+" vs "+str(j)+".png")

#### 3. Bad Actions

In [49]:
Theta3= ['Yellow Card', 'Red Card','Dispossessed','Bad Touches','Fouls']

In [50]:
for i in h_players:
    for j in a_players:
        fig = go.Figure()
        fig.add_trace(go.Scatterpolar(r=Outfield_stats_df[Outfield_stats_df["Player"]==i][Theta3].iloc[0].tolist(),theta=Theta3,fill='toself'))
        fig.add_trace(go.Scatterpolar(r=Outfield_stats_df[Outfield_stats_df["Player"]==j][Theta3].iloc[0].tolist(),theta=Theta3,fill='toself'))
        fig.update_layout(title={
                'text': str(i)+" vs "+str(j),
                'y':0.98,
                'x':0.5,
                'xanchor': 'center',
                'yanchor': 'top'},template="plotly_dark",polar=dict(radialaxis=dict(visible=True,range=[0, Outfield_stats_df[(Outfield_stats_df["Player"]==i) | (Outfield_stats_df["Player"]==j)][Theta3].max().max()])),showlegend=False,width=700, height=700)
        fig.write_image(str(path5)+str(i)+" vs "+str(j)+".png")

#### 4. Out-of possession

In [51]:
Theta4= ['Interceptions','Press','Blocked Shots']

In [52]:
for i in h_players:
    for j in a_players:
        fig = go.Figure()
        fig.add_trace(go.Scatterpolar(r=Outfield_stats_df[Outfield_stats_df["Player"]==i][Theta4].iloc[0].tolist(),theta=Theta4,fill='toself'))
        fig.add_trace(go.Scatterpolar(r=Outfield_stats_df[Outfield_stats_df["Player"]==j][Theta4].iloc[0].tolist(),theta=Theta4,fill='toself'))
        fig.update_layout(title={
                'text': str(i)+" vs "+str(j),
                'y':0.98,
                'x':0.5,
                'xanchor': 'center',
                'yanchor': 'top'},template="plotly_dark",polar=dict(radialaxis=dict(visible=True,range=[0, Outfield_stats_df[(Outfield_stats_df["Player"]==i) | (Outfield_stats_df["Player"]==j)][Theta4].max().max()])),showlegend=False,width=700, height=700)
        fig.write_image(str(path6)+str(i)+" vs "+str(j)+".png")

#### 5. Defensive

In [53]:
Theta5= ['Clearances','Blocks','Total Tackles']

In [54]:
for i in h_players:
    for j in a_players:
        fig = go.Figure()
        fig.add_trace(go.Scatterpolar(r=Outfield_stats_df[Outfield_stats_df["Player"]==i][Theta5].iloc[0].tolist(),theta=Theta5,fill='toself'))
        fig.add_trace(go.Scatterpolar(r=Outfield_stats_df[Outfield_stats_df["Player"]==j][Theta5].iloc[0].tolist(),theta=Theta5,fill='toself'))
        fig.update_layout(title={
                'text': str(i)+" vs "+str(j),
                'y':0.98,
                'x':0.5,
                'xanchor': 'center',
                'yanchor': 'top'},template="plotly_dark",polar=dict(radialaxis=dict(visible=True,range=[0, Outfield_stats_df[(Outfield_stats_df["Player"]==i) | (Outfield_stats_df["Player"]==j)][Theta5].max().max()])),showlegend=False,width=700, height=700)
        fig.write_image(str(path7)+str(i)+" vs "+str(j)+".png")

#### 6. Progessive Actions

In [55]:
Theta6= ['Shot Creating Actions','Goal Creating Actions','Dribbles','Progressive Pass','Progressive Carry']

In [57]:
for i in h_players:
    for j in a_players:
        fig = go.Figure()
        fig.add_trace(go.Scatterpolar(r=Outfield_stats_df[Outfield_stats_df["Player"]==i][Theta6].iloc[0].tolist(),theta=Theta6,fill='toself'))
        fig.add_trace(go.Scatterpolar(r=Outfield_stats_df[Outfield_stats_df["Player"]==j][Theta6].iloc[0].tolist(),theta=Theta6,fill='toself'))
        fig.update_layout(title={
                'text': str(i)+" vs "+str(j),
                'y':0.98,
                'x':0.5,
                'xanchor': 'center',
                'yanchor': 'top'},template="plotly_dark",polar=dict(radialaxis=dict(visible=True,range=[0, Outfield_stats_df[(Outfield_stats_df["Player"]==i) | (Outfield_stats_df["Player"]==j)][Theta6].max().max()])),showlegend=False,width=700, height=700)
        fig.write_image(str(path8)+str(i)+" vs "+str(j)+".png")

### D. Passes Received Heatmap

In [34]:
successful_events_df= events_df_flat[events_df_flat["Outcome"]=="Successful"]
successfulpasses_events_df= successful_events_df[successful_events_df["Event Type"]=="Pass"]
Players_received= list(successfulpasses_events_df["Next Player"].unique())

In [35]:
for i in range(len(Players_received)):
    try:
        draw_pitch()
        ax= sns.kdeplot(successfulpasses_events_df[successfulpasses_events_df["Next Player"]==str(Players_received[i])]["x_end"],successfulpasses_events_df[successfulpasses_events_df["Next Player"]==str(Players_received[i])]["y_end"],shade=True,cmap="mako",n_levels=100)
        plt.ylim(0,90)
        plt.xlim(0,130)
        fig= ax.get_figure()
        fig.patch.set_facecolor('xkcd:black')
        plt.title("Passes Received by "+str(Players_received[i])+" : "+str(successfulpasses_events_df["Match Details"].unique()[0]), fontsize= 30, color="White")
        plt.savefig(str(path9)+str(successfulpasses_events_df[successfulpasses_events_df["Next Player"]==str(Players_received[i])]["Team"].unique()[0])+" "+str(Players_received[i])+".png" ,orientation='landscape',facecolor ="black",transparent = True)
    except:
        pass
plt.close('all')