# Scraping of the live commentaries from LiveScore

This notebook aims to scrape the live commentaries of a sample of football games from livescore.com and evaluate the performance of our model on this unseen dataset. This evaluation will allows to test if our model can generalize easily to different semantic structures.

## Web scraping of the dataset

In [3]:
#Essential modules for web scraping
import pandas as pd 
import numpy as np 
from selenium import webdriver
import selenium.webdriver.support.ui as ui
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException
import random
import numpy as np
import time

#For text pre-processing
import re, string
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('omw-1.4')

#For model evaluation
from sklearn.metrics import accuracy_score, confusion_matrix

#To vectorize our textual data
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\yanis\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\yanis\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\yanis\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\yanis\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\yanis\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [None]:
#Get the hrefs (links) of the results of all the previous 100 English Premier League games 
options = Options()
options.add_argument("--start-maximized")
driver = webdriver.Chrome('C:/Program Files (x86)/Google/Chrome/Application/chromedriver.exe', options=options)

driver.get('https://www.livescore.com/en/football/england/premier-league/results/')
store = []
for i in range(1,11):
    driver.execute_script(f"window.scrollTo(0, {1080*i})") 
    time.sleep(5)
    elems = driver.find_elements_by_xpath("//a[@href]")
    for elem in elems:
        if "en/football/england/premier-league" in str(elem.get_attribute("href")):
            store.append(elem.get_attribute("href"))
            
store = [i for i in store if "fixtures" not in i and "table" not in i and "results" not in i and i != "https://www.livescore.com/en/football/england/premier-league/"]

In [None]:
#Extract the live commentaries of these games
from tqdm.notebook import tqdm
store_text =[]
options = Options()
options.add_argument("--start-maximized")
driver = webdriver.Chrome('C:/Program Files (x86)/Google/Chrome/Application/chromedriver.exe', options=options)

for href in tqdm(store):
    try:
        driver.get(href + "?tab=summary-commentary")
        button = driver.find_element_by_xpath("//button")
        button.click()
        elem = driver.find_element_by_xpath("//div[@class='Summary_blockWrapper__1P4fu']")
        store_text.append(elem.text)
    except NoSuchElementException :
        continue


In [None]:
# Compute the number of uncomplete scraped webpages
# Count must be equal to 0
count = 0
for i in range(len(store_text)):
    splitted = store_text[i].split("\n")
    if "More commentary" in splitted: 
       count +=1
print("Count: ", count)

In [None]:
#Cleaning of our extracted data
store_rows = []
row = {}
for game in store_text:
    commentaries = game.split("\n")
    for comment in commentaries:
        if comment[0].isdigit():
            store_rows.append(row)
            row = {}
            row["minutes"] = comment
            row["text"] = ""
        else:
            row["text"] = row["text"] + comment
store_rows = [i for i in store_rows if i]
df = pd.DataFrame(store_rows)
df.head(5)

In [None]:
#Save the dataset in a csv file
df.to_csv("livescore30000.csv")
df = pd.read_csv("livescore30000.csv", index_col=0)

In [None]:
#Clean the minutes column by converting string object to int
def cleaning_minutes(minutes):
    cleaned_minutes = int(eval(minutes.replace("'", "")))
    return cleaned_minutes

df["time"] = df["minutes"].apply(cleaning_minutes)
df = df[["time", "text"]]
df.head(5)

In [None]:
#Save the dataset in a csv file
df.to_csv("livescore30000.csv")

In [4]:
df.head(5)

Unnamed: 0,time,text
0,97,Plenty of chances in this game but neither tea...
1,97,That's it! The referee blows the final whistle
2,97,"Ball possession: Tottenham: 44%, Liverpool: 56%."
3,96,James Milner relieves the pressure with a clea...
4,96,Poor play by Trent Alexander-Arnold as his wea...
...,...,...
31702,2,Bernardo Silva puts in a cross...
31703,1,Manchester City is in control of the ball.
31704,1,Manchester City take a throw-in at the right s...
31705,1,The pitch is in fantastic condition today and ...


## Model evaluation on this new unseen dataset

Please note that the dataset obtained at this point was manually annotated with the correct labels to be able to truly evaluate our model

In [1]:
#Load the model previously trained on a large dataset of live commentaries
import pickle
model = pickle.load(open("pipeline_model.pickle", "rb"))

In [48]:
df = pd.read_csv("pred_livescore30000_true.csv", index_col=0)
df.dropna(subset=["true"], inplace=True)
df["true"] = df["true"].astype(int)
df.head(5)

Unnamed: 0,time,text,true
0,97,Plenty of chances in this game but neither tea...,0
1,97,That's it! The referee blows the final whistle,0
2,97,"Ball possession: Tottenham: 44%, Liverpool: 56%.",0
3,96,James Milner relieves the pressure with a clea...,0
4,96,Poor play by Trent Alexander-Arnold as his wea...,1


In [49]:
#Convert to lowercase, strip and remove punctuations
def preprocess(text):
    text = text.lower() 
    text = text.strip()  
    text = re.compile('<.*?>').sub('', text) 
    text = re.compile('[%s]' % re.escape(string.punctuation)).sub(' ', text)  
    text = re.sub('\s+', ' ', text)  
    text = re.sub(r'\[[0-9]*\]',' ',text) 
    text = re.sub(r'[^\w\s]', '', str(text).lower().strip())
    text = re.sub(r'\d',' ',text) 
    text = re.sub(r'\s+',' ',text) 
    return text

 
#Stopword removal
def stopword(string):
    a = [i for i in string.split() if i not in stopwords.words('english')]
    return ' '.join(a)

#Lemmatization
wl = WordNetLemmatizer()

def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN
# Tokenize the sentence
def lemmatizer(string):
    word_pos_tags = nltk.pos_tag(word_tokenize(string)) # Get position tags
    a=[wl.lemmatize(tag[0], get_wordnet_pos(tag[1])) for idx, tag in enumerate(word_pos_tags)] # Map the position tag and lemmatize the word/token
    return " ".join(a)

def finalpreprocess(string):
    return lemmatizer(stopword(preprocess(string)))
from tqdm import tqdm, tqdm_notebook
tqdm_notebook().pandas()
df['clean_text'] = df['text'].progress_apply(lambda x: finalpreprocess(x))
df.head(5)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  tqdm_notebook().pandas()


0it [00:00, ?it/s]

  0%|          | 0/299 [00:00<?, ?it/s]

Unnamed: 0,time,text,true,clean_text
0,97,Plenty of chances in this game but neither tea...,0,plenty chance game neither team could score de...
1,97,That's it! The referee blows the final whistle,0,referee blow final whistle
2,97,"Ball possession: Tottenham: 44%, Liverpool: 56%.",0,ball possession tottenham liverpool
3,96,James Milner relieves the pressure with a clea...,0,james milner relieve pressure clearance
4,96,Poor play by Trent Alexander-Arnold as his wea...,1,poor play trent alexander arnold weak attempt ...


In [51]:
#Predict the classes on the new scraped dataset
pred= model.predict(df["clean_text"])
df["prediction"] = pred

In [52]:
#Replace the numerical labels by their real categories
df["cat"] = df["prediction"].replace({  1:"Attempt", 
                                        2:"Corner",
                                        3:"Foul", 
                                        4:"Yellow card", 
                                        5:"Second yellow card", 
                                        6:"Red card",
                                        7:"Substitution", 
                                        8:"Free kick won", 
                                        9:"Offside", 
                                        10:"Hand ball", 
                                        11:"Penalty conceded"})

In [35]:
#Save the full dataframe with the predictions
df.to_csv("pred_livescore30000.csv")

In [11]:
df

Unnamed: 0,time,text,true,clean_text,prediction,cat
0,97,Plenty of chances in this game but neither tea...,0,plenty chance game neither team could score de...,1,Attempt
1,97,That's it! The referee blows the final whistle,0,referee blow final whistle,3,Foul
2,97,"Ball possession: Tottenham: 44%, Liverpool: 56%.",0,ball possession tottenham liverpool,10,Hand ball
3,96,James Milner relieves the pressure with a clea...,0,james milner relieve pressure clearance,3,Foul
4,96,Poor play by Trent Alexander-Arnold as his wea...,1,poor play trent alexander arnold weak attempt ...,3,Foul
...,...,...,...,...,...,...
294,18,Could be a good chance here as Harry Kane from...,0,could good chance harry kane tottenham cut opp...,3,Foul
295,18,Tottenham start a counter attack.,0,tottenham start counter attack,8,Free kick won
296,17,Great defending by Davinson Sanchez from Totte...,0,great defend davinson sanchez tottenham preven...,3,Foul
297,17,Danger averted there as Davinson Sanchez from ...,0,danger avert davinson sanchez tottenham clear ...,3,Foul


In [5]:
#Evaluate the predictions made by computing the accuracy of our model and plotting the confusion matrix
df=pd.read_csv("pred_livescore30000.csv")
df= df[df["true"]!=0]
print(df["true"].unique())
print(df["prediction"].unique())
print(len(df))

[ 1  4  9 10  7  8  2  3  6]
[10  4  1  9  7  8  2  3]
113


In [6]:
accuracy = accuracy_score(df["true"], df["prediction"])
print(accuracy)

import seaborn as sn
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
labels = ["Attempt", "Corner", "Foul", "Yellow Card", "Red Card", "Substitution", "Free kick won", "Offside", "Handball", "Penalty conceded"]
cm = confusion_matrix(df["true"], df["prediction"]) 
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
df_cm = pd.DataFrame(cm, index = [i for i in labels],
                  columns = [i for i in labels])
plt.figure(figsize = (10,7))
sn.heatmap(df_cm, annot=True, cmap='Blues')

NameError: name 'accuracy_score' is not defined