#CMT309 – Data Science Portfolio (Spring)


In [1]:
import numpy as np
import string
import pandas as pd
import re
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
import plotly.express as px
import math
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
# the code block below is directly downloading commentary.txt and superheros.csv into your drive folder. Please just run it and do not comment out.
from urllib import request
module_url = [f"https://drive.google.com/uc?export=view&id=18y6hLv2bqAyJsIXwVCty58lF0u7yimVq"]
name = ['commentary.txt']
for i in range(len(name)):
    with request.urlopen(module_url[i]) as f, open(name[i],'w') as outf:
        a = f.read()
        outf.write(a.decode('ISO-8859-1'))
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
import nltk
import re
from tqdm import tqdm
tqdm.pandas()
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

#Text Analysis

interrogate the football commentary dataset

In [2]:
df = pd.read_csv('commentary.txt', sep='\t')
df.head()

Unnamed: 0,Minute,Commentary
0,97,Plenty of chances in this game but neither tea...
1,97,That's it! The referee blows the final whistle
2,97,"Ball possession: Tottenham: 44%, Liverpool: 56%."
3,96,James Milner relieves the pressure with a clea...
4,96,Poor play by Trent Alexander-Arnold as his wea...


## Preprocessing

method for obtaining tokenized, PoS-tagged and PoS-tagged and lemmatized versions of the Commentary column. with use of only `nltk` libraries. it creates 3 new columns: `Tokenized`, `PoS_tagged` and `PoS_lemmatized`, in order:

1.- New `Tokenized` column, by lower casing and tokenizing the `Commentary` column.

2.- New `PoS_tagged` column, by pos_tagging the `Tokenized` column.

3.- New `PoS_lemmatized` column, by lemmatizing only the words in the `PoS_tagged` column. The reason for doing it in this order is to present to the tagging function the original text.

An example outcome:

```python
>>print(df['Tokenized'][:3])
0    [plenty, of, chances, in, this, game, but, nei...
1    [that, 's, it, !, the, referee, blows, the, fi...
2    [ball, possession, :, tottenham, :, 44, %, ,, ...
Name: Tokenized, dtype: object

>>print(df['PoS_tagged'][:3])
0    [(plenty, NN), (of, IN), (chances, NNS), (in, ...
1    [(that, DT), ('s, VBZ), (it, PRP), (!, .), (th...
2    [(ball, DT), (possession, NN), (:, :), (totten...
Name: PoS_tagged, dtype: object

>>print(df['PoS_lemmatized'][:3])
0    [(plenty, NN), (of, IN), (chance, NNS), (in, I...
1    [(that, DT), ('s, VBZ), (it, PRP), (!, .), (th...
2    [(ball, DT), (possession, NN), (:, :), (totten...
Name: PoS_lemmatized, dtype: object
```

In [4]:
"""
Method of obtaining tokenized, PoS-tagged and PoS-tagged and lemmatized
versions of the Commentary column.

Takes the data frame Commentary column, creates relevant columns ie  Tokenized, PoS_tagged, PoS_lemmatized



"""
# Data cleaning
#Pattern initialization
pattern = re.compile(r'[^a-zA-Z\s*%;:\n]')

# Tokenize Comentary column

token_comments = []

for text in df['Commentary']:
  clean_text = re.sub(pattern, '', text)
  clean_text = clean_text.replace(':', '')
  clean_text = clean_text.replace('%', '')
  tokens = word_tokenize(text.lower())
  token_comments.append(tokens)

df['Tokenized'] = token_comments

# Pos Tagging the Tokens

pos_tagged_tokens = []

for tokens in df['Tokenized']:
  pos_taggs = pos_tag(tokens)
  pos_tagged_tokens.append(pos_taggs)

df['PoS_tagged'] = pos_tagged_tokens

# Lemmatizing the Pos

lemmatizer = WordNetLemmatizer()

lemma_words = []

for comments in df['PoS_tagged']:
  lemma = []
  for word, pos_tags in comments :
    if pos_tags.startswith('N'):
      lemmatized_word = lemmatizer.lemmatize(word, pos='n') # lemmatize nouns
    elif pos_tags.startswith('V'):
      lemmatized_word = lemmatizer.lemmatize(word, pos='v') # lemmatize verbs
    else:
      lemmatized_word = lemmatizer.lemmatize(word) # lemmatize other parts of speech
    lemma.append((lemmatized_word, pos_tags))
  lemma_words.append(lemma)

df['PoS_lemmatized'] = lemma_words


df.head()


Unnamed: 0,Minute,Commentary,Tokenized,PoS_tagged,PoS_lemmatized
0,97,Plenty of chances in this game but neither tea...,"[plenty, of, chances, in, this, game, but, nei...","[(plenty, NN), (of, IN), (chances, NNS), (in, ...","[(plenty, NN), (of, IN), (chance, NNS), (in, I..."
1,97,That's it! The referee blows the final whistle,"[that, 's, it, !, the, referee, blows, the, fi...","[(that, DT), ('s, VBZ), (it, PRP), (!, .), (th...","[(that, DT), ('s, VBZ), (it, PRP), (!, .), (th..."
2,97,"Ball possession: Tottenham: 44%, Liverpool: 56%.","[ball, possession, :, tottenham, :, 44, %, ,, ...","[(ball, DT), (possession, NN), (:, :), (totten...","[(ball, DT), (possession, NN), (:, :), (totten..."
3,96,James Milner relieves the pressure with a clea...,"[james, milner, relieves, the, pressure, with,...","[(james, NNS), (milner, VBP), (relieves, VBZ),...","[(james, NNS), (milner, VBP), (relieve, VBZ), ..."
4,96,Poor play by Trent Alexander-Arnold as his wea...,"[poor, play, by, trent, alexander-arnold, as, ...","[(poor, JJ), (play, NN), (by, IN), (trent, JJ)...","[(poor, JJ), (play, NN), (by, IN), (trent, JJ)..."


In [6]:
print(df['PoS_lemmatized'][:3])

0    [(plenty, NN), (of, IN), (chance, NNS), (in, I...
1    [(that, DT), ('s, VBZ), (it, PRP), (!, .), (th...
2    [(ball, DT), (possession, NN), (:, :), (totten...
Name: PoS_lemmatized, dtype: object


In [7]:
print(df['PoS_tagged'][:3])

0    [(plenty, NN), (of, IN), (chances, NNS), (in, ...
1    [(that, DT), ('s, VBZ), (it, PRP), (!, .), (th...
2    [(ball, DT), (possession, NN), (:, :), (totten...
Name: PoS_tagged, dtype: object


In [8]:
print(df['Tokenized'][:3])

0    [plenty, of, chances, in, this, game, but, nei...
1    [that, 's, it, !, the, referee, blows, the, fi...
2    [ball, possession, :, tottenham, :, 44, %, ,, ...
Name: Tokenized, dtype: object


## Basic search engine

implementation of a basic search engine in a function called `retrieve_similar_commentaries(df, query, k)`, which takes as input the following arguments:

- `df` the previously enriched (tokenized, pos tagged, etc) commentary dataframe.
- `query` a string of any type, which will be the query we will be using to retrieve similar commentaries.
- `k` and integer denoting the top `k` commentaries to be returned (by similarity).

function must perform the following steps:

1 - Tokenize and lemmatize the input query.

2 - For each commentary in the df, compute how similar it is to the query as the number of shared tokens between query and commentary.

3 - We will prioritize noun matches, so our similarity score will receive +1 if at least one of the matching tokens in the commentary is a noun (i.e., its part of speech starts with `N`). This means that, for example, if your query has 2 tokens, the maximum similarity a commentary can have is 4: 2 for 2 overlapping tokens, and 2 for both tokens being nouns.

4 - The function must return a list of tuples of the form `[(commentary1, sim), (commentary2, sim) ... (commentaryk, sim)]`, where commentaries are ranked by `sim` value in descending order.

An example test case is given below:

0 ('Manchester United is in control of the ball.', 5)
1 ('Manchester United is in control of the ball.', 5)
2 ('Jadon Sancho from Manchester United crosses the ball, but it goes out for a goal kick.', 5)
```

In [5]:

def retrieve_similar_commentaries(df, query, k):

    """
    Retrieve similar commentaries from a DataFrame based on a query.

    Args:
        df (DataFrame): DataFrame containing commentaries and their attributes.
        query (str): Query string to search for similar commentaries.
        k (int): Number of top similar commentaries to return.

    Returns:
        list: List of tuples containing top k similar commentaries and their similarity scores.
    """
    # Initialize WordNet lemmatizer

    lemmatizer = WordNetLemmatizer()

    query_token = word_tokenize(query.lower())
    query_pos_tags = pos_tag(query_token)

    #print(query_pos_tags)

    query_pos_token = []

    # Mapping from POS tags to lemmatization tags
    tag_map = {'N': 'n', 'V': 'v', 'J': 'a', 'R': 'r'}  # adjectives and adverbs added

    # Lemmatize query tokens and store with original POS tags

    for word, pos_tags in query_pos_tags:
        pos = pos_tags[0]  # get the first character of the POS tag
        if pos in tag_map:
            lemmatized_word = lemmatizer.lemmatize(word, pos=tag_map[pos])
        else:
            lemmatized_word = lemmatizer.lemmatize(word)

        query_pos_token.append((lemmatized_word, pos_tags))

    #print(query_pos_token)

    # collect results

    results = []

    # Iterate through each row in the DataFrame

    for index, row in df.iterrows():
        commentary_tokens = set(row['Tokenized']) # Get tokens from commentary
        lemmatized_commentary = set([token[0] for token in row['PoS_lemmatized']]) # Get lemmatized tokens from commentary

        # Find common tokens between query and commentary
        common_tokens = set([token[0] for token in query_pos_token]).intersection(lemmatized_commentary)

        # Calculate similarity score based on common tokens
        sim_score = len(common_tokens)

        # Check for noun matches
        for lemma_token, pos_tags in row['PoS_lemmatized']:
            if lemma_token in common_tokens and pos_tags.startswith('N'):
                sim_score += 1
                break  # Add 1 only once per commentary if any noun matches


        # Append commentary and similarity score to results
        results.append((row['Commentary'], sim_score))

    results.sort(key=lambda x: x[1], reverse=True)

    #Return top k similar commentaries without similarity score
    return [(commentary, sim_value) for commentary, sim_value in results[:k]]

# Example usage
result = retrieve_similar_commentaries(df, "Harry Kane from Tottenham", 3)
for idx, r in enumerate(result):
    print(idx, r)

0 ('Harry Kane from Tottenham is ruled offside.', 5)
1 ("Harry Kane from Tottenham directs a ball squarely in the box, but it's intercepted by an opponent player.", 5)
2 ("Harry Kane from Tottenham directs the ball behind the defence, but it's intercepted by an opponent player.", 5)


## PMI 

implementation of the pointwise mutual information (PMI) metric, a word association metric introduced in 1992, to the football commentaries. The purpose of PMI is to extract, from free text, pairs of words or phrases than tend to co-occur together more often than expected by chance. For example, PMI(`new`, `york`) would give a higher score than PMI(`new`, `car`) because the chance of finding `new` and `york` together in text is higher than `new` and `car`, despite `new` being a more frequent word than `york`.

The formula for PMI (where `x` and `y` are two words) is:

$PMI(x,y) = log(\frac{p(x,y)}{p(x)p(y)})$

logic:

- **Phrase Extraction**: The first step is to extract noun phrases (NPs) and verb phrases (VPs) from the lemmatized data. To do this, write a function that goes through each entry and groups words into noun phrases or verb phrases based on their part-of-speech tags. reward cases where NPs and VPs go beyond single word matching.

- **Phrase Counting**: Once NPs and VPs have been extracted, count how many times each phrase occurs in the dataset. write a function that iterates through the NPs and VPs and keeps track of the counts in dictionaries.

- **Total Counts**: The next step is to compute the total count of all NPs and VPs. This is simply the sum of all the counts in the dictionaries you created in the previous step.

- **Identifying Top Phrases**: To reduce computational complexity, we only want to compute PMI for the top occurring NPs and VPs. So, you will need to write a function that sorts the phrases by their counts and selects the top 100 phrases.

- **Creating the PMI Matrix**: Finally, create a PMI matrix using the top NPs and VPs, their counts, and the total counts of NPs and VPs. This matrix will be a pandas DataFrame, which will have rows corresponding to the top VPs, columns corresponding to the top NPs, and each cell will contain the PMI value between the corresponding NP and VP. This part of your solution will return 0 when there is no co-occurrence between an NP and a VP, and apply smoothing only to the final PMI value (refer to the video).

implement all the functionality in a function called `compute_pmi_dataframe(df)` that takes as input the enriched `df` you created in `part 1`.

In [12]:
# Phase extraction helper function

def extract_phrases(pos_tagged_text):
    """
    Function to extract noun phrases (NPs) and verb phrases (VPs) from the lemmatized data.

    Parameters:
    pos_tagged_text (list): List of tuples containing word and POS tag.

    Returns:
    list: List of extracted phrases.
    """
    # Initialize an empty list to store phrases
    phrases = []

    # Iterate through the POS-tagged text
    for i in range(len(pos_tagged_text)):
        # Check if the current word is a noun or verb
        if pos_tagged_text[i][1] in ['NN', 'NNS', 'NNP', 'NNPS', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']:
            # Initialize an empty string to store the phrase
            phrase = ''
            # Iterate through subsequent words until a non-noun/verb is encountered
            while i < len(pos_tagged_text) and pos_tagged_text[i][1] in ['NN', 'NNS', 'NNP', 'NNPS', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']:
                # Concatenate words to form the phrase
                phrase += pos_tagged_text[i][0] + ' '
                i += 1
            # Append the extracted phrase to the list
            phrases.append(phrase.strip())

    return phrases

In [13]:
# Phrases counter Helper function

def count_phrases(phrases):
    """
    Function to count the occurrences of each phrase.

    Parameters:
    phrases (list): List of extracted phrases.

    Returns:
    dict: Dictionary containing phrase counts.
    """
    # Initialize an empty dictionary to store counts
    phrase_counts = {}

    # Iterate through the list of phrases
    for phrase in phrases:
        # Increment count for the current phrase or initialize it to 1 if it's the first occurrence
        phrase_counts[phrase] = phrase_counts.get(phrase, 0) + 1

    return phrase_counts

In [22]:
def compute_pmi_dataframe(df):
    """
    Function to compute the PMI matrix.

    Parameters:
    df (DataFrame): DataFrame containing the enriched data.

    Returns:
    DataFrame: PMI matrix.
    """
    # Extract noun phrases (NPs) and verb phrases (VPs) from the lemmatized data
    df['Phrases'] = df['PoS_lemmatized'].apply(extract_phrases) # Use Helper function extract phrases

    # Flatten the list of phrases
    all_phrases = [phrase for sublist in df['Phrases'] for phrase in sublist]

    # Count occurrences of each phrase
    phrase_counts = count_phrases(all_phrases) # Use Helper function count_phrases

    #print('phrase counts', phrase_counts)

    # Total count of all phrases
    total_count = sum(phrase_counts.values())


    # Identify top 100 phrases
    top_phrases = sorted(phrase_counts, key=phrase_counts.get, reverse=True)[:10]

    #print('top 10 pharses', top_phrases)


    # Initialize a DataFrame to store the PMI values
    pmi_df = pd.DataFrame(index=top_phrases, columns=top_phrases)


    # Compute PMI values
    for i in range(len(top_phrases)):
        for j in range(len(top_phrases)):
            if i != j:
                # Co-occurrence count of the phrases
                co_occurrence_count = sum(1 for phrase in all_phrases if top_phrases[i] in phrase and top_phrases[j] in phrase)

                # Probability of co-occurrence
                p_xy = co_occurrence_count / total_count

                # Individual probabilities
                p_x = phrase_counts[top_phrases[i]] / total_count
                p_y = phrase_counts[top_phrases[j]] / total_count

                # Calculate PMI
                pmi = math.log(p_xy / (p_x * p_y)) if p_xy > 0 else 0

                # Apply smoothing
                pmi_smoothed = max(0, pmi)

                # Store the PMI value in the DataFrame
                pmi_df.iloc[i, j] = pmi_smoothed
            else:
                # For diagonal entries, set PMI to 0
                pmi_df.iloc[i, j] = 0

    return pmi_df


# Example usage:
pmidf = compute_pmi_dataframe(df)

pmidf.head()

Unnamed: 0,ball,field,take,pitch,be,%,side,half,throw-in,kick
ball,0,0,0.0,0.0,0.0,0,0,0.0,0,0
field,0,0,0.0,0.0,0.0,0,0,0.0,0,0
take,0,0,0.0,0.0,0.850785,0,0,0.0,0,0
pitch,0,0,0.0,0.0,0.109448,0,0,0.0,0,0
be,0,0,0.850785,0.109448,0.0,0,0,0.212482,0,0


In [24]:
# you can test your resulting matrix
def top_k_vps(pmi_matrix, np, k):
    # Check if the NP exists in the matrix
    if np in pmi_matrix.T.index:
        top_vps = pmi_matrix.T.loc[np].nlargest(k)
        return top_vps.index.tolist()
    else:
        print(f"Noun phrase '{np}' not found in PMI matrix.")
        return []
top_k_vps(pmidf, 'joao cancelo', 3)

Noun phrase 'joao cancelo' not found in PMI matrix.


[]