In [1]:
import re
import string
import requests
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import TfidfVectorizer

In [3]:
def docs_and_clean():

  r = requests.get('https://sports.ndtv.com/fifa-world-cup-2022/news')
  soup = BeautifulSoup(r.content, 'html.parser')

  #THE FOLLOWING CODE NEED TO BE MODIFIED TO SUITE FOR THE ABOVE URL
  link = []
  for i in soup.find('div', {'class':'lst-pg_hd'}).find_all('a',{'class':'lst-pg_ttl'}):
      i['href'] ='https://sports.ndtv.com/'+ i['href'] + '?page=all'
      link.append(i['href'])
  

  # Retrieve Paragraphs
  documents = []
  for i in link:
      r = requests.get(i)
      soup = BeautifulSoup(r.content, 'html.parser')

      sen = []
      for i in soup.find('div', {'class':'sp-cn pg-str-com js-ad-section'}).find_all('p'):
          sen.append(i.text)
      documents.append(' '.join(sen))

  # Clean Paragraphs
  documents_clean = []
  for d in documents:
      document_test = re.sub(r'[^\x00-\x7F]+', ' ', d)
      document_test = re.sub(r'@\w+', '', document_test)
      document_test = document_test.lower()
      document_test = re.sub(r'[%s]' % re.escape(string.punctuation), ' ', document_test)
      document_test = re.sub(r'[0-9]', '', document_test)
      document_test = re.sub(r'\s{2,}', ' ', document_test)
      documents_clean.append(document_test)

  return documents_clean

In [5]:
docs = docs_and_clean()
# Create Term-Document Matrix with TF-IDF weighting
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(docs)

# Create a DataFrame
data = pd.DataFrame(X.T.toarray(), index=vectorizer.get_feature_names_out())
data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17
abemahttps,0.0,0.0163,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ability,0.0,0.0,0.0,0.076124,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.064171,0.0,0.0,0.0,0.0,0.0
about,0.026663,0.0,0.0,0.043187,0.017707,0.0,0.0,0.0,0.0,0.0,0.0,0.050024,0.0,0.047819,0.0,0.0,0.0,0.0
above,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.025098,0.0,0.0,0.0
absent,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00239,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
def get_similar_articles(q, data):
  print("query:", q)
  print("The following are articles with the highest cosine similarity values: ")
  print()
  q = [q]
  q_vec = vectorizer.transform(q).toarray().reshape(data.shape[0],)
  sim = {}
  for i in range(10):
    sim[i] = np.dot(data.loc[:, i].values, q_vec) / np.linalg.norm(df.loc[:, i]) * np.linalg.norm(q_vec)
  
  sim_sorted = sorted(sim.items(), key=lambda x: x[1], reverse=True)
  
  for k, v in sim_sorted:
    if v != 0.0:
      print("Similarity Values:", v)
      print(docs[k])
      print()


q1 = 'barcelona'
q2 = 'spain'
q3 = 'argentina'

get_similar_articles(q1, data)
print('-'*100)
get_similar_articles(q2, data)
print('-'*100)
get_similar_articles(q3, data)

query: barcelona
The following are articles with the highest cosine similarity values: 

----------------------------------------------------------------------------------------------------
query: spain
The following are articles with the highest cosine similarity values: 

Similarity Values: 0.16208127138200307

Similarity Values: 0.022566445312628348
neymar could make his return to the world cup stage on monday as brazil continue their bid to be crowned kings for a record extending sixth time against south korea superstar forward neymar has been absent for brazil since spraining his ankle in his team s opening group g win against serbia last month and their supporters have been sweating on his fitness ever since coach tite said neymar would be assessed in brazil s final pre game training session on sunday but gave a heavy hint that the paris saint germain attacker would start window rrcode window rrcode rrcode push function function v d o ai ai d createelement script ai defer true ai

In [7]:
from gensim.summarization.bm25 import BM25

def simple_tok(sent:str):
    return sent.split()

def bm25_similar_articles(query):
  print("query:", query)
  print("The following are articles with the highest BM25 scores: ")
  print()
  tok_corpus = [simple_tok(s) for s in docs]
  query = simple_tok(query)
  bm25 = BM25(tok_corpus)
  scores = bm25.get_scores(query, average_idf = 100)
  best_docs = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)[:3]
  for i, b in enumerate(best_docs):
      print(f"rank {i+1}: {docs[b]}")
      print()


q1 = 'barcelona'
q2 = 'spain'
q3 = 'argentina'


bm25_similar_articles(q1)
print('-'*100)
bm25_similar_articles(q2)
print('-'*100)
bm25_similar_articles(q3)
print('-'*100)

query: barcelona
The following are articles with the highest BM25 scores: 

rank 1: poland captain robert lewandowski refused to confirm if he had played his last ever game at the world cup after his side were knocked out of the tournament in qatar in a last defeat by france on sunday barcelona striker lewandowski scored a late consolation from the penalty spot for a poland side who were outclassed by the fearsome french attack in doha he will be almost by the time the next world cup comes around in north america in but he suggested that issues beyond his physical condition were more likely to see him end his international career window rrcode window rrcode rrcode push function function v d o ai ai d createelement script ai defer true ai async true ai src v location protocol o d head appendchild ai window document a vdo ai core v ndtv vdo ai js physically i m not afraid of this but we have so many different things outside of football whether your happiness is still there and what s goi