In [None]:
%pip install Sentence-transformers
%pip install re
%pip install nltk

In [5]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

## Data Preprocessing 

### __Procedure__:
  #### 1. Removal of Stopwords
  #### 2. Lemetization
  #### 3. Removing any non-meaningful words with the help of re module.
  
 Note: Stemming was not performed as it was observed that it was mostly damaging the information which can be used to find similarity between the sentences.

In [6]:
nltk.download('punkt')
nltk.download('wordnet')


PATH='Collection//euro_news//para.csv'
df=pd.read_csv(PATH,sep='|',names=['Channel','Date','Time','Place','News'])

[nltk_data] Downloading package punkt to /home/udayb/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/udayb/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [7]:
stop_words=set(stopwords.words('english'))

# Creating  the object for lemmatisation
lemt = WordNetLemmatizer()


for ind, row  in df.iterrows():

	text=df.loc[(ind,'News')]
	words=nltk.word_tokenize(	text	)
	
	NEW_HDL=""
	for word in words:
		if word not in stop_words:
			NEW_HDL=NEW_HDL+' '+lemt.lemmatize(word)

	# Removing strings which are inside the parantheses from the news headline 
	NEW_HDL=re.sub('\[[a-zA-Z0-9,\' ]+\]',' ',NEW_HDL)
	
	df.loc[(ind,'News')]=NEW_HDL
	
df

Unnamed: 0,Channel,Date,Time,Place,News
0,Euro News,01/01/23,Evening,Outside India,10 year joining European Union Croatia swit...
1,Euro News,01/01/23,Midday,Outside India,half hour 2023 Air Raid Sirens ring across Uk...
2,Euro News,01/01/23,Morning,Outside India,Noble kind word used Pope Francis describe pr...
3,Euro News,01/02/23,Evening,Outside India,sign slight easing cost living crisis inflati...
4,Euro News,01/02/23,Midday,Outside India,foreign half million worker expected strike U...
...,...,...,...,...,...
1531,Euro News,31/10/23,Midday,Outside India,Israeli air strike continue dark hundred thou...
1532,Euro News,31/10/23,Morning,Outside India,defiant Benjamin Netanyahu ruled ceasefire Ga...
1533,Euro News,31/12/22,Evening,Outside India,European leader commemorate former Pope Ben...
1534,Euro News,31/12/22,Midday,Outside India,former Pope Benedict XVI first pontiff 600 ...


## Checking for similarity in the News Headlines

### In the below cell, we have made 4 major Methods which are later used to find similarity in the news which are released in the day.

### A Brief description of the methods is given below:
---
  - ####  _EMBED\_COS_
    - This embeds the news headlines of Morning, Midday and Evening by the help of the model [all-MiniLM-L6-v2](https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2 "Click to know more").
    - It gives the embeddings in the form of a numpy array containing the values for Morning, Midday and Evening.
  ---
  
  - ####  _EMBED\_SEM_
    - This embeds the news headlines of Morning, Midday and Evening by the help of the model [clips/mfaq]( https://huggingface.co/clips/mfaq "Click to know more").
    - Similar to the previous one, this gives Embeddings in the form of a numpy array consisting of the values for Morning, Midday and Evening.
  - ####  _COS\_SIM_
    - Here, we take out the _Cosine Similarity_ with the help of util library from the Sentence transformers library.
  - #### _SEM_SIM_
    - We take out the Semantic Similarity with the help of util library from the Sentence Transformers library.

In [8]:
from sentence_transformers import SentenceTransformer as sen_tr, util

def EMBED_COS(news_list:dict):
  """
    - The model which we have used here for Embedding is all-MiniLM-L6-v2.
    - This gives an nd array of Embeddings in the format of:
        [  Morning , Midday , Evening ]
  """

  # Creating an object for the model
  model = sen_tr('all-MiniLM-L6-v2')
  
  # Creating Embeddings for different timings of the day
  emb0 = model.encode(news_list['Morning'])
  emb1 = model.encode(news_list['Midday'])
  emb2 = model.encode(news_list['Evening'])
  
  return np.array([ emb0 ,emb1 , emb2 ])


def EMBED_SEM(  news_list:dict ):
  """
    - The model which we have used here for Embedding is clips/mfaq.
    - This gives an nd array of Embeddings in the format of:
      [ Morning , Midday  , Evening ]
  """

  model=sen_tr('clips/mfaq')

  emb0=model.encode(news_list['Morning'])
  emb1=model.encode(news_list['Midday'])
  emb2=model.encode(news_list['Evening'])

  return np.array(  [ emb0  , emb1  , emb2  ]  )


def COS_SIM(embed: dict):
  """
    This function returns the cosine-similarity of different timings in the format:

  """
  def rel(val):
    return val*100
  
  ans={'Morning-Midday':0,'Midday-Evening':0,'Morning-Evening':0}

  ans['Morning-Midday'] = rel(  util.cos_sim(embed[0],embed[1]  )[0,0].item())
  ans['Morning-Evening'] = rel(  util.cos_sim(embed[0],embed[2]  )[0,0].item())
  ans['Midday-Evening'] = rel(  util.cos_sim(embed[1],embed[2]  )[0,0].item())

  return ans


def SEM_SIM(embed: dict):

  def rel(val):
    return val*100
  
  ans={'Morning-Midday':0,'Midday-Evening':0,'Morning-Evening':0}

  ans['Morning-Midday']   = rel(  ( util.semantic_search( embed[0] ,  embed[1] )[0][0] )['score'] )
  ans['Morning-Evening']  = rel(  ( util.semantic_search( embed[0] ,  embed[2] )[0][0] )['score'] )
  ans['Midday-Evening']   = rel(  ( util.semantic_search( embed[1] ,  embed[2] )[0][0] )['score'] )

  return ans

Below code takes 20 min Time

In [10]:
un_date=df['Date'].unique()

data_similar={'Date':[],'Technique':[],'Morning-Midday':[],'Midday-Evening':[],'Morning-Evening':[]}


for DATE in un_date:


  df_day=df[df['Date']==DATE]

  news={
          'Morning':  '',
          'Midday' :  '',
          'Evening':  ''
        }

  for ind, row in df_day.iterrows():
    news[row['Time']]=row['News']
  
  
  embed_cos = EMBED_COS(news);          
  dt_cos  = COS_SIM(embed_cos)

  embed_sem= EMBED_SEM(news)
  dt_sem  = SEM_SIM(embed_sem) 

  print(1)

  data_similar['Technique'].append('Cosine Similarity');  data_similar['Date'].append(DATE);  data_similar['Morning-Midday'].append(dt_cos['Morning-Midday']);  data_similar['Midday-Evening'].append(dt_cos['Midday-Evening']);  data_similar['Morning-Evening'].append(dt_cos['Morning-Evening'])
  data_similar['Technique'].append('Semantic Similarity');  data_similar['Date'].append(DATE);  data_similar['Morning-Midday'].append(dt_sem['Morning-Midday']);  data_similar['Midday-Evening'].append(dt_sem['Midday-Evening']);  data_similar['Morning-Evening'].append(dt_sem['Morning-Evening'])



1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1


In [13]:
per_df=pd.DataFrame(data_similar)
per_df['Morning-Evening'].min()

2.7643445879220963

In [12]:
per_df

Unnamed: 0,Date,Technique,Morning-Midday,Midday-Evening,Morning-Evening
0,01/01/23,Cosine Similarity,73.351413,44.613236,34.173316
1,01/01/23,Semantic Similarity,96.062630,97.445554,95.561874
2,01/02/23,Cosine Similarity,57.157195,59.162945,42.400160
3,01/02/23,Semantic Similarity,98.220742,98.225093,98.085129
4,01/03/23,Cosine Similarity,73.402280,48.875538,48.308423
...,...,...,...,...,...
1019,31/10/22,Semantic Similarity,99.828959,99.141967,99.205130
1020,31/10/23,Cosine Similarity,86.921757,95.947510,85.745031
1021,31/10/23,Semantic Similarity,99.000019,99.553859,99.041295
1022,31/12/22,Cosine Similarity,14.287755,73.257643,22.974724


In [14]:
per_df.to_csv('result/similarity.csv')