TFID  Mode

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

In [2]:
# Load data
data = pd.read_csv("data\war-news.csv")
data.head()

Unnamed: 0.1,Unnamed: 0,Headlines,Summary,Press,Date,Keyword
0,0,I served in Iraq and Afghanistan but the horro...,A WAR hero traumatised by the horrors of comba...,The Sun,1 day ago,Afghanistan
1,1,The forever war in Afghanistan is nowhere near...,Islamic State is seeking to overthrow the Tali...,ThePrint,2 weeks ago,Afghanistan
2,2,"Hell at Abbey Gate: Chaos, Confusion and Death...","In firsthand accounts, Afghan civilians and U....",ProPublica,1 month ago,Afghanistan
3,3,‘A second Afghanistan’: Doubts over Russia’s w...,Russia's lack of progress in its war against U...,Al Jazeera,5 days ago,Afghanistan
4,4,Afghanistan: Former army general vows new war ...,Lt Gen Sami Sadat tells the BBC of planned ope...,BBC,1 week ago,Afghanistan


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5654 entries, 0 to 5653
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  5634 non-null   object
 1   Headlines   5653 non-null   object
 2   Summary     5653 non-null   object
 3   Press       5653 non-null   object
 4   Date        5653 non-null   object
 5   Keyword     5653 non-null   object
dtypes: object(6)
memory usage: 265.2+ KB


In [4]:
# Drop rows with missing values in 'title' and 'summary' columns
data = data.dropna(subset=['Headlines', 'Summary'])
data.info()


<class 'pandas.core.frame.DataFrame'>
Index: 5653 entries, 0 to 5652
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  5633 non-null   object
 1   Headlines   5653 non-null   object
 2   Summary     5653 non-null   object
 3   Press       5653 non-null   object
 4   Date        5653 non-null   object
 5   Keyword     5653 non-null   object
dtypes: object(6)
memory usage: 309.1+ KB


In [5]:

# Extract titles and summaries from the DataFrame
titles = data['Headlines'].tolist()
summaries = data['Summary'].tolist()

In [6]:
def preprocess_text(text):
    if isinstance(text, str):  # Check if the input is a string
        text = re.sub(r"[^a-zA-Z0-9\s]", "", text)
        text = text.lower()
        tokens = word_tokenize(text)
        stop_words = set(stopwords.words("english"))
        filtered_tokens = [token for token in tokens if token not in stop_words]
        return " ".join(filtered_tokens)
    else:
        return ""

In [7]:
preprocessed_titles = [preprocess_text(title) for title in titles]
preprocessed_summaries = [preprocess_text(summary) for summary in summaries]

<b>TFID </b>
<ol>
<li> Representation: TF-IDF represents each document as a vector, where each dimension corresponds to a unique term in the entire corpus. </li>
<li>Term Importance: It assigns weights to terms based on their frequency in a document relative to their frequency across all documents in the corpus. High weight is given to terms that are frequent in the document but not common across all documents. </li>
<li>Document Comparison: TF-IDF vectors are used to calculate the similarity between documents. The similarity is based on the overlap of terms and their weights. </li>

In [8]:
# Feature representation
vectorizer = TfidfVectorizer()
title_vectors = vectorizer.fit_transform(preprocessed_titles)
summary_vectors = vectorizer.transform(preprocessed_summaries)

In [9]:
# Similarity calculation
similarity_scores = cosine_similarity(title_vectors, summary_vectors)

In [10]:
# Print top 5 most similar pairs
most_similar = [(i, j, similarity_scores[i][j]) for i, j in enumerate(similarity_scores.argsort(axis=1)[:,-1])]
most_similar_sorted = sorted(most_similar, key=lambda x: x[2],reverse=True)
print("Top 5 most similar pairs:")
for i, j, score in most_similar_sorted[:5]:
  print("Title: {}\nSummary: {}\nSimilarity score: {:.2f}\n".format(titles[i], summaries[i], score))


Top 5 most similar pairs:
Title: India and Pakistan fought 3 wars over Kashmir - here's why international 
law falls short to solve this territorial dispute
Summary: India and Pakistan fought 3 wars over Kashmir - here's why international 
law falls short to solve this territorial dispute.
Similarity score: 1.00

Title: India and Pakistan fought 3 wars over Kashmir - here's why international 
law falls short to solve this territorial dispute
Summary: An armed conflict in Kashmir has thwarted all attempts to solve it for 
three quarters of a century. Kashmir, an 85,806-square-mile valley...
Similarity score: 1.00

Title: The risk of nuclear war was already the highest since the Cuban Missile 
Crisis. Putin has made it far worse, former energy secretary says
Summary: The risk of nuclear war was already the highest since the Cuban Missile 
Crisis. Putin has made it far worse, former energy secretary says. By.
Similarity score: 1.00

Title: Russia-Ukraine War Highlights: Quad countries acc

<b>Word2Vec</b>
<ol>
<li>Representation: Word2Vec represents each word as a dense vector in a continuous vector space. It captures the semantic relationships between words.</li>
<li>Term Similarity: Word2Vec is trained on large corpora to learn word embeddings such that semantically similar words have similar vector representations.</li>
<li>Document Representation: Document vectors can be obtained by averaging or combining the word vectors of the words in the document.</li>
<li>Document Comparison: Similarity between documents is calculated based on the similarity of their word vectors. It captures the semantic similarity between documents.</li>

In [11]:
from gensim.models import KeyedVectors
import numpy as np


In [12]:

# Load pre-trained word embeddings
word2vec_model = KeyedVectors.load_word2vec_format("GoogleNews-vectors-negative300.bin.gz", binary=True)


In [None]:
glove_model = KeyedVectors.load_word2vec_format("glove.6B/glove.6B.300d.txt", binary=False)

In [None]:
# Create word embedding matrices
title_embeddings = np.array([word2vec_model[word] for word in preprocessed_titles if word in word2vec_model])
summary_embeddings = np.array([glove_model[word] for word in preprocessed_summaries if word in glove_model])


In [None]:

# Calculate cosine similarity
similarity_scores = cosine_similarity(title_embeddings, summary_embeddings)


In [None]:

# Print top 5 most similar pairs
most_similar = [(i, j, similarity_scores[i][j]) for i, j in enumerate(similarity_scores.argsort(axis=1)[:,-1])]
most_similar_sorted = sorted(most_similar, key=lambda x: x[2],reverse=True)
print("Top 5 most similar pairs:")
for i, j, score in most_similar_sorted[:5]:
  print("Title: {}\nSummary: {}\nSimilarity score: {:.2f}\n".format(titles[i], summaries[i], score))

In [None]:
# Print preprocessed titles and summaries
for title, summary in zip(preprocessed_titles, preprocessed_summaries):
    print(f"Preprocessed Title: {title}")
    print(f"Preprocessed Summary: {summary}")


In [None]:
# Testing the Word2Vec Model  
example_index =1  
title_word_vectors = [word2vec_model.get_vector(word) for word in preprocessed_titles[example_index].split() if word in word2vec_model.key_to_index]
summary_word_vectors = [word2vec_model.get_vector(word) for word in preprocessed_summaries[example_index].split() if word in word2vec_model.key_to_index]

print(f"Word Vectors for Example {example_index} Title: {title_word_vectors}")

print(f"Word Vectors for Example {example_index} Summary: {summary_word_vectors}")

 Universal Sentence Encoder (USE)

In [None]:
from tensorflow_hub import load

use_model = load("https://tfhub.dev/google/universal-sentence-encoder/4")

title_embeddings = use_model(preprocessed_titles)
summary_embeddings = use_model(preprocessed_summaries)

similarity_scores = cosine_similarity(title_embeddings, summary_embeddings)


In [None]:
# Print top 5 most similar pairs
most_similar = [(i, j, similarity_scores[i][j]) for i, j in enumerate(similarity_scores.argsort(axis=1)[:,-1])]
most_similar_sorted = sorted(most_similar, key=lambda x: x[2],reverse=True)
print("Top 5 most similar pairs:")
for i, j, score in most_similar_sorted[:5]:
  print("Title: {}\nSummary: {}\nSimilarity score: {:.2f}\n".format(titles[i], summaries[i], score))