# Data Loading

In [1]:
import pandas as pd
news_df = pd.read_csv("reddit_worldnews.csv", engine='python')

# Word2Vec was introduced in 2013
# A search engine in 2012 (BoW):

In [2]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(news_df["title"])  # Convert titles to BoW

In [4]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

query = "president" # User prompt
query_vec = vectorizer.transform([query])  # Convert question to BoW

similarities = cosine_similarity(query_vec, X) # Compute similarity

n=10
top_n_indices = np.argsort(similarities[0])[-n:][::-1]  # Get indices of top 5 highest similarities
top_n_results = news_df.iloc[top_n_indices]["title"].tolist() # Output top 5 answers

# Display results
print("Top News:")
top_n_results



Top News:


['French President Nicolas Sarkozy insults US President, Spanish President and German Chancellor',
 'Iran s President Nominates Female Vice President',
 'Phillippine President regrets comments regarding President Obama',
 'Ukrainian president speaks to  fake  Kyrgyz president',
 'President Obama and President Castro shake hands',
 'Uruguay president calls Argentine president an ‘old hag’',
 'President Obama Reveals Phone Call With Iranian President',
 'Egypt ex-president Morsi tells judge  I am president ',
 'President Xi gives President Duterte red carpet treatment',
 'Egypt ex-president Morsi tells judge  I am president ']

# TF_IDF

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(stop_words="english", max_features=5000)  # Adjust max_features as needed
X = vectorizer.fit_transform(news_df['title'])

In [7]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

query = "president" # User prompt
query_vec = vectorizer.transform([query])  # Convert question to BoW

similarities = cosine_similarity(query_vec, X) # Compute similarity

n=10
top_n_indices = np.argsort(similarities[0])[-n:][::-1]  # Get indices of top 5 highest similarities
top_n_results = news_df.iloc[top_n_indices]["title"].tolist() # Output top 5 answers

# Display results
print("Top News:")
top_n_results

Top News:


['Infographic on Achievements of President Jonathan',
 'Filipinos move to impeach President Aquino',
 'Varela inaugurated as Panamanian president',
 'The President of Liberland interviewed',
 'If I were president\r\n',
 'President Bodyguards Only for Rajput, Jats and Sikhs',
 'An illegitimate president',
 'President of the World',
 'Ultareligious hardliner to be Iran s next president',
 'What if Iran s Next President is a Reformist?']

# Word2Vec

In [8]:
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess
model = Word2Vec(
        window=10,
        vector_size=50,
        alpha=0.03,
        min_count=2,
        epochs=5
        )

processed_df = news_df['title'].apply(simple_preprocess)
model.build_vocab(processed_df)

In [10]:
# Function to compute sentence vector using mean of word vectors
def get_sentence_vector(tokens, model):
    vectors = [model.wv[word] for word in tokens if isinstance(word, str) and word in model.wv]
    return np.mean(vectors, axis=0) if vectors else np.zeros(model.vector_size)

# Apply the function to generate Word2Vec embeddings
processed_df["word2vec_vector"] = processed_df.apply(lambda tokens: get_sentence_vector(tokens, model))

# Convert list of vectors into a NumPy array
X = np.array(processed_df["word2vec_vector"].tolist())  # More efficient stacking

In [11]:
# ---- Query Processing ----
query = "president"  # User prompt
query_tokens = simple_preprocess(query) # Tokenize the query
query_vec = get_sentence_vector(query_tokens, model).reshape(1, -1)  # Convert query to vector

# Compute cosine similarity
similarities = cosine_similarity(query_vec, X)

# Retrieve top N results
n = 10
top_n_indices = np.argsort(similarities[0])[-n:][::-1]  # Get indices of top N highest similarities
top_n_results = news_df['title'].iloc[top_n_indices].tolist()  # Output top N matching texts

# Display results
print("Top Results:")
for i, result in enumerate(top_n_results, 1):
    print(f"{i}. {result}")

Top Results:
1. Uruguay president calls Argentine president an ‘old hag’
2. Ukrainian president speaks to  fake  Kyrgyz president
3. Incumbent EU President Defends Israeli Counterattack
4. Madagascar President Cedes Power 
5. Iran s President Nominates Female Vice President
6. Phillippine President regrets comments regarding President Obama
7. Palestinian Authority president refuses European Union-brokered meeting with Israeli president
8. Tunisian president flees after 23 years in power
9. Ecuador President injured in ‘coup’
10. Haiti protesters urge president s departure


# RNN

In [12]:
from collections import Counter
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from sklearn.metrics.pairwise import cosine_similarity
from gensim.utils import simple_preprocess
import numpy as np


# Tokenize sentences
samples = news_df.sample(frac=0.25, random_state=42).reset_index(drop=True)
processed_df = samples['title'].apply(simple_preprocess)


# Build vocabulary
word_counts = Counter(word for tokens in processed_df for word in tokens)
vocab = {word: i+1 for i, (word, _) in enumerate(word_counts.most_common())}  # Start indexing from 1 (0 for padding)

# Convert sentences to sequences
def text_to_sequence(tokens, vocab):
    return [vocab[word] for word in tokens if word in vocab]

processed_df = processed_df.apply(lambda tokens: text_to_sequence(tokens, vocab))

# Pad sequences
max_len = max(len(seq) for seq in processed_df)  # Determine max length
def pad_sequence(seq, max_len):
    return seq + [0] * (max_len - len(seq))  # Pad with zeros

processed_df = processed_df.apply(lambda seq: pad_sequence(seq, max_len))

# Convert to tensor
X = torch.tensor(processed_df.tolist(), dtype=torch.long)

# ---- Define RNN Model ----
class RNNEncoder(nn.Module):
    def __init__(self, vocab_size, embedding_dim=50, hidden_dim=64):
        super(RNNEncoder, self).__init__()
        self.embedding = nn.Embedding(vocab_size + 1, embedding_dim, padding_idx=0)
        self.rnn = nn.RNN(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, embedding_dim)  # Output vector size

    def forward(self, x):
        x = self.embedding(x)
        _, h_n = self.rnn(x)  # Get the final hidden state
        output = self.fc(h_n.squeeze(0))  # Pass through a linear layer
        return output

# Initialize model
vocab_size = len(vocab)
model = RNNEncoder(vocab_size)
model.eval()  # No training, just inference

# Generate document embeddings
with torch.no_grad():
    document_embeddings = model(X).numpy()  # Convert to NumPy

In [14]:
# ---- Query Processing ----
query = "president"  # Example user input
query_tokens = simple_preprocess(query)
query_sequence = text_to_sequence(query_tokens, vocab)
query_padded = torch.tensor(pad_sequence(query_sequence, max_len), dtype=torch.long).unsqueeze(0)

# Get embedding for the query
with torch.no_grad():
    query_embedding = model(query_padded).numpy()

# Compute cosine similarity
similarities = cosine_similarity(query_embedding, document_embeddings)

# Retrieve top N results
n = 10
top_n_indices = np.argsort(similarities[0])[-n:][::-1]  # Get top N highest similarities
top_n_results = news_df['title'].iloc[top_n_indices].tolist()  # Output top N matches

# Display results
print("Top Results:")
for i, result in enumerate(top_n_results, 1):
    print(f"{i}. {result}")

Top Results:
1. Austrian authorities reveal find of buried treasure
2. Thaksin says he plans to return to Thailand
3. Venezuela offers use of air base to Russia: Chavez
4. Stoning lawyer, Mohammed Mostafaei, flees Iran

5. Two Young Women Exchange Letters: one from Israel, one from Gaza
6. Greeks drive hard bargain as creditor talks start
7. North Korea threatens to launch pre-emptive strike against U.S.
8. Airline plot trio get life terms
9. Massive gang shooting spree on Vancouver s west side injures 10 people
10. Why Are The French So Determined To Run The IMF? --- If economics isn t your area, this will make the scales fall from your eyes. Superb, concise commentary on eurozone financial troubles and the IMF, by the fund s former chief economist


# LSTM

In [15]:
from collections import Counter
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from sklearn.metrics.pairwise import cosine_similarity
from gensim.utils import simple_preprocess
import numpy as np


# Tokenize sentences
samples = news_df.sample(frac=0.25, random_state=42).reset_index(drop=True)
processed_df = samples['title'].apply(simple_preprocess)


# Build vocabulary
word_counts = Counter(word for tokens in processed_df for word in tokens)
vocab = {word: i+1 for i, (word, _) in enumerate(word_counts.most_common())}  # Start indexing from 1 (0 for padding)

# Convert sentences to sequences
def text_to_sequence(tokens, vocab):
    return [vocab[word] for word in tokens if word in vocab]

processed_df = processed_df.apply(lambda tokens: text_to_sequence(tokens, vocab))

# Pad sequences
max_len = max(len(seq) for seq in processed_df)  # Determine max length
def pad_sequence(seq, max_len):
    return seq + [0] * (max_len - len(seq))  # Pad with zeros

processed_df = processed_df.apply(lambda seq: pad_sequence(seq, max_len))

# Convert to tensor
X = torch.tensor(processed_df.tolist(), dtype=torch.long)

# ---- Define LSTM Model ----
class LSTMEncoder(nn.Module):
    def __init__(self, vocab_size, embedding_dim=50, hidden_dim=64):
        super(LSTMEncoder, self).__init__()
        self.embedding = nn.Embedding(vocab_size + 1, embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, embedding_dim)  # Output vector size

    def forward(self, x):
        x = self.embedding(x)
        _, (h_n, _) = self.lstm(x)  # Get the final hidden state from LSTM
        output = self.fc(h_n.squeeze(0))  # Pass through a linear layer
        return output

# Initialize model
vocab_size = len(vocab)
model = LSTMEncoder(vocab_size)
# model.eval()  # No training, just inference

# Generate document embeddings
with torch.no_grad():
    document_embeddings = model(X).numpy()  # Convert to NumPy

In [16]:
# ---- Query Processing ----
query = "president"  # Example user input
query_tokens = simple_preprocess(query)
query_sequence = text_to_sequence(query_tokens, vocab)
query_padded = torch.tensor(pad_sequence(query_sequence, max_len), dtype=torch.long).unsqueeze(0)

# Get embedding for the query
with torch.no_grad():
    query_embedding = model(query_padded).numpy()

# Compute cosine similarity
similarities = cosine_similarity(query_embedding, document_embeddings)

# Retrieve top N results
n = 10
top_n_indices = np.argsort(similarities[0])[-n:][::-1]  # Get top N highest similarities
top_n_results = news_df['title'].iloc[top_n_indices].tolist()  # Output top N matches

# Display results
print("Top Results:")
for i, result in enumerate(top_n_results, 1):
    print(f"{i}. {result}")

Top Results:
1. The girls with too much skin - see the photos and watch the video
2. Bhutan: Lost in Democracy
3. 10 North Koreans died in IAF attack on Syrian reactor according to Japanese TV reports
4. Two newspapers have published front page apologies for a series of articles about the parents of missing Madeleine McCann.
5. Controversial Bestseller Shakes the Foundation of the Israeli State
6. Chilean Glacier Ice Heist Lands Thieves in Hot Water - 
Police arrested a crime ring as they drove a refrigerated truck filled with over 5 tons of ice stolen from an already dwindling glacier. They ve never seen a case quite like this before.
7. Brainwashed: Don t Trust U.S. Mainstream News
8. BBC Journalism vs Disney Reporting
9. Britain sends warship to Falkland Islands - Buenos Aires slams UK decision & Prince William in  uniform of a conquistador  to disputed island
10. Obama s $30 Billion Pledge to Israel at AIPAC


# Transformers

In [17]:
from transformers import BertTokenizer, BertModel
from gensim.utils import simple_preprocess
import numpy as np
import torch
from tqdm import tqdm

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


# Tokenize sentences
samples = news_df.sample(frac=0.1, random_state=42).reset_index(drop=True)


# Load pre-trained BERT model and tokenizer
bert_model = BertModel.from_pretrained("bert-base-uncased").to(device)
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Function to get BERT embeddings
def get_bert_embedding(text):
    inputs = tokenizer(text['title'], return_tensors="pt", truncation=True, padding="max_length", max_length=512)
    inputs = {key: val.to(device) for key, val in inputs.items()}
    with torch.no_grad():
        outputs = bert_model(**inputs)
        emb = torch.mean(outputs.last_hidden_state, dim=1).squeeze().cpu().numpy()
    return emb  # Extract token representation

# Generate document embeddings using BERT
a=[]
for i,row in samples[:1000].iterrows():
    a.append(get_bert_embedding(row))

# Convert embeddings to a NumPy array
document_embeddings = np.array(a)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [20]:
from sklearn.metrics.pairwise import cosine_similarity

def get_bert_embedding(text):
    print(text)
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding="max_length", max_length=512)
    inputs = {key: val.to(device) for key, val in inputs.items()}
    with torch.no_grad():
        outputs = bert_model(**inputs)
        emb = torch.mean(outputs.last_hidden_state, dim=1).squeeze().cpu().numpy()
    return emb  # Extract CLS token representation
# ---- Query Processing ----
query = "president"  # Example user input
query_embedding = get_bert_embedding(query).reshape(1, -1)  # Convert query into BERT embedding

# Compute cosine similarity
similarities = cosine_similarity(query_embedding, document_embeddings)

# Retrieve top N results
n = 10
top_n_indices = np.argsort(similarities[0])[-n:][::-1]  # Get top N highest similarities
top_n_results = news_df.iloc[top_n_indices]['title'].tolist()  # Output top N matches

# Display results
print("Top Results:")
for i, result in enumerate(top_n_results, 1):
    print(f"{i}. {result}")

president
Top Results:
1. Columbia Says Rebels Sought To Make Dirty Bomb
2. Australia apologizes to Aborigines
3. Bishop of Rochester reasserts  no-go  claim
4. As Goes Kosovo, So Goes Quebec ?
5. Official: Iraqi $1.2b Oil Deal  With China Thwarted By Invasion Is Expected to Be Re-Signed Next Month, Oil Ministry Says
6. Airport X-Ray Scanners: Mind Having Your Naked Body Photographed And Stored in a Federal Database?
7. Roadside blast kills 13 in NW Pakistan
8. Mainstream Control Media Ignores Scores of Other Shocking Abuse Videos of US Soldiers in Iraq (videos)
9. Bush Hopes Recession Doesn t Affect Sales Of His Memoirs | The Onion 
10. Nicolas Sarkozy threatens to sue Ryanair 
