<a href="https://colab.research.google.com/github/yashaswini1764/NLP/blob/main/NLP_ASS_8.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import re
import numpy as np
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
import nltk
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer


nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [3]:

data = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))
documents = data.data

print(f"Loaded {len(documents)} documents.")


Loaded 11314 documents.


In [4]:
stop_words = set(stopwords.words('english'))
lemm = WordNetLemmatizer()
token_pattern = re.compile(r"\b[a-zA-Z]{2,}\b")

def preprocess(text):
    text = text.lower()
    tokens = token_pattern.findall(text)
    tokens = [t for t in tokens if t not in stop_words]
    tokens = [lemm.lemmatize(t) for t in tokens]
    return " ".join(tokens)

docs_clean = [preprocess(doc) for doc in documents]


In [5]:
n_topics = 5


tfidf = TfidfVectorizer(max_features=10000)
tfidf_matrix = tfidf.fit_transform(docs_clean)


count_vect = CountVectorizer(max_features=10000)
count_matrix = count_vect.fit_transform(docs_clean)


nmf = NMF(n_components=n_topics, random_state=42)
W = nmf.fit_transform(tfidf_matrix)
H = nmf.components_


lda = LatentDirichletAllocation(n_components=n_topics, random_state=42)
lda.fit(count_matrix)


In [6]:
def get_top_words(model, feature_names, n_top_words=10):
    for idx, topic in enumerate(model.components_):
        top_words = [feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]
        print(f"Topic {idx + 1}: {', '.join(top_words)}")

print("\n=== NMF Topics ===")
get_top_words(nmf, tfidf.get_feature_names_out())

print("\n=== LDA Topics ===")
get_top_words(lda, count_vect.get_feature_names_out())



=== NMF Topics ===
Topic 1: would, one, year, like, think, people, get, time, good, game
Topic 2: window, file, thanks, anyone, please, program, know, mail, driver, do
Topic 3: god, christian, jesus, bible, people, believe, say, faith, christ, belief
Topic 4: drive, scsi, disk, card, controller, hard, floppy, ide, system, bus
Topic 5: key, chip, encryption, clipper, bit, system, government, phone, escrow, algorithm

=== LDA Topics ===
Topic 1: one, would, get, drive, like, know, use, car, problem, card
Topic 2: file, window, edu, god, use, program, one, also, com, system
Topic 3: space, key, new, year, file, db, information, encryption, program, game
Topic 4: would, people, one, think, say, know, time, like, year, right
Topic 5: ax, max, pl, bhj, giz, cx, wm, chz, ah, sl


In [7]:
def jaccard_similarity(list1, list2):
    a, b = set(list1), set(list2)
    return len(a & b) / len(a | b)

nmf_words = [[tfidf.get_feature_names_out()[i] for i in topic.argsort()[:-11:-1]] for topic in nmf.components_]
lda_words = [[count_vect.get_feature_names_out()[i] for i in topic.argsort()[:-11:-1]] for topic in lda.components_]

print("\n=== Topic Similarity (Jaccard) ===")
for i, nmf_t in enumerate(nmf_words):
    sims = [jaccard_similarity(nmf_t, lda_t) for lda_t in lda_words]
    print(f"NMF Topic {i+1} most similar to LDA Topic {np.argmax(sims)+1} (Score={max(sims):.2f})")



=== Topic Similarity (Jaccard) ===
NMF Topic 1 most similar to LDA Topic 4 (Score=0.54)
NMF Topic 2 most similar to LDA Topic 2 (Score=0.18)
NMF Topic 3 most similar to LDA Topic 4 (Score=0.11)
NMF Topic 4 most similar to LDA Topic 1 (Score=0.11)
NMF Topic 5 most similar to LDA Topic 3 (Score=0.11)


In [8]:
word1 = "economy"
word2 = "finance"

syns1 = wordnet.synsets(word1)
syns2 = wordnet.synsets(word2)

if syns1 and syns2:
    s1 = syns1[0]
    s2 = syns2[0]
    path_sim = s1.path_similarity(s2)
    wup_sim = s1.wup_similarity(s2)

    print(f"Path Similarity between '{word1}' and '{word2}': {path_sim}")
    print(f"Wu-Palmer Similarity between '{word1}' and '{word2}': {wup_sim}")

    if wup_sim >= 0.8:
        print("→ Words are semantically very close.")
    elif wup_sim >= 0.5:
        print("→ Words are somewhat related.")
    else:
        print("→ Words are not very similar.")
else:
    print("Synsets not found for one or both words.")


Path Similarity between 'economy' and 'finance': 0.09090909090909091
Wu-Palmer Similarity between 'economy' and 'finance': 0.2857142857142857
→ Words are not very similar.


In [9]:

import random
random.seed(42)
indices = random.sample(range(len(docs_clean)), 3)
docs = [docs_clean[i].split() for i in indices]

def jaccard_doc(d1, d2):
    s1, s2 = set(d1), set(d2)
    return len(s1 & s2) / len(s1 | s2)


pairs = [(0,1), (0,2), (1,2)]
scores = {}
for a,b in pairs:
    sim = jaccard_doc(docs[a], docs[b])
    scores[(a,b)] = sim
    print(f"Jaccard Similarity between Doc{a+1} and Doc{b+1}: {sim:.4f}")


most_similar = max(scores, key=scores.get)
least_similar = min(scores, key=scores.get)

print(f"\nMost Similar Pair: Doc{most_similar[0]+1} & Doc{most_similar[1]+1} ({scores[most_similar]:.4f})")
print(f"Least Similar Pair: Doc{least_similar[0]+1} & Doc{least_similar[1]+1} ({scores[least_similar]:.4f})")


Jaccard Similarity between Doc1 and Doc2: 0.0263
Jaccard Similarity between Doc1 and Doc3: 0.0000
Jaccard Similarity between Doc2 and Doc3: 0.0115

Most Similar Pair: Doc1 & Doc2 (0.0263)
Least Similar Pair: Doc1 & Doc3 (0.0000)
