<a href="https://colab.research.google.com/github/varshitha-janagani/NLP/blob/main/2403a54085_nlp_ass_7.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
D1 = "The football team celebrated a great win."
D2 = "The cricket team enjoyed a big victory."
D3 = "The government introduced a new policy."
D4 = "Doctors recommend daily exercise for health."
D5= "Players trained hard for the match."
D6= "The team prepared well for the game."
D7= "Fans cheered loudly during the match."
D8="Regular exercise keeps the body strong."
D9= "A balanced diet improves overall health."
D10="Healthy food supports good lifestyle habits."
D11="Fitness programs encourage active living."
D12="Artificial intelligence supports smart innovation."
D13="Exercise and diet support good health."
D14="Healthy habits improve daily lifestyle."
D15="The team celebrated another great victory."
D16="Computers help people solve problems quickly."
D17="Digital innovation supports better communication."
D18="Computers help people solve problems quickly."
D19="Smart devices make life easier for users."
D20="Digital innovation supports better communication."

import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

documents = [
    D1,D2,D3,D4,D5,D6,D7,D8,D9,D10,D11,D12,D13,D14,D15,D16,D17,D18,D19,D20
]

print("Sample Documents:")
for i, doc in enumerate(documents, 1):
    print(f"Doc {i}: {doc}")

import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Download required NLTK resources (run once)
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab') # Added as suggested by the error message

# 1. Lowercasing
def to_lowercase(text):
    """
    Convert all characters to lowercase.
    Purpose: Ensures uniformity, so 'Football' and 'football' are treated the same.
    """
    return text.lower()

# 2. Remove punctuation & numbers
def remove_punct_numbers(text):
    """
    Remove punctuation and digits using regex.
    Purpose: Keeps only meaningful words, avoids noise from symbols/numbers.
    """
    return re.sub(r'[^a-zA-Z\s]', '', text)

# 3. Tokenization
def tokenize(text):
    """
    Split text into individual words (tokens).
    Purpose: Allows word-level analysis for similarity measures.
    """
    return word_tokenize(text)

# 4. Remove stopwords
def remove_stopwords(tokens):
    """
    Remove common words like 'the', 'is', 'and'.
    Purpose: These words don’t add much meaning and can distort similarity.
    """
    stop_words = set(stopwords.words('english'))
    return [word for word in tokens if word not in stop_words]

# 5. (Optional) Lemmatization
def lemmatize(tokens):
    """
    Reduce words to their base form (lemma).
    Example: 'running' → 'run', 'better' → 'good'.
    Purpose: Groups word variants together for better semantic matching.
    """
    lemmatizer = WordNetLemmatizer()
    return [lemmatizer.lemmatize(word) for word in tokens]

# Combined preprocessing pipeline
def preprocess(text, do_lemmatize=True):
    text = to_lowercase(text)
    text = remove_punct_numbers(text)
    tokens = tokenize(text)
    tokens = remove_stopwords(tokens)
    if do_lemmatize:
        tokens = lemmatize(tokens)
    return tokens

# Example usage
sample_text = "The Football Team celebrated 2 Big Wins!!!"
print("Original:", sample_text)
print("Processed:", preprocess(sample_text))


Sample Documents:
Doc 1: The football team celebrated a great win.
Doc 2: The cricket team enjoyed a big victory.
Doc 3: The government introduced a new policy.
Doc 4: Doctors recommend daily exercise for health.
Doc 5: Players trained hard for the match.
Doc 6: The team prepared well for the game.
Doc 7: Fans cheered loudly during the match.
Doc 8: Regular exercise keeps the body strong.
Doc 9: A balanced diet improves overall health.
Doc 10: Healthy food supports good lifestyle habits.
Doc 11: Fitness programs encourage active living.
Doc 12: Artificial intelligence supports smart innovation.
Doc 13: Exercise and diet support good health.
Doc 14: Healthy habits improve daily lifestyle.
Doc 15: The team celebrated another great victory.
Doc 16: Computers help people solve problems quickly.
Doc 17: Digital innovation supports better communication.
Doc 18: Computers help people solve problems quickly.
Doc 19: Smart devices make life easier for users.
Doc 20: Digital innovation supports 

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


Original: The Football Team celebrated 2 Big Wins!!!
Processed: ['football', 'team', 'celebrated', 'big', 'win']


In [2]:
from sklearn.feature_extraction.text import CountVectorizer

# Sample dataset (first few documents)
documents = [
    D1,D2,D3,D4,D5,D6,D7,D8,D9,D10,D11,D12,D13,D14,D15,D16,D17,D18,D19,D20
]

# Create Bag-of-Words representation
vectorizer = CountVectorizer(stop_words='english')
bow_matrix = vectorizer.fit_transform(documents)

# Show feature names (unique words)
print("Vocabulary:", vectorizer.get_feature_names_out())

# Show Bag-of-Words matrix
print("\nBag-of-Words Matrix:")
print(bow_matrix.toarray())


Vocabulary: ['active' 'artificial' 'balanced' 'better' 'big' 'body' 'celebrated'
 'cheered' 'communication' 'computers' 'cricket' 'daily' 'devices' 'diet'
 'digital' 'doctors' 'easier' 'encourage' 'enjoyed' 'exercise' 'fans'
 'fitness' 'food' 'football' 'game' 'good' 'government' 'great' 'habits'
 'hard' 'health' 'healthy' 'help' 'improve' 'improves' 'innovation'
 'intelligence' 'introduced' 'keeps' 'life' 'lifestyle' 'living' 'loudly'
 'make' 'match' 'new' 'overall' 'people' 'players' 'policy' 'prepared'
 'problems' 'programs' 'quickly' 'recommend' 'regular' 'smart' 'solve'
 'strong' 'support' 'supports' 'team' 'trained' 'users' 'victory' 'win']

Bag-of-Words Matrix:
[[0 0 0 ... 0 0 1]
 [0 0 0 ... 0 1 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 1 0 0]
 [0 0 0 ... 0 0 0]]


In [3]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd

# Sample dataset (8 documents for demonstration)
documents = [
    D1,D2,D3,D4,D5,D6,D7,D8,D9,D10,D11,D12,D13,D14,D15,D16,D17,D18,D19,D20
]

# Step 5 — Bag-of-Words representation
vectorizer = CountVectorizer(stop_words='english')
bow_matrix = vectorizer.fit_transform(documents)

# Step 6 — Compute Cosine Similarity
cosine_sim = cosine_similarity(bow_matrix, bow_matrix)

# Convert to DataFrame for readability
cosine_df = pd.DataFrame(cosine_sim,
                         index=[f"Doc{i}" for i in range(1, len(documents)+1)],
                         columns=[f"Doc{i}" for i in range(1, len(documents)+1)])

print("Cosine Similarity Matrix (Bag-of-Words):")
print(cosine_df)


Cosine Similarity Matrix (Bag-of-Words):
           Doc1      Doc2  Doc3  Doc4  Doc5      Doc6  Doc7  Doc8  Doc9  \
Doc1   1.000000  0.200000   0.0   0.0  0.00  0.258199  0.00   0.0   0.0   
Doc2   0.200000  1.000000   0.0   0.0  0.00  0.258199  0.00   0.0   0.0   
Doc3   0.000000  0.000000   1.0   0.0  0.00  0.000000  0.00   0.0   0.0   
Doc4   0.000000  0.000000   0.0   1.0  0.00  0.000000  0.00   0.2   0.2   
Doc5   0.000000  0.000000   0.0   0.0  1.00  0.000000  0.25   0.0   0.0   
Doc6   0.258199  0.258199   0.0   0.0  0.00  1.000000  0.00   0.0   0.0   
Doc7   0.000000  0.000000   0.0   0.0  0.25  0.000000  1.00   0.0   0.0   
Doc8   0.000000  0.000000   0.0   0.2  0.00  0.000000  0.00   1.0   0.0   
Doc9   0.000000  0.000000   0.0   0.2  0.00  0.000000  0.00   0.0   1.0   
Doc10  0.000000  0.000000   0.0   0.0  0.00  0.000000  0.00   0.0   0.0   
Doc11  0.000000  0.000000   0.0   0.0  0.00  0.000000  0.00   0.0   0.0   
Doc12  0.000000  0.000000   0.0   0.0  0.00  0.000000  0.00

In [4]:
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd

# Sample dataset (8 documents)
documents = [
    D1,D2,D3,D4,D5,D6,D7,D8,D9,D10,D11,D12,D13,D14,D15,D16,D17,D18,D19,D20
]

# Tokenize documents into sets of words
def jaccard_similarity(doc1, doc2):
    set1, set2 = set(doc1.lower().split()), set(doc2.lower().split())
    intersection = set1.intersection(set2)
    union = set1.union(set2)
    return len(intersection) / len(union)

# Compute Jaccard similarity matrix
n = len(documents)
jaccard_matrix = [[jaccard_similarity(documents[i], documents[j]) for j in range(n)] for i in range(n)]

# Convert to DataFrame for readability
jaccard_df = pd.DataFrame(jaccard_matrix,
                          index=[f"Doc{i}" for i in range(1, n+1)],
                          columns=[f"Doc{i}" for i in range(1, n+1)])

print("Jaccard Similarity Matrix:")
print(jaccard_df)


Jaccard Similarity Matrix:
           Doc1      Doc2      Doc3      Doc4      Doc5      Doc6      Doc7  \
Doc1   1.000000  0.272727  0.181818  0.000000  0.083333  0.181818  0.083333   
Doc2   0.272727  1.000000  0.181818  0.000000  0.083333  0.181818  0.083333   
Doc3   0.181818  0.181818  1.000000  0.000000  0.090909  0.090909  0.090909   
Doc4   0.000000  0.000000  0.000000  1.000000  0.090909  0.090909  0.000000   
Doc5   0.083333  0.083333  0.090909  0.090909  1.000000  0.200000  0.200000   
Doc6   0.181818  0.181818  0.090909  0.090909  0.200000  1.000000  0.090909   
Doc7   0.083333  0.083333  0.090909  0.000000  0.200000  0.090909  1.000000   
Doc8   0.083333  0.083333  0.090909  0.090909  0.090909  0.090909  0.090909   
Doc9   0.083333  0.083333  0.090909  0.090909  0.000000  0.000000  0.000000   
Doc10  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   
Doc11  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   
Doc12  0.000000  0.000000

In [5]:
import nltk
from nltk.corpus import wordnet as wn
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

# Download resources (run once)
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Dataset
documents = [
    "The football team celebrated a great win.",
    "The cricket team enjoyed a big victory.",
    "The government introduced a new policy.",
    "Doctors recommend daily exercise for health.",
    "Players trained hard for the match.",
    "The team prepared well for the game.",
    "Fans cheered loudly during the match.",
    "Regular exercise keeps the body strong.",
    "A balanced diet improves overall health.",
    "Healthy food supports good lifestyle habits.",
    "Fitness programs encourage active living.",
    "Artificial intelligence supports smart innovation.",
    "Exercise and diet support good health.",
    "Healthy habits improve daily lifestyle.",
    "The team celebrated another great victory.",
    "Computers help people solve problems quickly.",
    "Digital innovation supports better communication.",
    "Computers help people solve problems quickly.",
    "Smart devices make life easier for users.",
    "Digital innovation supports better communication."
]

# Preprocessing: tokenize and remove stopwords
stop_words = set(stopwords.words('english'))

def preprocess(text):
    tokens = word_tokenize(text.lower())
    return [t for t in tokens if t.isalpha() and t not in stop_words]

# WordNet-based similarity between two sentences
def sentence_similarity(sent1, sent2, method="wup"):
    words1 = preprocess(sent1)
    words2 = preprocess(sent2)

    synsets1 = [wn.synsets(w)[0] for w in words1 if wn.synsets(w)]
    synsets2 = [wn.synsets(w)[0] for w in words2 if wn.synsets(w)]

    if not synsets1 or not synsets2:
        return 0

    scores = []
    for s1 in synsets1:
        best_score = max((s1.wup_similarity(s2) if method=="wup" else s1.path_similarity(s2)) or 0 for s2 in synsets2)
        scores.append(best_score)

    return sum(scores) / len(scores)

# Example: compute similarity for first 10 pairs
pairs = [(documents[i], documents[i+1]) for i in range(10)]
for i, (d1, d2) in enumerate(pairs, 1):
    sim = sentence_similarity(d1, d2, method="wup")
    print(f"Pair {i}: {sim:.3f}")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Pair 1: 0.695
Pair 2: 0.445
Pair 3: 0.242
Pair 4: 0.346
Pair 5: 0.431
Pair 6: 0.367
Pair 7: 0.457
Pair 8: 0.350
Pair 9: 0.436
Pair 10: 0.530
