In [None]:
# ============================================================
# Q1: TEXT PREPROCESSING & REPRESENTATION
# ============================================================

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# Download NLTK resources (run only once)
nltk.download('punkt')
nltk.download('stopwords')

# Example text (can change in exam)
texts = ["Natural Language Processing is amazing!",
         "Language helps humans communicate effectively."]

# --- Step 1: Preprocess text ---
stop_words = set(stopwords.words('english'))
clean_texts = []
for text in texts:
    tokens = word_tokenize(text.lower())
    filtered = [w for w in tokens if w.isalpha() and w not in stop_words]
    clean_texts.append(" ".join(filtered))
print("Cleaned Texts:", clean_texts)

# --- Step 2: Bag of Words Representation ---
bow = CountVectorizer()
bow_matrix = bow.fit_transform(clean_texts)
print("\nBag of Words Matrix:\n", bow_matrix.toarray())
print("BOW Features:", bow.get_feature_names_out())

# --- Step 3: TF-IDF Representation ---
tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(clean_texts)
print("\nTF-IDF Matrix:\n", tfidf_matrix.toarray())
print("TF-IDF Features:", tfidf.get_feature_names_out())

"""
# POSSIBLE ERRORS & FIXES (Q1)
1. LookupError: Missing punkt/stopwords → run nltk.download('punkt') and nltk.download('stopwords')
2. ValueError: Empty vocabulary → ensure texts are not empty or all stopwords
3. Non-English text → may need different tokenizer
"""

# ============================================================
# Q2: SEMANTIC UNDERSTANDING & LANGUAGE MODELING
# ============================================================

from nltk.corpus import wordnet
from collections import Counter

nltk.download('wordnet')
nltk.download('omw-1.4')

# --- Step 1: Synonyms, Antonyms, Hypernyms ---
word = "good"
synonyms, antonyms, hypernyms = set(), set(), set()

for syn in wordnet.synsets(word):
    for l in syn.lemmas():
        synonyms.add(l.name())
        if l.antonyms():
            antonyms.add(l.antonyms()[0].name())
    for h in syn.hypernyms():
        for l in h.lemmas():
            hypernyms.add(l.name())

print("\nSynonyms:", synonyms)
print("Antonyms:", antonyms)
print("Hypernyms:", hypernyms)

# --- Step 2: N-Gram Language Model (Bigram + Laplace smoothing) ---
text = "the cat sat on the mat"
tokens = text.split()
bigrams = [(tokens[i], tokens[i+1]) for i in range(len(tokens)-1)]
freq = Counter(bigrams)
vocab = set(tokens)

def laplace_prob(w1, w2):
    return (freq[(w1, w2)] + 1) / (tokens.count(w1) + len(vocab))

print("\nP('sat' | 'cat') =", laplace_prob('cat', 'sat'))
print("P('on' | 'mat') =", laplace_prob('mat', 'on'))

"""
# POSSIBLE ERRORS & FIXES (Q2)
1. LookupError: wordnet not found → nltk.download('wordnet')
2. ZeroDivisionError → if word not found in tokens
3. IndexError → ensure text has enough words
4. Use lowercase tokens to match word counts properly
"""

# ============================================================
# Q3: INFORMATION EXTRACTION & SENTIMENT ANALYSIS
# ============================================================

import gensim
from nltk import pos_tag, ne_chunk
from nltk.sentiment import SentimentIntensityAnalyzer
from gensim.models import Word2Vec

nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')
nltk.download('vader_lexicon')

# --- Step 1: Named Entity Recognition (NER) ---
text = "Elon Musk founded SpaceX in California."
tokens = word_tokenize(text)
tags = pos_tag(tokens)
ner_tree = ne_chunk(tags)
print("\nNamed Entities:\n", ner_tree)

# --- Step 2: Sentiment Analysis ---
sia = SentimentIntensityAnalyzer()
sentiment = sia.polarity_scores("The product is great but expensive.")
print("\nSentiment Scores:", sentiment)

# --- Step 3: Word Embeddings (Word2Vec Similarity) ---
sentences = [["nlp", "is", "fun"],
             ["deep", "learning", "is", "powerful"],
             ["language", "models", "learn", "patterns"]]

model = Word2Vec(sentences, vector_size=30, min_count=1)
print("\nSimilarity (nlp, learning):", model.wv.similarity('nlp', 'learning'))
print("Most similar to 'language':", model.wv.most_similar('language'))

"""
# POSSIBLE ERRORS & FIXES (Q3)
1. LookupError: missing nltk data → run nltk.download() for required modules
2. ModuleNotFoundError: gensim not found → pip install gensim
3. KeyError in similarity → check if word exists in vocabulary
4. Matplotlib error → if visualization required, use %matplotlib inline
"""


In [None]:
# ============================================================
# Q1: TEXT PREPROCESSING & REPRESENTATION
# ============================================================

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# Download necessary resources
nltk.download('punkt')
nltk.download('stopwords')

# --- Predicted Exam Question ---
# Q1: Write a program to preprocess text (tokenize, remove stopwords, lowercase)
# and represent it using Bag-of-Words and TF-IDF. Compare both.

texts = ["Natural Language Processing is amazing!",
         "Language helps humans communicate effectively."]

stop_words = set(stopwords.words('english'))
clean_texts = []

for text in texts:
    tokens = word_tokenize(text.lower())
    filtered = [w for w in tokens if w.isalpha() and w not in stop_words]
    clean_texts.append(" ".join(filtered))

print("Preprocessed Texts:", clean_texts)

# Bag of Words
bow = CountVectorizer()
bow_matrix = bow.fit_transform(clean_texts)
print("\nBag of Words:\n", bow_matrix.toarray())

# TF-IDF
tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(clean_texts)
print("\nTF-IDF:\n", tfidf_matrix.toarray())

"""
# Possible Errors (Q1)
1. LookupError: missing punkt/stopwords → run nltk.download('punkt'), nltk.download('stopwords')
2. ValueError: empty vocabulary → ensure input text not empty
3. UnicodeDecodeError → ensure text is plain English
"""

# ============================================================
# Q2: SEMANTIC UNDERSTANDING & LANGUAGE MODELING
# ============================================================

from nltk.corpus import wordnet
from collections import Counter

nltk.download('wordnet')
nltk.download('omw-1.4')

# --- Predicted Exam Question ---
# Q2: Write a program to find synonyms, antonyms, and hypernyms of a given word.
# Then implement a Bigram Language Model with Laplace smoothing.

word = "happy"
synonyms, antonyms, hypernyms = set(), set(), set()

for syn in wordnet.synsets(word):
    for l in syn.lemmas():
        synonyms.add(l.name())
        if l.antonyms():
            antonyms.add(l.antonyms()[0].name())
    for h in syn.hypernyms():
        for l in h.lemmas():
            hypernyms.add(l.name())

print("\nSynonyms:", synonyms)
print("Antonyms:", antonyms)
print("Hypernyms:", hypernyms)

# Bigram Model (with Laplace smoothing)
text = "the cat sat on the mat"
tokens = text.split()
bigrams = [(tokens[i], tokens[i+1]) for i in range(len(tokens)-1)]
freq = Counter(bigrams)
vocab = set(tokens)

def laplace_prob(w1, w2):
    return (freq[(w1, w2)] + 1) / (tokens.count(w1) + len(vocab))

print("\nP('sat' | 'cat') =", laplace_prob('cat', 'sat'))
print("P('on' | 'mat') =", laplace_prob('mat', 'on'))

"""
# Possible Errors (Q2)
1. LookupError: wordnet not found → run nltk.download('wordnet')
2. ZeroDivisionError → check token existence
3. IndexError → if text too short for bigrams
"""

# ============================================================
# Q3: INFORMATION EXTRACTION & SENTIMENT ANALYSIS
# ============================================================

import gensim
from nltk import pos_tag, ne_chunk
from nltk.sentiment import SentimentIntensityAnalyzer
from gensim.models import Word2Vec

nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')
nltk.download('vader_lexicon')

# --- Predicted Exam Question ---
# Q3: Write a program to extract named entities from a text,
# perform sentiment analysis, and visualize word similarities using Word2Vec.

text = "Elon Musk founded SpaceX in California."
tokens = word_tokenize(text)
tags = pos_tag(tokens)
ner = ne_chunk(tags)
print("\nNamed Entities:\n", ner)

# Sentiment Analysis
sia = SentimentIntensityAnalyzer()
sentence = "The movie was great but a bit too long."
print("\nSentiment Scores:", sia.polarity_scores(sentence))

# Word2Vec Similarity
sentences = [["nlp", "is", "fun"],
             ["machine", "learning", "is", "powerful"],
             ["language", "models", "learn", "patterns"]]

model = Word2Vec(sentences, vector_size=30, min_count=1)
print("\nSimilarity (nlp, learning):", model.wv.similarity('nlp', 'learning'))
print("Most similar to 'language':", model.wv.most_similar('language'))

"""
# Possible Errors (Q3)
1. LookupError: missing nltk models → download required resources
2. gensim not installed → pip install gensim
3. KeyError: if word not in vocabulary
"""
