In [None]:
# ============================================================
# 🧩 NLP PIPELINE — Text Preprocessing → Semantic Understanding → Sentiment & Entity Extraction
# ============================================================

# ✅ 1. INSTALL DEPENDENCIES (run once in Jupyter)
# !pip install nltk spacy textblob scikit-learn pandas numpy matplotlib wordcloud gensim tqdm
# !python -m spacy download en_core_web_sm

# ============================================================
# ✅ IMPORTS
# ============================================================
import nltk
import spacy
import pandas as pd
from nltk.corpus import stopwords, wordnet as wn
from nltk import word_tokenize, ngrams
from collections import Counter
from textblob import TextBlob
from wordcloud import WordCloud
import matplotlib.pyplot as plt

# ============================================================
# ✅ DOWNLOAD NLTK DATA (run once)
# ============================================================
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')

# ============================================================
# ✅ SAMPLE TEXT INPUT (You can replace this in exam)
# ============================================================
text = "Apple announced the new iPhone at the event. Everyone loved it! The CEO Tim Cook looked happy."

print("🔹 Original Text:")
print(text)

# ============================================================
# 🧩 Q1 — TEXT PREPROCESSING & REPRESENTATION
# ============================================================
# Step 1: Tokenization
tokens = word_tokenize(text.lower())

# Step 2: Stopword Removal
stop_words = set(stopwords.words('english'))
filtered_tokens = [w for w in tokens if w.isalpha() and w not in stop_words]

# Step 3: Lemmatization
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
lemmatized_tokens = [lemmatizer.lemmatize(w) for w in filtered_tokens]

# Step 4: Frequency Distribution
word_freq = Counter(lemmatized_tokens)

print("\n🔹 Cleaned & Lemmatized Tokens:")
print(lemmatized_tokens)
print("\n🔹 Word Frequency:")
print(word_freq)

# Step 5: WordCloud Visualization
plt.figure(figsize=(5,5))
WordCloud(width=400, height=300, background_color='white').generate(' '.join(lemmatized_tokens))
plt.imshow(_, interpolation='bilinear')
plt.axis('off')
plt.title("WordCloud of Processed Text")
plt.show()

# ============================================================
# 🧩 Q2 — SEMANTIC UNDERSTANDING & LANGUAGE MODELING
# ============================================================

# Part 1: WordNet — Synonyms, Antonyms, Hypernyms
word = "happy"
synonyms = {lemma.name() for syn in wn.synsets(word) for lemma in syn.lemmas()}
antonyms = {lemma.antonyms()[0].name() for syn in wn.synsets(word) for lemma in syn.lemmas() if lemma.antonyms()}
hypernyms = {h.name().split('.')[0] for syn in wn.synsets(word) for h in syn.hypernyms()}

print("\n🔹 WordNet for 'happy'")
print("Synonyms:", synonyms)
print("Antonyms:", antonyms)
print("Hypernyms:", hypernyms)

# Part 2: Simple Bigram Language Model (Laplace Smoothing)
bigrams = list(ngrams(lemmatized_tokens, 2))
vocab = set(lemmatized_tokens)
V = len(vocab)
bigram_freq = Counter(bigrams)
unigram_freq = Counter(lemmatized_tokens)

def bigram_prob(w1, w2):
    return (bigram_freq[(w1, w2)] + 1) / (unigram_freq[w1] + V)

print("\n🔹 Example Bigram Probabilities:")
print(f"P(loved | everyone) = {bigram_prob('everyone', 'loved'):.4f}")
print(f"P(new | the) = {bigram_prob('the', 'new'):.4f}")

# ============================================================
# 🧩 Q3 — INFORMATION EXTRACTION & SENTIMENT ANALYSIS
# ============================================================

# Part 1: Named Entity Recognition (NER)
nlp = spacy.load("en_core_web_sm")
doc = nlp(text)
print("\n🔹 Named Entities:")
for ent in doc.ents:
    print(f"{ent.text} ({ent.label_})")

# Part 2: Sentiment Analysis
blob = TextBlob(text)
print("\n🔹 Sentiment Analysis:")
print("Polarity:", blob.sentiment.polarity)
print("Subjectivity:", blob.sentiment.subjectivity)
if blob.sentiment.polarity > 0:
    print("Overall Sentiment: 😊 Positive")
elif blob.sentiment.polarity < 0:
    print("Overall Sentiment: 😠 Negative")
else:
    print("Overall Sentiment: 😐 Neutral")

# ============================================================
# 🧩 Q3 (Extra) — WORD EMBEDDINGS DEMO (Gensim Word2Vec)
# ============================================================
from gensim.models import Word2Vec
model = Word2Vec([lemmatized_tokens], vector_size=50, window=2, min_count=1, sg=1)
similar_words = model.wv.most_similar("apple", topn=3)
print("\n🔹 Word2Vec Similarity for 'apple':")
print(similar_words)

# ============================================================
# ✅ END OF PIPELINE
# ============================================================
print("\n✅ NLP Pipeline Completed Successfully!")
