In [None]:
"""
NLP EXAM - COMPLETE READY-TO-USE CODE FOR ALL 3 QUESTIONS
Copy the entire question code and run in Jupyter Notebook
"""

# ============================================================================
# QUESTION 1: TEXT PREPROCESSING & REPRESENTATION (15 MARKS)
# Task: 1. Preprocess text
#       2. Representation of text and comparison of outputs
# ============================================================================

# -------- FIRST RUN THIS CELL (IMPORTS & SETUP) --------
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from collections import Counter
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

# Download NLTK data (run once)
nltk_data = ['punkt', 'stopwords', 'wordnet', 'omw-1.4']
for item in nltk_data:
    try:
        nltk.download(item, quiet=True)
    except:
        pass

print("✓ Setup Complete for Question 1!")

# -------- THEN RUN THIS CELL (MAIN CODE) --------

# Sample input text (replace with exam question text)
text1 = """Natural Language Processing is an exciting field of artificial intelligence. 
It helps computers understand and process human language. NLP has many applications 
in real world scenarios."""

text2 = """Artificial Intelligence and Machine Learning are transforming technology.
These fields enable computers to learn and make intelligent decisions."""

# ===== PART 1: TEXT PREPROCESSING (5 marks) =====
print("="*70)
print("PART 1: TEXT PREPROCESSING")
print("="*70)

def preprocess_text(text):
    """Complete preprocessing pipeline"""
    
    # Step 1: Tokenization
    print(f"\n1. Original Text:\n{text}\n")
    tokens = word_tokenize(text.lower())
    print(f"2. After Tokenization ({len(tokens)} tokens):\n{tokens[:10]}...\n")
    
    # Step 2: Remove punctuation
    tokens_clean = [word for word in tokens if word.isalnum()]
    print(f"3. After Removing Punctuation ({len(tokens_clean)} tokens):\n{tokens_clean[:10]}...\n")
    
    # Step 3: Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens_no_stop = [word for word in tokens_clean if word not in stop_words]
    print(f"4. After Removing Stopwords ({len(tokens_no_stop)} tokens):\n{tokens_no_stop[:10]}...\n")
    
    # Step 4: Stemming
    stemmer = PorterStemmer()
    stemmed = [stemmer.stem(word) for word in tokens_no_stop]
    print(f"5. After Stemming:\n{stemmed[:10]}...\n")
    
    # Step 5: Lemmatization
    lemmatizer = WordNetLemmatizer()
    lemmatized = [lemmatizer.lemmatize(word) for word in tokens_no_stop]
    print(f"6. After Lemmatization:\n{lemmatized[:10]}...\n")
    
    return tokens_clean, tokens_no_stop, lemmatized

# Preprocess both texts
print("\n" + "="*70)
print("PREPROCESSING TEXT 1:")
print("="*70)
tokens1_clean, tokens1_no_stop, tokens1_final = preprocess_text(text1)

print("\n" + "="*70)
print("PREPROCESSING TEXT 2:")
print("="*70)
tokens2_clean, tokens2_no_stop, tokens2_final = preprocess_text(text2)

# ===== PART 2: TEXT REPRESENTATION (5 marks) =====
print("\n" + "="*70)
print("PART 2: TEXT REPRESENTATION")
print("="*70)

# Representation 1: Bag of Words (BoW)
print("\n--- Bag of Words Representation ---")
bow1 = Counter(tokens1_final)
bow2 = Counter(tokens2_final)
print(f"\nText 1 BoW:\n{dict(bow1)}")
print(f"\nText 2 BoW:\n{dict(bow2)}")

# Representation 2: Term Frequency (TF)
print("\n--- Term Frequency Representation ---")
def calculate_tf(tokens):
    bow = Counter(tokens)
    total_words = len(tokens)
    tf = {word: count/total_words for word, count in bow.items()}
    return tf

tf1 = calculate_tf(tokens1_final)
tf2 = calculate_tf(tokens2_final)
print(f"\nText 1 TF (top 5):")
for word, freq in sorted(tf1.items(), key=lambda x: x[1], reverse=True)[:5]:
    print(f"  {word}: {freq:.4f}")
print(f"\nText 2 TF (top 5):")
for word, freq in sorted(tf2.items(), key=lambda x: x[1], reverse=True)[:5]:
    print(f"  {word}: {freq:.4f}")

# Representation 3: TF-IDF
print("\n--- TF-IDF Representation ---")
def calculate_tfidf(documents):
    """Calculate TF-IDF for multiple documents"""
    n_docs = len(documents)
    
    # Calculate Document Frequency (DF)
    df = {}
    for doc in documents:
        unique_words = set(doc)
        for word in unique_words:
            df[word] = df.get(word, 0) + 1
    
    # Calculate IDF
    idf = {word: np.log(n_docs / freq) for word, freq in df.items()}
    
    # Calculate TF-IDF for each document
    tfidf_docs = []
    for doc in documents:
        tf = calculate_tf(doc)
        tfidf = {word: tf_val * idf[word] for word, tf_val in tf.items()}
        tfidf_docs.append(tfidf)
    
    return tfidf_docs, idf

documents = [tokens1_final, tokens2_final]
tfidf_results, idf_values = calculate_tfidf(documents)

print(f"\nText 1 TF-IDF (top 5):")
for word, score in sorted(tfidf_results[0].items(), key=lambda x: x[1], reverse=True)[:5]:
    print(f"  {word}: {score:.4f}")

print(f"\nText 2 TF-IDF (top 5):")
for word, score in sorted(tfidf_results[1].items(), key=lambda x: x[1], reverse=True)[:5]:
    print(f"  {word}: {score:.4f}")

# ===== PART 3: COMPARISON OF REPRESENTATIONS (5 marks) =====
print("\n" + "="*70)
print("PART 3: COMPARISON OF REPRESENTATIONS")
print("="*70)

# Cosine Similarity
def cosine_similarity(vec1, vec2):
    """Calculate cosine similarity between two vectors"""
    # Get common words
    all_words = set(vec1.keys()) | set(vec2.keys())
    
    # Create vectors
    v1 = [vec1.get(word, 0) for word in all_words]
    v2 = [vec2.get(word, 0) for word in all_words]
    
    # Calculate cosine similarity
    dot_product = sum(a * b for a, b in zip(v1, v2))
    magnitude1 = np.sqrt(sum(a**2 for a in v1))
    magnitude2 = np.sqrt(sum(b**2 for b in v2))
    
    if magnitude1 == 0 or magnitude2 == 0:
        return 0.0
    
    return dot_product / (magnitude1 * magnitude2)

# Compare using different representations
sim_bow = cosine_similarity(dict(bow1), dict(bow2))
sim_tf = cosine_similarity(tf1, tf2)
sim_tfidf = cosine_similarity(tfidf_results[0], tfidf_results[1])

print("\nSimilarity between Text 1 and Text 2:")
print(f"  Using Bag of Words:  {sim_bow:.4f}")
print(f"  Using TF:            {sim_tf:.4f}")
print(f"  Using TF-IDF:        {sim_tfidf:.4f}")

print("\nComparison Summary:")
print("- Bag of Words: Simple word counts, doesn't consider document length")
print("- TF: Normalized by document length, better for different sized texts")
print("- TF-IDF: Highlights important words unique to documents")
print(f"\n✓ Question 1 Complete!")


# ============================================================================
# QUESTION 2: SEMANTIC UNDERSTANDING & LANGUAGE MODELING (15 MARKS)
# Task: 1. Extraction of synonyms, antonyms, and hypernyms
#       2. N-Gram Language Model with Laplace smoothing for prediction
# ============================================================================

# -------- FIRST RUN THIS CELL (IMPORTS & SETUP) --------
import nltk
from nltk.corpus import wordnet
from collections import defaultdict, Counter
from nltk.tokenize import word_tokenize
import warnings
warnings.filterwarnings('ignore')

# Download NLTK data
nltk.download('wordnet', quiet=True)
nltk.download('omw-1.4', quiet=True)
nltk.download('punkt', quiet=True)

print("✓ Setup Complete for Question 2!")

# -------- THEN RUN THIS CELL (MAIN CODE) --------

# ===== PART 1: SEMANTIC ANALYSIS WITH WORDNET (5 marks) =====
print("="*70)
print("PART 1: SEMANTIC ANALYSIS - SYNONYMS, ANTONYMS, HYPERNYMS")
print("="*70)

def get_synonyms(word):
    """Extract synonyms from WordNet"""
    synonyms = set()
    for synset in wordnet.synsets(word):
        for lemma in synset.lemmas():
            synonym = lemma.name().replace('_', ' ')
            if synonym.lower() != word.lower():
                synonyms.add(synonym)
    return list(synonyms)

def get_antonyms(word):
    """Extract antonyms from WordNet"""
    antonyms = set()
    for synset in wordnet.synsets(word):
        for lemma in synset.lemmas():
            if lemma.antonyms():
                antonym = lemma.antonyms()[0].name().replace('_', ' ')
                antonyms.add(antonym)
    return list(antonyms)

def get_hypernyms(word):
    """Extract hypernyms (more general terms) from WordNet"""
    hypernyms = set()
    for synset in wordnet.synsets(word):
        for hypernym in synset.hypernyms():
            for lemma in hypernym.lemmas():
                hypernym_word = lemma.name().replace('_', ' ')
                hypernyms.add(hypernym_word)
    return list(hypernyms)

def get_hyponyms(word):
    """Extract hyponyms (more specific terms) from WordNet"""
    hyponyms = set()
    for synset in wordnet.synsets(word):
        for hyponym in synset.hyponyms():
            for lemma in hyponym.lemmas():
                hyponym_word = lemma.name().replace('_', ' ')
                hyponyms.add(hyponym_word)
    return list(hyponyms)

# Test words (replace with exam question words)
test_words = ['good', 'happy', 'dog', 'car']

for word in test_words:
    print(f"\n{'='*70}")
    print(f"Word: '{word}'")
    print(f"{'='*70}")
    
    synonyms = get_synonyms(word)
    antonyms = get_antonyms(word)
    hypernyms = get_hypernyms(word)
    hyponyms = get_hyponyms(word)
    
    print(f"\nSynonyms ({len(synonyms)}): {synonyms[:5] if synonyms else 'None found'}")
    print(f"Antonyms ({len(antonyms)}): {antonyms[:5] if antonyms else 'None found'}")
    print(f"Hypernyms ({len(hypernyms)}): {hypernyms[:5] if hypernyms else 'None found'}")
    print(f"Hyponyms ({len(hyponyms)}): {hyponyms[:3] if hyponyms else 'None found'}")
    
    # Show definitions
    synsets = wordnet.synsets(word)
    if synsets:
        print(f"\nDefinitions:")
        for i, syn in enumerate(synsets[:2], 1):
            print(f"  {i}. {syn.definition()}")

# ===== PART 2: N-GRAM LANGUAGE MODEL WITH LAPLACE SMOOTHING (10 marks) =====
print("\n" + "="*70)
print("PART 2: N-GRAM LANGUAGE MODEL WITH LAPLACE SMOOTHING")
print("="*70)

class NGramLanguageModel:
    """N-gram Language Model with Laplace Smoothing"""
    
    def __init__(self, n=2):
        self.n = n
        self.ngram_counts = defaultdict(int)
        self.context_counts = defaultdict(int)
        self.vocabulary = set()
        
    def train(self, text):
        """Train the n-gram model on text"""
        # Tokenize
        tokens = word_tokenize(text.lower())
        
        # Add to vocabulary
        self.vocabulary.update(tokens)
        
        # Add padding
        padded_tokens = ['<START>'] * (self.n - 1) + tokens + ['<END>']
        self.vocabulary.add('<START>')
        self.vocabulary.add('<END>')
        
        # Count n-grams
        for i in range(len(padded_tokens) - self.n + 1):
            ngram = tuple(padded_tokens[i:i + self.n])
            context = ngram[:-1]
            
            self.ngram_counts[ngram] += 1
            self.context_counts[context] += 1
        
        print(f"\nModel Training Complete!")
        print(f"  Vocabulary size: {len(self.vocabulary)}")
        print(f"  Total {self.n}-grams: {sum(self.ngram_counts.values())}")
        print(f"  Unique {self.n}-grams: {len(self.ngram_counts)}")
    
    def probability_with_smoothing(self, ngram):
        """Calculate probability with Laplace smoothing"""
        ngram = tuple(ngram)
        context = ngram[:-1]
        
        # Laplace smoothing: Add 1 to numerator and vocab size to denominator
        vocab_size = len(self.vocabulary)
        numerator = self.ngram_counts[ngram] + 1
        denominator = self.context_counts[context] + vocab_size
        
        return numerator / denominator
    
    def probability_without_smoothing(self, ngram):
        """Calculate probability without smoothing"""
        ngram = tuple(ngram)
        context = ngram[:-1]
        
        if self.context_counts[context] == 0:
            return 0.0
        
        return self.ngram_counts[ngram] / self.context_counts[context]
    
    def predict_next_word(self, context, top_k=5):
        """Predict next word given context"""
        # Take only last (n-1) words as context
        context = context[-(self.n-1):] if len(context) >= (self.n-1) else ['<START>'] * (self.n-1-len(context)) + context
        context = tuple(context)
        
        # Calculate probabilities for all words
        predictions = []
        for word in self.vocabulary:
            if word not in ['<START>', '<END>']:
                ngram = context + (word,)
                prob_smooth = self.probability_with_smoothing(ngram)
                prob_no_smooth = self.probability_without_smoothing(ngram)
                predictions.append((word, prob_smooth, prob_no_smooth))
        
        # Sort by probability (with smoothing)
        predictions.sort(key=lambda x: x[1], reverse=True)
        
        return predictions[:top_k]

# Training corpus (replace with exam question corpus)
corpus = """
The cat sat on the mat. The dog sat on the log. 
The cat and the dog are friends. They play together every day.
The mat is soft and the log is hard.
"""

print(f"\nTraining Corpus:\n{corpus}\n")

# Train bigram model (n=2)
print("="*70)
print("BIGRAM MODEL (n=2)")
print("="*70)
bigram_model = NGramLanguageModel(n=2)
bigram_model.train(corpus)

# Predict next word
context1 = ['the']
print(f"\n--- Predictions after '{' '.join(context1)}' ---")
predictions = bigram_model.predict_next_word(context1, top_k=5)

print(f"\n{'Word':<15} {'With Smoothing':<20} {'Without Smoothing':<20}")
print("-" * 55)
for word, prob_smooth, prob_no_smooth in predictions:
    print(f"{word:<15} {prob_smooth:<20.6f} {prob_no_smooth:<20.6f}")

# Train trigram model (n=3)
print("\n" + "="*70)
print("TRIGRAM MODEL (n=3)")
print("="*70)
trigram_model = NGramLanguageModel(n=3)
trigram_model.train(corpus)

context2 = ['the', 'cat']
print(f"\n--- Predictions after '{' '.join(context2)}' ---")
predictions = trigram_model.predict_next_word(context2, top_k=5)

print(f"\n{'Word':<15} {'With Smoothing':<20} {'Without Smoothing':<20}")
print("-" * 55)
for word, prob_smooth, prob_no_smooth in predictions:
    print(f"{word:<15} {prob_smooth:<20.6f} {prob_no_smooth:<20.6f}")

# Demonstrate smoothing effect
print("\n" + "="*70)
print("COMPARISON: IMPACT OF LAPLACE SMOOTHING")
print("="*70)
print("\nFor unseen n-gram ['the', 'zebra']:")
unseen_ngram = ['the', 'zebra']
prob_with = bigram_model.probability_with_smoothing(unseen_ngram)
prob_without = bigram_model.probability_without_smoothing(unseen_ngram)
print(f"  With Laplace Smoothing:    {prob_with:.6f}")
print(f"  Without Smoothing:         {prob_without:.6f}")
print(f"\nSmoothing prevents zero probabilities for unseen n-grams!")

print(f"\n✓ Question 2 Complete!")


# ============================================================================
# QUESTION 3: INFORMATION EXTRACTION & SENTIMENT ANALYSIS (20 MARKS)
# Task: 1. Implement NER to extract entities
#       2. Perform Sentiment Analysis
#       3. Implement Word Embeddings and visualize similarity
# ============================================================================

# -------- FIRST RUN THIS CELL (IMPORTS & SETUP) --------
import nltk
from nltk.tokenize import word_tokenize
from nltk import pos_tag, ne_chunk
from nltk.corpus import stopwords
from collections import defaultdict, Counter
import numpy as np
import warnings
warnings.filterwarnings('ignore')

# Download NLTK data
downloads = ['punkt', 'averaged_perceptron_tagger', 'maxent_ne_chunker', 
             'words', 'stopwords']
for item in downloads:
    nltk.download(item, quiet=True)

print("✓ Setup Complete for Question 3!")

# -------- THEN RUN THIS CELL (MAIN CODE) --------

# ===== PART 1: NAMED ENTITY RECOGNITION (5 marks) =====
print("="*70)
print("PART 1: NAMED ENTITY RECOGNITION (NER)")
print("="*70)

def extract_named_entities(text):
    """Extract named entities from text"""
    # Tokenize and POS tag
    tokens = word_tokenize(text)
    pos_tags = pos_tag(tokens)
    
    # Named Entity Recognition
    chunks = ne_chunk(pos_tags, binary=False)
    
    # Extract entities by type
    entities = {
        'PERSON': [],
        'ORGANIZATION': [],
        'GPE': [],  # Geo-Political Entity (locations)
        'DATE': [],
        'TIME': [],
        'MONEY': [],
        'FACILITY': []
    }
    
    for chunk in chunks:
        if hasattr(chunk, 'label'):
            entity_type = chunk.label()
            entity_name = ' '.join(c[0] for c in chunk)
            
            if entity_type in entities:
                entities[entity_type].append(entity_name)
    
    return entities, chunks

# Test text for NER (replace with exam question text)
ner_texts = [
    "Apple Inc. was founded by Steve Jobs in California. Tim Cook is the current CEO.",
    "Microsoft CEO Satya Nadella announced new products in Seattle last Monday.",
    "The Eiffel Tower in Paris attracts millions of tourists every year."
]

for i, text in enumerate(ner_texts, 1):
    print(f"\n{'='*70}")
    print(f"Text {i}: {text}")
    print(f"{'='*70}")
    
    entities, chunks = extract_named_entities(text)
    
    print("\nExtracted Entities:")
    for entity_type, entity_list in entities.items():
        if entity_list:
            print(f"  {entity_type}: {entity_list}")
    
    # Show POS tags
    tokens = word_tokenize(text)
    pos_tags = pos_tag(tokens)
    print(f"\nPOS Tags (first 10): {pos_tags[:10]}")

# ===== PART 2: SENTIMENT ANALYSIS (5 marks) =====
print("\n" + "="*70)
print("PART 2: SENTIMENT ANALYSIS")
print("="*70)

class SentimentAnalyzer:
    """Lexicon-based sentiment analyzer"""
    
    def __init__(self):
        # Expanded sentiment lexicons
        self.positive_words = {
            'good', 'great', 'excellent', 'amazing', 'wonderful', 'fantastic',
            'awesome', 'brilliant', 'outstanding', 'superb', 'perfect', 'love',
            'loved', 'happy', 'joy', 'joyful', 'delighted', 'beautiful',
            'best', 'better', 'positive', 'nice', 'fine', 'pleasant'
        }
        
        self.negative_words = {
            'bad', 'terrible', 'awful', 'horrible', 'poor', 'worst',
            'hate', 'hated', 'sad', 'unhappy', 'disappointed', 'disappointing',
            'negative', 'wrong', 'fail', 'failed', 'failure', 'worse',
            'ugly', 'disgusting', 'boring', 'dull', 'unpleasant'
        }
        
        # Intensifiers
        self.intensifiers = {'very', 'extremely', 'really', 'absolutely', 'incredibly'}
        
        # Negations
        self.negations = {'not', 'no', 'never', 'neither', "n't", 'nobody', 'nothing'}
    
    def analyze(self, text):
        """Analyze sentiment of text"""
        tokens = word_tokenize(text.lower())
        
        pos_score = 0
        neg_score = 0
        
        for i, word in enumerate(tokens):
            # Check for intensifiers
            multiplier = 1.5 if i > 0 and tokens[i-1] in self.intensifiers else 1.0
            
            # Check for negations
            is_negated = i > 0 and tokens[i-1] in self.negations
            
            # Score the word
            if word in self.positive_words:
                if is_negated:
                    neg_score += 1 * multiplier
                else:
                    pos_score += 1 * multiplier
            elif word in self.negative_words:
                if is_negated:
                    pos_score += 1 * multiplier
                else:
                    neg_score += 1 * multiplier
        
        # Determine overall sentiment
        if pos_score > neg_score:
            sentiment = 'Positive'
            confidence = pos_score / (pos_score + neg_score) if (pos_score + neg_score) > 0 else 0
        elif neg_score > pos_score:
            sentiment = 'Negative'
            confidence = neg_score / (pos_score + neg_score) if (pos_score + neg_score) > 0 else 0
        else:
            sentiment = 'Neutral'
            confidence = 0.5
        
        return {
            'sentiment': sentiment,
            'positive_score': pos_score,
            'negative_score': neg_score,
            'confidence': confidence,
            'total_words': len(tokens)
        }

# Test sentiment analysis
sentiment_texts = [
    "This product is amazing and wonderful! I absolutely love it!",
    "This is terrible and disappointing. I hate it.",
    "The weather is okay today.",
    "Not bad, but not great either.",
    "This is not good at all. Very disappointing experience."
]

analyzer = SentimentAnalyzer()

for i, text in enumerate(sentiment_texts, 1):
    print(f"\n{'='*70}")
    print(f"Text {i}: {text}")
    print(f"{'='*70}")
    
    result = analyzer.analyze(text)
    
    print(f"\nSentiment: {result['sentiment']}")
    print(f"Positive Score: {result['positive_score']:.2f}")
    print(f"Negative Score: {result['negative_score']:.2f}")
    print(f"Confidence: {result['confidence']:.2%}")

# ===== PART 3: WORD EMBEDDINGS & SIMILARITY (10 marks) =====
print("\n" + "="*70)
print("PART 3: WORD EMBEDDINGS AND SIMILARITY VISUALIZATION")
print("="*70)

class SimpleWordEmbeddings:
    """Simple co-occurrence based word embeddings"""
    
    def __init__(self, window_size=2):
        self.window_size = window_size
        self.cooccurrence = defaultdict(lambda: defaultdict(int))
        self.vocabulary = set()
        self.word_to_vec = {}
    
    def train(self, text):
        """Build co-occurrence matrix from text"""
        # Tokenize and clean
        tokens = word_tokenize(text.lower())
        stop_words = set(stopwords.words('english'))
        tokens = [w for w in tokens if w.isalnum() and w not in stop_words]
        
        self.vocabulary.update(tokens)
        
        # Build co-occurrence matrix
        for i, word in enumerate(tokens):
            # Define window
            start = max(0, i - self.window_size)
            end = min(len(tokens), i + self.window_size + 1)
            
            # Count co-occurrences
            for j in range(start, end):
                if i != j:
                    context_word = tokens[j]
                    self.cooccurrence[word][context_word] += 1
        
        # Convert to vectors
        all_context_words = sorted(self.vocabulary)
        for word in self.vocabulary:
            vector = [self.cooccurrence[word].get(context, 0) 
                     for context in all_context_words]
            self.word_to_vec[word] = np.array(vector)
        
        print(f"\nEmbeddings Training Complete!")
        print(f"  Vocabulary size: {len(self.vocabulary)}")
        print(f"  Vector dimension: {len(all_context_words)}")
    
    def cosine_similarity(self, word1, word2):
        """Calculate cosine similarity between two words"""
        if word1 not in self.vocabulary or word2 not in self.vocabulary:
            return 0.0
        
        vec1 = self.word_to_vec[word1]
        vec2 = self.word_to_vec[word2]
        
        dot_product = np.dot(vec1, vec2)
        norm1 = np.linalg.norm(vec1)
        norm2 = np.linalg.norm(vec2)
        
        if norm1 == 0 or norm2 == 0:
            return 0.0
        
        return dot_product / (norm1 * norm2)
    
    def most_similar(self, word, top_k=5):
        """Find most similar words"""
        if word not in self.vocabulary:
            return []
        
        similarities = []
        for other_word in self.vocabulary:
            if other_word != word:
                sim = self.cosine_similarity(word, other_word)
                similarities.append((other_word, sim))
        
        similarities.sort(key=lambda x: x[1], reverse=True)
        return similarities[:top_k]
    
    def visualize_similarity_matrix(self, words):
        """Create similarity matrix for given words"""
        n = len(words)
        matrix = np.zeros((n, n))
        
        for i, word1 in enumerate(words):
            for j, word2 in enumerate(words):
                if word1 in self.vocabulary and word2 in self.vocabulary:
                    matrix[i][j] = self.cosine_similarity(word1, word2)
        
        return matrix

# Training corpus for embeddings
embedding_corpus = """
The cat and dog are animals. Cats and dogs are pets.
A cat is a small animal. A dog is a loyal animal.
People love cats and dogs. Cats are independent. Dogs are friendly.
The cat sleeps on the mat. The dog plays in the yard.
Cats hunt mice. Dogs guard homes.
"""

print(f"\nTraining Corpus for Embeddings:\n{embedding_corpus}\n")

# Train embeddings
embeddings = SimpleWordEmbeddings(window_size=2)
embeddings.train(embedding_corpus)

# Find similar words
test_words = ['cat', 'dog', 'animal']

for word in test_words:
    print(f"\n{'='*70}")
    print(f"Most similar words to '{word}':")
    print(f"{'='*70}")
    
    similar = embeddings.most_similar(word, top_k=5)
    
    print(f"\n{'Rank':<6} {'Word':<15} {'Similarity':<15}")
    print("-" * 40)
    for rank, (similar_word, similarity) in enumerate(similar, 1):
        print(f"{rank:<6} {similar_word:<15} {similarity:.4f}")

# Similarity matrix visualization (text-based)
print(f"\n{'='*70}")
print("WORD SIMILARITY MATRIX")
print(f"{'='*70}")

compare_words = ['cat', 'dog', 'animal', 'pet']
similarity_matrix = embeddings.visualize_similarity_matrix(compare_words)

# Print header
print(f"\n{'Word':<12}", end='')
for word in compare_words:
    print(f"{word:<12}", end='')
print()
print("-" * (12 + 12 * len(compare_words)))

# Print matrix
for i, word1 in enumerate(compare_words):
    print(f"{word1:<12}", end='')
    for j, word2 in enumerate(compare_words):
        print(f"{similarity_matrix[i][j]:<12.4f}", end='')
    print()

# Word pairs comparison
print(f"\n{'='*70}")
print("SPECIFIC WORD PAIR SIMILARITIES")
print(f"{'='*70}")

word_pairs = [
    ('cat', 'dog'),
    ('cat', 'animal'),
    ('dog', 'animal'),
    ('cat', 'pet')
]

for word1, word2 in word_pairs:
    sim = embeddings.cosine_similarity(word1, word2)
    print(f"\nSimilarity between '{word1}' and '{word2}': {sim:.4f}")

# Vector representation sample
print(f"\n{'='*70}")
print("SAMPLE VECTOR REPRESENTATIONS")
print(f"{'='*70}")

sample_word = 'cat'
if sample_word in embeddings.vocabulary:
    vector = embeddings.word_to_vec[sample_word]
    print(f"\nVector for '{sample_word}' (first 10 dimensions):")
    print(vector[:10])
    print(f"Vector length: {len(vector)}")
    print(f"Vector norm: {np.linalg.norm(vector):.4f}")

print(f"\n✓ Question 3 Complete!")

print("\n" + "="*70)
print("ALL QUESTIONS COMPLETED SUCCESSFULLY!")
print("="*70)