In [4]:
# === Install & download required packages ===
#!pip install --quiet nltk spacy transformers
#!python -m spacy download --quiet en_core_web_sm

# === Imports ===
import numpy as np
import nltk
import spacy
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from transformers import pipeline

# Download NLTK data
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
# Download the missing 'punkt_tab' data package
nltk.download('punkt_tab', quiet=True) # This line is added to download the missing data

# === Q1: NLP Preprocessing Pipeline ===
def nlp_preprocess(sentence):
    print("Original sentence:")
    print(" ", sentence, "\n")

    # 1. Tokenize
    tokens = word_tokenize(sentence)
    print("1) Original Tokens:")
    print(" ", tokens, "\n")

    # 2. Remove stopwords
    stops = set(stopwords.words('english'))
    tokens_nostop = [t for t in tokens if t.lower() not in stops]
    print("2) Tokens Without Stopwords:")
    print(" ", tokens_nostop, "\n")

    # 3. Stemming
    stemmer = PorterStemmer()
    stems = [stemmer.stem(t) for t in tokens_nostop]
    print("3) Stemmed Words:")
    print(" ", stems, "\n")

# Run Q1 on the given sentence
sentence_q1 = "NLP techniques are used in virtual assistants like Alexa and Siri."
nlp_preprocess(sentence_q1)



Original sentence:
  NLP techniques are used in virtual assistants like Alexa and Siri. 

1) Original Tokens:
  ['NLP', 'techniques', 'are', 'used', 'in', 'virtual', 'assistants', 'like', 'Alexa', 'and', 'Siri', '.'] 

2) Tokens Without Stopwords:
  ['NLP', 'techniques', 'used', 'virtual', 'assistants', 'like', 'Alexa', 'Siri', '.'] 

3) Stemmed Words:
  ['nlp', 'techniqu', 'use', 'virtual', 'assist', 'like', 'alexa', 'siri', '.'] 



1. Stemming vs. Lemmatization

Stemming crudely chops off word endings using simple rules (e.g. Porter stemmer):

“running” → “run” → “run” or sometimes “runn”

Lemmatization uses vocabulary and morphological analysis to return the base (dictionary) form:

“running” → “run”

2. When to remove stop words

Useful: in tasks like topic modeling or keyword extraction where common words add noise and do not carry meaning.

Harmful: in tasks like sentiment analysis or question answering, where words like “not,” “very,” or “but” critically affect meaning if removed.

In [5]:

# === Q2: Named Entity Recognition with spaCy ===
nlp = spacy.load("en_core_web_sm")

def ner_example(text):
    doc = nlp(text)
    print(f"Input sentence:\n  {text}\n")
    print("Detected Entities:")
    for ent in doc.ents:
        print(f" - Text: '{ent.text}'\n"
              f"   Label: {ent.label_}\n"
              f"   Char pos: ({ent.start_char}, {ent.end_char})\n")

# Run Q2 on the given sentence
text_q2 = ("Barack Obama served as the 44th President of the United States "
           "and won the Nobel Peace Prize in 2009.")
ner_example(text_q2)




Input sentence:
  Barack Obama served as the 44th President of the United States and won the Nobel Peace Prize in 2009.

Detected Entities:
 - Text: 'Barack Obama'
   Label: PERSON
   Char pos: (0, 12)

 - Text: '44th'
   Label: ORDINAL
   Char pos: (27, 31)

 - Text: 'the United States'
   Label: GPE
   Char pos: (45, 62)

 - Text: 'the Nobel Peace Prize'
   Label: WORK_OF_ART
   Char pos: (71, 92)

 - Text: '2009'
   Label: DATE
   Char pos: (96, 100)



**1. NER vs. POS Tagging**  
- **Named Entity Recognition (NER)** locates spans of text that correspond to real‑world “entities” (people, organizations, locations, dates, etc.) and labels each span with its semantic category.  
- **Part‑of‑Speech (POS) Tagging** assigns every single token a grammatical role (noun, verb, adjective, etc.), focusing on syntax rather than semantic identity.

---

**2. Real‑World NER Applications**  
- **Financial News Analytics:** Automatically extract company names, stock tickers, currencies, and event dates from news feeds to drive algorithmic trading, risk monitoring, or sentiment scoring.  
- **Search Engines & QA Systems:** Disambiguate query terms (“Apple” the company vs. the fruit), link queries to knowledge‑graph entities, and return richer, entity‑centric answers and suggestions.

In [6]:
# === Q3: Scaled Dot‑Product Attention ===
def scaled_dot_product_attention(Q, K, V):
    """
    Q, K: (seq_len, d); V: (seq_len, d)
    Returns attention_weights (seq_len, seq_len) and output (seq_len, d)
    """
    d = Q.shape[-1]
    # 1) matmul Q @ K^T
    scores = Q @ K.T
    # 2) scale
    scaled_scores = scores / np.sqrt(d)
    # 3) softmax
    exp = np.exp(scaled_scores - np.max(scaled_scores, axis=-1, keepdims=True))
    weights = exp / exp.sum(axis=-1, keepdims=True)
    # 4) output = weights @ V
    output = weights @ V
    return weights, output

# Test inputs
Q = np.array([[1,0,1,0],
              [0,1,0,1]], dtype=float)
K = Q.copy()
V = np.array([[1,2,3,4],
              [5,6,7,8]], dtype=float)

attn_w, out = scaled_dot_product_attention(Q, K, V)
print("Attention weights (after softmax):\n", attn_w, "\n")
print("Final output:\n", out, "\n")



Attention weights (after softmax):
 [[0.73105858 0.26894142]
 [0.26894142 0.73105858]] 

Final output:
 [[2.07576569 3.07576569 4.07576569 5.07576569]
 [3.92423431 4.92423431 5.92423431 6.92423431]] 



**1. Why divide by √d?**  
When Q·Kᵀ grows large (for high-dimensional d), its softmax becomes extremely peaked, making gradients very small and training unstable. Dividing by √d scales the dot‑products down to a more moderate range, keeping the softmax output well‑conditioned and gradients healthy.

---

**2. How self‑attention captures word relationships**  
Self‑attention lets each word’s representation attend to (i.e. compute weighted sums of) all other words in the sequence. By learning attention weights, the model can directly link related words—handling long‑range dependencies, resolving pronouns, or emphasizing context—without relying solely on sequential recurrences.

In [7]:

# === Q4: Sentiment Analysis with HuggingFace ===
sentiment = pipeline("sentiment-analysis")

def sentiment_example(text):
    print("Input:", text)
    result = sentiment(text)[0]
    print(f"Sentiment: {result['label']}")
    print(f"Confidence Score: {result['score']:.4f}")

# Run Q4 on the given sentence
text_q4 = ("Despite the high price, the performance of the new MacBook is outstanding.")
sentiment_example(text_q4)

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Device set to use cuda:0


Input: Despite the high price, the performance of the new MacBook is outstanding.
Sentiment: POSITIVE
Confidence Score: 0.9998


**1. BERT vs. GPT**  
- **Architecture:**  
  - **BERT** is a **bidirectional encoder‑only** Transformer: every token attends to both left and right context.  
  - **GPT** is a **unidirectional decoder‑only** Transformer: each token can only attend to previous tokens (left context).  
- **Which uses which:**  
  - BERT = **Encoder** stack  
  - GPT  = **Decoder** stack  

---

**2. Benefits of Pre‑trained Models**  
- **Rich Linguistic Knowledge:** Learned syntax, semantics, and world knowledge from massive text corpora.  
- **Data Efficiency:** Downstream tasks need far fewer labeled examples (“fine‑tuning” vs. training from scratch).  
- **Compute Savings:** Avoid the enormous cost of training tens or hundreds of millions of parameters from zero.  
- **Strong Baselines:** Pre‑trained models achieve state‑of‑the‑art performance on diverse NLP tasks with minimal additional effort.