In [1]:
# Homework 4 Part C Q1
# Text preprocessing pipeline: tokenization → stopword removal → lemmatization → POS filtering

import spacy
from nltk.corpus import stopwords
import nltk

# Download stopwords once if not already available
nltk.download('stopwords')

# Load spaCy English model
nlp = spacy.load("en_core_web_sm")

# Input sentence
text = "John enjoys playing football while Mary loves reading books in the library."

# Process with spaCy
doc = nlp(text)

# Load English stopword list from NLTK
stop_words = set(stopwords.words('english'))

# Filtered tokens: keep non-stopword, alphabetic, lemma, POS = noun or verb
filtered = [
    (token.text, token.lemma_, token.pos_)
    for token in doc
    if token.is_alpha
       and token.text.lower() not in stop_words
       and token.pos_ in ["NOUN", "VERB"]
]

print("Filtered tokens (Word – Lemma – POS):")
for w, l, p in filtered:
    print(f"{w:10} → {l:10} ({p})")

# Optional: just the lemmatized output
lemmas = [l for _, l, _ in filtered]
print("\nLemmatized result:", lemmas)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Filtered tokens (Word – Lemma – POS):
enjoys     → enjoy      (VERB)
playing    → play       (VERB)
football   → football   (NOUN)
reading    → read       (VERB)
books      → book       (NOUN)
library    → library    (NOUN)

Lemmatized result: ['enjoy', 'play', 'football', 'read', 'book', 'library']


In [2]:
# Homework 4 Part C Q2
# Named Entity Recognition + Pronoun ambiguity warning

import spacy

# Load spaCy English model
nlp = spacy.load("en_core_web_sm")

text = "Chris met Alex at Apple headquarters in California. He told him about the new iPhone launch."

# Run NER
doc = nlp(text)

print("Named Entities Detected:")
for ent in doc.ents:
    print(f" - {ent.text:25} → {ent.label_}")

# Detect pronouns
pronouns = [t.text.lower() for t in doc if t.pos_ == "PRON"]

# Check for third-person pronouns that could cause ambiguity
ambiguous_prons = {"he", "she", "they", "him", "her", "them"}
if any(p in ambiguous_prons for p in pronouns):
    print("\n⚠️ Warning: Possible pronoun ambiguity detected!")
else:
    print("\nNo pronoun ambiguity detected.")


Named Entities Detected:
 - Chris                     → PERSON
 - Alex                      → PERSON
 - Apple                     → ORG
 - California                → GPE
 - iPhone                    → ORG

