# NLP Text Analysis

Demonstrates natural language processing with spaCy, NLTK, and sentence-transformers.

**Libraries:**
- [spaCy](https://spacy.io/) — Industrial-strength NLP: NER, dependency parsing, tokenization
- [NLTK](https://www.nltk.org/) — Classic NLP toolkit: VADER sentiment, WordNet, stemming
- [Sentence Transformers](https://www.sbert.net/) — Semantic embeddings for search and clustering

> **Note:** spaCy model required — run once:  
> `python -m spacy download en_core_web_sm`  
> The sentence transformer model (~80 MB) downloads automatically on first use.

In [None]:
import os
import sys
import subprocess

import numpy as np
import matplotlib.pyplot as plt
import nltk

# Download NLTK data
for resource in ('punkt', 'punkt_tab', 'stopwords', 'vader_lexicon', 'wordnet',
                  'averaged_perceptron_tagger', 'averaged_perceptron_tagger_eng'):
    nltk.download(resource, quiet=True)

from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.sentiment.vader import SentimentIntensityAnalyzer

%matplotlib inline

In [None]:
TEXTS = [
    "Apple Inc. was founded by Steve Jobs, Steve Wozniak, and Ronald Wayne in Cupertino, California in 1976.",
    "The Eiffel Tower is a wrought-iron lattice tower on the Champ de Mars in Paris, France.",
    "NASA's Mars rover Perseverance successfully landed in Jezero Crater on February 18, 2021.",
    "Albert Einstein was a German-born theoretical physicist who developed the theory of relativity.",
    "The Amazon River flows through nine nations in South America and discharges into the Atlantic Ocean.",
]

REVIEWS = [
    "This product is absolutely fantastic! I love everything about it. Highly recommended!",
    "Terrible experience. The item broke after one day. Complete waste of money.",
    "It's okay, nothing special. Does what it's supposed to do, I guess.",
    "Outstanding quality and fast shipping. Will definitely buy again!",
    "Not what I expected. The description was misleading and customer service was unhelpful.",
    "Pretty good overall. A few minor issues but nothing dealbreaking.",
    "Worst purchase ever. Avoid at all costs. Save your money.",
    "Exceeded all expectations. Remarkable craftsmanship and attention to detail.",
]

print(f"Corpus: {len(TEXTS)} fact sentences, {len(REVIEWS)} product reviews")

## spaCy: Named Entity Recognition and Dependency Parsing

spaCy's NLP pipeline processes text into `Doc` objects with rich linguistic annotations.

In [None]:
import spacy

try:
    nlp = spacy.load('en_core_web_sm')
except OSError:
    print("Downloading en_core_web_sm...")
    subprocess.run([sys.executable, '-m', 'spacy', 'download', 'en_core_web_sm'],
                   check=True, capture_output=True)
    nlp = spacy.load('en_core_web_sm')

print(f"Pipeline components: {nlp.pipe_names}")

In [None]:
# Named Entity Recognition
all_entities = []
for text in TEXTS:
    doc = nlp(text)
    entities = [(ent.text, ent.label_, spacy.explain(ent.label_)) for ent in doc.ents]
    all_entities.extend(entities)
    print(f"\n{text[:75]}...")
    for ent_text, label, explanation in entities:
        print(f"  [{label:8s}] {ent_text!r:35s} — {explanation}")

In [None]:
# Dependency parsing on sentence 1
doc = nlp(TEXTS[0])
print(f"Sentence: {TEXTS[0]}\n")
print(f"{'Token':<20} {'Dep':<14} {'POS':<10} {'Head'}")
print("-" * 60)
for token in doc:
    if not token.is_punct:
        print(f"{token.text:<20} {token.dep_:<14} {token.pos_:<10} {token.head.text!r}")

In [None]:
# Token-level linguistic features
print(f"{'Token':<20} {'Lemma':<20} {'POS':<10} {'Is Stop'}")
print("-" * 60)
for token in doc:
    if not token.is_space and not token.is_punct:
        print(f"{token.text:<20} {token.lemma_:<20} {token.pos_:<10} {token.is_stop}")

## NLTK: Tokenization, Preprocessing, and Sentiment Analysis

NLTK provides classical NLP primitives and the VADER lexicon-based sentiment analyzer.

In [None]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

text = "The quick brown foxes were jumping over the lazily sleeping dogs in the park."
words = word_tokenize(text)
filtered = [w for w in words if w.isalpha() and w.lower() not in stop_words]
lemmatized = [lemmatizer.lemmatize(w.lower()) for w in filtered]
stemmed = [stemmer.stem(w.lower()) for w in filtered]

print(f"Original  : {text}")
print(f"Tokens    : {words}")
print(f"Filtered  : {filtered}")
print(f"Lemmatized: {lemmatized}")
print(f"Stemmed   : {stemmed}")

In [None]:
# VADER sentiment analysis
sia = SentimentIntensityAnalyzer()
sentiments = []

print(f"{'Review':<55} {'Compound':>9} {'Label':>10}")
print("-" * 77)
for review in REVIEWS:
    scores = sia.polarity_scores(review)
    compound = scores['compound']
    label = 'positive' if compound >= 0.05 else ('negative' if compound <= -0.05 else 'neutral')
    sentiments.append({'text': review, 'compound': compound, 'label': label, **scores})
    short = review[:53] + '..' if len(review) > 55 else review
    print(f"{short:<55} {compound:>9.3f} {label:>10}")

In [None]:
labels = ['positive', 'neutral', 'negative']
counts = [sum(1 for s in sentiments if s['label'] == l) for l in labels]
colors = ['#2ecc71', '#95a5a6', '#e74c3c']

fig, axes = plt.subplots(1, 2, figsize=(12, 5))

axes[0].bar(labels, counts, color=colors, edgecolor='black', alpha=0.85)
axes[0].set_title("Sentiment Distribution (VADER)")
axes[0].set_ylabel("Count")

compound_scores = [s['compound'] for s in sentiments]
bar_colors = [colors[0] if c >= 0.05 else (colors[2] if c <= -0.05 else colors[1]) for c in compound_scores]
axes[1].barh(range(len(compound_scores)), compound_scores, color=bar_colors, edgecolor='black', alpha=0.85)
axes[1].axvline(x=0, color='black', linewidth=0.8, linestyle='--')
axes[1].axvline(x=0.05, color='green', linewidth=0.8, linestyle=':')
axes[1].axvline(x=-0.05, color='red', linewidth=0.8, linestyle=':')
axes[1].set_yticks(range(len(REVIEWS)))
axes[1].set_yticklabels([f"Review {i+1}" for i in range(len(REVIEWS))], fontsize=9)
axes[1].set_xlabel("Compound Score")
axes[1].set_title("Per-Review Compound Scores")

plt.tight_layout()
plt.show()

## NLTK: WordNet — Synonyms, Antonyms, and Definitions

In [None]:
for word in ['happy', 'fast', 'beautiful']:
    synsets = wordnet.synsets(word)
    print(f"\n'{word}' — {len(synsets)} synset(s):")
    synonyms, antonyms = set(), set()
    for syn in synsets[:3]:
        print(f"  [{syn.name()}] {syn.definition()}")
        for lemma in syn.lemmas():
            synonyms.add(lemma.name().replace('_', ' '))
            for ant in lemma.antonyms():
                antonyms.add(ant.name().replace('_', ' '))
    print(f"  Synonyms : {', '.join(sorted(synonyms)[:8])}")
    if antonyms:
        print(f"  Antonyms : {', '.join(sorted(antonyms))}")

## Sentence Transformers: Semantic Similarity and Search

Unlike keyword matching, sentence embeddings capture *meaning*. Similar sentences cluster
together in vector space regardless of exact wording.

In [None]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

model = SentenceTransformer('all-MiniLM-L6-v2')
corpus_embeddings = model.encode(TEXTS)
print(f"Encoded {len(TEXTS)} sentences → shape {corpus_embeddings.shape}")

In [None]:
QUERIES = [
    "Who founded Apple?",
    "Where is the Eiffel Tower located?",
    "What did Einstein develop?",
]

query_embeddings = model.encode(QUERIES)
for query, q_emb in zip(QUERIES, query_embeddings):
    scores = cosine_similarity([q_emb], corpus_embeddings)[0]
    best_idx = int(np.argmax(scores))
    print(f"\nQuery : {query!r}")
    print(f"Match : [{scores[best_idx]:.3f}] {TEXTS[best_idx]}")

In [None]:
sim_matrix = cosine_similarity(corpus_embeddings)
short_labels = [t[:35] + '...' for t in TEXTS]

fig, ax = plt.subplots(figsize=(8, 7))
im = ax.imshow(sim_matrix, cmap='Blues', vmin=0, vmax=1)
plt.colorbar(im, ax=ax, label='Cosine Similarity')
ax.set_xticks(range(len(TEXTS)))
ax.set_yticks(range(len(TEXTS)))
ax.set_xticklabels(short_labels, rotation=30, ha='right', fontsize=8)
ax.set_yticklabels(short_labels, fontsize=8)
ax.set_title("Sentence Similarity Heatmap (all-MiniLM-L6-v2)")
for i in range(len(TEXTS)):
    for j in range(len(TEXTS)):
        ax.text(j, i, f"{sim_matrix[i,j]:.2f}", ha='center', va='center', fontsize=8)
plt.tight_layout()
plt.show()

In [None]:
# Cluster reviews into sentiment groups using embeddings
from sklearn.cluster import KMeans

review_embeddings = model.encode(REVIEWS)
kmeans = KMeans(n_clusters=3, random_state=42, n_init=10)
cluster_labels = kmeans.fit_predict(review_embeddings)

for cluster_id in sorted(set(cluster_labels)):
    print(f"\nCluster {cluster_id}:")
    for idx, label in enumerate(cluster_labels):
        if label == cluster_id:
            compound = sentiments[idx]['compound']
            print(f"  [{compound:+.3f}] {REVIEWS[idx][:70]}")

---
## Summary

1. **spaCy** — NER extracts named entities (persons, places, orgs, dates); dependency parsing reveals sentence grammar
2. **NLTK** — VADER gives fast, lexicon-based sentiment without model training; WordNet provides a rich lexical database
3. **Sentence Transformers** — Dense embeddings enable semantic search and meaning-aware clustering

**Resources:**
- [spaCy Usage Guide](https://spacy.io/usage)
- [NLTK Book (free)](https://www.nltk.org/book/)
- [SBERT Documentation](https://www.sbert.net/)