# Natural Language Processing (NLP)

> Add blockquote



# Tokenization

In [None]:
# Word tokenization with NLTK
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')

from nltk.tokenize import word_tokenize

text = "The universe is change; our life is what our thoughts make it."
tokens = word_tokenize(text)
print(tokens)

# Sentence tokenization
from nltk.tokenize import sent_tokenize
quote = 'They gradually ascended for half a mile, and then found ' + \
        'themselves at the top of a considerable eminence, where ' + \
        'the wood ceased, and the eye was instantly caught by ' + \
        'Pemberley House, situated on the opposite side of a ' + \
        'valley, into which the road with some abruptness wound. ' + \
        'It was a large, handsome, stone building, standing well ' + \
        'on rising ground, and backed by a ridge of high woody ' + \
        'hills;—and in front, a stream of some natural importance ' + \
        'was swelled into greater, but without any artificial ' + \
        'appearance. Its banks were neither formal, nor falsely ' + \
        'adorned. Elizabeth was delighted. She had never seen a ' + \
        'place where nature had done more, or where natural ' + \
        'beauty had been so little counteracted by an awkward ' + \
        'taste. They were all of them warm in her admiration; ' + \
        'and at that moment she felt that to be mistress of ' + \
        'Pemberley might be something!'
        # Austen, Jane. Pride and Prejudice. 1813, vol. 3, ch. 1

sentences = sent_tokenize(quote)
for index, sentence in enumerate(sentences):
    print(f'  sentence {index}: {sentence}')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


['The', 'universe', 'is', 'change', ';', 'our', 'life', 'is', 'what', 'our', 'thoughts', 'make', 'it', '.']
  sentence 0: They gradually ascended for half a mile, and then found themselves at the top of a considerable eminence, where the wood ceased, and the eye was instantly caught by Pemberley House, situated on the opposite side of a valley, into which the road with some abruptness wound.
  sentence 1: It was a large, handsome, stone building, standing well on rising ground, and backed by a ridge of high woody hills;—and in front, a stream of some natural importance was swelled into greater, but without any artificial appearance.
  sentence 2: Its banks were neither formal, nor falsely adorned.
  sentence 3: Elizabeth was delighted.
  sentence 4: She had never seen a place where nature had done more, or where natural beauty had been so little counteracted by an awkward taste.
  sentence 5: They were all of them warm in her admiration; and at that moment she felt that to be mistress 

## Sentiment Analysis

In [None]:
from nltk.sentiment import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')
sia = SentimentIntensityAnalyzer()

evals = ['This course is absolutely amazing!',
         'Absolute waste of time.  I learned nothing.',
         'It was fine, but moved too slow',
         'I like turtles!']

for eval in evals:
    scores = sia.polarity_scores(eval)
    print(f'{eval}\n\t{scores}')

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


This course is absolutely amazing!
	{'neg': 0.0, 'neu': 0.477, 'pos': 0.523, 'compound': 0.6581}
Absolute waste of time.  I learned nothing.
	{'neg': 0.359, 'neu': 0.641, 'pos': 0.0, 'compound': -0.4215}
It was fine, but moved too slow
	{'neg': 0.0, 'neu': 0.811, 'pos': 0.189, 'compound': 0.1027}
I like turtles!
	{'neg': 0.0, 'neu': 0.264, 'pos': 0.736, 'compound': 0.4199}


## Text Preprocessing for Statistical NLP
Clean text for statistical models like Naive Bayes or word frequency analysis


In [None]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer

# Download required NLTK data (run once)
try:
    nltk.download('stopwords', quiet=True)
    nltk.download('wordnet', quiet=True)
    nltk.download('punkt', quiet=True)  # For word_tokenize
except:
    print("NLTK download failed—ensure internet connection!")

# Initialize preprocessing tools
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

# Sample text for preprocessing
text = "Follies and nonsense, whims and inconsistencies do divert me, I own, and I laugh at them whenever I can."

# Step-by-step preprocessing
tokens = word_tokenize(text.lower())  # Lowercase and tokenize
filtered = [w for w in tokens if w.isalpha() and w not in stop_words]  # Remove punctuation, stop words
stemmed = [stemmer.stem(w) for w in filtered]  # Stem words
lemmatized = [lemmatizer.lemmatize(w) for w in filtered]  # Lemmatize words

# Word frequency for statistical NLP
from collections import Counter
freq = Counter(lemmatized)

# Display results
print("Step 1 - Tokenized & Lowercased:", tokens)
print("Step 2 - Filtered (No Punctuation/Stop Words):", filtered)
print("Step 3 - Stemmed (Porter):", stemmed)
print("Step 4 - Lemmatized (WordNet):", lemmatized)
print("Step 5 - Word Frequency (Statistical NLP):", freq)


Step 1 - Tokenized & Lowercased: ['follies', 'and', 'nonsense', ',', 'whims', 'and', 'inconsistencies', 'do', 'divert', 'me', ',', 'i', 'own', ',', 'and', 'i', 'laugh', 'at', 'them', 'whenever', 'i', 'can', '.']
Step 2 - Filtered (No Punctuation/Stop Words): ['follies', 'nonsense', 'whims', 'inconsistencies', 'divert', 'laugh', 'whenever']
Step 3 - Stemmed (Porter): ['folli', 'nonsens', 'whim', 'inconsist', 'divert', 'laugh', 'whenev']
Step 4 - Lemmatized (WordNet): ['folly', 'nonsense', 'whim', 'inconsistency', 'divert', 'laugh', 'whenever']
Step 5 - Word Frequency (Statistical NLP): Counter({'folly': 1, 'nonsense': 1, 'whim': 1, 'inconsistency': 1, 'divert': 1, 'laugh': 1, 'whenever': 1})


# N-Gram Analysis
This section preprocesses a sample text by tokenizing, filtering, stemming, and lemmatizing, then generates bigrams and trigrams, and computes word frequencies using NLTK.


In [None]:
from nltk.util import ngrams

text = "Natural language processing is fascinating.  Absolutely fascinating!"
tokens = word_tokenize(text.lower())  # Lowercase and tokenize
filtered = [w for w in tokens if w.isalpha() and w not in stop_words]  # Remove punctuation, stop words
stemmed = [stemmer.stem(w) for w in filtered]  # Stem words
lemmatized = [lemmatizer.lemmatize(w) for w in filtered]  # Lemmatize words

# N-Gram generation
bigrams = list(ngrams(lemmatized, 2))  # Bigrams from lemmatized tokens
trigrams = list(ngrams(lemmatized, 3))  # Trigrams from lemmatized tokens

# Word frequency for statistical NLP
freq = Counter(tokens)

print("Bigrams:", bigrams)
print("Trigrams:", trigrams)
print("Word Frequency:", freq)


Bigrams: [('natural', 'language'), ('language', 'processing'), ('processing', 'fascinating'), ('fascinating', 'absolutely'), ('absolutely', 'fascinating')]
Trigrams: [('natural', 'language', 'processing'), ('language', 'processing', 'fascinating'), ('processing', 'fascinating', 'absolutely'), ('fascinating', 'absolutely', 'fascinating')]
Word Frequency: Counter({'fascinating': 2, 'natural': 1, 'language': 1, 'processing': 1, 'is': 1, '.': 1, 'absolutely': 1, '!': 1})


## Text Vectorization with CountVectorizer
This example demonstrates the use of scikit-learn's CountVectorizer to convert a small corpus of text into a numerical feature matrix. It tokenizes three sentences, builds a vocabulary of unique words, and transforms the text into a bag-of-words representation, printing both the feature names (words) and the resulting array.

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

corpus = [
    "I am malicious because I am miserable.",
    "The fallen angel becomes a malignant devil.",
    "Life, although it may only be an accumulation of anguish, is dear to me."
]

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)
print(vectorizer.get_feature_names_out())
print(X.toarray())

['accumulation' 'although' 'am' 'an' 'angel' 'anguish' 'be' 'because'
 'becomes' 'dear' 'devil' 'fallen' 'is' 'it' 'life' 'malicious'
 'malignant' 'may' 'me' 'miserable' 'of' 'only' 'the' 'to']
[[0 0 2 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0]
 [0 0 0 0 1 0 0 0 1 0 1 1 0 0 0 0 1 0 0 0 0 0 1 0]
 [1 1 0 1 0 1 1 0 0 1 0 0 1 1 1 0 0 1 1 0 1 1 0 1]]


## TF-IDF Vectorization Example
This example showcases TF-IDF vectorization using scikit-learn’s TfidfVectorizer. It transforms a small text corpus into a weighted matrix, emphasizing rare terms over common ones, and prints the feature names (vocabulary) and the resulting TF-IDF array.

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Define a sample corpus
corpus = [
    "I am malicious because I am miserable",
    "The fallen angel becomes a malignant devil",
    "Life is dear to me despite its anguish"
]

# Initialize and apply TF-IDF vectorizer
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(corpus)

# Display results
print("Feature Names (Words):", vectorizer.get_feature_names_out())
print("TF-IDF Matrix:\n", X.toarray())

Feature Names (Words): ['am' 'angel' 'anguish' 'because' 'becomes' 'dear' 'despite' 'devil'
 'fallen' 'is' 'its' 'life' 'malicious' 'malignant' 'me' 'miserable' 'the'
 'to']
TF-IDF Matrix:
 [[0.75592895 0.         0.         0.37796447 0.         0.
  0.         0.         0.         0.         0.         0.
  0.37796447 0.         0.         0.37796447 0.         0.        ]
 [0.         0.40824829 0.         0.         0.40824829 0.
  0.         0.40824829 0.40824829 0.         0.         0.
  0.         0.40824829 0.         0.         0.40824829 0.        ]
 [0.         0.         0.35355339 0.         0.         0.35355339
  0.35355339 0.         0.         0.35355339 0.35355339 0.35355339
  0.         0.         0.35355339 0.         0.         0.35355339]]


## Naive Bayes Sentiment Classification
This example demonstrates text classification using scikit-learn’s Multinomial Naive Bayes in a pipeline with CountVectorizer. It trains on a small sentiment dataset (positive/negative reviews) and predicts labels for test samples, showcasing probabilistic classification in action.



In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

# More robust training data with distinct sentiment cues
X_train = [
    "I love this amazing product",
    "Terrible experience with this junk",
    "Fantastic service and quality",
    "Horrible and utterly disappointing",
    "Really joyful happy purchase",
    "Awful disgusting bad service"
]
y_train = ["positive", "negative", "positive", "negative", "positive", "negative"]
X_test = ["This is wonderful", "Totally dreadful"]

# Create and train the pipeline
pipeline = Pipeline([
    ('vectorizer', CountVectorizer()),  # Convert text to word counts
    ('classifier', MultinomialNB())     # Apply Naive Bayes classifier
])

# Training and prediction
pipeline.fit(X_train, y_train)
predictions = pipeline.predict(X_test)

# Display results
print("Test Samples:", X_test)
print("Predictions:", predictions)

vectorizer = pipeline.named_steps['vectorizer']
classifier = pipeline.named_steps['classifier']
print("\nVocabulary:", vectorizer.get_feature_names_out())
print("Class Probabilities for Test Samples:\n", pipeline.predict_proba(X_test))

Test Samples: ['This is wonderful', 'Totally dreadful']
Predictions: ['positive' 'negative']

Vocabulary: ['amazing' 'and' 'awful' 'bad' 'disappointing' 'disgusting' 'experience'
 'fantastic' 'happy' 'horrible' 'joyful' 'junk' 'love' 'product'
 'purchase' 'quality' 'really' 'service' 'terrible' 'this' 'utterly'
 'with']
Class Probabilities for Test Samples:
 [[0.49275362 0.50724638]
 [0.5        0.5       ]]


## Parts of Speech Tagging with NLTK

Description: This cell demonstrates basic POS tagging using NLTK’s pre-trained averaged_perceptron_tagger. It tokenizes a sample sentence ("The quick brown fox jumps over the lazy dog") and assigns grammatical tags (e.g., noun, verb, adjective) to each word, printing the resulting tagged list.

In [None]:
import nltk
from nltk.tokenize import word_tokenize

# Download required NLTK resources
nltk.download('punkt_tab')                  # For tokenization
nltk.download('averaged_perceptron_tagger_eng')  # For POS tagging

text = "The quick brown fox jumps over the lazy dog"
tokens = word_tokenize(text)
pos_tags = nltk.pos_tag(tokens)
print(pos_tags)

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.


[('The', 'DT'), ('quick', 'JJ'), ('brown', 'NN'), ('fox', 'NN'), ('jumps', 'VBZ'), ('over', 'IN'), ('the', 'DT'), ('lazy', 'JJ'), ('dog', 'NN')]


## Named Entity Recognition with spaCy
This example demonstrates how to identify and label entities like organizations, locations, dates, and monetary values in a sentence using spaCy's pre-trained language model.

In [None]:
import spacy
nlp = spacy.load("en_core_web_sm")

text = 'President Donald Trump unveiled 54% tariffs on all Chinese imports into the United States ' + \
       'Wednesday as part of his sweeping “Liberation Day” reset of American trade global policy.'
doc = nlp(text)

for ent in doc.ents:
    print(ent.text, "→", ent.label_)

Donald Trump → PERSON
54% → PERCENT
Chinese → NORP
the United States → GPE
Wednesday → DATE
American → NORP


#Context-Free Grammars (CFGs) with NLTK
CFGs define the syntactic structure of sentences using formal production rules. This example sets up a grammar and demonstrates how to generate parse trees, which help visualize sentence structure.



In [None]:
from nltk import CFG
from nltk.parse import ChartParser

# Define a simple context-free grammar (CFG)
grammar = CFG.fromstring("""
    S -> NP VP
    NP -> Det N | Det N PP
    VP -> V NP | V NP PP
    PP -> P NP
    Det -> 'the' | 'a'
    N -> 'dog' | 'cat'
    V -> 'chased' | 'saw'
    P -> 'in' | 'with'
""")

# Create a parser using the defined grammar
parser = ChartParser(grammar)

# Positive example (valid sentence)
sentence1 = ['the', 'dog', 'chased', 'a', 'cat']
print("Parsing valid sentence:", ' '.join(sentence1))
for tree in parser.parse(sentence1):
    tree.pretty_print()

# Negative example (invalid sentence - not covered by grammar)
sentence2 = ['the', 'dog', 'barked', 'at', 'a', 'cat']
print("\nAttempting to parse invalid sentence:", ' '.join(sentence2))
try:
    parsed = list(parser.parse(sentence2))
    if parsed:
        for tree in parsed:
            tree.pretty_print()
    else:
        print("No parse tree — sentence doesn't match the grammar rules.")
except Exception as e:
    print("Error during parsing:", e)

Parsing valid sentence: the dog chased a cat
              S               
      ________|_____           
     |              VP        
     |         _____|___       
     NP       |         NP    
  ___|___     |      ___|___   
Det      N    V    Det      N 
 |       |    |     |       |  
the     dog chased  a      cat


Attempting to parse invalid sentence: the dog barked at a cat
Error during parsing: Grammar does not cover some of the input words: "'barked', 'at'".


#Dependency Parsing with spaCy
This example analyzes the grammatical structure of a sentence by identifying how each word depends on others. It uses spaCy to extract and display syntactic relationships, such as subjects, objects, and modifiers.



In [None]:
import spacy
nlp = spacy.load("en_core_web_sm")
doc = nlp("The quick brown fox jumps over the lazy dog")

for token in doc:
    print(f"{token.text} --> {token.dep_} --> {token.head.text}")


The --> det --> fox
quick --> amod --> fox
brown --> amod --> fox
fox --> nsubj --> jumps
jumps --> ROOT --> jumps
over --> prep --> jumps
the --> det --> dog
lazy --> amod --> dog
dog --> pobj --> over
