#### Tokenization

In [1]:
from nltk.tokenize import word_tokenize, sent_tokenize
text = "NLTK is a powerful tool for NLP tasks. It can handle tokenization effectively."

words = word_tokenize(text)
sentences = sent_tokenize(text)

print("Words: ", words)
print("Sentences: ", sentences)

Words:  ['NLTK', 'is', 'a', 'powerful', 'tool', 'for', 'NLP', 'tasks', '.', 'It', 'can', 'handle', 'tokenization', 'effectively', '.']
Sentences:  ['NLTK is a powerful tool for NLP tasks.', 'It can handle tokenization effectively.']


#### Stopword Removal

In [2]:
from nltk.corpus import stopwords

stop_words = set(stopwords.words("english"))
filter_words = list(filter(lambda x: x.lower() not in stop_words, words))

print("Filtered words: ", filter_words)

Filtered words:  ['NLTK', 'powerful', 'tool', 'NLP', 'tasks', '.', 'handle', 'tokenization', 'effectively', '.']


#### Stemming and Lemmaatization

In [3]:
from nltk.stem import PorterStemmer, WordNetLemmatizer

porter_stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

words = ['running', 'ran', 'jumps', 'jumping']
stemmed_words = list(map(lambda x: porter_stemmer.stem(x), words))
lemmatized_words = list(map(lambda x: lemmatizer.lemmatize(x, pos='v'), words))

print('Stemmed words:', stemmed_words)
print('Lemmatized words:', lemmatized_words)

Stemmed words: ['run', 'ran', 'jump', 'jump']
Lemmatized words: ['run', 'run', 'jump', 'jump']


#### Part-of-speech (POS) Tagging

In [4]:
from nltk import pos_tag

tagged_words = pos_tag(words)

print("POS Tagging: ", tagged_words)

POS Tagging:  [('running', 'VBG'), ('ran', 'VBD'), ('jumps', 'NNS'), ('jumping', 'VBG')]


#### Named Entity Recognition (NER)

In [5]:
from nltk import ne_chunk

sentence = "Barack Obama was born in Hawaii."

tagged_sentence = pos_tag(word_tokenize(sentence))
named_entities = ne_chunk(tagged_sentence)

print('Named Entities: ', named_entities)

Named Entities:  (S
  (PERSON Barack/NNP)
  (PERSON Obama/NNP)
  was/VBD
  born/VBN
  in/IN
  (GPE Hawaii/NNP)
  ./.)


#### Corpra and Resources

In [8]:
from nltk.corpus import gutenberg

# Load the gunteberg corpus
emma = gutenberg.words('austen-emma.txt')

# Find the number of words in the gunteberg corpus
num_words = len(emma)

# Find the number of sentences in the gunteberg corpus
num_sentences = len(gutenberg.sents('austen-emma.txt'))

# Calculate the average words per sentence
avg_words_per_sentence = num_words / num_sentences

print('Number of words: ', num_words)
print('Number of sentences: ', num_sentences)
print('Average words per sentence: ', avg_words_per_sentence)


Number of words:  192427
Number of sentences:  7752
Average words per sentence:  24.822884416924666


#### Wordnet and Applications

In [10]:
from nltk.corpus import wordnet

# Get synonyms of the word happy
synsets = wordnet.synsets('happy')
synonyms = list(map(lambda x: x.lemmas()[0].name(), synsets))

# Get hypernyms for dog
synsets_dog = wordnet.synsets('dog')
hypernyms = synsets_dog[0].hypernyms()

print("Synonyms of 'happy': ", synonyms)
print("Hypernyms of 'dog': ", hypernyms)

Synonyms of 'happy':  ['happy', 'felicitous', 'glad', 'happy']
Hypernyms of 'dog':  [Synset('canine.n.02'), Synset('domestic_animal.n.01')]


#### Feature extraction from Text

In [3]:
from sklearn.feature_extraction.text import CountVectorizer

documents = [
    "NLTK is a powerful tool for NLP tasks.",
    "Sentiment analysis helps understand user feelings.",
    "Topic modeling finds hidden patterns in data.",
]

# Create a CountVectorizer instance
vectorizer = CountVectorizer()

# Fit and transform the text data into a feature matrix
feature_matrix = vectorizer.fit_transform(documents)

print("Feature matrix")
print(feature_matrix.toarray())
print("Vocabulary: ", vectorizer.get_feature_names_out())

Feature matrix
[[0 0 0 0 1 0 0 0 1 0 1 1 0 1 0 1 1 0 0 0]
 [1 0 1 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 1 1]
 [0 1 0 1 0 0 1 1 0 1 0 0 1 0 0 0 0 1 0 0]]
Vocabulary:  ['analysis' 'data' 'feelings' 'finds' 'for' 'helps' 'hidden' 'in' 'is'
 'modeling' 'nlp' 'nltk' 'patterns' 'powerful' 'sentiment' 'tasks' 'tool'
 'topic' 'understand' 'user']


### Text classifier

In [3]:
from nltk.corpus import movie_reviews
from nltk.classify import NaiveBayesClassifier
from nltk.classify.util import accuracy as nltk_accuracy

# Prepare data for sentiment analysis
positive_reviews = [(movie_reviews.words(fileId), 'positive') for fileId in movie_reviews.fileids('pos')]
negative_reviews = [(movie_reviews.words(fileId), 'negative') for fileId in movie_reviews.fileids('neg')]
reviews = positive_reviews + negative_reviews

# Create feature sets using BoW representation
def extract_features(words):
    return dict(list(map(lambda x: (x, True), words)))

feature_sets = [(extract_features(words), sentiment) for (words, sentiment) in reviews]

# Split the data in training and testing sets
train_set = feature_sets[:800]
test_set = feature_sets[800:]

# Build the Naive Bayes classifier
classifier = NaiveBayesClassifier.train(train_set)

# Test the accuracy of the classifier
accuracy = nltk_accuracy(classifier, test_set)

print('Accuracy: ', accuracy)

Accuracy:  0.16666666666666666


#### Sentiment analysis

In [6]:
# Using the VADER sentiment analysis from NLTK
from nltk.sentiment.vader import SentimentIntensityAnalyzer

# Sample text for sentiment analysis
text = "I love this product! It is amazing!"

# Create SentimentIntensityAnalyzer instance
sid = SentimentIntensityAnalyzer()

# Get the sentiment score for the text
sentiment_score = sid.polarity_scores(text)

print('Sentiment score: ', sentiment_score)

Sentiment score:  {'neg': 0.0, 'neu': 0.318, 'pos': 0.682, 'compound': 0.8619}


#### Sentiment analyzer

In [10]:
# Prepare data for sentiment analysis
positive_reviews = [(movie_reviews.raw(fileId), 'positive') for fileId in movie_reviews.fileids('pos')]
negative_reviews = [(movie_reviews.raw(fileId), 'negative') for fileId in movie_reviews.fileids('neg')]
reviews = positive_reviews + negative_reviews

# Shuffle the reviews for training and testing sets
import random
random.shuffle(reviews)

# Create feature sets using BoW representation
feature_sets = [(extract_features(words), sentiment) for (words, sentiment) in reviews]

# Split the data in training and testing sets
train_set = feature_sets[:1600]
test_set = feature_sets[1600:]

# Build the Naive Bayes classifier
classifier = NaiveBayesClassifier.train(feature_sets)

# Test the classifier on the test set
accuracy = nltk_accuracy(classifier, test_set)

print('Accuracy: ', accuracy)

Accuracy:  0.5475


#### Topic Modeling

In [4]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.probability import FreqDist
from nltk.corpus import inaugural, brown, reuters, gutenberg, movie_reviews, webtext, nps_chat, treebank, conll2000, names, wordnet

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

corpus = [
    "NLTK is a powerful tool for NLP tasks.", "Topic modeling finds hidden patterns in data.",
    "Sentiment analysis helps understand user feelings.", "LDA is a popular topic modeling algorithm.", 
    "Natural Language Processing is an exciting field.", "Text classification categorizes documents in classes."
]

# Function to prepare the text data
def preprocess_text(text):
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()

    # Tokenize the text
    words = word_tokenize(text.lower())

    # Remove punctuation and non-alphabetic characters
    words = list(filter(lambda x: x.isalpha(), words))

    # Remove stopwords
    words = list(filter(lambda x: x not in stop_words, words))

    # Lemmatize words
    words = list(map(lambda x: lemmatizer.lemmatize(x), words))

    return " ".join(words)

# Preprocess the corpus
preprocessed_corpus = [preprocess_text(doc) for doc in corpus]

# Create a CountVectorizer instance
vectorizer = CountVectorizer()

# Fit and transform the preprocessed text data into a feature matrix
feature_matrix = vectorizer.fit_transform(preprocessed_corpus)

# Build the LDA model
num_topics = 2
lda_model = LatentDirichletAllocation(n_components=num_topics, random_state=42)
lda_model.fit(feature_matrix)

# Display the topics
def display_topics(model, feature_names, num_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print(f"Topic {topic_idx + 1}:")
        print(" ".join([feature_names[i] for i in topic.argsort()[:-num_top_words -1 : -1]]))

num_top_words = 5
display_topics(lda_model, vectorizer.get_feature_names_out(), num_top_words)


Topic 1:
feeling user analysis sentiment understand
Topic 2:
modeling topic data find hidden
