In [1]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.probability import FreqDist

In [2]:
#Corpus : Body of text, singular. Corpora is the plural of this. 
#Lexicon : Words and their meanings. 
#Token : Each “entity” that is a part of whatever was split up based on rules.
#In corpus linguistics, part-of-speech tagging (POS tagging or PoS tagging or POST), also called grammatical tagging or word-category disambiguation.

In [3]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/ibande/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
text="""Today, we revel in the sweet taste of victory in the annual Nam Oyiech tournament. Our success wasn't about reinventing the wheel; it was about being strategists, designers, and developers—small enough to be quick, big enough to deliver excellence."""

In [4]:
#tokenize words
#separate each word and puntuation
tokenized_word=word_tokenize(text)
print(tokenized_word)

['Today', ',', 'we', 'revel', 'in', 'the', 'sweet', 'taste', 'of', 'victory', 'in', 'the', 'annual', 'Nam', 'Oyiech', 'tournament', '.', 'Our', 'success', 'was', "n't", 'about', 'reinventing', 'the', 'wheel', ';', 'it', 'was', 'about', 'being', 'strategists', ',', 'designers', ',', 'and', 'developers—small', 'enough', 'to', 'be', 'quick', ',', 'big', 'enough', 'to', 'deliver', 'excellence', '.']


In [5]:
#sent_tokenize
#separate paragraphs
print(sent_tokenize(text))

['Today, we revel in the sweet taste of victory in the annual Nam Oyiech tournament.', "Our success wasn't about reinventing the wheel; it was about being strategists, designers, and developers—small enough to be quick, big enough to deliver excellence."]


In [6]:
#frequency distribution
#the most_common () displayed
fd = FreqDist(tokenized_word)
print(fd.most_common(3))

[(',', 4), ('the', 3), ('in', 2)]


In [8]:
#remove stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/ibande/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [8]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
print(stop_words)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [9]:
#remove stopwords from tokenized_word
tokenized_word_without_stop_words = []

for word in tokenized_word:
    if word not in stop_words:
        tokenized_word_without_stop_words.append(word)
print(tokenized_word_without_stop_words)

['Today', ',', 'revel', 'sweet', 'taste', 'victory', 'annual', 'Nam', 'Oyiech', 'tournament', '.', 'Our', 'success', "n't", 'reinventing', 'wheel', ';', 'strategists', ',', 'designers', ',', 'developers—small', 'enough', 'quick', ',', 'big', 'enough', 'deliver', 'excellence', '.']


In [10]:
#see the difference
print(set(tokenized_word)-set(tokenized_word_without_stop_words))
print(len(tokenized_word))
print(len(tokenized_word_without_stop_words))

{'and', 'in', 'was', 'the', 'be', 'it', 'being', 'about', 'we', 'of', 'to'}
47
30


In [12]:
#sentiment analysis,lemmatization and stemming
#converts word to their original form

nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /home/ibande/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [13]:
# Stemming is a crude heuristic process that chops off the ends of words to obtain their root forms, known as stems.The resulting stem 
# may not always be a valid word in the language but is intended to represent the core meaning shared by related words. 

# Lemmatization, on the other hand, uses a more sophisticated approach based on vocabulary and morphological analysis of words.
# It reduces words to their canonical or dictionary forms, known as lemmas, which are actual words found in the dictionary

from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize

demoWords = "After experiencing sadness and disappointment, she found joy and excitement in the peaceful moments, feeling grateful and thrilled despite her initial fear and anxiety."
tokenized_words = word_tokenize(demoWords)
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

for word in tokenized_words:
    print(word, stemmer.stem(word), lemmatizer.lemmatize(word))


After after After
experiencing experienc experiencing
sadness sad sadness
and and and
disappointment disappoint disappointment
, , ,
she she she
found found found
joy joy joy
and and and
excitement excit excitement
in in in
the the the
peaceful peac peaceful
moments moment moment
, , ,
feeling feel feeling
grateful grate grateful
and and and
thrilled thrill thrilled
despite despit despite
her her her
initial initi initial
fear fear fear
and and and
anxiety anxieti anxiety
. . .


In [14]:
#sentment analysis
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/ibande/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [11]:
from nltk.sentiment import SentimentIntensityAnalyzer

sia = SentimentIntensityAnalyzer()
#we use polarity analysis to show negative,neutral or positive sentiment
print(sia.polarity_scores("i love food"))

{'neg': 0.0, 'neu': 0.192, 'pos': 0.808, 'compound': 0.6369}


In [14]:
#find meaning,synonyms and antonyms
from nltk.corpus import wordnet

word = wordnet.synset('food.n.01')  # 'leg.n.01' is the unique identifier for the first noun sense of 'leg'
print(word.definition())


any substance that can be metabolized by an animal to give energy and build tissue


In [15]:
#synonym
synonyms = []
for syn in wordnet.synsets('generous'):
    for lemma in syn.lemmas():
        synonyms.append(lemma.name())
print(synonyms)


['generous', 'generous', 'generous']


In [17]:
#antonyms

antonyms = []
for syn in wordnet.synsets('sleep'):
    for lemma in syn.lemmas():
        for antonym in lemma.antonyms():
            antonyms.append(antonym.name())
print(antonyms)


['wake']


In [19]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/ibande/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [18]:
from nltk import pos_tag, word_tokenize

# Define a sentence
a_sentence = "NLTK is a powerful tool for natural language processing."

# Tokenize the sentence into words
tokens = word_tokenize(a_sentence)

# Perform part-of-speech tagging on the tokenized words
pos_tags = pos_tag(tokens)

# Print the result
print(pos_tags)



[('NLTK', 'NNP'), ('is', 'VBZ'), ('a', 'DT'), ('powerful', 'JJ'), ('tool', 'NN'), ('for', 'IN'), ('natural', 'JJ'), ('language', 'NN'), ('processing', 'NN'), ('.', '.')]


In [21]:
# nltk.download('tagsets')
nltk.help.upenn_tagset('JJ')

JJ: adjective or numeral, ordinal
    third ill-mannered pre-war regrettable oiled calamitous first separable
    ectoplasmic battery-powered participatory fourth still-to-be-named
    multilingual multi-disciplinary ...


In [19]:
# Named Entity Recognition (NER): 
# Identifying and classifying named entities in text into pre-defined categories like person names,
# organizations, locations, etc.

# nltk.download('maxent_ne_chunker')
# nltk.download('words')

from nltk import ne_chunk
words = word_tokenize("Barack Obama was the 44th President of the United States.")
ner_tags = ne_chunk(pos_tag(words))
print(ner_tags)

# Geo-Political Entity

(S
  (PERSON Barack/NNP)
  (PERSON Obama/NNP)
  was/VBD
  the/DT
  44th/JJ
  President/NNP
  of/IN
  the/DT
  (GPE United/NNP States/NNPS)
  ./.)


In [20]:
# Text Similarity: Measuring how similar two texts are to each other.

#Example (using cosine similarity):

from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

documents = ["NLTK is a powerful tool for natural language processing.",
             "Natural language processing is the future of the world."]

# Retrieve English stopwords as a string
stop_words = 'english'


# Term frequency inverse Document frequency
# The TfidfVectorizer internally handles the stopwords removal based on the language specified in the stop_words parameter.
# In this case, setting stop_words='english' instructs the vectorizer to use NLTK's list of English
# stopwords to remove common English words from the text data during the TF-IDF vectorization process.


# Initialize TF-IDF vectorizer with English stopwords
vectorizer = TfidfVectorizer(stop_words=stop_words)

# Convert Documents to TF-IDF Matrix
tfidf_matrix = vectorizer.fit_transform(documents)
print(tfidf_matrix)

# Calculate Similarity
similarity = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])

print(similarity)




  (0, 5)	0.3347122780719073
  (0, 1)	0.3347122780719073
  (0, 2)	0.3347122780719073
  (0, 6)	0.4704264280854632
  (0, 4)	0.4704264280854632
  (0, 3)	0.4704264280854632
  (1, 7)	0.5330978245262535
  (1, 0)	0.5330978245262535
  (1, 5)	0.3793034928087496
  (1, 1)	0.3793034928087496
  (1, 2)	0.3793034928087496
[[0.38087261]]


In [32]:
# Text Classification: Assigning predefined categories or labels to text documents.

#Example (using Naive Bayes Classifier):
from sklearn.feature_extraction.text import CountVectorizer
#  convert text data into a numerical representation

from sklearn.naive_bayes import MultinomialNB
# Represents the Naive Bayes classifier we'll use for text classification

from sklearn.model_selection import train_test_split
# split our dataset into training and testing sets

from sklearn.metrics import accuracy_score
# calculate the accuracy of our classifier.

reviews = [
    ("This movie is amazing!", "positive"),
    ("I didn't like this movie.", "negative"),
    ("The acting was superb.", "positive"),
    ("The plot was confusing.", "negative"),
    ("I absolutely loved it!", "positive"),
    ("The worst movie I've ever seen.", "negative"),
    ("The cinematography was breathtaking.", "positive"),
    ("I couldn't stand watching it.", "negative"),
    ("A must-watch for everyone!", "positive"),
    ("Terrible acting and terrible plot.", "negative"),
    ("I was blown away by this film.", "positive"),
    ("I wasted my money on this garbage.", "negative"),
    ("The best movie of the year!", "positive"),
    ("I fell asleep halfway through.", "negative"),
    ("Great storyline and great performances.", "positive"),
    ("Don't bother watching it.", "negative"),
    ("One of the worst movies I've ever seen.", "negative"),
    ("Captivating from start to finish.", "positive"),
    ("Not worth the ticket price.", "negative"),
    ("Highly recommend it to everyone!", "positive")
]

#  Extract the review texts (features) from the reviews dataset.
X = [review[0] for review in reviews]

# Extract the sentiment labels (targets) from the reviews dataset.
y = [review[1] for review in reviews]


#  Initialize a CountVectorizer object, convert the review texts into a numerical format suitable for the classifier.
vectorizer = CountVectorizer()

# convert the raw text data into a sparse matrix of token counts.
X_vectorized = vectorizer.fit_transform(X)
print(vectorizer.get_feature_names_out())
print(X_vectorized.toarray())
# We allocate 20% of the data for testing and 80% for training. 
# The random_state parameter ensures reproducibility by fixing the random seed.
X_train, X_test, y_train, y_test = train_test_split(X_vectorized, y, test_size=0.2, random_state=42)

# initialize the model
classifier = MultinomialNB()

# Train the classifier using the training data (features X_train and labels y_train) using the fit method.
classifier.fit(X_train, y_train)

predictions = classifier.predict(X_test)

# Calculate the accuracy of the classifier by comparing the predicted labels (predictions)
# with the true labels (y_test) using the accuracy_score function.

accuracy = accuracy_score(y_test, predictions)
print("Accuracy:", accuracy)


['absolutely' 'acting' 'amazing' 'and' 'asleep' 'away' 'best' 'blown'
 'bother' 'breathtaking' 'by' 'captivating' 'cinematography' 'confusing'
 'couldn' 'didn' 'don' 'ever' 'everyone' 'fell' 'film' 'finish' 'for'
 'from' 'garbage' 'great' 'halfway' 'highly' 'is' 'it' 'like' 'loved'
 'money' 'movie' 'movies' 'must' 'my' 'not' 'of' 'on' 'one' 'performances'
 'plot' 'price' 'recommend' 'seen' 'stand' 'start' 'storyline' 'superb'
 'terrible' 'the' 'this' 'through' 'ticket' 'to' 've' 'was' 'wasted'
 'watch' 'watching' 'worst' 'worth' 'year']
[[0 0 1 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 1 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 1 0]
 [0 0 0 ... 0 0 0]]
Accuracy: 0.75


In [24]:
from sklearn.feature_extraction.text import CountVectorizer

# Sample documents
documents = ["This is the first document.",
             "This document is the second document.",
             "And this is the third one.",
             "Is this the first document?"]

# Create a CountVectorizer instance
vectorizer = CountVectorizer()

# Fit the vectorizer to the documents and transform the documents into a matrix of token counts
X = vectorizer.fit_transform(documents)

# Print the vocabulary
print(vectorizer.get_feature_names_out())

# Print the matrix of token counts
print(X.toarray())
print(X)


# The numbers in the matrix represent the frequency of each word in the corresponding document.
# For example, the value 2 in row 1, column 1 indicates that the word "document" appears twice in the second document.
# In the first document, the word "and" doesn't appear, "document", "first", "is", "the", and "this" each appear once.

['and' 'document' 'first' 'is' 'one' 'second' 'the' 'third' 'this']
[[0 1 1 1 0 0 1 0 1]
 [0 2 0 1 0 1 1 0 1]
 [1 0 0 1 1 0 1 1 1]
 [0 1 1 1 0 0 1 0 1]]
  (0, 8)	1
  (0, 3)	1
  (0, 6)	1
  (0, 2)	1
  (0, 1)	1
  (1, 8)	1
  (1, 3)	1
  (1, 6)	1
  (1, 1)	2
  (1, 5)	1
  (2, 8)	1
  (2, 3)	1
  (2, 6)	1
  (2, 0)	1
  (2, 7)	1
  (2, 4)	1
  (3, 8)	1
  (3, 3)	1
  (3, 6)	1
  (3, 2)	1
  (3, 1)	1
