In [1]:
raw_docs = ["The sun sets behind the mountains, casting a warm orange glow.",
            "In a distant galaxy, a new civilization is born, full of hope and curiosity.",
            "Amidst the bustling city, a street musician plays a melancholic tune on his guitar.",
            "The aroma of freshly baked bread fills the cozy bakery, enticing passersby.",
            "A lone wolf prowls through the dense forest, its eyes gleaming in the moonlight."]

In [2]:
# Tokenizing text into bags of words
from nltk.tokenize import word_tokenize
tokenized_docs = [word_tokenize(doc) for doc in raw_docs]
print(tokenized_docs)

[['The', 'sun', 'sets', 'behind', 'the', 'mountains', ',', 'casting', 'a', 'warm', 'orange', 'glow', '.'], ['In', 'a', 'distant', 'galaxy', ',', 'a', 'new', 'civilization', 'is', 'born', ',', 'full', 'of', 'hope', 'and', 'curiosity', '.'], ['Amidst', 'the', 'bustling', 'city', ',', 'a', 'street', 'musician', 'plays', 'a', 'melancholic', 'tune', 'on', 'his', 'guitar', '.'], ['The', 'aroma', 'of', 'freshly', 'baked', 'bread', 'fills', 'the', 'cozy', 'bakery', ',', 'enticing', 'passersby', '.'], ['A', 'lone', 'wolf', 'prowls', 'through', 'the', 'dense', 'forest', ',', 'its', 'eyes', 'gleaming', 'in', 'the', 'moonlight', '.']]


In [3]:
# Removing punctuation
import re
import string
regex = re.compile("[%s]" % re.escape(string.punctuation))

# Initialize an empty list to store tokenized documents without punctuation
tokenized_docs_no_punctuation = []

# Loop through each tokenized review in the tokenized_docs list
for review in tokenized_docs:
    # Initialize an empty list to store tokens in the current review without punctuation
    new_review = []
    for token in review:
        # Remove punctuation from the token using the regex pattern
        new_token = regex.sub(u'', token)
        # Check if the new token is not an empty string
        if not new_token == u'':
            # Append the cleaned token to the new_review list
            new_review.append(new_token) 
    # Append the new_review (cleaned review) to the tokenized_docs_no_punctuation list
    tokenized_docs_no_punctuation.append(new_review)
    
print(tokenized_docs_no_punctuation)

[['The', 'sun', 'sets', 'behind', 'the', 'mountains', 'casting', 'a', 'warm', 'orange', 'glow'], ['In', 'a', 'distant', 'galaxy', 'a', 'new', 'civilization', 'is', 'born', 'full', 'of', 'hope', 'and', 'curiosity'], ['Amidst', 'the', 'bustling', 'city', 'a', 'street', 'musician', 'plays', 'a', 'melancholic', 'tune', 'on', 'his', 'guitar'], ['The', 'aroma', 'of', 'freshly', 'baked', 'bread', 'fills', 'the', 'cozy', 'bakery', 'enticing', 'passersby'], ['A', 'lone', 'wolf', 'prowls', 'through', 'the', 'dense', 'forest', 'its', 'eyes', 'gleaming', 'in', 'the', 'moonlight']]


In [4]:
# Cleaning the text of stopwords
from nltk.corpus import stopwords

tokenized_docs_no_stopwords = []

for doc in tokenized_docs_no_punctuation:
    new_term_vector = []
    for word in doc:
        if not word in stopwords.words("english"):
            new_term_vector.append(word)
    tokenized_docs_no_stopwords.append(new_term_vector)
    
print(tokenized_docs_no_stopwords)

[['The', 'sun', 'sets', 'behind', 'mountains', 'casting', 'warm', 'orange', 'glow'], ['In', 'distant', 'galaxy', 'new', 'civilization', 'born', 'full', 'hope', 'curiosity'], ['Amidst', 'bustling', 'city', 'street', 'musician', 'plays', 'melancholic', 'tune', 'guitar'], ['The', 'aroma', 'freshly', 'baked', 'bread', 'fills', 'cozy', 'bakery', 'enticing', 'passersby'], ['A', 'lone', 'wolf', 'prowls', 'dense', 'forest', 'eyes', 'gleaming', 'moonlight']]


In [5]:
#Stemming and lemmatizing
'''
Porter Stemmer (PorterStemmer): Porter stemming is an algorithm that attempts to remove suffixes (and sometimes prefixes) 
from words in order to obtain their root form. It is a simple and widely used stemming algorithm. 
For example, it would reduce words like "running," "runner," and "ran" to the root form "run." 

However, it may not always produce valid words, as it focuses on rule-based transformations rather than linguistic accuracy. 
Porter stemming is fast and efficient.

Snowball Stemmer (SnowballStemmer): Snowball stemming is an extension of the Porter stemming algorithm and is available 
in multiple languages. It aims to provide more accurate stemming by applying language-specific rules. The 'english' variant 
specifically targets the English language. It's considered an improvement over the original Porter stemmer and can 
produce more linguistically valid stems.

WordNet Lemmatizer (WordNetLemmatizer): Lemmatization is a different approach from stemming. Instead of reducing words to 
their root forms, lemmatization reduces words to their base or dictionary forms (lemmas). 
The WordNet Lemmatizer is based on WordNet, a lexical database of English. It can provide more accurate results 
because it considers word meanings and parts of speech. For example, it would reduce "running," "runner," and "ran" to the
lemma "run." Lemmatization often results in valid dictionary words but can be slower than stemming.
'''
from nltk.stem.porter import PorterStemmer 
from nltk.stem.snowball import SnowballStemmer 
from nltk.stem.wordnet import WordNetLemmatizer 

porter = PorterStemmer()
snowball = SnowballStemmer("english")
wordnet = WordNetLemmatizer() 

preprocessed_docs = []

for doc in tokenized_docs_no_stopwords:
    #print(doc)
    final_doc = []
    for word in doc:
        #print(word)
        final_doc.append(word)
    preprocessed_docs.append(final_doc)
    
print(preprocessed_docs)

[['The', 'sun', 'sets', 'behind', 'mountains', 'casting', 'warm', 'orange', 'glow'], ['In', 'distant', 'galaxy', 'new', 'civilization', 'born', 'full', 'hope', 'curiosity'], ['Amidst', 'bustling', 'city', 'street', 'musician', 'plays', 'melancholic', 'tune', 'guitar'], ['The', 'aroma', 'freshly', 'baked', 'bread', 'fills', 'cozy', 'bakery', 'enticing', 'passersby'], ['A', 'lone', 'wolf', 'prowls', 'dense', 'forest', 'eyes', 'gleaming', 'moonlight']]


## Basic Tasks


In [6]:
import nltk
nltk.download('gutenberg')
nltk.download('genesis')
nltk.download('inaugural')
nltk.download('nps_chat')
nltk.download('webtext')
nltk.download('treebank')
from nltk.book import *

[nltk_data] Downloading package gutenberg to
[nltk_data]     C:\Users\bhise\AppData\Roaming\nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!
[nltk_data] Downloading package genesis to
[nltk_data]     C:\Users\bhise\AppData\Roaming\nltk_data...
[nltk_data]   Package genesis is already up-to-date!
[nltk_data] Downloading package inaugural to
[nltk_data]     C:\Users\bhise\AppData\Roaming\nltk_data...
[nltk_data]   Package inaugural is already up-to-date!
[nltk_data] Downloading package nps_chat to
[nltk_data]     C:\Users\bhise\AppData\Roaming\nltk_data...
[nltk_data]   Package nps_chat is already up-to-date!
[nltk_data] Downloading package webtext to
[nltk_data]     C:\Users\bhise\AppData\Roaming\nltk_data...
[nltk_data]   Package webtext is already up-to-date!
[nltk_data] Downloading package treebank to
[nltk_data]     C:\Users\bhise\AppData\Roaming\nltk_data...
[nltk_data]   Package treebank is already up-to-date!


*** Introductory Examples for the NLTK Book ***
Loading text1, ..., text9 and sent1, ..., sent9
Type the name of the text or sentence to view it.
Type: 'texts()' or 'sents()' to list the materials.
text1: Moby Dick by Herman Melville 1851
text2: Sense and Sensibility by Jane Austen 1811
text3: The Book of Genesis
text4: Inaugural Address Corpus
text5: Chat Corpus
text6: Monty Python and the Holy Grail
text7: Wall Street Journal
text8: Personals Corpus
text9: The Man Who Was Thursday by G . K . Chesterton 1908


In [7]:
text2

<Text: Sense and Sensibility by Jane Austen 1811>

In [8]:
sents()

sent1: Call me Ishmael .
sent2: The family of Dashwood had long been settled in Sussex .
sent3: In the beginning God created the heaven and the earth .
sent4: Fellow - Citizens of the Senate and of the House of Representatives :
sent5: I have a problem with people PMing me to lol JOIN
sent6: SCENE 1 : [ wind ] [ clop clop clop ] KING ARTHUR : Whoa there !
sent7: Pierre Vinken , 61 years old , will join the board as a nonexecutive director Nov. 29 .
sent8: 25 SEXY MALE , seeks attrac older single lady , for discreet encounters .
sent9: THE suburb of Saffron Park lay on the sunset side of London , as red and ragged as a cloud of sunset .


In [9]:
sent1

['Call', 'me', 'Ishmael', '.']

In [10]:
print(text7, len(text7))

<Text: Wall Street Journal> 100676


In [11]:
print(sent7, len(sent7))

['Pierre', 'Vinken', ',', '61', 'years', 'old', ',', 'will', 'join', 'the', 'board', 'as', 'a', 'nonexecutive', 'director', 'Nov.', '29', '.'] 18


In [12]:
list(set(text7))[:10]

['firmness',
 '130',
 'players',
 'predict',
 'reservations',
 '47.1',
 'palace',
 'providing',
 'legal',
 '6.70']

In [13]:
# # Compute the frequency distribution of words in 'text7'
dist = FreqDist(text7)
len(dist)

12408

In [14]:
vocab1 = list(dist.keys())
# Get a list of the first 10 unique words in the vocabulary
vocab1[:10]

['Pierre', 'Vinken', ',', '61', 'years', 'old', 'will', 'join', 'the', 'board']

In [15]:
dist['Vinken']

2

In [16]:
freqwords = [w for w in vocab1 if len(w) > 5 and dist[w] > 100]
freqwords

['billion',
 'company',
 'president',
 'because',
 'market',
 'million',
 'shares',
 'trading',
 'program']

In [17]:
# different forms of the same "word"
input1 = "List listed lists listing listings"
words1 = input1.lower().split(' ')
words1

['list', 'listed', 'lists', 'listing', 'listings']

In [18]:
[porter.stem(t) for t in words1]

['list', 'list', 'list', 'list', 'list']

In [19]:
#tokenization
text1 = "Life in general is a slippery slope with many crests and troughs."
text1.split(' ')

['Life',
 'in',
 'general',
 'is',
 'a',
 'slippery',
 'slope',
 'with',
 'many',
 'crests',
 'and',
 'troughs.']

In [20]:
nltk.word_tokenize(text1)

['Life',
 'in',
 'general',
 'is',
 'a',
 'slippery',
 'slope',
 'with',
 'many',
 'crests',
 'and',
 'troughs',
 '.']

In [21]:
# Sentence splitting
text12 = "The shimmering stars illuminated the night sky, creating a mesmerizing celestial display. Wow"
sentences = nltk.sent_tokenize(text12)
len(sentences)

2

In [22]:
sentences

['The shimmering stars illuminated the night sky, creating a mesmerizing celestial display.',
 'Wow']