### Sentence tokenization: Task-1

In [4]:
# sentence tokenization using regular expression implementation

import re
import numpy as np

sample_text = ("US unveils world's most powerful supercomputer, beats China. " 
               "The US has unveiled the world's most powerful supercomputer called 'Summit', " 
               "beating the previous record-holder China's Sunway TaihuLight. With a peak performance "
               "of 200,000 trillion calculations per second, it is over twice as fast as Sunway TaihuLight, "
               "which is capable of 93,000 trillion calculations per second. Summit has 4,608 servers, "
               "which reportedly take up the size of two tennis courts.")

sentences = re.split(r"(?<=[.!?])\s", sample_text)
print("Number of sentences: ", len(sentences))
print(np.array(sentences))

Number of sentences:  4
["US unveils world's most powerful supercomputer, beats China."
 "The US has unveiled the world's most powerful supercomputer called 'Summit', beating the previous record-holder China's Sunway TaihuLight."
 'With a peak performance of 200,000 trillion calculations per second, it is over twice as fast as Sunway TaihuLight, which is capable of 93,000 trillion calculations per second.'
 'Summit has 4,608 servers, which reportedly take up the size of two tennis courts.']


In [5]:
# sentence tokenization using NLTK's implementation


import nltk
import numpy as np
default_st = nltk.sent_tokenize

sample_text = ("US unveils world's most powerful supercomputer, beats China. " 
               "The US has unveiled the world's most powerful supercomputer called 'Summit', " 
               "beating the previous record-holder China's Sunway TaihuLight. With a peak performance "
               "of 200,000 trillion calculations per second, it is over twice as fast as Sunway TaihuLight, "
               "which is capable of 93,000 trillion calculations per second. Summit has 4,608 servers, "
               "which reportedly take up the size of two tennis courts.")

sample_sentences = default_st(text=sample_text)
print('Total sentences in sample_text:', len(sample_sentences))
print('Sample text sentences :-')
print(np.array(sample_sentences))

Total sentences in sample_text: 4
Sample text sentences :-
["US unveils world's most powerful supercomputer, beats China."
 "The US has unveiled the world's most powerful supercomputer called 'Summit', beating the previous record-holder China's Sunway TaihuLight."
 'With a peak performance of 200,000 trillion calculations per second, it is over twice as fast as Sunway TaihuLight, which is capable of 93,000 trillion calculations per second.'
 'Summit has 4,608 servers, which reportedly take up the size of two tennis courts.']


In [6]:
from nltk import sent_tokenize

text = "US unveils world's most powerful supercomputer, beats China. The US has unveiled the world's most powerful supercomputer called 'Summit', beating the previous record-holder China's Sunway TaihuLight. With a peak performance of 200,000 trillion calculations per second, it is over twice as fast as Sunway TaihuLight, which is capable of 93,000 trillion calculations per second. Summit has 4,608 servers, which reportedly take up the size of two tennis courts."

output = sent_tokenize(text)
print(len(output))
print(output)



4
["US unveils world's most powerful supercomputer, beats China.", "The US has unveiled the world's most powerful supercomputer called 'Summit', beating the previous record-holder China's Sunway TaihuLight.", 'With a peak performance of 200,000 trillion calculations per second, it is over twice as fast as Sunway TaihuLight, which is capable of 93,000 trillion calculations per second.', 'Summit has 4,608 servers, which reportedly take up the size of two tennis courts.']


### Word tokenization: Task-1

In [7]:
########################     Tokenization using regular expression    ##########################

import re

raw_text = """Prof. Mark Lee Ph.D at the University of Birmingham, U.K, says Crungus is merely a composite of data that Craiyon has seen. "I think we could say that it's producing things which are original," he says. "But they are based on previous examples. It could be just a blended image that's come from multiple sources. And it looks very scary, right?"""

tokenized_word = re.findall(r"\w+(?:[-.']\w+)*|'\S\w*", raw_text)
print(tokenized_word)

print('\n')

tokenized_text = " ".join(tokenized_word)
print(tokenized_text)

['Prof', 'Mark', 'Lee', 'Ph.D', 'at', 'the', 'University', 'of', 'Birmingham', 'U.K', 'says', 'Crungus', 'is', 'merely', 'a', 'composite', 'of', 'data', 'that', 'Craiyon', 'has', 'seen', 'I', 'think', 'we', 'could', 'say', 'that', "it's", 'producing', 'things', 'which', 'are', 'original', 'he', 'says', 'But', 'they', 'are', 'based', 'on', 'previous', 'examples', 'It', 'could', 'be', 'just', 'a', 'blended', 'image', "that's", 'come', 'from', 'multiple', 'sources', 'And', 'it', 'looks', 'very', 'scary', 'right']


Prof Mark Lee Ph.D at the University of Birmingham U.K says Crungus is merely a composite of data that Craiyon has seen I think we could say that it's producing things which are original he says But they are based on previous examples It could be just a blended image that's come from multiple sources And it looks very scary right


In [8]:
########################  Tokenization using NLTK's RegexpTokenizer implementation   ###################################



from nltk.tokenize import RegexpTokenizer

tokenizer = RegexpTokenizer(r'\w+')
tokenized_text = tokenizer.tokenize(raw_text)
tokenized_text = " ".join(tokenized_text)
print(tokenized_text)

print('\n')

tokenized_text = tokenized_text.split()
print(tokenized_text)

Prof Mark Lee Ph D at the University of Birmingham U K says Crungus is merely a composite of data that Craiyon has seen I think we could say that it s producing things which are original he says But they are based on previous examples It could be just a blended image that s come from multiple sources And it looks very scary right


['Prof', 'Mark', 'Lee', 'Ph', 'D', 'at', 'the', 'University', 'of', 'Birmingham', 'U', 'K', 'says', 'Crungus', 'is', 'merely', 'a', 'composite', 'of', 'data', 'that', 'Craiyon', 'has', 'seen', 'I', 'think', 'we', 'could', 'say', 'that', 'it', 's', 'producing', 'things', 'which', 'are', 'original', 'he', 'says', 'But', 'they', 'are', 'based', 'on', 'previous', 'examples', 'It', 'could', 'be', 'just', 'a', 'blended', 'image', 'that', 's', 'come', 'from', 'multiple', 'sources', 'And', 'it', 'looks', 'very', 'scary', 'right']


In [9]:
########################  Tokenization using NLTK's word_tokenize implementation   ###################################

from nltk.tokenize import word_tokenize

word_tokenize(raw_text)

['Prof.',
 'Mark',
 'Lee',
 'Ph.D',
 'at',
 'the',
 'University',
 'of',
 'Birmingham',
 ',',
 'U.K',
 ',',
 'says',
 'Crungus',
 'is',
 'merely',
 'a',
 'composite',
 'of',
 'data',
 'that',
 'Craiyon',
 'has',
 'seen',
 '.',
 '``',
 'I',
 'think',
 'we',
 'could',
 'say',
 'that',
 'it',
 "'s",
 'producing',
 'things',
 'which',
 'are',
 'original',
 ',',
 "''",
 'he',
 'says',
 '.',
 '``',
 'But',
 'they',
 'are',
 'based',
 'on',
 'previous',
 'examples',
 '.',
 'It',
 'could',
 'be',
 'just',
 'a',
 'blended',
 'image',
 'that',
 "'s",
 'come',
 'from',
 'multiple',
 'sources',
 '.',
 'And',
 'it',
 'looks',
 'very',
 'scary',
 ',',
 'right',
 '?']

In [12]:
########################  Tokenization using Python string module   ###################################


from string import punctuation as punc

for words in raw_text:
    if words in punc:
        tokenized_word = raw_text.replace(words, '')
print(tokenized_word)

print('\n')
tokenized_word = tokenized_word.split()
print(tokenized_word)

Prof. Mark Lee Ph.D at the University of Birmingham, U.K, says Crungus is merely a composite of data that Craiyon has seen. "I think we could say that it's producing things which are original," he says. "But they are based on previous examples. It could be just a blended image that's come from multiple sources. And it looks very scary, right


['Prof.', 'Mark', 'Lee', 'Ph.D', 'at', 'the', 'University', 'of', 'Birmingham,', 'U.K,', 'says', 'Crungus', 'is', 'merely', 'a', 'composite', 'of', 'data', 'that', 'Craiyon', 'has', 'seen.', '"I', 'think', 'we', 'could', 'say', 'that', "it's", 'producing', 'things', 'which', 'are', 'original,"', 'he', 'says.', '"But', 'they', 'are', 'based', 'on', 'previous', 'examples.', 'It', 'could', 'be', 'just', 'a', 'blended', 'image', "that's", 'come', 'from', 'multiple', 'sources.', 'And', 'it', 'looks', 'very', 'scary,', 'right']


In [5]:
########################  Tokenization using Python string module with translate method  ###################################

import string

tokenized_text = raw_text.translate(str.maketrans('','', string.punctuation))
print(tokenized_text)

print('\n')

tokenized_text = tokenized_text.split()
print(tokenized_text)

Prof Mark Lee PhD at the University of Birmingham UK says Crungus is merely a composite of data that Craiyon has seen I think we could say that its producing things which are original he says But they are based on previous examples It could be just a blended image thats come from multiple sources And it looks very scary right


['Prof', 'Mark', 'Lee', 'PhD', 'at', 'the', 'University', 'of', 'Birmingham', 'UK', 'says', 'Crungus', 'is', 'merely', 'a', 'composite', 'of', 'data', 'that', 'Craiyon', 'has', 'seen', 'I', 'think', 'we', 'could', 'say', 'that', 'its', 'producing', 'things', 'which', 'are', 'original', 'he', 'says', 'But', 'they', 'are', 'based', 'on', 'previous', 'examples', 'It', 'could', 'be', 'just', 'a', 'blended', 'image', 'thats', 'come', 'from', 'multiple', 'sources', 'And', 'it', 'looks', 'very', 'scary', 'right']


### Word tokenization: Task-2

In [6]:
########################  Tokenization using regular expression  ###################################

import re

raw_text2 = """I won't have any pepper in my kitchen at all. Soup does very well without-Maybe it's always pepper that makes people hot-tempered."""


tokenized_word = re.findall(r"\w+(?:[-.']\w+)*|'\S\w*", raw_text2)
print(tokenized_word)

['I', "won't", 'have', 'any', 'pepper', 'in', 'my', 'kitchen', 'at', 'all', 'Soup', 'does', 'very', 'well', 'without-Maybe', "it's", 'always', 'pepper', 'that', 'makes', 'people', 'hot-tempered']


In [7]:
########################  Tokenization using NLTK's RegexpTokenizer implementation  ###################################


from nltk.tokenize import RegexpTokenizer

tokenizer = RegexpTokenizer(r'\w+')
tokenized_text = tokenizer.tokenize(raw_text2)
print(tokenized_text)

['I', 'won', 't', 'have', 'any', 'pepper', 'in', 'my', 'kitchen', 'at', 'all', 'Soup', 'does', 'very', 'well', 'without', 'Maybe', 'it', 's', 'always', 'pepper', 'that', 'makes', 'people', 'hot', 'tempered']


In [8]:
########################  Tokenization using NLTK's word_tokenize implementation  ###################################

from nltk.tokenize import word_tokenize

word_tokenize(raw_text2)

['I',
 'wo',
 "n't",
 'have',
 'any',
 'pepper',
 'in',
 'my',
 'kitchen',
 'at',
 'all',
 '.',
 'Soup',
 'does',
 'very',
 'well',
 'without-Maybe',
 'it',
 "'s",
 'always',
 'pepper',
 'that',
 'makes',
 'people',
 'hot-tempered',
 '.']

### Word tokenization: Task-3

In [9]:
########################  Tokenization using regular expression  ###################################

raw_text3 = """We had a meeting with our industrial partner on October 22nd, 2022, it was scheduled for 2 hours, but it ended up lasting for 2.5 hours. The meeting was productive and we were able to finalize the budget for next quarter, which is $250,000."""

tokenized_word = re.findall(r"(\w+(?:[-.']\w+)*|[$]\d+[,]?\d+|'\S\w*)", raw_text3)
print(tokenized_word)

print('\n')

tokenized_text = " ".join(tokenized_word)
print(tokenized_text)

['We', 'had', 'a', 'meeting', 'with', 'our', 'industrial', 'partner', 'on', 'October', '22nd', '2022', 'it', 'was', 'scheduled', 'for', '2', 'hours', 'but', 'it', 'ended', 'up', 'lasting', 'for', '2.5', 'hours', 'The', 'meeting', 'was', 'productive', 'and', 'we', 'were', 'able', 'to', 'finalize', 'the', 'budget', 'for', 'next', 'quarter', 'which', 'is', '$250,000']


We had a meeting with our industrial partner on October 22nd 2022 it was scheduled for 2 hours but it ended up lasting for 2.5 hours The meeting was productive and we were able to finalize the budget for next quarter which is $250,000


In [10]:
########################  Tokenization using NLTK's RegexpTokenizer implementation  ###################################

from nltk.tokenize import RegexpTokenizer

tokenizer = RegexpTokenizer(r'\w+')
tokenized_text = tokenizer.tokenize(raw_text3)
print(tokenized_text)

print('\n')

tokenized_text = " ".join(tokenized_word)
print(tokenized_text)

['We', 'had', 'a', 'meeting', 'with', 'our', 'industrial', 'partner', 'on', 'October', '22nd', '2022', 'it', 'was', 'scheduled', 'for', '2', 'hours', 'but', 'it', 'ended', 'up', 'lasting', 'for', '2', '5', 'hours', 'The', 'meeting', 'was', 'productive', 'and', 'we', 'were', 'able', 'to', 'finalize', 'the', 'budget', 'for', 'next', 'quarter', 'which', 'is', '250', '000']


We had a meeting with our industrial partner on October 22nd 2022 it was scheduled for 2 hours but it ended up lasting for 2.5 hours The meeting was productive and we were able to finalize the budget for next quarter which is $250,000


In [11]:
########################  Tokenization using NLTK's word_tokenizer implementation  ###################################

from nltk.tokenize import word_tokenize

word_tokenize(raw_text3)


['We',
 'had',
 'a',
 'meeting',
 'with',
 'our',
 'industrial',
 'partner',
 'on',
 'October',
 '22nd',
 ',',
 '2022',
 ',',
 'it',
 'was',
 'scheduled',
 'for',
 '2',
 'hours',
 ',',
 'but',
 'it',
 'ended',
 'up',
 'lasting',
 'for',
 '2.5',
 'hours',
 '.',
 'The',
 'meeting',
 'was',
 'productive',
 'and',
 'we',
 'were',
 'able',
 'to',
 'finalize',
 'the',
 'budget',
 'for',
 'next',
 'quarter',
 ',',
 'which',
 'is',
 '$',
 '250,000',
 '.']

### Word tokenization: Task-4

In [12]:
########################  Tokenization using regular expression  ###################################

raw_text4 = """We had a meeting with industrial partner and Ph.D researchers on October 22nd, 2022, it was scheduled for 2 hours, but it ended up lasting for 2.5 hours. The meeting was productive and we were able to finalize the budget for next quarter, which is $250,000. I won't have any pepper in my kitchen at all. Soup does very well without-Maybe it's always pepper that makes people hot-tempered."""
print(raw_text4)

print('\n\n')
tokenized_word = re.findall(r"(\w+(?:[-.']\w+)*|[$]\d+[,]?\d+|'\S\w*)", raw_text4)
print(tokenized_word)

We had a meeting with industrial partner and Ph.D researchers on October 22nd, 2022, it was scheduled for 2 hours, but it ended up lasting for 2.5 hours. The meeting was productive and we were able to finalize the budget for next quarter, which is $250,000. I won't have any pepper in my kitchen at all. Soup does very well without-Maybe it's always pepper that makes people hot-tempered.



['We', 'had', 'a', 'meeting', 'with', 'industrial', 'partner', 'and', 'Ph.D', 'researchers', 'on', 'October', '22nd', '2022', 'it', 'was', 'scheduled', 'for', '2', 'hours', 'but', 'it', 'ended', 'up', 'lasting', 'for', '2.5', 'hours', 'The', 'meeting', 'was', 'productive', 'and', 'we', 'were', 'able', 'to', 'finalize', 'the', 'budget', 'for', 'next', 'quarter', 'which', 'is', '$250,000', 'I', "won't", 'have', 'any', 'pepper', 'in', 'my', 'kitchen', 'at', 'all', 'Soup', 'does', 'very', 'well', 'without-Maybe', "it's", 'always', 'pepper', 'that', 'makes', 'people', 'hot-tempered']


In [13]:
########################  Tokenization using NLTK's RegexpTokenizer implementation  ###################################

from nltk.tokenize import RegexpTokenizer

tokenizer = RegexpTokenizer(r'\w+')
tokenized_text = tokenizer.tokenize(raw_text4)
print(tokenized_text)

['We', 'had', 'a', 'meeting', 'with', 'industrial', 'partner', 'and', 'Ph', 'D', 'researchers', 'on', 'October', '22nd', '2022', 'it', 'was', 'scheduled', 'for', '2', 'hours', 'but', 'it', 'ended', 'up', 'lasting', 'for', '2', '5', 'hours', 'The', 'meeting', 'was', 'productive', 'and', 'we', 'were', 'able', 'to', 'finalize', 'the', 'budget', 'for', 'next', 'quarter', 'which', 'is', '250', '000', 'I', 'won', 't', 'have', 'any', 'pepper', 'in', 'my', 'kitchen', 'at', 'all', 'Soup', 'does', 'very', 'well', 'without', 'Maybe', 'it', 's', 'always', 'pepper', 'that', 'makes', 'people', 'hot', 'tempered']


In [14]:
########################  Tokenization using NLTK's word_tokenize implementation  ###################################


from nltk.tokenize import word_tokenize

word_tokenize(raw_text4)

['We',
 'had',
 'a',
 'meeting',
 'with',
 'industrial',
 'partner',
 'and',
 'Ph.D',
 'researchers',
 'on',
 'October',
 '22nd',
 ',',
 '2022',
 ',',
 'it',
 'was',
 'scheduled',
 'for',
 '2',
 'hours',
 ',',
 'but',
 'it',
 'ended',
 'up',
 'lasting',
 'for',
 '2.5',
 'hours',
 '.',
 'The',
 'meeting',
 'was',
 'productive',
 'and',
 'we',
 'were',
 'able',
 'to',
 'finalize',
 'the',
 'budget',
 'for',
 'next',
 'quarter',
 ',',
 'which',
 'is',
 '$',
 '250,000',
 '.',
 'I',
 'wo',
 "n't",
 'have',
 'any',
 'pepper',
 'in',
 'my',
 'kitchen',
 'at',
 'all',
 '.',
 'Soup',
 'does',
 'very',
 'well',
 'without-Maybe',
 'it',
 "'s",
 'always',
 'pepper',
 'that',
 'makes',
 'people',
 'hot-tempered',
 '.']

### Stopword Removal

In [None]:
text_with_stopwords = """The life-span of trees is determined by growth rings. These can be seen if the tree is cut down or in cores taken from the edge to the center of the tree. Correct determination is only possible for trees which make growth rings, generally those which occur in seasonal climates. Trees in uniform non-seasonal tropical climates are always growing and do not have distinct growth rings. It is also only possible for trees which are solid to the center of the tree; many very old trees become hollow as the dead heartwood decays away. For some of these species, age estimates have been made on the basis of extrapolating current growth rates, but the results are usually little better than guesses or speculation. White proposed a method of estimating the age of large and veteran trees in the United Kingdom by correlation between a trees stem diameter, growth character and age."""
print(text_with_stopwords)

In [1]:
# let'f first print the available list of stopwords

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

# printing the ntlk stop words list
nltk_stopwords = stopwords.words('english')
print(nltk_stopwords)

print('\n\n')

# convert text to lower case
text_with_stopwords = text_with_stopwords.lower()

# convert text to tokens
text_tokenized = word_tokenize(text_with_stopwords)
print(text_tokenized)

# remove stop words
text_without_sw = [word for word in text_tokenized if not word in stopwords.words()]

print(text_without_sw)
print('\n\n')

# if you want to rejoin the tokens to make sentences
filtered_sentence = (" ").join(text_without_sw)
print(filtered_sentence)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

NameError: name 'text_with_stopwords' is not defined

In [2]:
# Add new stop words to default stop words list, and remove the stop words by using updated stop words list

nltk_stopwords = stopwords.words('english')

# adding new word to existing nltk stop word list
nltk_stopwords = nltk_stopwords.append('made')


text_tokenized = word_tokenize(text_with_stopwords)
text_without_sw = [word for word in text_tokenized if not word in stopwords.words()]

print(text_without_sw)

print('\n\n')

filtered_sentence = (" ").join(text_without_sw)
print(filtered_sentence)

NameError: name 'text_with_stopwords' is not defined

In [None]:
# open a file for writing and create it if it does not exist
f = open("text_without_sw.txt", "w")

# write to the file
f.write(str(text_without_sw))

# close file
f.close()