In [9]:
paragraph = """ChemAnalyst is a digital platform, which keeps a real-time eye on the chemicals and petrochemicals market fluctuations, thus, enabling its customers to make wise business decisions. With over 450 chemical products traded globally, we bring detailed market information and pricing data at your fingertip's. Our real-time pricing and commentary updates enable users to stay acquainted with new commercial opportunities. """

## Tokenization

In [10]:
from nltk.tokenize import sent_tokenize

sentences = sent_tokenize(paragraph, language='english')
print(sentences)

['ChemAnalyst is a digital platform, which keeps a real-time eye on the chemicals and petrochemicals market fluctuations, thus, enabling its customers to make wise business decisions.', "With over 450 chemical products traded globally, we bring detailed market information and pricing data at your fingertip's.", 'Our real-time pricing and commentary updates enable users to stay acquainted with new commercial opportunities.']


In [11]:
from nltk.tokenize import word_tokenize

words = word_tokenize(paragraph)
print(words)

['ChemAnalyst', 'is', 'a', 'digital', 'platform', ',', 'which', 'keeps', 'a', 'real-time', 'eye', 'on', 'the', 'chemicals', 'and', 'petrochemicals', 'market', 'fluctuations', ',', 'thus', ',', 'enabling', 'its', 'customers', 'to', 'make', 'wise', 'business', 'decisions', '.', 'With', 'over', '450', 'chemical', 'products', 'traded', 'globally', ',', 'we', 'bring', 'detailed', 'market', 'information', 'and', 'pricing', 'data', 'at', 'your', 'fingertip', "'s", '.', 'Our', 'real-time', 'pricing', 'and', 'commentary', 'updates', 'enable', 'users', 'to', 'stay', 'acquainted', 'with', 'new', 'commercial', 'opportunities', '.']


In [12]:
from nltk.tokenize import wordpunct_tokenize

punc_words = wordpunct_tokenize(paragraph)
print(punc_words)

['ChemAnalyst', 'is', 'a', 'digital', 'platform', ',', 'which', 'keeps', 'a', 'real', '-', 'time', 'eye', 'on', 'the', 'chemicals', 'and', 'petrochemicals', 'market', 'fluctuations', ',', 'thus', ',', 'enabling', 'its', 'customers', 'to', 'make', 'wise', 'business', 'decisions', '.', 'With', 'over', '450', 'chemical', 'products', 'traded', 'globally', ',', 'we', 'bring', 'detailed', 'market', 'information', 'and', 'pricing', 'data', 'at', 'your', 'fingertip', "'", 's', '.', 'Our', 'real', '-', 'time', 'pricing', 'and', 'commentary', 'updates', 'enable', 'users', 'to', 'stay', 'acquainted', 'with', 'new', 'commercial', 'opportunities', '.']


In [13]:
from nltk.tokenize import TreebankWordTokenizer

tokenizer = TreebankWordTokenizer()
tokens = tokenizer.tokenize(paragraph)
tokens

['ChemAnalyst',
 'is',
 'a',
 'digital',
 'platform',
 ',',
 'which',
 'keeps',
 'a',
 'real-time',
 'eye',
 'on',
 'the',
 'chemicals',
 'and',
 'petrochemicals',
 'market',
 'fluctuations',
 ',',
 'thus',
 ',',
 'enabling',
 'its',
 'customers',
 'to',
 'make',
 'wise',
 'business',
 'decisions.',
 'With',
 'over',
 '450',
 'chemical',
 'products',
 'traded',
 'globally',
 ',',
 'we',
 'bring',
 'detailed',
 'market',
 'information',
 'and',
 'pricing',
 'data',
 'at',
 'your',
 "fingertip's.",
 'Our',
 'real-time',
 'pricing',
 'and',
 'commentary',
 'updates',
 'enable',
 'users',
 'to',
 'stay',
 'acquainted',
 'with',
 'new',
 'commercial',
 'opportunities',
 '.']

## Stemming

In [15]:
from nltk.stem import PorterStemmer

In [16]:
words = ["go", "gone", "going", "goes", "eats", "eating", "eat", "work", "working", "worked"]
ps = PorterStemmer()

In [17]:
for word in words:
    print(word, "-->", ps.stem(word))

go --> go
gone --> gone
going --> go
goes --> goe
eats --> eat
eating --> eat
eat --> eat
work --> work
working --> work
worked --> work


In [18]:
from nltk.stem import RegexpStemmer

In [19]:
reg = RegexpStemmer('ing$|s$|e$|able$|ed$|es$', min=4)

for word in words:
    print(word, "-->", reg.stem(word))

go --> go
gone --> gon
going --> go
goes --> go
eats --> eat
eating --> eat
eat --> eat
work --> work
working --> work
worked --> work


In [20]:
from nltk.stem import SnowballStemmer

In [21]:
snow = SnowballStemmer('english')
for word in words:
    print(word, "-->", snow.stem(word))

go --> go
gone --> gone
going --> go
goes --> goe
eats --> eat
eating --> eat
eat --> eat
work --> work
working --> work
worked --> work


## Lemmatization

In [22]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

In [9]:
import nltk

nltk.download('wordnet')

[nltk_data] Downloading package wordnet to C:\Users\D S
[nltk_data]     Patwal\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [23]:
'''
POS -
Noun-n
Verb-v
Adjective-a
Abverb-r
'''

lemmatizer.lemmatize('going', pos='v')

'go'

In [25]:
for word in words:
    print(word, "-->", lemmatizer.lemmatize(word, pos='v'))

go --> go
gone --> go
going --> go
goes --> go
eats --> eat
eating --> eat
eat --> eat
work --> work
working --> work
worked --> work


## Stopword removal

In [36]:
paragraph = """In machine learning, a Type I error (false positive) occurs when a model incorrectly predicts a positive outcome, rejecting the null hypothesis when it is actually true. A Type II error (false negative) happens when a model fails to detect a positive outcome, failing to reject a false null hypothesis. These errors are inversely related; reducing the chance of one increases the chance of the other"""

In [27]:
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to C:\Users\D S
[nltk_data]     Patwal\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [31]:
stopwords.words('english')

['a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 "he'd",
 "he'll",
 'her',
 'here',
 'hers',
 'herself',
 "he's",
 'him',
 'himself',
 'his',
 'how',
 'i',
 "i'd",
 'if',
 "i'll",
 "i'm",
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it'd",
 "it'll",
 "it's",
 'its',
 'itself',
 "i've",
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'on

In [33]:
stemmer = PorterStemmer()
sentences = nltk.sent_tokenize(paragraph)

In [34]:
# apply stopwords and filter and then apply stemming

for i in range(len(sentences)):
    words = nltk.word_tokenize(sentences[i])
    words = [stemmer.stem(word) for word in words if word not in stopwords.words('english')]
    sentences[i] = ' '.join(words)


In [35]:
sentences

['in machin learn , type i error ( fals posit ) occur model incorrectli predict posit outcom , reject null hypothesi actual true .',
 'a type ii error ( fals neg ) happen model fail detect posit outcom , fail reject fals null hypothesi .',
 'these error invers relat ; reduc chanc one increas chanc']

In [37]:
# apply stopwords and filter and then apply snowball stemming
from nltk.stem import SnowballStemmer
snow = SnowballStemmer('english')

for i in range(len(sentences)):
    words = nltk.word_tokenize(sentences[i])
    words = [snow.stem(word) for word in words if word not in stopwords.words('english')]
    sentences[i] = ' '.join(words)

sentences


['machin learn , type error ( fal posit ) occur model incorrect predict posit outcom , reject null hypothesi actual true .',
 'type ii error ( fal neg ) happen model fail detect posit outcom , fail reject fal null hypothesi .',
 'error inver relat ; reduc chanc one increa chanc']

In [40]:
# apply stopwords and filter and then apply lemmatizer
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

for i in range(len(sentences)):
    words = nltk.word_tokenize(sentences[i])
    words = [lemmatizer.lemmatize(word, pos='v') for word in words if word not in stopwords.words('english')]
    sentences[i] = ' '.join(words)

sentences


['machin learn , type error ( fal posit ) occur model incorrect predict posit outcom , reject null hypothesi actual true .',
 'type ii error ( fal neg ) happen model fail detect posit outcom , fail reject fal null hypothesi .',
 'error inver relat ; reduc chanc one increa chanc']