In [2]:
import pandas as pd
import nltk

from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer 

from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer 
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [8]:
nltk.download('punkt')

text = """
Anthony Fauci, the U.S. government’s top infectious-disease expert, told Congress he was seeing a “disturbing surge” in new cases. California reported its biggest daily jump, and Florida’s infection rate climbed above 10%.

As many as 31 states have R0 figures above 1, according to the Rt.live website, meaning that each person with the virus infects at least one other. The World Trade Organization said its worst-case scenario 
for cross-border commerce this year will likely be avoided, depending on whether there’s a second wave of outbreaks.

Novak Djokovic, the world’s leading men’s tennis player, tested positive for Covid-19 days after an exhibition tournament in the Balkans featuring him was cut short. England eased more restrictions as deaths continued to fall.
"""

tokens = word_tokenize(text)

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/ritasousabritopereira/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [9]:
print(tokens)

['Anthony', 'Fauci', ',', 'the', 'U.S.', 'government', '’', 's', 'top', 'infectious-disease', 'expert', ',', 'told', 'Congress', 'he', 'was', 'seeing', 'a', '“', 'disturbing', 'surge', '”', 'in', 'new', 'cases', '.', 'California', 'reported', 'its', 'biggest', 'daily', 'jump', ',', 'and', 'Florida', '’', 's', 'infection', 'rate', 'climbed', 'above', '10', '%', '.', 'As', 'many', 'as', '31', 'states', 'have', 'R0', 'figures', 'above', '1', ',', 'according', 'to', 'the', 'Rt.live', 'website', ',', 'meaning', 'that', 'each', 'person', 'with', 'the', 'virus', 'infects', 'at', 'least', 'one', 'other', '.', 'The', 'World', 'Trade', 'Organization', 'said', 'its', 'worst-case', 'scenario', 'for', 'cross-border', 'commerce', 'this', 'year', 'will', 'likely', 'be', 'avoided', ',', 'depending', 'on', 'whether', 'there', '’', 's', 'a', 'second', 'wave', 'of', 'outbreaks', '.', 'Novak', 'Djokovic', ',', 'the', 'world', '’', 's', 'leading', 'men', '’', 's', 'tennis', 'player', ',', 'tested', 'positi

In [10]:
#stemming 
#Porter Stemmer was the first one and improvments are baed on this one

ps = PorterStemmer()

In [11]:
stemmed = [ps.stem(word) for word in tokens]

In [13]:
print(stemmed)

['anthoni', 'fauci', ',', 'the', 'u.s.', 'govern', '’', 's', 'top', 'infectious-diseas', 'expert', ',', 'told', 'congress', 'he', 'wa', 'see', 'a', '“', 'disturb', 'surg', '”', 'in', 'new', 'case', '.', 'california', 'report', 'it', 'biggest', 'daili', 'jump', ',', 'and', 'florida', '’', 's', 'infect', 'rate', 'climb', 'abov', '10', '%', '.', 'As', 'mani', 'as', '31', 'state', 'have', 'R0', 'figur', 'abov', '1', ',', 'accord', 'to', 'the', 'rt.live', 'websit', ',', 'mean', 'that', 'each', 'person', 'with', 'the', 'viru', 'infect', 'at', 'least', 'one', 'other', '.', 'the', 'world', 'trade', 'organ', 'said', 'it', 'worst-cas', 'scenario', 'for', 'cross-bord', 'commerc', 'thi', 'year', 'will', 'like', 'be', 'avoid', ',', 'depend', 'on', 'whether', 'there', '’', 's', 'a', 'second', 'wave', 'of', 'outbreak', '.', 'novak', 'djokov', ',', 'the', 'world', '’', 's', 'lead', 'men', '’', 's', 'tenni', 'player', ',', 'test', 'posit', 'for', 'covid-19', 'day', 'after', 'an', 'exhibit', 'tournament

In [17]:
#alternatively let's do lemmatization

from nltk.stem import WordNetLemmatizer 
nltk.download('wordnet') 
lemmatizer = WordNetLemmatizer()

lemmatized = [lemmatizer.lemmatize(word) for word in tokens]

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/ritasousabritopereira/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


In [20]:
print(lemmatized)

['Anthony', 'Fauci', ',', 'the', 'U.S.', 'government', '’', 's', 'top', 'infectious-disease', 'expert', ',', 'told', 'Congress', 'he', 'wa', 'seeing', 'a', '“', 'disturbing', 'surge', '”', 'in', 'new', 'case', '.', 'California', 'reported', 'it', 'biggest', 'daily', 'jump', ',', 'and', 'Florida', '’', 's', 'infection', 'rate', 'climbed', 'above', '10', '%', '.', 'As', 'many', 'a', '31', 'state', 'have', 'R0', 'figure', 'above', '1', ',', 'according', 'to', 'the', 'Rt.live', 'website', ',', 'meaning', 'that', 'each', 'person', 'with', 'the', 'virus', 'infects', 'at', 'least', 'one', 'other', '.', 'The', 'World', 'Trade', 'Organization', 'said', 'it', 'worst-case', 'scenario', 'for', 'cross-border', 'commerce', 'this', 'year', 'will', 'likely', 'be', 'avoided', ',', 'depending', 'on', 'whether', 'there', '’', 's', 'a', 'second', 'wave', 'of', 'outbreak', '.', 'Novak', 'Djokovic', ',', 'the', 'world', '’', 's', 'leading', 'men', '’', 's', 'tennis', 'player', ',', 'tested', 'positive', 'fo

In [22]:
ps.stem("university")
ps.stem("universal")
ps.stem("universe")

#3 words which should be different features are the same...

'univers'