In [39]:
import nltk
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import pandas as pd

In [40]:
# Reading data
raw_data = pd.read_csv("reviews_rt_all.csv", sep="|")
text_data = raw_data['text']
text_data.head()

0    To an entire generation of filmgoers, it just ...
1    Pixar classic is one of the best kids' movies ...
2    Apesar de representar um imenso avanço tecnoló...
3    When Woody perks up in the opening scene, it's...
4    Introduced not one but two indelible character...
Name: text, dtype: object

In [41]:
# Normalize words by getting rid of uppercase
txt = text_data.str.lower().str.cat(sep=' ')
words = nltk.word_tokenize(txt)
# Count word frequency
word_dist = nltk.FreqDist(words)

In [42]:
# 30 most frequent words
most_common_words = word_dist.most_common(30)
most_common_words

[('.', 105164),
 (',', 99936),
 ('the', 97231),
 ('a', 68842),
 ('of', 59638),
 ('and', 58935),
 ('to', 37337),
 ("'s", 35572),
 ('is', 34932),
 ('it', 31843),
 ('in', 25590),
 ('that', 22234),
 ('as', 17683),
 ('but', 16423),
 ('with', 16064),
 ('film', 15909),
 ('this', 15334),
 ('for', 14499),
 ('an', 11712),
 ('its', 10822),
 ('movie', 10503),
 ('on', 8854),
 ('you', 8743),
 ('one', 8628),
 ("n't", 8350),
 ('be', 8282),
 ('...', 8271),
 ('not', 8132),
 ('by', 7632),
 ('are', 7014)]

In [43]:
# Define stop words
stop_words = [word for (word, freq) in most_common_words]
stop_words

['.',
 ',',
 'the',
 'a',
 'of',
 'and',
 'to',
 "'s",
 'is',
 'it',
 'in',
 'that',
 'as',
 'but',
 'with',
 'film',
 'this',
 'for',
 'an',
 'its',
 'movie',
 'on',
 'you',
 'one',
 "n't",
 'be',
 '...',
 'not',
 'by',
 'are']

In [None]:
# Delete stop words from the entire data 
cleaned_words = [word for word in words if word not in stop_words]
# Just to make sure that stop words were deleted from the words list
word_dist = nltk.FreqDist(cleaned_words)
most_common_words = word_dist.most_common(30)
most_common_words

[('his', 6885),
 ('at', 6871),
 ('from', 6609),
 ('more', 6344),
 ('has', 6335),
 ('about', 6190),
 ('all', 5988),
 ('than', 5910),
 ('have', 5467),
 ('like', 5257),
 ('i', 5239),
 ('most', 4698),
 ('so', 4676),
 ('if', 4558),
 ('--', 4549),
 ('there', 4504),
 ('comedy', 4308),
 ('story', 4193),
 ('or', 4153),
 ('what', 4005),
 ('good', 3995),
 ("'", 3943),
 ('who', 3840),
 ('just', 3826),
 ('much', 3728),
 ("''", 3719),
 ('does', 3685),
 ('up', 3665),
 ('``', 3661),
 ('some', 3545)]

In [None]:
# Apply Porter's stemmer
stemmer = PorterStemmer()
stemmes = [stemmer.stem(w) for w in cleaned_words]

In [None]:
# Apply WordNet lemmatizer
lemmatizer = WordNetLemmatizer()
lemmas = [lemmatizer.lemmatize(w) for w in cleaned_words]

Let's see the different between stemmer and lemmatizer

In [None]:
stemmes[:10]

In [None]:
lemmas[:10]

Lemmatizer requires the part of speech to be specified. By default it treats all words as nouns. Let's see an example

In [None]:
stemmer.stem('doing')

In [None]:
lemmatizer.lemmatize('doing')

In [None]:
lemmatizer.lemmatize('doing', 'v')