### 1. Imports & Load Data

In [16]:
import pandas as pd
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

# Load the reviews CSV
df = pd.read_csv('imdb_movie_reviews.csv')
df.head()

Unnamed: 0,label,review
0,negative,"In the ten years since Wildside aired, nothing..."
1,positive,This is a better-than-average entry in the Sai...
2,negative,"""The Mayor Of Hell"" has the feel of an early D..."
3,positive,This is a really great short from Hal Roach. T...
4,positive,A rather charming depiction of European union ...


### 2. Remove Punctuation

In [17]:
# Review the possible punctuation.
import string

print(string.punctuation)
# → '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


In [18]:
# Create a function
def remove_punctuation(review):
    # build a new string without any characters in string.punctuation
    review = "".join([char for char in review if char not in string.punctuation])
    return review

In [19]:
# Test a function
print(remove_punctuation("Wow!!! This, right here, is amazing...?"))

Wow This right here is amazing


In [27]:
# Create a new column in the DataFrame
df['review_no_punct'] = df['review'].apply(lambda x: remove_punctuation(x))
df[['review', 'review_no_punct']].head()

Unnamed: 0,review,review_no_punct
0,"In the ten years since Wildside aired, nothing...",In the ten years since Wildside aired nothing ...
1,This is a better-than-average entry in the Sai...,This is a betterthanaverage entry in the Saint...
2,"""The Mayor Of Hell"" has the feel of an early D...",The Mayor Of Hell has the feel of an early Dea...
3,This is a really great short from Hal Roach. T...,This is a really great short from Hal Roach Th...
4,A rather charming depiction of European union ...,A rather charming depiction of European union ...


### 3. Tokenize

In [28]:
# Create a function
def tokenize(review):
    tokens = review.lower().split()
    return tokens

In [29]:
# Test on a sample string.
print(tokenize("This Is A SAMPLE Review"))  

['this', 'is', 'a', 'sample', 'review']


In [30]:
# Using a lambda, create a new column with tokenized reviews 
df['review_tokenized'] = df['review_no_punct'].apply(lambda x: tokenize(x))
df[['review_no_punct', 'review_tokenized']].head()

Unnamed: 0,review_no_punct,review_tokenized
0,In the ten years since Wildside aired nothing ...,"[in, the, ten, years, since, wildside, aired, ..."
1,This is a betterthanaverage entry in the Saint...,"[this, is, a, betterthanaverage, entry, in, th..."
2,The Mayor Of Hell has the feel of an early Dea...,"[the, mayor, of, hell, has, the, feel, of, an,..."
3,This is a really great short from Hal Roach Th...,"[this, is, a, really, great, short, from, hal,..."
4,A rather charming depiction of European union ...,"[a, rather, charming, depiction, of, european,..."


In [33]:
# Analyze min, max, and average token counts.
lengths = df['review_tokenized'].apply(len)
print("Min words:", lengths.min())
print("Max words:", lengths.max())
print("Avg words: {:.2f}".format(lengths.mean()))

Min words: 10
Max words: 2469
Avg words: 232.00


### 4. Stop Word Removal

In [35]:
# Import NLTK
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

In [36]:
# Take a quick look at how many stop words we have
print(len(stop_words), "stop words")
print(sorted(list(stop_words))[:20], "…")

198 stop words
['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't", 'as', 'at', 'be', 'because', 'been'] …


In [37]:
# Define a function that removes any stop word from a token list
def remove_stopwords(review):
    review = [word for word in review if word not in stop_words]
    return review

In [38]:
# Test it on a sample list of tokens
print(remove_stopwords(['this','is','an','example','of','stop','word','removal']))

['example', 'stop', 'word', 'removal']


In [39]:
# Create a new column with stop words removed
df['review_no_stop'] = df['review_tokenized'].apply(lambda x: remove_stopwords(x))
df[['review_tokenized','review_no_stop']].head()

Unnamed: 0,review_tokenized,review_no_stop
0,"[in, the, ten, years, since, wildside, aired, ...","[ten, years, since, wildside, aired, nothing, ..."
1,"[this, is, a, betterthanaverage, entry, in, th...","[betterthanaverage, entry, saint, series, hold..."
2,"[the, mayor, of, hell, has, the, feel, of, an,...","[mayor, hell, feel, early, dead, end, kids, fi..."
3,"[this, is, a, really, great, short, from, hal,...","[really, great, short, hal, roach, two, main, ..."
4,"[a, rather, charming, depiction, of, european,...","[rather, charming, depiction, european, union,..."


In [43]:
# Compare token counts before and after stop‐word removal

# minimum, maximum, and average for the original tokenized reviews
print("Original:")
print("  Min tokens:  ", df['review_tokenized'].apply(len).min())
print("  Max tokens:  ", df['review_tokenized'].apply(len).max())
print("  Avg tokens:  {:.2f}".format(df['review_tokenized'].apply(len).mean()))

# minimum, maximum, and average for the no‐stopword reviews
print("\n  No of stop words:")
print("  Min tokens:  ", df['review_no_stop'].apply(len).min())
print("  Max tokens:  ", df['review_no_stop'].apply(len).max())
print("  Avg tokens:  {:.2f}".format(df['review_no_stop'].apply(len).mean()))

Original:
  Min tokens:   10
  Max tokens:   2469
  Avg tokens:  232.00

  No of stop words:
  Min tokens:   6
  Max tokens:   1449
  Avg tokens:  124.06


### 5. Stemming

In [44]:
# Import NLTK and the PorterStemmer, then download punkt
import nltk
nltk.download('punkt')

from nltk.stem import PorterStemmer

[nltk_data] Downloading package punkt to /Users/vunguyen/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [45]:
# Instantiate stemmer
stemmer = PorterStemmer()

In [46]:
# Try stemming a small sample of your no-stop-word tokens
sample = df['review_no_stop'].iloc[0]      # e.g. first review’s token list
print("Original:", sample[:10])
print("Stemmed: ", [stemmer.stem(tok) for tok in sample[:10]])

Original: ['ten', 'years', 'since', 'wildside', 'aired', 'nothing', 'really', 'come', 'close', 'quality']
Stemmed:  ['ten', 'year', 'sinc', 'wildsid', 'air', 'noth', 'realli', 'come', 'close', 'qualiti']


In [47]:
# Apply stemming to every review’s token list 
df['review_stemmed'] = df['review_no_stop']\
    .apply(lambda toks: [stemmer.stem(w) for w in toks])

# Peek at the result
df[['review_no_stop','review_stemmed']].head()

Unnamed: 0,review_no_stop,review_stemmed
0,"[ten, years, since, wildside, aired, nothing, ...","[ten, year, sinc, wildsid, air, noth, realli, ..."
1,"[betterthanaverage, entry, saint, series, hold...","[betterthanaverag, entri, saint, seri, hold, i..."
2,"[mayor, hell, feel, early, dead, end, kids, fi...","[mayor, hell, feel, earli, dead, end, kid, fil..."
3,"[really, great, short, hal, roach, two, main, ...","[realli, great, short, hal, roach, two, main, ..."
4,"[rather, charming, depiction, european, union,...","[rather, charm, depict, european, union, begin..."


### 6. Lemmatization

In [50]:
# Import the WordNet lemmatizer and download the required corpus
import nltk
nltk.download('wordnet')

from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/vunguyen/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [52]:
# Instantiate your lemmatizer
lemmatizer = WordNetLemmatizer()

In [53]:
# Try it on a small sample of your no-stop-word tokens
sample_tokens = df['review_no_stop'].iloc[0]  # e.g. first review’s tokens
print("Original:", sample_tokens[:10])
print("Lemmatized (nouns):", [lemmatizer.lemmatize(w) for w in sample_tokens[:10]])
# You can also pass pos='v', 'a', 'r', etc.:
print("Lemmatized (verbs):", [lemmatizer.lemmatize(w, pos='v') for w in sample_tokens[:10]])

Original: ['ten', 'years', 'since', 'wildside', 'aired', 'nothing', 'really', 'come', 'close', 'quality']
Lemmatized (nouns): ['ten', 'year', 'since', 'wildside', 'aired', 'nothing', 'really', 'come', 'close', 'quality']
Lemmatized (verbs): ['ten', 'years', 'since', 'wildside', 'air', 'nothing', 'really', 'come', 'close', 'quality']


In [54]:
# Apply lemmatization across your entire DataFrame
df['review_lemmatized'] = df['review_no_stop'].apply(
    lambda toks: [lemmatizer.lemmatize(w) for w in toks])

# Peek at the result
df[['review_no_stop', 'review_lemmatized']].head()

Unnamed: 0,review_no_stop,review_lemmatized
0,"[ten, years, since, wildside, aired, nothing, ...","[ten, year, since, wildside, aired, nothing, r..."
1,"[betterthanaverage, entry, saint, series, hold...","[betterthanaverage, entry, saint, series, hold..."
2,"[mayor, hell, feel, early, dead, end, kids, fi...","[mayor, hell, feel, early, dead, end, kid, fil..."
3,"[really, great, short, hal, roach, two, main, ...","[really, great, short, hal, roach, two, main, ..."
4,"[rather, charming, depiction, european, union,...","[rather, charming, depiction, european, union,..."
