In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

## Explore the Data

In [2]:
data = pd.read_csv("dataset/IMDB Dataset.csv")

In [3]:
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [4]:
data["sentiment"].value_counts()

negative    25000
positive    25000
Name: sentiment, dtype: int64

## Label Encoding

In [5]:
from sklearn import preprocessing

label_encoder = preprocessing.LabelEncoder()
data["sentiment"] = label_encoder.fit_transform(data["sentiment"])
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


## Split Training and Test Sets

In [6]:
from sklearn.model_selection import train_test_split

data_train, data_test, train_label, test_label = train_test_split(data["review"], data["sentiment"], test_size = 0.25, random_state = 0)

## Process a Sample Review 

In [7]:
sample_review = data_train.sample(1, random_state = 1).iloc[0]
sample_review

'Cage (1989) was another one of those low budget "buddy" action flicks that were produced during the 80\'s thanks in large part due to the films such as 48hrs. and Lethal Weapon. This one stars Reb Brown and Lou Ferrigno as to former Vietnam Vets who happen to run a local dive bar. Reb takes care of Lou because he saved his life in \'Nam. But Lou was shot in the head and is now pretty soft. Although he\'s huge, Lou has the brain of a child. One day some ruffians throw their wait around in the bar and Lou and Rebb beat the tar out of them. But payback\'s a mother. They crash the bar leaving Lou and Reb with nothing. That is until these two thugs come into the picture (one of them\'s a real nice guy) who have a plan in mind.<br /><br />The film\'s a waste of time. Maybe if they went all they way and made a hard core action flick instead of trying to tone down the gruesomeness of the situation perhaps it could have worked. Alas, it doesn\'t and the audience is left holding the bag. Oh wel

Before vectorization, it seems that we need to preprocess the reviews through several steps:
1. Remove HTML contents such as "<br \/>"
2. Remove punctuations
4. Turn all letters to lowercase
3. Remove stopwords
4. Apply Lemmatization

#### Remove HTML Contents

In [8]:
from bs4 import BeautifulSoup

soup = BeautifulSoup(sample_review, "html.parser")
sample_review = soup.get_text()
sample_review

'Cage (1989) was another one of those low budget "buddy" action flicks that were produced during the 80\'s thanks in large part due to the films such as 48hrs. and Lethal Weapon. This one stars Reb Brown and Lou Ferrigno as to former Vietnam Vets who happen to run a local dive bar. Reb takes care of Lou because he saved his life in \'Nam. But Lou was shot in the head and is now pretty soft. Although he\'s huge, Lou has the brain of a child. One day some ruffians throw their wait around in the bar and Lou and Rebb beat the tar out of them. But payback\'s a mother. They crash the bar leaving Lou and Reb with nothing. That is until these two thugs come into the picture (one of them\'s a real nice guy) who have a plan in mind.The film\'s a waste of time. Maybe if they went all they way and made a hard core action flick instead of trying to tone down the gruesomeness of the situation perhaps it could have worked. Alas, it doesn\'t and the audience is left holding the bag. Oh well. It\'s too

#### Remove Punctuations

In [9]:
import re

sample_review = re.sub(r'[^\w]', ' ', sample_review)
sample_review

'Cage  1989  was another one of those low budget  buddy  action flicks that were produced during the 80 s thanks in large part due to the films such as 48hrs  and Lethal Weapon  This one stars Reb Brown and Lou Ferrigno as to former Vietnam Vets who happen to run a local dive bar  Reb takes care of Lou because he saved his life in  Nam  But Lou was shot in the head and is now pretty soft  Although he s huge  Lou has the brain of a child  One day some ruffians throw their wait around in the bar and Lou and Rebb beat the tar out of them  But payback s a mother  They crash the bar leaving Lou and Reb with nothing  That is until these two thugs come into the picture  one of them s a real nice guy  who have a plan in mind The film s a waste of time  Maybe if they went all they way and made a hard core action flick instead of trying to tone down the gruesomeness of the situation perhaps it could have worked  Alas  it doesn t and the audience is left holding the bag  Oh well  It s too bad bec

#### Lowercase

In [10]:
sample_review = sample_review.lower()
sample_review

'cage  1989  was another one of those low budget  buddy  action flicks that were produced during the 80 s thanks in large part due to the films such as 48hrs  and lethal weapon  this one stars reb brown and lou ferrigno as to former vietnam vets who happen to run a local dive bar  reb takes care of lou because he saved his life in  nam  but lou was shot in the head and is now pretty soft  although he s huge  lou has the brain of a child  one day some ruffians throw their wait around in the bar and lou and rebb beat the tar out of them  but payback s a mother  they crash the bar leaving lou and reb with nothing  that is until these two thugs come into the picture  one of them s a real nice guy  who have a plan in mind the film s a waste of time  maybe if they went all they way and made a hard core action flick instead of trying to tone down the gruesomeness of the situation perhaps it could have worked  alas  it doesn t and the audience is left holding the bag  oh well  it s too bad bec

#### Remove Stopwords

In [11]:
import nltk
from nltk.corpus import stopwords
#nltk.download('stopwords')

word_lst = sample_review.split()
stop_words = set(stopwords.words("english"))
sample_review = " ".join([word for word in word_lst if word not in stop_words])
sample_review

'cage 1989 another one low budget buddy action flicks produced 80 thanks large part due films 48hrs lethal weapon one stars reb brown lou ferrigno former vietnam vets happen run local dive bar reb takes care lou saved life nam lou shot head pretty soft although huge lou brain child one day ruffians throw wait around bar lou rebb beat tar payback mother crash bar leaving lou reb nothing two thugs come picture one real nice guy plan mind film waste time maybe went way made hard core action flick instead trying tone gruesomeness situation perhaps could worked alas audience left holding bag oh well bad elements great b movie better luck next time guess recommended xxx'

#### Lemmatization

In [12]:
from nltk.stem import WordNetLemmatizer
#nltk.download('wordnet')

lem = WordNetLemmatizer()
word_lst = sample_review.split()
sample_review = " ".join([lem.lemmatize(word) for word in word_lst])
sample_review

'cage 1989 another one low budget buddy action flick produced 80 thanks large part due film 48hrs lethal weapon one star reb brown lou ferrigno former vietnam vet happen run local dive bar reb take care lou saved life nam lou shot head pretty soft although huge lou brain child one day ruffian throw wait around bar lou rebb beat tar payback mother crash bar leaving lou reb nothing two thug come picture one real nice guy plan mind film waste time maybe went way made hard core action flick instead trying tone gruesomeness situation perhaps could worked ala audience left holding bag oh well bad element great b movie better luck next time guess recommended xxx'

#### Bring All Steps into a Function

In [13]:
def review_processor(review):
    soup = BeautifulSoup(review, "html.parser")
    html_parsed = soup.get_text()
    punct_removed = re.sub(r'[^\w]', ' ', html_parsed)
    lowercase = punct_removed.lower()
    word_lst = lowercase.split()
    stopword_removed = [word for word in word_lst if word not in set(stopwords.words('english'))]
    lem = WordNetLemmatizer()
    lemmatized = [lem.lemmatize(word) for word in stopword_removed]
    res = " ".join(lemmatized)
    return res

In [14]:
sample_review = data_train.sample(1, random_state = 1).iloc[0]
review_processor(sample_review)

'cage 1989 another one low budget buddy action flick produced 80 thanks large part due film 48hrs lethal weapon one star reb brown lou ferrigno former vietnam vet happen run local dive bar reb take care lou saved life nam lou shot head pretty soft although huge lou brain child one day ruffian throw wait around bar lou rebb beat tar payback mother crash bar leaving lou reb nothing two thug come picture one real nice guy plan mind film waste time maybe went way made hard core action flick instead trying tone gruesomeness situation perhaps could worked ala audience left holding bag oh well bad element great b movie better luck next time guess recommended xxx'