In [150]:
import pandas as pd

In [151]:
df = pd.read_csv('IMDB_Dataset.csv')
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


# Text Cleaning Process

1. sample 10000 data
2. remove html tag
3. convert to lower case
4. remove stop words
5. stemming

In [152]:
# df=df.sample(10000)

In [153]:
# df.shape

In [154]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB


In [155]:
# convert the sentimental column value to numeric
# positive = 1
# negative = 0
df['sentiment'].replace({'positive': 1, 'negative': 0}, inplace=True)
df.head()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['sentiment'].replace({'positive': 1, 'negative': 0}, inplace=True)
  df['sentiment'].replace({'positive': 1, 'negative': 0}, inplace=True)


Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


In [156]:
import re
clean = re.compile('<.*?>')

def remove_html_content(text):
    t = re.sub(clean, '', text)
    return t

In [157]:
df['review'] = df['review'].apply(remove_html_content)
df

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. The filming tec...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1
...,...,...
49995,I thought this movie did a down right good job...,1
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",0
49997,I am a Catholic taught in parochial elementary...,0
49998,I'm going to have to disagree with the previou...,0


In [158]:
# convert to lowercase
def lowerCase(text):
    return text.lower()

In [159]:
df['review'] = df['review'].apply(lowerCase)
df

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,1
1,a wonderful little production. the filming tec...,1
2,i thought this was a wonderful way to spend ti...,1
3,basically there's a family where a little boy ...,0
4,"petter mattei's ""love in the time of money"" is...",1
...,...,...
49995,i thought this movie did a down right good job...,1
49996,"bad plot, bad dialogue, bad acting, idiotic di...",0
49997,i am a catholic taught in parochial elementary...,0
49998,i'm going to have to disagree with the previou...,0


In [160]:
# remove special character
def remove_special_char(text):
    newT=''
    for t in text:
        if t.isalnum():
            newT+=t
        else:
            newT+=' '
    return newT

In [161]:
df['review'] = df['review'].apply(remove_special_char)

In [162]:
# used to download the stopwords 

import ssl
import nltk

ssl._create_default_https_context = ssl._create_unverified_context
nltk.download('stopwords')


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/prajapativaibhavyogeshkumar/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [163]:
# remove stop words
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
print(stop_words)


def remove_stop_words(text):
    ans=[]
    for word in text.split():
        if word not in stop_words:
            ans.append(word)
        
    return ' '.join(ans)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [164]:
df['review'] = df['review'].apply(remove_stop_words)
df

Unnamed: 0,review,sentiment
0,one reviewers mentioned watching 1 oz episode ...,1
1,wonderful little production filming technique ...,1
2,thought wonderful way spend time hot summer we...,1
3,basically family little boy jake thinks zombie...,0
4,petter mattei love time money visually stunnin...,1
...,...,...
49995,thought movie right good job creative original...,1
49996,bad plot bad dialogue bad acting idiotic direc...,0
49997,catholic taught parochial elementary schools n...,0
49998,going disagree previous comment side maltin on...,0


In [165]:
# perfor stemming

from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

def stemming(text):
    ans=[]
    for word in text.split():
        ans.append(ps.stem(word))
    return ' '.join(ans)

In [166]:
df['review'] = df['review'].apply(stemming)
df

Unnamed: 0,review,sentiment
0,one review mention watch 1 oz episod hook righ...,1
1,wonder littl product film techniqu unassum old...,1
2,thought wonder way spend time hot summer weeke...,1
3,basic famili littl boy jake think zombi closet...,0
4,petter mattei love time money visual stun film...,1
...,...,...
49995,thought movi right good job creativ origin fir...,1
49996,bad plot bad dialogu bad act idiot direct anno...,0
49997,cathol taught parochi elementari school nun ta...,0
49998,go disagre previou comment side maltin one sec...,0


# BAG OF WORDS

In [167]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=5000)

In [168]:

X = cv.fit_transform(df['review']).toarray()
X.shape

(50000, 5000)

In [169]:
X

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [170]:
y = df.iloc[:, -1].values
y

array([1, 1, 1, ..., 0, 0, 0])

In [171]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [172]:
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB

m1 = GaussianNB()
m2 = MultinomialNB()
m3 = BernoulliNB()

In [173]:
m1.fit(x_train, y_train)
m2.fit(x_train, y_train)
m3.fit(x_train, y_train)

In [174]:
y_pred1 = m1.predict(x_test)
y_pred2 = m2.predict(x_test)
y_pred3 = m3.predict(x_test)

In [175]:
from sklearn.metrics import accuracy_score
def show(model, a_score):
    print(f'{model} achieved: {a_score} accuracy')

show('Gausian',accuracy_score(y_test, y_pred1))
show('Multinomial',accuracy_score(y_test, y_pred2))
show('Bernoulli',accuracy_score(y_test, y_pred3))

Gausian achieved: 0.7105 accuracy
Multinomial achieved: 0.8392 accuracy
Bernoulli achieved: 0.8388 accuracy


In [188]:
import numpy as np
def show_counts(y1, y2):
    u1,c1 = np.unique(y1, return_counts=True)
    u2,c2 = np.unique(y2, return_counts=True)
    print(f'unique: {u1}, counts: {c1}')
    print(f'unique: {u2}, counts: {c2}')
show_counts(y_test, y_pred1)
show_counts(y_test, y_pred2)
show_counts(y_test, y_pred3)

unique: [0 1], counts: [4967 5033]
unique: [0 1], counts: [6412 3588]
unique: [0 1], counts: [4967 5033]
unique: [0 1], counts: [5057 4943]
unique: [0 1], counts: [4967 5033]
unique: [0 1], counts: [5109 4891]
