In [1]:
import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split as tts
from  sklearn.naive_bayes import GaussianNB,MultinomialNB,BernoulliNB
from sklearn.metrics import accuracy_score
import pickle

In [2]:
data = pd.read_csv('IMDB-Dataset.csv')
print(data.shape)
data.head()

(50000, 2)


Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB


In [4]:
data['sentiment'].unique()

array(['positive', 'negative'], dtype=object)

In [5]:
data['sentiment'].value_counts()

sentiment
positive    25000
negative    25000
Name: count, dtype: int64

In [6]:
data.sentiment.replace('positive',1, inplace=True)
data.sentiment.replace('negative', 0, inplace = True)
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


In [7]:
data.review[0]

"One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked. They are right, as this is exactly what happened with me.<br /><br />The first thing that struck me about Oz was its brutality and unflinching scenes of violence, which set in right from the word GO. Trust me, this is not a show for the faint hearted or timid. This show pulls no punches with regards to drugs, sex or violence. Its is hardcore, in the classic use of the word.<br /><br />It is called OZ as that is the nickname given to the Oswald Maximum Security State Penitentary. It focuses mainly on Emerald City, an experimental section of the prison where all the cells have glass fronts and face inwards, so privacy is not high on the agenda. Em City is home to many..Aryans, Muslims, gangstas, Latinos, Christians, Italians, Irish and more....so scuffles, death stares, dodgy dealings and shady agreements are never far away.<br /><br />I would say the main appeal of the show is due to the fa

In [8]:
def clean(text):                                               ## removed Tags
    cleaned = re.compile(r'<.*?>')
    return re.sub(cleaned, '', text)

data.review = data.review.apply(clean)
data.review[0]

"One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked. They are right, as this is exactly what happened with me.The first thing that struck me about Oz was its brutality and unflinching scenes of violence, which set in right from the word GO. Trust me, this is not a show for the faint hearted or timid. This show pulls no punches with regards to drugs, sex or violence. Its is hardcore, in the classic use of the word.It is called OZ as that is the nickname given to the Oswald Maximum Security State Penitentary. It focuses mainly on Emerald City, an experimental section of the prison where all the cells have glass fronts and face inwards, so privacy is not high on the agenda. Em City is home to many..Aryans, Muslims, gangstas, Latinos, Christians, Italians, Irish and more....so scuffles, death stares, dodgy dealings and shady agreements are never far away.I would say the main appeal of the show is due to the fact that it goes where other shows wo

In [9]:
def is_special(text):
    rem=""
    for i in text:
        if i.isalnum():
            rem=rem+i
        else:
            rem=rem+' '
    return rem
data.review=data.review.apply(is_special)
data.review[1]

'A wonderful little production  The filming technique is very unassuming  very old time BBC fashion and gives a comforting  and sometimes discomforting  sense of realism to the entire piece  The actors are extremely well chosen  Michael Sheen not only  has got all the polari  but he has all the voices down pat too  You can truly see the seamless editing guided by the references to Williams  diary entries  not only is it well worth the watching but it is a terrificly written and performed piece  A masterful production about one of the great master s of comedy and his life  The realism really comes home with the little things  the fantasy of the guard which  rather than use the traditional  dream  techniques remains solid then disappears  It plays on our knowledge and our senses  particularly with the scenes concerning Orton and Halliwell and the sets  particularly of their flat with Halliwell s murals decorating every surface  are terribly well done '

In [10]:
def to_lower(text):
    return text.lower()

data.review=data.review.apply(to_lower)
data.review[1]

'a wonderful little production  the filming technique is very unassuming  very old time bbc fashion and gives a comforting  and sometimes discomforting  sense of realism to the entire piece  the actors are extremely well chosen  michael sheen not only  has got all the polari  but he has all the voices down pat too  you can truly see the seamless editing guided by the references to williams  diary entries  not only is it well worth the watching but it is a terrificly written and performed piece  a masterful production about one of the great master s of comedy and his life  the realism really comes home with the little things  the fantasy of the guard which  rather than use the traditional  dream  techniques remains solid then disappears  it plays on our knowledge and our senses  particularly with the scenes concerning orton and halliwell and the sets  particularly of their flat with halliwell s murals decorating every surface  are terribly well done '

In [11]:
import nltk
nltk.download('stopwords')

def rem_stopwords(text):
    stop_words=set(stopwords.words('english'))
    words=word_tokenize(str(text))
    return [w for w in words if w not in stop_words]

data.review=data.review.apply(rem_stopwords)
data.review[1]

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


['wonderful',
 'little',
 'production',
 'filming',
 'technique',
 'unassuming',
 'old',
 'time',
 'bbc',
 'fashion',
 'gives',
 'comforting',
 'sometimes',
 'discomforting',
 'sense',
 'realism',
 'entire',
 'piece',
 'actors',
 'extremely',
 'well',
 'chosen',
 'michael',
 'sheen',
 'got',
 'polari',
 'voices',
 'pat',
 'truly',
 'see',
 'seamless',
 'editing',
 'guided',
 'references',
 'williams',
 'diary',
 'entries',
 'well',
 'worth',
 'watching',
 'terrificly',
 'written',
 'performed',
 'piece',
 'masterful',
 'production',
 'one',
 'great',
 'master',
 'comedy',
 'life',
 'realism',
 'really',
 'comes',
 'home',
 'little',
 'things',
 'fantasy',
 'guard',
 'rather',
 'use',
 'traditional',
 'dream',
 'techniques',
 'remains',
 'solid',
 'disappears',
 'plays',
 'knowledge',
 'senses',
 'particularly',
 'scenes',
 'concerning',
 'orton',
 'halliwell',
 'sets',
 'particularly',
 'flat',
 'halliwell',
 'murals',
 'decorating',
 'every',
 'surface',
 'terribly',
 'well',
 'done']

In [12]:
def stem_txt(text):
    ss = SnowballStemmer('english')
    return " ".join([ss.stem(w) for w in text])

data.review=data.review.apply(stem_txt)
data.review[1]

'wonder littl product film techniqu unassum old time bbc fashion give comfort sometim discomfort sens realism entir piec actor extrem well chosen michael sheen got polari voic pat truli see seamless edit guid refer william diari entri well worth watch terrif written perform piec master product one great master comedi life realism realli come home littl thing fantasi guard rather use tradit dream techniqu remain solid disappear play knowledg sens particular scene concern orton halliwel set particular flat halliwel mural decor everi surfac terribl well done'

In [13]:
x = np.array(data.review.values)        # also written as --> x = np.array(data.iloc[:,0].values) 
y = np.array(data.sentiment.values)

In [14]:
cv = CountVectorizer(max_features = 1000)
x = cv.fit_transform(data.review).toarray()
print(x.shape)
print(y.shape)

(50000, 1000)
(50000,)


In [15]:
print(x)

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 1 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 1 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [16]:
x_train,x_test,y_train,y_test = tts(x,y, test_size=0.2, random_state = 0)

In [17]:
gnb, mnb, bnb = GaussianNB(), MultinomialNB(alpha = 1.0, fit_prior = True), BernoulliNB(alpha = 1.0, fit_prior = True)
gnb.fit(x_train, y_train)
mnb.fit(x_train, y_train)
bnb.fit(x_train, y_train)

In [18]:
ypg = gnb.predict(x_test)
ypm = mnb.predict(x_test)
ypb = bnb.predict(x_test)

In [19]:
print(ypg)
print(ypm)
print(ypb)

[1 0 1 ... 1 0 0]
[1 0 1 ... 1 0 0]
[1 0 1 ... 1 0 0]


In [20]:
print("Accuracy of GaussianNB :",accuracy_score(y_test,ypg))
print("Accuracy of MultinomialNB :",accuracy_score(y_test,ypm))
print("Accuracy of BernoulliNB:",accuracy_score(y_test,ypb))

Accuracy of GaussianNB : 0.7869
Accuracy of MultinomialNB : 0.8259
Accuracy of BernoulliNB: 0.8312


In [27]:
rev = """You are witnessing the most accurate description of this movie . 
One day R.chandru woke up and realised he has sh** ton of money and was wondering how to spend it . 
While having breakfast he watched kgf and then during lunch peaky blinders( a netflix series ) and finally during dinner he watched mirzapur( prime series) . While trying to sleep a great idea flashed into his head . My man decided to make a pan India movie named Kabzaa which was a mixture of all the above mentioned movies and series. 
Little did he know about the skill , art and technicalities which would be required to produce great pan Indian movies . 
But then what can you not aches with so much cash . R.Chandru got in the big names ( upendra , sudeep, shivanna etc ) 
And directed the most shi**iest movie I have ever watched . 


This movie emphasises the fact that money can’t buy everything and hardwork is a key element. 
Even the story is pure sh**. Only the actors did a decent job in acting . 
And also The love story , their shi**y romantic dialogues always managed to get me real annoyed ever single time . No proper context, scenes dumped in here and there randomly, Pathetic writing , poor bgm etc . 

I can confirm that releasing this movie pan Indian has brought down the standards of our rich Regional  language movies . 
A pure disgrace and unworthy movie to be precise . You could rather sleep and spend your time much better .
84 people found this helpful.
"""

In [28]:
f1 = clean(rev)
f2 = is_special(f1)
f3 = to_lower(f2)
f4 = rem_stopwords(f3)
f5 = stem_txt(f4)

In [29]:
bow, words =[],word_tokenize(f5)
for word in words:
    bow.append(words.count(word))
    
word_dict = cv.vocabulary_
pickle.dump(word_dict, open('bow.pkl','wb'))

In [30]:
inp = []
for i in word_dict:
    inp.append(f5.count(i[0]))

y_pred = bnb.predict(np.array(inp).reshape(1,1000))
print('bernoulliNB prediction is =', y_pred)

bernoulliNB prediction is = [0]


In [31]:
y_pred = gnb.predict(np.array(inp).reshape(1,1000))
print('GaussianNB prediction is =', y_pred)

GaussianNB prediction is = [1]


In [32]:
y_pred = mnb.predict(np.array(inp).reshape(1,1000))
print('MultinomialNB prediction is =', y_pred)

MultinomialNB prediction is = [1]
