In [1]:
import numpy as np
import pandas as pd

#Datasets from "https://www.kaggle.com/c/naive-bayes-imdb/data"

train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')


In [2]:
train_data.head()

Unnamed: 0.1,Unnamed: 0,review,label
0,0,I think they really let the quality of the DVD...,0
1,1,I'm sorry but this is just awful. I have told ...,0
2,2,"The Japenese sense of pacing, editing and musi...",0
3,3,"In the '60's/'70's, David Jason was renowned f...",1
4,4,"""Hail The Woman"" is one of the most moving fil...",1


In [3]:
test_data.head()

Unnamed: 0.1,Unnamed: 0,review
0,0,The make -or-break of a love story for me is w...
1,1,"""Bend It Like Beckham"" is a film that got very..."
2,2,"Pete's Meteor. I seen this referred to as ""aut..."
3,3,Funny that I find myself forced to review this...
4,4,Bare Wench is another softcore parody of the B...


In [4]:
from nltk.corpus import stopwords

StopWords = set(stopwords.words('english'))
    
print(StopWords)

{'an', 'herself', "you'd", 'on', 'very', 'about', 'is', 'does', 'has', 'she', 'not', 'only', 'hasn', "aren't", 'ma', "mightn't", 'do', 'doing', 'd', 'because', 'll', 'at', 'they', "hasn't", 'couldn', 't', 'its', 'yourself', "didn't", "it's", 'hadn', 'with', 'them', 'by', 'all', 'each', 'for', 'having', 'over', 'the', 'any', 'o', "should've", 'your', 'was', 'me', 'should', 'had', 'that', 'between', 'mustn', "won't", 'himself', 'haven', "haven't", 'against', 'when', 'out', 'from', 'no', 'why', 'too', "weren't", 'off', 're', 'again', 'yourselves', 'been', 'a', 'will', 'my', 'were', 'isn', 'myself', 'nor', 'this', 'than', 'm', 'it', 'if', 'what', 'itself', 'as', "shouldn't", 'which', 'shan', 'he', 'his', "wasn't", 'after', 'ours', 'their', 'did', 'such', 'so', "you're", 'few', 'there', 'wouldn', 'up', 'shouldn', 'whom', 'before', 'most', 'our', "wouldn't", 'aren', 'while', 'further', 'themselves', 'below', 'where', 'i', 'or', 'am', 'down', 'during', "isn't", 'how', "you'll", 'these', 'here

In [5]:
import string
from nltk import word_tokenize
from nltk.stem.snowball import SnowballStemmer   # Identifying semantically similar words 

sno = SnowballStemmer(language = 'english')

for i in range(len(train_data)):
    sent = train_data.loc[i,'review']
    l = [char for char in sent if char not in string.punctuation]  # Removing punctuation marks
    l = ''.join(l)
    p = word_tokenize(l)
    x = [sno.stem(word.lower()) for word in p if word.lower() not in StopWords]  # Excluding the Stopwords
    x = ' '.join(x)
    train_data.loc[i,'review'] = x
    
for i in range(len(test_data)):
    sent = test_data.loc[i,'review']
    l = [char for char in sent if char not in string.punctuation]
    l = ''.join(l)
    p = word_tokenize(l)
    x = [sno.stem(word.lower()) for word in p if word.lower() not in StopWords]
    x = ' '.join(x)
    test_data.loc[i,'review'] = x    
    

In [6]:
train_data.head()

Unnamed: 0.1,Unnamed: 0,review,label
0,0,think realli let qualiti dvd product get away ...,0
1,1,im sorri aw told peopl film bad act almost don...,0
2,2,japenes sens pace edit music score must differ...,0
3,3,60s70s david jason renown mani support role te...,1
4,4,hail woman one move film ever seen entir life ...,1


In [7]:
test_data.head()

Unnamed: 0.1,Unnamed: 0,review
0,0,make orbreak love stori whether like charact a...
1,1,bend like beckham film got littl exposur unit ...
2,2,pete meteor seen refer authent gem caught movi...
3,3,funni find forc review movi ambr br review rec...
4,4,bare wench anoth softcor parodi blair witch pr...


In [8]:
# Converting textual data into matrix form
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
vectorizer.fit(train_data['review'].values)
vectorized_train_data = vectorizer.transform(train_data['review'])

In [9]:
vectorized_train_data.shape

(40000, 125190)

In [10]:
vectorized_test_data = vectorizer.transform(test_data['review'])

In [11]:
vectorized_test_data.shape

(10000, 125190)

In [12]:
from sklearn.naive_bayes import BernoulliNB

In [13]:
clf = BernoulliNB() # Classifier
model = clf.fit(vectorized_train_data,train_data['label'])
result = model.predict(vectorized_test_data)

In [14]:
type(result)

numpy.ndarray

In [15]:
print(result)

[0 1 0 ... 1 1 1]


In [16]:
#Accuracy of model
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
X_train,X_test,Y_train,Y_test = train_test_split(vectorized_train_data,train_data['label'],train_size = 0.67)
model.fit(X_train,Y_train)
result2 = model.predict(X_test)

result2 = pd.DataFrame(confusion_matrix(result2,Y_test),columns = ['Pred0','Pred1'],index = ['Actual0','Actual1'])
result2

Unnamed: 0,Pred0,Pred1
Actual0,5810,1099
Actual1,809,5482


In [17]:
final_predictions = pd.DataFrame({"Id" : [i for i in range(0,len(test_data))],"Predicted" : result})
final_predictions.to_csv(r'final_predictions.csv',index = False)