In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv('Train.csv')
data = df.values

In [3]:
df.head()

Unnamed: 0,review,label
0,mature intelligent and highly charged melodram...,pos
1,http://video.google.com/videoplay?docid=211772...,pos
2,Title: Opera (1987) Director: Dario Argento Ca...,pos
3,I think a lot of people just wrote this off as...,pos
4,This is a story of two dogs and a cat looking ...,pos


In [4]:
from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords

In [25]:
tokenizer = RegexpTokenizer("[A-Za-z-']+")
en_stopwords = set(stopwords.words('english'))
ps = PorterStemmer()

In [26]:
from bs4 import BeautifulSoup
import regex as re

In [27]:
def getStemmedReview(review) :
    review = review.lower()
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    review = url_pattern.sub(r' ', review)
    soup = BeautifulSoup(review, 'html.parser')
    review = soup.get_text(separator = ' ')
    tokens = tokenizer.tokenize(review)
    new_tokens = [token for token in tokens if token not in en_stopwords]
    stemmed_tokens = [ps.stem(token) for token in new_tokens]
    cleaned_review = ' '.join(stemmed_tokens)
    return cleaned_review

In [28]:
data[1, 0]

'http://video.google.com/videoplay?docid=211772166650071408&hl=en Distribution was tried.<br /><br />We opted for mass appeal.<br /><br />We want the best possible viewing range so, we forgo profit and continue our manual labor jobs gladly to entertain you for working yours.<br /><br />View Texas tale, please write about it... If you like it or not, if you like Alex or not, if you like Stuie, Texas or Texas tale... Just write about it.<br /><br />Your opinion rules.'

In [29]:
getStemmedReview(data[1, 0])

'distribut tri opt mass appeal want best possibl view rang forgo profit continu manual labor job gladli entertain work view texa tale pleas write like like alex like stuie texa texa tale write opinion rule'

In [30]:
def getStemmedDocument(document) :
    output = []
    for review in document :
        output.append(getStemmedReview(review))
    return output

In [31]:
cleaned_document = getStemmedDocument(data[:, 0])

In [32]:
cleaned_document[0]

"matur intellig highli charg melodrama unbelivebl film china wei wei' stun perform catylast love triangl simpli stun oppurun see magnific film take"

In [33]:
cleaned_document[1]

'distribut tri opt mass appeal want best possibl view rang forgo profit continu manual labor job gladli entertain work view texa tale pleas write like like alex like stuie texa texa tale write opinion rule'

In [34]:
cleaned_document[2]

"titl opera director dario argento cast cristina masillach ian charleson urbano barberini daria nicolodi review argento movi seen suspiria one blew away style color spooki stori line next decid go opera told one best man think i'm discov ultim one favorit horror director opera young opera singer get big break main star creepi modern opera take mc beth get hit car betti understudi get part bad there' psycho make watch brutal murder friend co-work wow id heard good thing flick prepar level great film would take yeah movi shortcom ill get later part movi blew away first movi fill lot color suspiria expect bit like suspiria depart surpris look feel film somehow devoid color lot color certain scene like master kitchen live room sequenc argento fill screen lush green blue part film grayish black tone like distinct look real star show incred well orchestr death sequenc wow everi death scene like work art beauti destruct typic hack slash death sequenc death care construct shock get situat love

In [36]:
y = data[:, 1]

In [38]:
Y = []
for i in y :
    if i == 'pos' :
        Y.append(1)
    else :
        Y.append(0)

In [39]:
Y[:10]

[1, 1, 1, 1, 1, 1, 0, 0, 1, 1]

In [40]:
 X = cleaned_document

In [41]:
from sklearn.feature_extraction.text import CountVectorizer

In [74]:
cv = CountVectorizer(ngram_range=(1, 2))

x_vec = cv.fit_transform(X)
print(x_vec[3, :].toarray())
print(x_vec.shape)

[[0 0 0 ... 0 0 0]]
(40000, 2286831)


In [75]:
dftest = pd.read_csv('Test.csv')

In [76]:
x_test_raw = dftest.values[:, 0]

In [77]:
x_test = getStemmedDocument(x_test_raw)

In [78]:
cv.vocabulary_

{'matur': 1243991,
 'intellig': 1021167,
 'highli': 931979,
 'charg': 322128,
 'melodrama': 1259189,
 'unbelivebl': 2111935,
 'film': 730217,
 'china': 336168,
 'wei': 2206116,
 'stun': 1938369,
 'perform': 1478804,
 'catylast': 302521,
 'love': 1189553,
 'triangl': 2084540,
 'simpli': 1823513,
 'oppurun': 1423336,
 'see': 1756141,
 'magnific': 1208516,
 'take': 1979343,
 'matur intellig': 1244097,
 'intellig highli': 1021459,
 'highli charg': 932030,
 'charg melodrama': 322329,
 'melodrama unbelivebl': 1259426,
 'unbelivebl film': 2111936,
 'film china': 731388,
 'china wei': 336389,
 'wei wei': 2206129,
 'wei stun': 2206127,
 'stun perform': 1938656,
 'perform catylast': 1479192,
 'catylast love': 302522,
 'love triangl': 1192850,
 'triangl simpli': 2084628,
 'simpli stun': 1824528,
 'stun oppurun': 1938642,
 'oppurun see': 1423337,
 'see magnific': 1758727,
 'magnific film': 1208622,
 'film take': 736929,
 'distribut': 546635,
 'tri': 2082388,
 'opt': 1423380,
 'mass': 1237065,
 'ap

In [79]:
x_test_vec = cv.transform(x_test)

In [80]:
print(x_test_vec.shape)

(10000, 2286831)


## Multinomial Naive Bayes

In [81]:
from sklearn.naive_bayes import MultinomialNB

In [82]:
mnb = MultinomialNB()

In [83]:
mnb.fit(x_vec, Y)

MultinomialNB()

In [84]:
y_t = mnb.predict(x_test_vec)

In [90]:
y_test = []
for i in range(len(y_t)) :
    if y_t[i] == 1 :
        y_test.append([i, 'pos'])
    else :
        y_test.append([i, 'neg'])

In [91]:
y_test = np.array(y_test)

In [92]:
dfy = pd.DataFrame(y_test, columns = ['Id', 'label'])

In [93]:
dfy.to_csv('answer.csv', index = False)

In [94]:
mnb.predict_proba(x_test_vec)

array([[1.00000000e+00, 3.38772631e-14],
       [9.98168972e-01, 1.83102796e-03],
       [1.00000000e+00, 2.57373093e-35],
       ...,
       [7.24657094e-27, 1.00000000e+00],
       [2.65645081e-13, 1.00000000e+00],
       [1.00000000e+00, 4.64574062e-43]])

In [95]:
y_t

array([0, 0, 0, ..., 1, 1, 0])

## Multivariate Bernoulli Event Naive Bayes

In [96]:
from sklearn.naive_bayes import BernoulliNB

In [97]:
bnb = BernoulliNB()

In [98]:
bnb.fit(x_vec, Y)

BernoulliNB()

In [99]:
bnb.predict_proba(x_test_vec)

array([[1.00000000e+00, 5.48236453e-13],
       [9.99617416e-01, 3.82583541e-04],
       [1.00000000e+00, 6.14521393e-34],
       ...,
       [1.54521240e-20, 1.00000000e+00],
       [1.66735203e-10, 1.00000000e+00],
       [1.00000000e+00, 1.04841127e-36]])

In [101]:
y_bern = bnb.predict(x_test_vec)

In [102]:
y_test_bern = []
for i in range(len(y_bern)) :
    if y_bern[i] == 1 :
        y_test_bern.append([i, 'pos'])
    else :
        y_test_bern.append([i, 'neg'])

In [103]:
mnb.score(x_vec, Y)

0.99485

In [104]:
dfnew = pd.DataFrame(y_test_bern, columns = ['Id', 'label'])
dfnew.to_csv('answer2.csv', index = False)