In [1]:
import nltk
import numpy as np
from nltk.stem import WordNetLemmatizer
from sklearn.linear_model import LogisticRegression

In [2]:
from bs4 import BeautifulSoup

In [3]:
wordnet_lemma=WordNetLemmatizer()

In [4]:
stopwords=set(w.rstrip() for w in open('stopwords.txt'))

In [5]:
positive_reviews=BeautifulSoup(open('positive.review').read())
positive_reviews=positive_reviews.findAll('review_text')

In [6]:
negative_reviews=BeautifulSoup(open('negative.review').read())
negative_reviews=negative_reviews.findAll('review_text')

In [7]:
def my_tokenizer(s):
    s=s.lower()
    tokens=nltk.tokenize.word_tokenize(s)
    tokens=[t for t in tokens if len(t)>2]
    tokens=[wordnet_lemma.lemmatize(t) for t in tokens]
    tokens=[t for t in tokens if t not in stopwords]
    return tokens

In [8]:
word_index_map={}
current_index=0

In [9]:
positive_tokenized=[]
negative_tokenized=[]

In [10]:
for review in positive_reviews:
    tokens=my_tokenizer(review.text)
    positive_tokenized.append(tokens)
    for token in tokens:
        if token not in word_index_map:
            word_index_map[token]=current_index
            current_index+=1

In [11]:
for review in negative_reviews:
    tokens=my_tokenizer(review.text)
    negative_tokenized.append(tokens)
    for token in tokens:
        if token not in word_index_map:
            word_index_map[token]=current_index
            current_index+=1

In [12]:
def tokens_to_vector(tokens, label):
    x=np.zeros(len(word_index_map)+1)
    for t in tokens:
        i=word_index_map[t]
        x[i]+=1
    x=x/x.sum()
    x[-1]=label
    return x

In [13]:
N=len(positive_tokenized)+len(negative_tokenized)
data=np.zeros((N,len(word_index_map)+1))

In [14]:
i=0
for tokens in positive_tokenized:
    xy=tokens_to_vector(tokens,1)
    data[i,:]=xy
    i+=1

In [15]:
for tokens in negative_tokenized:
    xy=tokens_to_vector(tokens,0)
    data[i,:]=xy
    i+=1

In [16]:
np.random.shuffle(data)
X=data[:,:-1]
Y=data[:,-1]

In [17]:
xtrain=X[:-1000,]
ytrain=Y[:-1000,]
xtest=X[-1000:,]
ytest=Y[-1000:,]

In [18]:
model=LogisticRegression(solver='lbfgs')
model.fit(xtrain,ytrain)
print("Accuracy of Logistic Regression:",model.score(xtest,ytest))

Accuracy of Logistic Regression: 0.72


In [19]:
threshold=0.5
for word,index in word_index_map.items():
    weight=model.coef_[0][index]
    if weight>threshold or weight< -threshold:
        print (word,weight)

purchased -0.724598292900975
unit -1.2979438204282727
bad -3.1365105040974464
cable 0.5757178615496364
minute -1.4389003589411649
time -1.0239358441986353
save -0.6746804566377006
clean 1.5933510078546589
've 1.248369287272526
month -1.6213353343331054
simple 0.8600882054435354
light 0.5391282828761181
sound 0.6221416982771426
lot 1.1362664085002327
you 2.2589885594394112
n't -3.987761925842323
easy 4.877339399254914
quality 1.5257464554627485
item -1.1498044035630024
wa -4.430350691936134
perfect 2.304194558203345
sturdy 0.5139860167385207
collection 0.5271652474464766
flimsy -0.5080798408180156
fast 1.2827448700365423
ha 1.5680456492535475
complaint 0.656127622911944
price 3.454942729663306
value 0.7634991358027196
money -2.2880520597665
memory 0.863874083437682
game 0.5897352793617908
buy -1.3002107039442603
... -1.7294762983290843
thank 0.5687466005167601
review -1.502484750588869
fine 0.5004058380020165
bit 0.8591558294211723
happy 1.0762869378823128
super 0.5234826619182064
prett

In [20]:
from sklearn.naive_bayes import MultinomialNB
model2=MultinomialNB()

In [21]:
model2.fit(xtrain,ytrain)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [22]:
print('Accuracy of MultinomialNB',model2.score(xtest,ytest))

Accuracy of MultinomialNB 0.79


In [None]:
from sklearn.svm import SVC
clf = SVC(gamma='auto')
clf.fit(xtrain, ytrain) 

In [None]:
print("Accuracy of SVM",clf.score(xtest,ytest))