In [38]:
import nltk
nltk.download('wordnet')
import numpy as np 
from sklearn.utils import shuffle

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/yuanjingma/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


In [3]:
from nltk.stem import WordNetLemmatizer
from sklearn.linear_model import LogisticRegression
from bs4 import BeautifulSoup

In [4]:
wordnet_lemmatizer = WordNetLemmatizer()

In [12]:
stopwords = set(w.rstrip() for w in open('stopwords.txt'))
# stopwords can also be imported using:
# from nltk.corpus import stopwords
# stopwords.words('english')

In [14]:
positive_reviews = BeautifulSoup(open('electronics/positive.review').read())
positive_reviews = positive_reviews.findAll('review_text')
negative_reviews = BeautifulSoup(open('electronics/negative.review').read())
negative_reviews = negative_reviews.findAll('review_text')

In [33]:
def my_tokenizer(s):
    s = s.lower()
    tokens = nltk.tokenize.word_tokenize(s)
    tokens = [t for t in tokens if len(t) > 2]
    tokens = [wordnet_lemmatizer.lemmatize(t) for t in tokens]
    tokens = [t for t in tokens if t not in stopwords]
    return tokens

In [46]:
word_index_map = {}
current_index = 0 
positive_tokenized = []
negative_tokenized = []
orig_reviews = []

for review in positive_reviews:
    orig_reviews.append(review.text)
    tokens = my_tokenizer(review.text)
    positive_tokenized.append(tokens)
    for token in tokens:
        if token not in word_index_map:
            word_index_map[token] = current_index
            current_index += 1 

for review in negative_reviews:
    orig_reviews.append(review.text)
    tokens = my_tokenizer(review.text)
    negative_tokenized.append(tokens)
    for token in tokens:
        if token not in word_index_map:
            word_index_map[token] = current_index
            current_index += 1 

In [48]:
def tokens_to_vector(tokens, label):
    x = np.zeros(len(word_index_map) + 1)
    for t in tokens:
        i = word_index_map[t]
        x[i] += 1
    x = x / x.sum()
    x[-1] = label
    return x 

In [49]:
N = len(positive_tokenized) + len(negative_tokenized)
data = np.zeros((N, len(word_index_map) + 1))
i = 0 
for tokens in positive_tokenized:
    xy = tokens_to_vector(tokens, 1)
    data[i,:] = xy
    i += 1 

for tokens in negative_tokenized:
    xy = tokens_to_vector(tokens, 0)
    data[i,:] = xy
    i += 1 

In [50]:
orig_reviews, data = shuffle(orig_reviews, data)
X = data[:, :-1]
Y = data[:, -1]
Xtrain = X[:-100,]
Ytrain = Y[:-100,]
Xtest = X[-100:,]
Ytest = Y[-100:,]

In [51]:
model = LogisticRegression()
model.fit(Xtrain, Ytrain)
print("Training accuracy:", model.score(Xtrain, Ytrain))
print("Test accuracy:", model.score(Xtest, Ytest))



Training accuracy: 0.7768421052631579
Test accuracy: 0.76


In [59]:
model.predict_proba(X)[:,1]

array([0.46706299, 0.52060852, 0.46479846, ..., 0.50817971, 0.50764533,
       0.45673638])

In [56]:
threshold = 0.5
for word, index in word_index_map.items():
    weight = model.coef_[0][index]
    if weight > threshold or weight < -threshold:
        print(word, weight)

unit -0.7009634319344006
bad -0.7394378351762326
cable 0.640379205425057
time -0.6904883886435361
've 0.8114146440667723
month -0.7248614931308413
sound 1.0970061759041818
lot 0.6801008901661839
you 1.1444423021289543
n't -2.056091674691703
easy 1.780069780294795
quality 1.3713289436818674
item -0.9353186494278319
wa -1.5829966057514344
perfect 0.9689734740488272
fast 0.9116200518262833
ha 0.664385006237124
price 2.7131527736259486
value 0.5367314976041344
money -1.1320823299411364
memory 0.8851087675186428
picture 0.5385029950893537
buy -0.8260324080546809
bit 0.6546273691490053
happy 0.5251639773738379
pretty 0.6970089086751402
doe -1.2734694158180864
highly 0.995255153811027
recommend 0.6962689140281912
customer -0.6411876138039375
support -0.8090414019027581
little 0.9554206419078429
returned -0.8029726060964223
excellent 1.4167902236991112
love 1.206844219157245
home 0.5253681502417883
week -0.7518863624046811
size 0.5306549051072578
using 0.5979041919218326
video 0.57021729268325

In [60]:
preds = model.predict(X)
P = model.predict_proba(X)[:,1]

minP_whenYis1 = 1
maxP_whenYis0 = 0
wrong_positive_review = None
wrong_negative_review = None
wrong_positive_prediction = None
wrong_negative_prediction = None

for i in range(N):
    p = P[i]
    y = Y[i]
    if y == 1 and p < 0.5:
        if p < minP_whenYis1:
            wrong_positive_review = orig_reviews[i]
            wrong_positive_prediction = preds[i]
            minP_whenYis1 = p
    elif y == 0 and p > 0.5:
        if p > maxP_whenYis0:
            wrong_negative_review = orig_reviews[i]
            wrong_negative_prediction = preds[i]
            maxP_whenYis0 = p


print("Most wrong positive review (prob = %s, pred = %s):" % (minP_whenYis1, wrong_positive_prediction))
print(wrong_positive_review)
print("Most wrong negative review (prob = %s, pred = %s):" % (maxP_whenYis0, wrong_negative_prediction))
print(wrong_negative_review)


Most wrong positive review (prob = 0.3359175281350011, pred = 0.0):

A device like this either works or it doesn't.  This one happens to work

Most wrong negative review (prob = 0.6061815857718108, pred = 1.0):

The Voice recorder meets all my expectations and more
Easy to use, easy to transfer great results

