**Importing Necessary Libraries for implementation**

In [1]:
from __future__ import print_function, division
from future.utils import iteritems
from builtins import range

In [2]:
import nltk
import numpy as np
from sklearn.utils import shuffle

In [3]:
from nltk.stem import WordNetLemmatizer
from sklearn.linear_model import LogisticRegression
from bs4 import BeautifulSoup

In [4]:
wordnet_lemmatizer = WordNetLemmatizer()

**Getting a list of stopwords**

Link for downloading file: http://www.lextek.com/manuals/onix/stopwords1.html 

an alternative source of stopwords
 #from nltk.corpus import stopwords
 #stopwords.words('english')

In [5]:
stopwords = set(w.rstrip() for w in open('stopwords.txt'))

load the reviews

 data courtesy of http://www.cs.jhu.edu/~mdredze/datasets/sentiment/index2.html

In [6]:
positive_reviews = BeautifulSoup(open('positive.review.txt').read(), features="lxml")
positive_reviews = positive_reviews.findAll('review_text')

In [7]:
negative_reviews = BeautifulSoup(open('negative.review.txt').read(), features="lxml")
negative_reviews = negative_reviews.findAll('review_text')

first let's just try to tokenize the text using nltk's tokenizer

let's take the first review for example:

[t = positive_reviews[0]

nltk.tokenize.word_tokenize(t.text)]

notice how it doesn't downcase, so It != it

not only that, but do we really want to include the word "it" anyway?

you can imagine it wouldn't be any more common in a positive review than a 
negative review

so it might only add noise to our model.

so let's create a function that does all this pre-processing for us

In [8]:
def my_tokenizer(s):
    s = s.lower() # downcase
    tokens = nltk.tokenize.word_tokenize(s) # split string into words (tokens)
    tokens = [t for t in tokens if len(t) > 2] # remove short words, they're probably not useful
    tokens = [wordnet_lemmatizer.lemmatize(t) for t in tokens] # put words into base form
    tokens = [t for t in tokens if t not in stopwords] # remove stopwords
    return tokens

create a word-to-index map so that we can create our word-frequency vectors later

 let's also save the tokenized versions so we don't have to tokenize again later

In [9]:
word_index_map = {}
current_index = 0
positive_tokenized = []
negative_tokenized = []
orig_reviews = []

**Downloading the necessary libraries from nltk**

In [10]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [11]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [12]:
nltk.download('omw-1.4')

[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [13]:
for review in positive_reviews:
    orig_reviews.append(review.text)
    tokens = my_tokenizer(review.text)
    positive_tokenized.append(tokens)
    for token in tokens:
        if token not in word_index_map:
            word_index_map[token] = current_index
            current_index += 1

In [14]:
for review in negative_reviews:
    orig_reviews.append(review.text)
    tokens = my_tokenizer(review.text)
    negative_tokenized.append(tokens)
    for token in tokens:
        if token not in word_index_map:
            word_index_map[token] = current_index
            current_index += 1

In [15]:
print("len(word_index_map):", len(word_index_map))

len(word_index_map): 10948


now let's create our input matrices

In [16]:
def tokens_to_vector(tokens, label):
    x = np.zeros(len(word_index_map) + 1) # last element is for the label
    for t in tokens:
        i = word_index_map[t]
        x[i] += 1
    x = x / x.sum() # normalize it before setting label
    x[-1] = label
    return x

In [17]:
N = len(positive_tokenized) + len(negative_tokenized)
# (N x D+1 matrix - keeping them together for now so we can shuffle more easily later
data = np.zeros((N, len(word_index_map) + 1))
i = 0

In [18]:
for tokens in positive_tokenized:
    xy = tokens_to_vector(tokens, 1)
    data[i,:] = xy
    i += 1

In [19]:
for tokens in negative_tokenized:
    xy = tokens_to_vector(tokens, 0)
    data[i,:] = xy
    i += 1

shuffle the data and create train/test splits

In [20]:
orig_reviews, data = shuffle(orig_reviews, data)

In [21]:
X = data[:,:-1]
Y = data[:,-1]

last 100 rows will be test

In [22]:
Xtrain = X[:-100,]
Ytrain = Y[:-100,]
Xtest = X[-100:,]
Ytest = Y[-100:,]

In [23]:
model = LogisticRegression()
model.fit(Xtrain, Ytrain)
print("Train accuracy:", model.score(Xtrain, Ytrain))
print("Test accuracy:", model.score(Xtest, Ytest))

Train accuracy: 0.7842105263157895
Test accuracy: 0.77


let's look at the weights for each word

In [24]:
threshold = 0.5
for word, index in iteritems(word_index_map):
    weight = model.coef_[0][index]
    if weight > threshold or weight < -threshold:
        print(word, weight)

unit -0.737895751518615
bad -0.7928197937863067
cable 0.5659943949932409
time -0.5977058923416557
've 0.74880187153881
month -0.7834323741158365
sound 0.9840963701879415
lot 0.586952295889688
you 1.0458997746532022
n't -2.083028657548949
easy 1.7709332688917743
quality 1.4060061624571984
company -0.5669853583815873
item -0.9845688251717922
wa -1.6159306611491042
perfect 0.9959313767105659
fast 0.9709390116417677
ha 0.6836983691616917
price 2.6306778318226174
value 0.5434826838501908
money -1.111246846068905
memory 0.985920738810667
buy -0.8837188239896172
bit 0.6027602644776905
happy 0.6483995874911666
pretty 0.730993791979793
doe -1.2000788972859866
highly 1.075781379771506
recommend 0.6417621638154619
customer -0.6386738828366063
support -0.8081001275550859
little 0.9184422057734457
returned -0.7855152271696372
excellent 1.2843440296194788
love 1.2527278939795603
piece -0.5683585045433528
useless -0.5234679625005486
week -0.7789700542175677
using 0.6485848920994011
laptop 0.560988257

check misclassified examples

In [25]:
preds = model.predict(X)
P = model.predict_proba(X)[:,1] # p(y = 1 | x)

since there are many, just print the "most" wrong samples

In [26]:
minP_whenYis1 = 1
maxP_whenYis0 = 0
wrong_positive_review = None
wrong_negative_review = None
wrong_positive_prediction = None
wrong_negative_prediction = None

In [27]:
for i in range(N):
    p = P[i]
    y = Y[i]
    if y == 1 and p < 0.5:
        if p < minP_whenYis1:
            wrong_positive_review = orig_reviews[i]
            wrong_positive_prediction = preds[i]
            minP_whenYis1 = p
    elif y == 0 and p > 0.5:
        if p > maxP_whenYis0:
            wrong_negative_review = orig_reviews[i]
            wrong_negative_prediction = preds[i]
            maxP_whenYis0 = p

In [28]:
print("Most wrong positive review (prob = %s, pred = %s):" % (minP_whenYis1, wrong_positive_prediction))
print(wrong_positive_review)
print("Most wrong negative review (prob = %s, pred = %s):" % (maxP_whenYis0, wrong_negative_prediction))
print(wrong_negative_review)

Most wrong positive review (prob = 0.35392278011586775, pred = 0.0):

A device like this either works or it doesn't.  This one happens to work

Most wrong negative review (prob = 0.6021959922952513, pred = 1.0):

The Voice recorder meets all my expectations and more
Easy to use, easy to transfer great results

