In [21]:
import nltk
import numpy as np

from nltk.stem import WordNetLemmatizer
from sklearn.linear_model import LogisticRegression
from bs4 import BeautifulSoup

Turn words into their basic form (e.g. cats ->cat, jumping -> jump)

In [2]:
wordnet_lemmatizer = WordNetLemmatizer()

In [3]:
stopwords = set(w.rstrip() for w in open('stopwords.txt'))

In [4]:
# html parser
positive_reviews = BeautifulSoup(open('positive.review').read(),"lxml")
positive_reviews = positive_reviews.findAll('review_text')

In [5]:
negative_reviews = BeautifulSoup(open('negative.review').read(), "lxml")
negative_reviews = negative_reviews.findAll('review_text')

There are more positive than negative reviews. To make the classes balanced, shuffle the postivie reviews and cut the extra to make positive and negative the same size

In [6]:
np.random.shuffle(positive_reviews)
positive_reviews = positive_reviews[:len(negative_reviews)]

Create index for words. 
* get the size of vocabulary

In [36]:
def my_tokenizer(s):
    s = s.lower()
    tokens = nltk.tokenize.word_tokenize(s)
    # remove words with len <2
    tokens = [t for t in tokens if len(t) > 2]
    # words in their basic forms
    tokens = [wordnet_lemmatizer.lemmatize(t) for t in tokens]
    tokens = [t for t in tokens if t not in stopwords]
    return tokens

In [29]:
s = positive_reviews[0].text.lower()
tokens = nltk.tokenize.word_tokenize(s)
tokens = [wordnet_lemmatizer.lemmatize(t) for t in tokens]

In [50]:
word_index_map = {}
current_index = 0

positive_tokenized = []
negative_tokenized = []

for review in positive_reviews:
    #tokenize
    tokens = my_tokenizer(review.text)
    positive_tokenized.append(tokens)
    for token in tokens:
        if token not in word_index_map:
            word_index_map[token] = current_index
            current_index += 1
for review in negative_reviews:
    #tokenize
    tokens = my_tokenizer(review.text)
    negative_tokenized.append(tokens)
    for token in tokens:
        if token not in word_index_map:
            word_index_map[token] = current_index
            current_index += 1

In [49]:
def tokens_to_vector(tokens, label):
    x = np.zeros(len(word_index_map) + 1)
    for t in tokens:
        i = word_index_map[t]
        x[i] += 1
    x = x/x.sum()
    x[-1] = label
    return x

In [52]:
N = len(positive_tokenized) + len(negative_tokenized)

In [53]:
# initialize an arra
data = np.zeros((N, len(word_index_map)+1))

In [55]:
i = 0
for tokens in positive_tokenized:
    xy = tokens_to_vector(tokens, 1)
    data[i,:] = xy
    i +=1
    
for tokens in negative_tokenized:
    xy = tokens_to_vector(tokens, 0)
    data[i,:] = xy
    i +=1

In [57]:
np.random.shuffle(data)
X = data[:, :-1]
Y = data[:, -1]

Xtrain = X[:-100,]
Ytrain = Y[:-100,]

Xtest = X[-100:,]
Ytest = Y[-100:,]

model = LogisticRegression()
model.fit(Xtrain, Ytrain)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [59]:
model.score(Xtest, Ytest)

0.81000000000000005

In [69]:
# see words that have heavy weight
threshold = 0.5
for word, index in word_index_map.items():
    weight = model.coef_[0][index]
    if weight > threshold or weight <-threshold:
        print(word,weight)

this -0.552770978504
little 0.952785317163
memory 1.00453417598
software -0.50405559167
you 0.881455709499
time -0.685009979542
excellent 1.33604047121
price 2.64496121988
then -1.00750625743
perfect 1.01408645517
n't -2.12230633217
doe -1.18190630891
space 0.583033568048
look 0.552919787595
sound 1.09033434125
ha 0.796290238263
lot 0.725438679158
laptop 0.589969307401
comfortable 0.652956174389
try -0.710490198675
bad -0.774702283072
've 0.740250016157
month -0.683028586558
... -0.527780830878
wa -1.54786690101
fast 0.818903404724
buy -0.63958044976
using 0.593397786425
picture 0.561185082146
tried -0.817403122803
paper 0.532317388363
quality 1.57716311219
highly 1.02828027189
recommend 0.719949290884
week -0.688959652919
bit 0.590708240921
cable 0.734706164764
unit -0.694281972939
love 1.15195079402
speaker 0.80411505714
pretty 0.750035485146
easy 1.70200166385
money -1.06877885223
company -0.537828599665
pro 0.509487801822
expected 0.575430977289
video 0.56567586709
hour -0.56233560