In [27]:
import nltk 
import numpy as np

from nltk.stem import WordNetLemmatizer
from sklearn.linear_model import LogisticRegression
from bs4 import BeautifulSoup
from nltk.corpus import stopwords

wordnet_lemm = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

# Done for electronics section

# Loading data
positive_reviews = BeautifulSoup(open('electronics/positive_review.txt').read(), "html.parser")
positive_reviews = positive_reviews.findAll('review_text')

negative_reviews = BeautifulSoup(open('electronics/negative_review.txt').read(), "html.parser")
negative_reviews = negative_reviews.findAll('review_text')

unlabeled_reviews = BeautifulSoup(open('electronics/unlabeled_review.txt').read(), "html.parser")
unlabeled_reviews = unlabeled_reviews.findAll('review_text')

# print len(positive_reviews), len(negative_reviews)
# Making positive and negative reviews same
np.random.shuffle(positive_reviews)
positive_reviews = positive_reviews[:len(negative_reviews)]

# Word-index map for vocabulary
def tokenizer(s):
    s = s.lower()
    tokens = nltk.tokenize.word_tokenize(s)
    tokens = [t for t in tokens if len(t) >2]
    tokens = [wordnet_lemm.lemmatize(t) for t in tokens]
    tokens = [t for t in tokens if not t in stop_words]
    return tokens

# Creating a word index map which makes a vocabulary of words with its indices
word_index_map = {}
current_index = 0

# Saving positive and negative tokenized words
positive_tokenized = []
negative_tokenized = []

# For positive tokens
for review in positive_reviews:
    tokens = tokenizer(review.text)
    positive_tokenized.append(tokens)
    for token in tokens:
        if token not in word_index_map:
            word_index_map[token] = current_index
            current_index += 1

# For negative tokens
for review in negative_reviews:
    tokens = tokenizer(review.text)
    negative_tokenized.append(tokens)
    for token in tokens:
        if token not in word_index_map:
            word_index_map[token] = current_index
            current_index += 1
            
# Proportion count
def tokens_to_vector(tokens, label):
    x = np.zeros(len(word_index_map)+1)
    for t in tokens:
        i = word_index_map[t]
        x[i] += 1
    x = x / x.sum()
    x[-1] = label
    return x

# Length of tokenized words
N = len(positive_tokenized) + len(negative_tokenized)

# Making a matrix which has N as rows and word_index_map + label as columns
data = np.zeros((N, len(word_index_map) + 1))
i = 0

for tokens in positive_tokenized:
    xy = tokens_to_vector(tokens, 1)
    data[i,:] = xy
    i+=1

for tokens in negative_tokenized:
    xy = tokens_to_vector(tokens, 0)
    data[i,:] = xy
    i+=1
    
# Creating X and Y datasets
np.random.shuffle(data)

X = data[:, :-1]
Y = data[:, -1]

X_train = X[:-100,]
Y_train = Y[:-100,]
X_test = X[-100:,]
Y_test = Y[-100:,]

# Training a logistic regression model
model_log = LogisticRegression()
model_log.fit(X_train, Y_train)

print "Classification Rate: ", model_log.score(X_test, Y_test)

# Making a threshold to see which words carry what sentiment
threshold = 0.5

for word, index in word_index_map.iteritems():
    weight = model_log.coef_[0][index]
    if weight > threshold or weight < -threshold:
        print "Word: {}, {}".format(word, weight)

Classification Rate:  0.74
Word: easy, 0.6165544919
Word: love, 0.554215699207
Word: cable, 0.644853738159
Word: great, 1.40312009039
Word: wa, -0.504378791738
Word: price, 0.85707758034
Word: quality, 0.654682793347
Word: item, -0.51429693109
Word: n't, -0.79937277189
Word: good, 0.622980905976
Word: back, -0.502187427048
Word: use, 0.807589746759
