In [81]:
""" Sentiment Analysis """


' Sentiment Analysis '

In [82]:
import numpy as np # for matrix operations - randomization, making arrays etc.
from bs4 import BeautifulSoup # For parsing the reviews which are xml files
import nltk # For all the NLP stuff
import sklearn # For the learning part

import pdb # python debugger for breakpoints etc.

In [83]:
"""Order of operations"""
# Get Data
# Processing on the raw text : Tokenization, Lower-Case ,Lemmatization, Stop words etc. etc.
# build the vocabulary (as a word_index_map)
# Make the numeric data
# learn

'Order of operations'

In [84]:
"""
the review files are XML, we are interested in the tag review_text
"""
review_files = ['electronics/positive.review','electronics/negative.review']
pos_rev = BeautifulSoup(open(review_files[0],'r').read())
pos_rev = pos_rev.findAll('review_text')

print type(pos_rev) # While pos_rev is a class object, it is actually a wrapper around a list.
print pos_rev[:2] # each element of the list is a review_text (with the tag intact)

# similar for the negative review
neg_rev = BeautifulSoup(open(review_files[1],'r').read())
neg_rev = neg_rev.findAll('review_text')


<class 'bs4.element.ResultSet'>
[<review_text>\nI purchased this unit due to frequent blackouts in my area and 2 power supplies going bad.  It will run my cable modem, router, PC, and LCD monitor for 5 minutes.  This is more than enough time to save work and shut down.   Equally important, I know that my electronics are receiving clean power.\n\nI feel that this investment is minor compared to the loss of valuable data or the failure of equipment due to a power spike or an irregular power supply.\n\nAs always, Amazon had it to me in &lt;2 business days\n</review_text>, <review_text>\nI ordered 3 APC Back-UPS ES 500s on the recommendation of an employee of mine who used to work at APC. I've had them for about a month now without any problems. They've functioned properly through a few unexpected power interruptions. I'll gladly order more if the need arises.\n\nPros:\n - Large plug spacing, good for power adapters\n - Simple design\n - Long cord\n\nCons:\n - No line conditioning (usually

In [85]:
len(neg_rev),len(pos_rev)

(1000, 1000)

In [86]:
from nltk.stem import WordNetLemmatizer # Uses WordNet to find the words to stem to
import pdb
lemmatizer = WordNetLemmatizer()
stop_words = [w.rstrip() for w in open('stopwords.txt','r')] # the for ..  in open(...) reads in lines...i.e. till the next new line
                                                             # rstrip removes the '\n' from the end
def process_text(txt):
    txt_lower = txt.lower() # convert all the text to lower case
    tokens = nltk.tokenize.word_tokenize(txt_lower) # Get all the words
    tokens = [t for t in tokens if len(t)> 2]# remove all the 2 letter tokens "in,on" etc.
    stemmed = [lemmatizer.lemmatize(t) for t in tokens] # stem all the words
    stemmed = [w for w in stemmed if w not in stop_words] # remove all the stop words
    return stemmed
    pass

        

    

In [109]:
"""
Dry Run 1, see how the loops work
"""
pos_tokens = [] # empty lists for storing the tokens derived from the reviews
neg_tokens = [] # empty lists for storing the tokens derived from the reviews
word_index_map = {} # a dictionary that maps a word to an index
nwords = 0
nrev = 0 # number of reviews

for rev,rev_tokens in zip([pos_rev,neg_rev],[pos_tokens,neg_tokens]):  
    # zip lets us iterate over multiple lists simultaneously
    nrev = 0 
    for txt in rev:
        this_tokens = process_text(txt.text) # tokens from this review
        rev_tokens.append(this_tokens)# add it to the list for this sentiment
        for t in this_tokens:
            if t not in word_index_map: # if the word is not already seen
                word_index_map[t] = nwords # add it to the word_index_map
                nwords +=1 
        nrev += 1
        if nrev >1: # just a break for the dry run
            break
print word_index_map # see what the word index map looks like after have seen 2 positive and 2 negative reviews
print nwords
print len(pos_tokens),nrev


{u'money': 113, u'valuable': 28, u'month': 44, u'functioned': 45, u'hole': 94, u'carpet': 68, u'slot': 80, u'monitor': 14, u'fit': 92, u'simple': 56, u'easy': 67, u'employee': 41, u'save': 17, u'...': 96, u'easily': 105, u'break': 89, u'lcd': 13, u'putting': 85, u'day': 36, u'minute': 15, u'wa': 86, u'loss': 27, u'receiving': 21, u'cable': 10, u"'ve": 43, u'bad': 8, u'dispite': 109, u'rack': 77, u'blackout': 5, u'interruption': 48, u'unexpected': 47, u'individual': 79, u'design': 57, u'recommendation': 40, u'investment': 24, u'business': 35, u'bottom': 81, u'expensive': 63, u'electronics': 20, u'poorly': 74, u'bought': 88, u'cord': 58, u'shut': 18, u'run': 9, u'vertical': 76, u'loose': 99, u'spacing': 54, u'rug': 107, u'conditioning': 61, u'plug': 53, u'irregular': 33, u'adapter': 55, u'usually': 62, u'router': 12, u'frequent': 4, u'con': 59, u'equally': 19, u'supply': 7, u'feel': 23, u'ca': 98, u'cd': 71, u'doesnt': 78, u'apc': 37, u'unit': 2, u'guess': 101, u'data': 29, u'top': 73, u

In [88]:
"""
Actual Loop to make tokens out of the reviews
"""
pos_tokens = []
neg_tokens = []
word_index_map = {}
nwords = 0
nrev = 0
for rev,rev_tokens in zip([pos_rev,neg_rev],[pos_tokens,neg_tokens]): 
    for txt in rev:
        this_tokens = process_text(txt.text)
        rev_tokens.append(this_tokens)
        for t in this_tokens:
            if t not in word_index_map:
                word_index_map[t] = nwords
                nwords +=1



In [90]:
"""
Convert all the tokenized reviews to numeric data
"""
pos_numeric = [] # numeric form for the positive reviews
neg_numeric = []
for sentiment_numeric,sentiment_tokens,class_label in zip([pos_numeric,neg_numeric],[pos_tokens,neg_tokens],[1,0]):
    # zip over the numerics, tokens, and labels for each entiment
    for rev in sentiment_tokens: # each review in the tokens for this sentiment
        this_numeric = np.zeros((nwords+1,)) # make a zeros of size (vocabulary size + (1 for the label) )
        for w in rev:
            this_numeric[word_index_map[w]] += 1 # each word in review adds to its index
        this_numeric = this_numeric/sum(this_numeric) # so that it sums to 1, 
                                                      # makes the feature invariant to number of words in the review
        this_numeric[-1] = class_label # last element is the sentiment label
        sentiment_numeric.append(this_numeric) # append this review to the sentiments numeric list
    
# we'll want to use numpy functions now, so convert the numeric lists to numpy arrays
pos_numeric = np.array(pos_numeric) 
neg_numeric = np.array(neg_numeric)
np.random.shuffle(pos_numeric) # IN PLACE shuffle
np.random.shuffle(neg_numeric) # IN PLACE shuffle

n_neg_rev = len(neg_rev)
n_pos_rev = len(pos_rev)
samples_per_sentiment = min(n_neg_rev,n_pos_rev) # in case the 2 sentiments had different number of reviews

pos_numeric = pos_numeric[:samples_per_sentiment+1,:] # keep the minimum number of reviews
neg_numeric = neg_numeric[:samples_per_sentiment+1,:]

xy = np.concatenate((pos_numeric,neg_numeric)) # make the combined dataset
np.random.shuffle(xy) # shuffle so that pos and neg are randomly distibuted across the dataset
X = xy[:,:-1] # break down into feature X
Y = xy[:,-1] # and labels Y

In [91]:
# make the training and test sets
# test is the last 100 rows, and the train is the rest
Xtr = X[:-100,:]
Ytr = Y[:-100]

Xts = X[-100:,:]
Yts = Y[-100:]

In [92]:
# Make a logistic regression classifier
log_reg = sklearn.linear_model.LogisticRegression()
log_reg.fit(Xtr,Ytr) # fit to the train
print 'Train Accuracy %f , Test Accuracy %f'%(log_reg.score(Xtr,Ytr),log_reg.score(Xts,Yts))

Train Accuracy 0.780000 , Test Accuracy 0.680000


In [99]:
# Explore the weights
print log_reg.coef_.shape # coef_ stores the coefficients/weights 
word_weights = log_reg.coef_[0,:]
index_word_map = dict(zip(word_index_map.values(),word_index_map.keys())) 
# this is the reverse of the vocabulary dictionary, maps from index to the word 
# this syntax is standard, keys and values the original dictionary's keys and values

thresh = 0.5 # same as from lecture
sig_idx=[] # indices of significant words
sig_weights = [] # the logistic regression weights for these words

# loop to keep the significant words and indices
for idx in range(nwords): 
    if word_weights[idx] > thresh or word_weights[idx] < -thresh:
#         print index_word_map[idx], word_weights[idx]
        sig_idx.extend([idx])
        sig_weights.extend([word_weights[idx]])


(1, 11091)


In [111]:
# we want to see the words from most significant to least 
# so sort the ABSOLUTE VALUE of weights in descending, and then order the indices according to the weights' sort order

sig_order = np.argsort(np.abs(sig_weights))[::-1] # sort defaults to ascending order, we want descending
sorted_sig_weights = np.array(sig_weights)[sig_order] # weights sorted in highest to lowest
sorted_sig_idx = np.array(sig_idx)[sig_order] # indices sorted according to the weight sorting

print 'Most Significant Positive\n'
for idx,wt in zip(sorted_sig_idx,sorted_sig_weights):
    if wt > thresh:
        print index_word_map[idx],wt

print '\n\nMost Significant Negative\n'
for idx,wt in zip(sorted_sig_idx,sorted_sig_weights):
    if wt < -thresh:
        print index_word_map[idx],wt


Most Significant Positive

price 2.84765236511
easy 1.82483671107
quality 1.58858499785
excellent 1.41636015886
love 1.21764470522
you 1.1744308924
sound 1.10778914646
little 1.00895974045
memory 0.998228889794
perfect 0.970594408271
fast 0.929110568025
highly 0.914956423304
speaker 0.851184743616
've 0.822992039887
lot 0.764332900057
pretty 0.755552463275
happy 0.69205100046
cable 0.689636106396
recommend 0.679645962543
bit 0.674672116113
using 0.669429206461
comfortable 0.625733109924
space 0.618130611566
picture 0.588406375074
ha 0.581043209685
paper 0.572210536955
expected 0.547978801697
value 0.540389468463
video 0.528032517014
home 0.512271512669


Most Significant Negative

n't -2.02656043081
wa -1.6730906974
doe -1.26445720469
return -1.17250643504
then -1.05140342941
money -1.02772474092
item -0.980317302503
waste -0.950470617889
support -0.905622417865
buy -0.804823906648
tried -0.778415033022
returned -0.762839766864
poor -0.756632228452
month -0.728957372861
week -0.7225809