In [26]:
from bs4 import BeautifulSoup
from future.utils import iteritems
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.utils import shuffle

In [2]:
wordnet_lemmatizer = WordNetLemmatizer()

In [3]:
#Read in our stop words file and then strip the line breaks and other such special characters to leave the word
with open(r'/Users/prof/Documents/machine_learning_examples/nlp_class/stopwords.txt') as f:
    lines = f.readlines()
    
stopwords = set(w.rstrip() for w in lines)

In [4]:
#Open a file for both the positive and negative electronics reviews
Reviews_pos = BeautifulSoup(open('/Users/prof/Documents/machine_learning_examples/nlp_class/electronics/positive.review').read(), features = "html5lib")
Reviews_pos = Reviews_pos.findAll('review_text')

Reviews_neg = BeautifulSoup(open('/Users/prof/Documents/machine_learning_examples/nlp_class/electronics/negative.review').read(), features = "html5lib")
Reviews_neg = Reviews_neg.findAll('review_text')

The objective of this example it to use logistic regression to to classify the sentiment of our reviews as either positive or negative and then compare the classification to the actual sentiment. There will be some preprocessing of our data before we can make predictions.
* Our words are in various cases like Home and home, in the model these would be two different words. We will make all words lowercase so we dont run into this issue

* We will also remove any short words like it, me, he and any stop words in our stop words list

* Also we will tokenize the words in each of our review referring to them as tokens in the example below

In the code below, we use wordnet_lemmatizer, this is a process used to normalize the text by grouping together words using their root. As an example, with lemmatization, the word Caring becomes Care. The opposite is Stemming which would turn caring into Car, which would be incorrect in this example.

In [5]:
def tokenizer(r):
    r = r.lower()
    tokens = nltk.tokenize.word_tokenize(r)
    tokens = [t for t in tokens if len(t) > 2] #removing short words    
    tokens = [wordnet_lemmatizer.lemmatize(t) for t in tokens] 
    tokens = [t for t in tokens if t not in stopwords]
    return tokens

The code below creates a word map from our tokenized text. As it loops through the reviews, if the token is not in out list we append it, but if the token is in our list already we increase its index by 1 to keep track of how many times the word appears in our positive reviews vs our negative reviews. 

In [6]:
word_index_map = {}
current_index = 0
positive_tokenized = []
negative_tokenized = []
orig_reviews = []

for review in Reviews_pos:
    orig_reviews.append(review.text)
    tokens = tokenizer(review.text)
    positive_tokenized.append(tokens)
    for token in tokens:
        if token not in word_index_map:
            word_index_map[token] = current_index
            current_index +=1
            
for review in Reviews_neg:
    orig_reviews.append(review.text)
    tokens = tokenizer(review.text)
    negative_tokenized.append(tokens)
    for token in tokens:
        if token not in word_index_map:
            word_index_map[token] = current_index
            current_index +=1
            
print("Length of word index map: ", len(word_index_map))

Length of word index map:  10950


In [7]:
def tokens_to_vector(tokens, label):
    x = np.zeros(len(word_index_map)+1)
    for t in tokens:
        i = word_index_map[t]
        x[i] += 1
    x = x / x.sum()
    x[-1] = label
    return x

In [19]:
N = len(positive_tokenized) + len(negative_tokenized)
data = np.zeros((N, len(word_index_map)+1))
i = 0

for tokens in positive_tokenized:
    xy = tokens_to_vector(tokens,1)
    data[i,:] = xy
    i +=1
    
for tokens in negative_tokenized:
    xy = tokens_to_vector(tokens,0)
    data[i,:] = xy
    i +=1

A logistic regression model is used to predict our training and testing dataset, whether the review was positive or negative based on the vectorized text. We also output an accuracy score to see how our training and testing models performed relative to the actual sentiment. 

In [25]:
orig_reviews, data = shuffle(orig_reviews,data)

X = data[:,:-1]
Y = data[:,-1]

Xtrain = X[:-100,]
Ytrain = Y[:-100,]
Xtest = X[-100:,]
Ytest = Y[-100:,]


model = LogisticRegression()
model.fit(Xtrain, Ytrain)
print("Train Accuracy: ", model.score(Xtrain, Ytrain))

print("Test Accuracy: ", model.score(Xtest, Ytest))

Train Accuracy:  0.7794736842105263
Test Accuracy:  0.76


In [28]:
threshold = 0.5
for word, index in iteritems(word_index_map):
    weight = model.coef_[0][index]
    if weight > threshold or weight < threshold:
        print(word,weight)

purchased -0.27903340449023717
this -0.36423064515501946
unit -0.7155819536788771
due -0.2939751855776533
frequent 0.029643179581998975
blackout 0.025674394126539
power -0.0020420166427574356
supply 0.08125402137292406
bad -0.7839346957111885
run 0.09973193996140321
cable 0.6422017570822114
modem -0.06392388625848935
router -0.28220898962241386
lcd 0.006710647033585313
monitor 0.33350192470014123
minute -0.31030245261840655
time -0.6084616983431862
save -0.20087181640468982
shut -0.14587180839364589
equally 0.05323153688019802
electronics -0.10232907472447211
receiving -0.019452020727943452
clean 0.14870792095201602
feel 0.12474000458469099
investment 0.18539551318684686
minor 0.16277792109902217
compared 0.1919481762544949
loss -0.039491722252119915
valuable 0.011685494750986514
data 0.09401589560128527
failure -0.07462461826942601
equipment 0.10260857300523742
spike 0.01437415182376908
irregular 0.011685494750986514
amazon 0.3677880409944986
business -0.12975370914046477
day -0.32676

nemo 0.009911598205229919
harry 0.009911598205229919
potter 0.009911598205229919
hdmi 0.0018312349222999011
kdf-42e2000 0.009911598205229919
3lcd 0.009911598205229919
rear 0.07804576494578064
projection 0.009911598205229919
hdtv -0.0037953103625302277
combination -0.11301404977257913
annoyance -0.01278099773174693
responds 0.0333173641690287
load 0.13046686072851152
slowly -0.010145409314504696
sacrifice 0.009911598205229919
incredibly -0.008885337402897002
low -0.16006241593825227
hot -0.038015764368901736
king -0.005807987509620913
stretch 0.024309393218053504
squat 0.009911598205229919
figure -0.20732067530025247
recode 0.009673551571764439
sleep -0.07046446891329226
wake -0.01236470951179063
finished 0.010753697241317963
awaits 0.009673551571764439
sharpen 0.009673551571764439
correct -0.0609718326417842
bring -0.0380588153229493
transfered 0.03893690502841747
released -0.04214381294784406
start -0.08731634520784538
library 0.026672223831563285
editing 0.028338488411847912
tricky 0

hinder 0.0027185765752971405
changing -0.049494854325597845
child -0.0021037130566328374
skipdoctor 0.005437153150594281
a/c 0.032468220536238705
contained 0.009441980991211461
automatic -0.01014876508288975
equivalent -0.0029547362271953362
repaired -0.05825838926263596
played 0.012473781368528837
skipped -0.012784016823820087
useable 0.0027185765752971405
condition 0.1786313047369521
visible -0.007285925901893646
necessarily 0.0044212313773616405
affect -0.003963387371008859
cloth 0.003197325904775833
attempting -0.03117674934584709
agree -0.31262073828251896
reviewer -0.17814051390820052
severely -0.007990113588735961
damaged -0.03096676386630013
pass 0.0027185765752971405
consuming -0.026253475124190417
acheive 0.0027185765752971405
usage 0.020408394895283143
repeating 0.0027185765752971405
buffing/sanding 0.0027185765752971405
row -0.02548705409417214
deeper -0.014751292315208402
cream 0.0006011078601352953
pas -0.04674256798503232
severity 0.0027185765752971405
grime 0.0027185765

klh 0.006008886845801335
10inch 0.006008886845801335
tremor 0.006008886845801335
series 0.13321667904998818
estimate -0.0025194794260009245
650.00 0.006008886845801335
colleague 0.006008886845801335
noice 0.024450085494531364
cancelling 0.03403896944835008
ant 0.05374609577455006
owns 0.03456247757466765
portability 0.02599388629002328
basically -0.041998762939893915
roun 0.006497704740473571
cylinder 0.006497704740473571
shaped 0.010220944618762925
wasnt 0.03537284633400038
impressive 0.046202978573412656
bulky 0.1300382422254141
costlier 0.006497704740473571
bestbuy -0.011138752749349666
outweighs 0.015433391465854396
yea 0.004142718845727039
playlist 0.010857009160978166
play/pause/next/prev 0.006497704740473571
griffin -0.12686990602405004
neither -0.07956766454882941
charged -0.0661109652700589
lasted -0.148865124901294
pump 0.01414621850680638
consistant 0.005888166977633807
1-2-3 0.005888166977633807
clearer 0.01772216097862289
happier 0.06263699821595202
view 0.0294915338956048

attempt -0.1365513650344824
repeat -0.08218430393595502
multifunction 0.0031657566097616115
regardless -0.01304303633014374
former 0.0004808132564374245
midiland 0.001792149769337132
thoroughly 0.01580132344244503
apart -0.09094632821259646
aim 0.001792149769337132
account -0.06805604159800491
999 0.001792149769337132
sooo -0.01946898076341951
bi-directional 0.001792149769337132
midrange 0.025416165782893664
include -0.023485270160498877
customize 0.001792149769337132
his/her 0.001792149769337132
and/or 0.06354060998394867
aesthetically 0.001792149769337132
pleasing 0.001792149769337132
rectangular -0.010119363684406461
attractiveness 0.001792149769337132
reproducing 0.001792149769337132
sound-granted 0.001792149769337132
paired 0.006361439407245708
competent 0.0065320048536716005
qualm 0.001792149769337132
hidden -0.003990107266836551
marking 0.001792149769337132
belong 0.0035726075470573697
to-a 0.001792149769337132
hide -0.02178564546402358
serious -0.12296179731999986
onkyo -0.0076

lived -0.012814274997744101
9volt 0.0022450320908407096
waste -0.9717685915109618
bedside 0.0022450320908407096
palm -0.13698435714445492
bell 0.07311715042428378
whistle 0.0594632260004435
checking -0.012993062334186243
straightforward 0.022785302163442215
emptying 0.016581870384227967
erasing 0.004802104597140167
technique 0.024168312767381416
manufacuters 0.016581870384227967
explain -0.02301245925393795
attach -0.021026951311715034
tread 0.016581870384227967
laniard 0.016581870384227967
struggle -0.004379585712639643
sleakest 0.010551948556187693
best-looking 0.010551948556187693
apple-white 0.010551948556187693
drone 0.010551948556187693
focus -0.017983811420329312
footprint -0.005647836735313468
unimposing 0.010551948556187693
sounddock 0.021564042211267863
recomendation 0.011258394453711133
listend 0.011258394453711133
raido 0.011258394453711133
disadvantage 0.01866931721196352
jogger 0.01699373164754245
unobtrusive 0.011730423229824389
convenient 0.16073521957227774
accidentall

eponymous 0.0009899497224594815
1979 0.0009899497224594815
steely 0.0009899497224594815
dan 0.0009899497224594815
gaucho 0.0009899497224594815
1980 0.0029698491673784445
joe -0.008487330969876791
walsh 0.0009899497224594815
smoker 0.0009899497224594815
drink 6.669034999088821e-05
1973 0.0009899497224594815
10cc 0.0009899497224594815
1975 0.0029698491673784445
godley 0.0009899497224594815
creme 0.0009899497224594815
consequence -0.00444473909222316
1977 0.001979899444918963
supertramp 0.0009899497224594815
crime 0.0009899497224594815
century 0.0009899497224594815
1974 0.001979899444918963
xtc 0.0009899497224594815
peter 0.0009899497224594815
gabriel 0.0009899497224594815
kate 0.0009899497224594815
bush -0.010382054563101626
dreaming 0.0009899497224594815
wind -0.017110042970792095
pursuit 0.0009899497224594815
cobham 0.0009899497224594815
1972 0.0009899497224594815
frank 0.0009899497224594815
zappa 0.0009899497224594815
goodnight 0.0009899497224594815
vienna 0.0009899497224594815
prince

dr. 0.06525070233001572
corrected -0.017757732244703114
auto-reroute 0.009346065396482153
soundwise 0.002791361838849645
5.1s 0.002791361838849645
tweaked 0.002791361838849645
reaching 0.002791361838849645
mediocre 0.002757171425884416
ancient -0.001067981843752134
soundblaster 0.06433713616291467
throwing -0.030523024500433073
128kbps 0.002791361838849645
shoddy -0.023475788953617623
encoding 0.002791361838849645
192kbps -0.007245501509271202
lastly -0.009454142269115558
loudness 0.002791361838849645
favor -0.053860347145465566
homework 0.04410186135318932
sparse 0.029369542483058304
compartment 0.02343558961094707
arranged -0.007980064551761343
-cds 0.00460774446556161
paperback 0.00460774446556161
squeezed 0.00804584465426941
desire 0.014999246432367129
flap -0.0007604317742078515
carrying 0.03665612462745544
e.g. 0.00460774446556161
testbooks 0.00460774446556161
p.s. 0.00460774446556161
-prompt 0.00460774446556161
courteous 0.00460774446556161
'fix 0.01456117242540929
velcro -0.022

yield 0.008348715651558498
30-35 -0.00977022738056491
monies 0.0029099727443358064
twelve 0.026922350252635698
boose 0.026922350252635698
acceptabl 0.026922350252635698
unbelievably 0.015282464885630327
heavily 0.01627328004476264
giveaway 0.015282464885630327
low-tech 0.0
musi 0.06122559836464935
deemed -0.007950092783163321
consists 0.0
enclosed 0.0
drifted 0.006067755212169309
jiggle -0.002525688369101457
profiler 0.007656294153447182
gamepads 0.006067755212169309
splinter 0.006067755212169309
sim 0.006067755212169309
x-wing 0.006067755212169309
alliance 0.006067755212169309
diagonal 0.006067755212169309
unusable -0.1441747105301114
fighter 0.006067755212169309
flat-panels 0.003993216289803858
500:1 0.003993216289803858
700:1 0.003993216289803858
20.1 0.003993216289803858
non-widescreen 0.003993216289803858
crystal-clear 0.003993216289803858
half-life 0.003993216289803858
unreal 0.003993216289803858
tournament 0.003993216289803858
wide-screen 0.003993216289803858
4:3 0.0147361974253

6-disk -0.021773092506988834
lease -0.02008393279307669
reassuring -0.02008393279307669
irreplaceable -0.02008393279307669
destructs -0.009373319150730714
10 -0.018746638301461428
all-in-all -0.009373319150730714
retirement -0.009373319150730714
anylonger -0.009373319150730714
dreaded -0.024542716448745425
unplug/protect -0.0046238029288973815
stone -0.01144718926516643
unhook -0.0046238029288973815
repack -0.0046238029288973815
reship -0.0046238029288973815
hastle -0.0046238029288973815
monday-friday -0.0046238029288973815
9-5pm -0.0046238029288973815
doornail -0.0046238029288973815
derable -0.017465922630019774
death -0.08673137272726406
un-recoverable -0.019918913519848046
pitching -0.019918913519848046
revive -0.031129830497908483
hush-hush -0.019918913519848046
green-orange 0.0
salesman 0.0
genius 0.0
involve 0.0
understanding -0.0029115367108539037
trendy 0.0
haircut 0.0
'genius 0.0
retitled 0.0
extendable -0.021792414493772082
256mb -0.010261609918877266
finalized -0.02197123694

freaked -0.0025391574334799466
unimpressed -0.0025391574334799466
amused -0.0025391574334799466
laughing -0.0025391574334799466
re-setting -0.0025391574334799466
logon -0.0025391574334799466
60-page -0.0025391574334799466
rom -0.0025391574334799466
norm -0.0025391574334799466
stink -0.004758660989858014
wusb54gc -0.0025391574334799466
unannounced -0.0025391574334799466
non-functional -0.0025391574334799466
rude -0.046018509604046326
redirect -0.005172752599499347
county -0.007037682852556706
stuttering -0.005172752599499347
acknowledged -0.010203878364266212
mapquest -0.010345505198998694
teleatlas -0.0016361605150597603
producer -0.0016361605150597603
interacts -0.0016361605150597603
easy-to-use -0.0016361605150597603
irritant -0.0032723210301195205
vie -0.0016361605150597603
informing -0.0016361605150597603
interstate -0.002947497027959467
allowance -0.0016361605150597603
travelling -0.029202525131436015
anew -0.0016361605150597603
omission -0.0016361605150597603
wisconsin -0.0016361

rigg -0.012368650258414614
thread -0.01940942815992719
futzed -0.004937159697094929
half- -0.004937159697094929
toooo -0.004937159697094929
plier -0.024578301970297894
recept -0.004937159697094929
vice-grip/clamp -0.004937159697094929
forcefully -0.004937159697094929
exposing -0.011470605062911482
gentle -0.024276614206656544
dcr-trv33 -0.009951619057038648
clogged -0.034637448890054955
cleaning/service -0.009951619057038648
lubricates -0.009951619057038648
stil -0.009951619057038648
zr-90 -0.008228609944338768
warehouse -0.008228609944338768
videotape -0.008228609944338768
basketball -0.008228609944338768
inform -0.02691362502312919
tossing -0.02691362502312919
340.00 0.0
irive -0.02691362502312919
wrapped -0.016977531236269973
palmtop -0.004389722218947025
cold-boot -0.004389722218947025
incessantly -0.004389722218947025
auto-power-off -0.004389722218947025
injury -0.004389722218947025
chinese-made -0.004389722218947025
meat -0.005701058731846731
microtrack -0.027335859552496296
woef

-bad -0.0218154059319109
11-1-05 -0.0218154059319109
......... -0.0218154059319109
eating -0.009990493606900167
posting -0.019168789682318447
rat -0.019168789682318447
differentce -0.026917464457327497
chassis -0.007040656988659113
tighted -0.007040656988659113
tighten -0.007040656988659113
maxtor -0.016044562745924307
winning -0.007040656988659113
trusting -0.013901673803168074
non-responsive -0.00936733880859149
dsc-s40 -0.011598866290633392
veiw -0.011598866290633392
persistance -0.011450491050170332
restocking -0.023797443901937086
fee -0.056367774234360304
restock -0.011450491050170332
landlord -0.0034609059094560914
'mac -0.0034609059094560914
cloning -0.0034609059094560914
arose -0.009389521146891748
40+ -0.0034609059094560914
5-year-old -0.0067553390717803325
re-entered -0.0067553390717803325
wrt54g -0.0067553390717803325
hp4705 -0.04315655937543245
non-bluetooth -0.05044713640021166
hp5510 -0.011591009331401296
website-could -0.011591009331401296
alignment -0.01451322534496306

In [30]:
preds = model.predict(X)
pred_probability = model.predict_proba(X)[:,1] #p(y=1 | x)
minP_whenYis1 = 1
maxP_whenYis0 = 0
wrong_positive_review = None
wrong_negative_review = None
wrong_positive_prediction = None
wrong_negative_prediction = None
for i in range(N):
    p = pred_probability[i]
    y = Y[i]
    if y == 1 and p < 0.5:
        if p < minP_whenYis1:
            wrong_positive_review = orig_reviews[i]
            wrong_positive_prediction = preds[i]
            minP_whenYis1 = p
    elif y == 0 and p > 0.5:
        if p > maxP_whenYis0:
            wrong_negative_review = orig_reviews[i]
            wrong_negative_prediction = preds[i]
            maxP_whenYis0 = p

print("Most wrong positive review (prob = %s, pred = %s):" % (minP_whenYis1, wrong_positive_prediction))
print(wrong_positive_review)
print("Most wrong negative review (prob = %s, pred = %s):" % (maxP_whenYis0, wrong_negative_prediction))
print(wrong_negative_review)


Most wrong positive review (prob = 0.36253920180971655, pred = 0.0):

callers can barely hear me, even with the mic right in my mouth (volume adjustment on cord doesnt affect it).
and even worse, callers hear an echo of themselves, the mic picks up the caller in the earpiece enough to resend it!  (echo!). Must be because its all on the same arm or something crappy.  This makes it basically unusable, so waste of money, throw away, going to buy a better one

Most wrong negative review (prob = 0.594819951118996, pred = 1.0):

My family and I just relocated 3 states away from our extended family and wanted to get a good webcam to keep intouch with everyone and let our 2 year old see his grandmother once and a while.  When I saw the Microsoft LifeCam at Target stores and that it was compatible with Windows Live I thought it would be worth the $100 price tag to be able to keep in touch with family.

I purchased the software and started the installation process on my computer.    Just so you 