In [None]:
# This code is for my NLP Udemy class, which can be found at:
# https://deeplearningcourses.com/c/data-science-natural-language-processing-in-python
# https://www.udemy.com/data-science-natural-language-processing-in-python
# It is written in such a way that tells a story.
# i.e. So you can follow a thought process of starting from a
# simple idea, hitting an obstacle, overcoming it, etc.
# i.e. It is not optimized for anything.

# Author: http://lazyprogrammer.me
% matplotlib notebook
import nltk
import numpy as np

from nltk.stem import WordNetLemmatizer
from sklearn.linear_model import LogisticRegression
from bs4 import BeautifulSoup


wordnet_lemmatizer = WordNetLemmatizer()

# from http://www.lextek.com/manuals/onix/stopwords1.html
stopwords = set(w.rstrip() for w in open('stopwords.txt'))

# load the reviews
# data courtesy of http://www.cs.jhu.edu/~mdredze/datasets/sentiment/index2.html
positive_reviews = BeautifulSoup(open('electronics/positive.review').read())
positive_reviews = positive_reviews.findAll('review_text')

negative_reviews = BeautifulSoup(open('electronics/negative.review').read())
negative_reviews = negative_reviews.findAll('review_text')

# there are more positive reviews than negative reviews
# so let's take a random sample so we have balanced classes
np.random.shuffle(positive_reviews)
positive_reviews = positive_reviews[:len(negative_reviews)]

# first let's just try to tokenize the text using nltk's tokenizer
# let's take the first review for example:
# t = positive_reviews[0]
# nltk.tokenize.word_tokenize(t.text)
#
# notice how it doesn't downcase, so It != it
# not only that, but do we really want to include the word "it" anyway?
# you can imagine it wouldn't be any more common in a positive review than a negative review
# so it might only add noise to our model.
# so let's create a function that does all this pre-processing for us

def my_tokenizer(s):
    s = s.lower() # downcase
    tokens = nltk.tokenize.word_tokenize(s) # split string into words (tokens)
    tokens = [t for t in tokens if len(t) > 2] # remove short words, they're probably not useful
    tokens = [wordnet_lemmatizer.lemmatize(t) for t in tokens] # put words into base form
    tokens = [t for t in tokens if t not in stopwords] # remove stopwords
    return tokens


# create a word-to-index map so that we can create our word-frequency vectors later
# let's also save the tokenized versions so we don't have to tokenize again later
word_index_map = {}
current_index = 0
positive_tokenized = []
negative_tokenized = []

for review in positive_reviews:
    tokens = my_tokenizer(review.text)
    positive_tokenized.append(tokens)
    for token in tokens:
        if token not in word_index_map:
            word_index_map[token] = current_index
            current_index += 1

for review in negative_reviews:
    tokens = my_tokenizer(review.text)
    negative_tokenized.append(tokens)
    for token in tokens:
        if token not in word_index_map:
            word_index_map[token] = current_index
            current_index += 1


# now let's create our input matrices
def tokens_to_vector(tokens, label):
    x = np.zeros(len(word_index_map) + 1) # last element is for the label
    for t in tokens:
        i = word_index_map[t]
        x[i] += 1
    x = x / x.sum() # normalize it before setting label
    x[-1] = label
    return x

N = len(positive_tokenized) + len(negative_tokenized)
# (N x D+1 matrix - keeping them together for now so we can shuffle more easily later
data = np.zeros((N, len(word_index_map) + 1))
i = 0
for tokens in positive_tokenized:
    xy = tokens_to_vector(tokens, 1)
    data[i,:] = xy
    i += 1

for tokens in negative_tokenized:
    xy = tokens_to_vector(tokens, 0)
    data[i,:] = xy
    i += 1

# shuffle the data and create train/test splits
# try it multiple times!
np.random.shuffle(data)

X = data[:,:-1]
Y = data[:,-1]

# last 100 rows will be test
Xtrain = X[:-100,]
Ytrain = Y[:-100,]
Xtest = X[-100:,]
Ytest = Y[-100:,]

model = LogisticRegression()
model.fit(Xtrain, Ytrain)
print "Classification rate:", model.score(Xtest, Ytest)


# let's look at the weights for each word
# try it with different threshold values!
threshold = 0.5
for word, index in word_index_map.iteritems():
    weight = model.coef_[0][index]
    if weight > threshold or weight < -threshold:
        print word, weight

In [22]:
# read the data
import pdb
import sklearn
import nltk
import numpy as np
from sklearn.naive_bayes import MultinomialNB
from bs4 import BeautifulSoup
positive_reviews1 = BeautifulSoup(open('electronics/positive.review').read())
positive_reviews = positive_reviews1.findAll('review_text')
negative_reviews = BeautifulSoup(open('electronics/negative.review').read())
negative_reviews = negative_reviews.findAll('review_text')


In [25]:
b = BeautifulSoup('<haha> nanana</haha>')
b.findAll('haha')

[<haha> nanana</haha>]

In [24]:
# see data
if False: print positive_reviews1
if False: print positive_reviews

In [27]:
# make a lemmatizer
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()


bs4.element.ResultSet

In [56]:
# get a list of stopwords
stopwords1  = open('stopwords.txt','r')
stopwords = [w.rstrip() for w in stopwords1]
# lens = [abs(len(w1)-len(w2)) for w1,w2 in zip(stopwords1,stopwords)]
# where_unequal= np.where(lens)
# print where_unequal
# print [w for ix,w in enumerate(stopwords) if ix in where_unequal]
print stopwords

['a', 'about', 'above', 'across', 'after', 'again', 'against', 'all', 'almost', 'alone', 'along', 'already', 'also', 'although', 'always', 'among', 'an', 'and', 'another', 'any', 'anybody', 'anyone', 'anything', 'anywhere', 'are', 'area', 'areas', 'around', 'as', 'ask', 'asked', 'asking', 'asks', 'at', 'away', 'b', 'back', 'backed', 'backing', 'backs', 'be', 'became', 'because', 'become', 'becomes', 'been', 'before', 'began', 'behind', 'being', 'beings', 'best', 'better', 'between', 'big', 'both', 'but', 'by', 'c', 'came', 'can', 'cannot', 'case', 'cases', 'certain', 'certainly', 'clear', 'clearly', 'come', 'could', 'd', 'did', 'differ', 'different', 'differently', 'do', 'does', 'done', 'down', 'down', 'downed', 'downing', 'downs', 'during', 'e', 'each', 'early', 'either', 'end', 'ended', 'ending', 'ends', 'enough', 'even', 'evenly', 'ever', 'every', 'everybody', 'everyone', 'everything', 'everywhere', 'f', 'face', 'faces', 'fact', 'facts', 'far', 'felt', 'few', 'find', 'finds', 'first

In [59]:
# the tokenizer is built over the lemmatizer, and does a sequence of operations
import pdb
def tokenizer(s): #s is text
    
    tokens = nltk.tokenize.word_tokenize(s)
    tokens = [t for t in tokens if len(t)>2]
    tokens = [wordnet_lemmatizer.lemmatize(t) for t in tokens] # does not convert going to go
    tokens = [t for t in tokens if t not in stopwords]
#     print tokens
#     pdb.set_trace()
    return tokens
    pass

In [62]:
all_pos_tokens = []
vocab = []

for review in positive_reviews:
    rtokens = tokenizer(review.text)
    all_pos_tokens.append(rtokens)
    vocab.extend(rtokens)
#     pdb.set_trace()
vocab = list(set(vocab))
all_neg_tokens = []
for review in negative_reviews:
    rtokens = tokenizer(review.text)
    all_neg_tokens.append(rtokens)
    vocab.extend(rtokens)
vocab = list(set(vocab))


[[u'con', u'tip', u'extremely', u'easy', u'carpet', u'you', u'lot', u'cd', u'stacked', u'top', u'poorly', u'designed', u'vertical', u'rack', u'doesnt', u'individual', u'slot', u'cd', u'you', u'bottom', u'stack', u'you', u'basically', u'pull', u'stack', u'putting', u'wa', u'pain', u'bought', u'break', u'piece', u'metal', u'fit', u'guide', u'hole', u'again..poorly', u'designed', u'...', u'doesnt', u'fit', u'cd', u'gap', u'ca', u'loose', u'fitting', u'pro', u'...', u'...', u'...', u'guess', u'hold', u'lot', u'cd', u'...'], [u'nice', u'look', u'tip', u'easily', u'steady', u'rug', u'surface', u'dispite', u'picture', u'box', u'advice', u'you', u'rack', u'hold', u'lot', u'Save', u'money', u'invest', u'nicer', u'sturdy'], [u'bought', u'returned', u'unit', u'Each', u'ha', u'defective', u'finally', u'returning', u'system', u'The', u'DVD', u'player', u'constantly', u'Bad', u'Disc', u'error', u'skip', u'slightest', u'smudge', u'disc', u'The', u'sound', u'quality', u'nice', u'price', u'player', u'd