# Sentiment Analysis
### Author : H.Yang
### Time: Spring 2017

## Data Reading

In [1]:
def pretty_print_review_and_label(i):
    print(labels[i] + "\t:\t" + reviews[i][:80] + "...")
g = open('reviews.txt','r')  
reviews = list(map(lambda x:x[:-1], g.readlines()))
g.close()

In [2]:
g = open('labels.txt', 'r')
labels = list(map(lambda x:x[:-1].upper(), g.readlines()))
g.close()

In [4]:
len(reviews)

25000

In [4]:
reviews[0]

'bromwell high is a cartoon comedy . it ran at the same time as some other programs about school life  such as  teachers  . my   years in the teaching profession lead me to believe that bromwell high  s satire is much closer to reality than is  teachers  . the scramble to survive financially  the insightful students who can see right through their pathetic teachers  pomp  the pettiness of the whole situation  all remind me of the schools i knew and their students . when i saw the episode in which a student repeatedly tried to burn down the school  i immediately recalled . . . . . . . . . at . . . . . . . . . . high . a classic line inspector i  m here to sack one of your teachers . student welcome to bromwell high . i expect that many adults of my age think that bromwell high is far fetched . what a pity that it isn  t   '

In [5]:
labels[0]

'POSITIVE'

# Develop a Predictive Theory 

In [6]:
#print("labels.txt \t : \t reviews.txt\n")
#pretty_print_review_and_label(1)
#pretty_print_review_and_label(12816)
#pretty_print_review_and_label(7)
#pretty_print_review_and_label(21934)
#pretty_print_review_and_label(5297)
#pretty_print_review_and_label(4998)

In [6]:
from collections import Counter
import numpy as np

In [7]:
## Count the Positive and Negative words bag

positive_counts = Counter()
negative_counts = Counter()
total_counts = Counter()

In [8]:
for i in range(len(reviews)):
    if(labels[i] == 'POSITIVE'):
        for word in reviews[i].split(" "):
            positive_counts[word] += 1
            total_counts[word] += 1
    else:
        for word in reviews[i].split(" "):
            negative_counts[word] += 1
            total_counts[word] += 1

In [10]:
# Examine the counts of the most common words in positive reviews
positive_counts.most_common()[:20]

[('', 550468),
 ('the', 173324),
 ('.', 159654),
 ('and', 89722),
 ('a', 83688),
 ('of', 76855),
 ('to', 66746),
 ('is', 57245),
 ('in', 50215),
 ('br', 49235),
 ('it', 48025),
 ('i', 40743),
 ('that', 35630),
 ('this', 35080),
 ('s', 33815),
 ('as', 26308),
 ('with', 23247),
 ('for', 22416),
 ('was', 21917),
 ('film', 20937)]

In [11]:
# Examine the counts of the most common words in negative reviews
# negative_counts.most_common()[:20]

### (1) Caluculate the Ratio, Developing a Metric
### Check all the words and calculate the ratio of postive to negative uses and store that ratio in pos_neg_ratios.

In [11]:
pos_neg_ratios = Counter()

# Calculate the ratios of positive and negative uses of the most common words
# Consider words to be "common" if they've been used at least 100 times
for term,cnt in list(total_counts.most_common()):
    if(cnt > 100):
        pos_neg_ratio = positive_counts[term] / float(negative_counts[term]+1)
        pos_neg_ratios[term] = pos_neg_ratio

In [12]:
print("Pos-to-neg ratio for 'the' = {}".format(pos_neg_ratios["the"]))
print("Pos-to-neg ratio for 'amazing' = {}".format(pos_neg_ratios["amazing"]))
print("Pos-to-neg ratio for 'terrible' = {}".format(pos_neg_ratios["terrible"]))

Pos-to-neg ratio for 'the' = 1.0607993145235326
Pos-to-neg ratio for 'amazing' = 4.022813688212928
Pos-to-neg ratio for 'terrible' = 0.17744252873563218


In [14]:
# Convert ratios to logs
for word,ratio in pos_neg_ratios.most_common():
    if(ratio > 1):
        pos_neg_ratios[word] = np.log(ratio)
    else:
        pos_neg_ratios[word] = -np.log((1 / (ratio+0.01)))

In [15]:
print("Pos-to-neg ratio for 'the' = {}".format(pos_neg_ratios["the"]))
print("Pos-to-neg ratio for 'amazing' = {}".format(pos_neg_ratios["amazing"]))
print("Pos-to-neg ratio for 'terrible' = {}".format(pos_neg_ratios["terrible"]))

Pos-to-neg ratio for 'the' = 0.05902269426102881
Pos-to-neg ratio for 'amazing' = 1.3919815802404802
Pos-to-neg ratio for 'terrible' = -1.6742829939664696


In [16]:
# words most frequently seen in a review with a "POSITIVE" label
# pos_neg_ratios.most_common()[:20]

In [17]:
# list(pos_neg_ratios.most_common())[0:20]

In [18]:
# words most frequently seen in a review with a "NEGATIVE" label
# list(reversed(pos_neg_ratios.most_common()))[0:30]


## (2) Creating the Input/Output Data

In [19]:
vocab = set(total_counts.keys())

In [20]:
vocab_size = len(vocab)
print(vocab_size)

74074


In [21]:
layer_0 = np.zeros((1,vocab_size))

In [22]:
layer_0.shape

(1, 74074)

In [23]:
# Attribute index to each word 
word2index = {}
for i, word in enumerate(vocab):
    word2index[word] = i

# word2index

In [24]:
def update_input_layer(review):
    global layer_0
    layer_0 *= 0
    for word in review.split(" "):
        layer_0[0][word2index[word]] += 1

In [25]:
update_input_layer(reviews[0])
layer_0

array([[ 18.,   0.,   0., ...,   0.,   0.,   0.]])

In [26]:
def get_target_for_label(label):
    if(label == 'POSITIVE'):
        return 1
    else:
        return 0

In [27]:
# labels[0]
# get_target_for_label(labels[0])

## (3) Building a Neural Network

In [35]:
import time
import sys
import numpy as np

In [36]:
class SentimentNetwork:
    def __init__(self, reviews,labels,hidden_nodes = 10, learning_rate = 0.1):
        """Create a SentimenNetwork with the given settings
        Args:
            reviews(list) - List of reviews used for training
            labels(list) - List of POSITIVE/NEGATIVE labels associated with the given reviews
            hidden_nodes(int) - Number of nodes to create in the hidden layer
            learning_rate(float) - Learning rate to use while training
        
        """
        # Assign a seed to our random number generator to ensure we get
        # reproducable results during development 
        np.random.seed(1)

        # process the reviews and their associated labels so that everything
        # is ready for training
        self.pre_process_data(reviews, labels)
        
        # Build the network to have the number of hidden nodes and the learning rate that
        # were passed into this initializer. Make the same number of input nodes as
        # there are vocabulary words and create a single output node.
        self.init_network(len(self.review_vocab),hidden_nodes, 1, learning_rate)

    def pre_process_data(self, reviews, labels):
        
        # populate review_vocab with all of the words in the given reviews
        review_vocab = set()
        for review in reviews:
            for word in review.split(" "):
                review_vocab.add(word)

        # Convert the vocabulary set to a list so we can access words via indices
        self.review_vocab = list(review_vocab)
        
        # populate label_vocab with all of the words in the given labels.
        label_vocab = set()
        for label in labels:
            label_vocab.add(label)
        
        # Convert the label vocabulary set to a list so we can access labels via indices
        self.label_vocab = list(label_vocab)
        
        # Store the sizes of the review and label vocabularies.
        self.review_vocab_size = len(self.review_vocab)
        self.label_vocab_size = len(self.label_vocab)
        
        # Create a dictionary of words in the vocabulary mapped to index positions
        self.word2index = {}
        for i, word in enumerate(self.review_vocab):
            self.word2index[word] = i
        
        # Create a dictionary of labels mapped to index positions
        self.label2index = {}
        for i, label in enumerate(self.label_vocab):
            self.label2index[label] = i
        
    def init_network(self, input_nodes, hidden_nodes, output_nodes, learning_rate):
        # Set number of nodes in input, hidden and output layers.
        self.input_nodes = input_nodes
        self.hidden_nodes = hidden_nodes
        self.output_nodes = output_nodes

        # Store the learning rate
        self.learning_rate = learning_rate

        # Initialize weights

        # These are the weights between the input layer and the hidden layer.
        self.weights_0_1 = np.zeros((self.input_nodes,self.hidden_nodes))
    
        # These are the weights between the hidden layer and the output layer.
        self.weights_1_2 = np.random.normal(0.0, self.output_nodes**-0.5, 
                                                (self.hidden_nodes, self.output_nodes))
        
        # The input layer, a two-dimensional matrix with shape 1 x input_nodes
        self.layer_0 = np.zeros((1,input_nodes))
    
    def update_input_layer(self,review):

        # clear out previous state, reset the layer to be all 0s
        self.layer_0 *= 0
        
        for word in review.split(" "):
            
            if(word in self.word2index.keys()):
                self.layer_0[0][self.word2index[word]] += 1
                
    def get_target_for_label(self,label):
        if(label == 'POSITIVE'):
            return 1
        else:
            return 0
        
    def sigmoid(self,x):
        return 1 / (1 + np.exp(-x))
    
    def sigmoid_output_2_derivative(self,output):
        return output * (1 - output)
    
    def train(self, training_reviews, training_labels):
        
        # make sure out we have a matching number of reviews and labels
        assert(len(training_reviews) == len(training_labels))
        
        # Keep track of correct predictions to display accuracy during training 
        correct_so_far = 0

        # Remember when we started for printing time statistics
        start = time.time()
        
        # loop through all the given reviews and run a forward and backward pass,
        # updating weights for every item
        for i in range(len(training_reviews)):
            
            # Get the next review and its correct label
            review = training_reviews[i]
            label = training_labels[i]
            
            #### Implement the forward pass here ####
            ### Forward pass ###

            # Input Layer
            self.update_input_layer(review)

            # Hidden layer
            layer_1 = self.layer_0.dot(self.weights_0_1)

            # Output layer
            layer_2 = self.sigmoid(layer_1.dot(self.weights_1_2))
            
            #### Implement the backward pass here ####
            ### Backward pass ###

            # Output error
            layer_2_error = layer_2 - self.get_target_for_label(label) # Output layer error is the difference between desired target and actual output.
            layer_2_delta = layer_2_error * self.sigmoid_output_2_derivative(layer_2)

            # Backpropagated error
            layer_1_error = layer_2_delta.dot(self.weights_1_2.T) # errors propagated to the hidden layer
            layer_1_delta = layer_1_error # hidden layer gradients - no nonlinearity so it's the same as the error

            # Update the weights
            self.weights_1_2 -= layer_1.T.dot(layer_2_delta) * self.learning_rate # update hidden-to-output weights with gradient descent step
            self.weights_0_1 -= self.layer_0.T.dot(layer_1_delta) * self.learning_rate # update input-to-hidden weights with gradient descent step

            # Keep track of correct predictions.
            if(layer_2 >= 0.5 and label == 'POSITIVE'):
                correct_so_far += 1
            elif(layer_2 < 0.5 and label == 'NEGATIVE'):
                correct_so_far += 1
            
            # For debug purposes, print out our prediction accuracy and speed 
            # throughout the training process. 
            elapsed_time = float(time.time() - start)
            reviews_per_second = i / elapsed_time if elapsed_time > 0 else 0
            
            sys.stdout.write("\rProgress:" + str(100 * i/float(len(training_reviews)))[:4] \
                             + "% Speed(reviews/sec):" + str(reviews_per_second)[0:5] \
                             + " #Correct:" + str(correct_so_far) + " #Trained:" + str(i+1) \
                             + " Training Accuracy:" + str(correct_so_far * 100 / float(i+1))[:4] + "%")
            if(i % 2500 == 0):
                print("")
    
    def test(self, testing_reviews, testing_labels):
        """
        Attempts to predict the labels for the given testing_reviews,
        and uses the test_labels to calculate the accuracy of those predictions.
        """
        
        # keep track of how many correct predictions we make
        correct = 0

        # we'll time how many predictions per second we make
        start = time.time()

        # Loop through each of the given reviews and call run to predict
        # its label. 
        for i in range(len(testing_reviews)):
            pred = self.run(testing_reviews[i])
            if(pred == testing_labels[i]):
                correct += 1
            
            # For debug purposes, print out our prediction accuracy and speed 
            # throughout the prediction process. 

            elapsed_time = float(time.time() - start)
            reviews_per_second = i / elapsed_time if elapsed_time > 0 else 0
            
            sys.stdout.write("\rProgress:" + str(100 * i/float(len(testing_reviews)))[:4] \
                             + "% Speed(reviews/sec):" + str(reviews_per_second)[0:5] \
                             + " #Correct:" + str(correct) + " #Tested:" + str(i+1) \
                             + " Testing Accuracy:" + str(correct * 100 / float(i+1))[:4] + "%")
    
    def run(self, review):
        """
        Returns a POSITIVE or NEGATIVE prediction for the given review.
        """
        # Run a forward pass through the network, like in the "train" function.
        
        # Input Layer
        self.update_input_layer(review.lower())

        # Hidden layer
        layer_1 = self.layer_0.dot(self.weights_0_1)

        # Output layer
        layer_2 = self.sigmoid(layer_1.dot(self.weights_1_2))
        
        # Return POSITIVE for values above greater-than-or-equal-to 0.5 in the output layer;
        # return NEGATIVE for other values
        if(layer_2[0] >= 0.5):
            return "POSITIVE"
        else:
            return "NEGATIVE"

In [37]:
# mlp = SentimentNetwork(reviews[:-1000], labels[:-1000], learning_rate = 0.01)

In [38]:
# mlp.test(reviews[-1000:], labels[-1000:])

In [39]:
# mlp.train(reviews[-1000:], labels[-1000:])

In [41]:
mlp = SentimentNetwork(reviews[:-1000],labels[:-1000], learning_rate=0.001)
mlp.train(reviews[:-1000],labels[:-1000])

Progress:0.0% Speed(reviews/sec):0.0 #Correct:1 #Trained:1 Training Accuracy:100.%
Progress:10.4% Speed(reviews/sec):103.0 #Correct:1267 #Trained:2501 Training Accuracy:50.6%
Progress:20.8% Speed(reviews/sec):103.0 #Correct:2629 #Trained:5001 Training Accuracy:52.5%
Progress:31.2% Speed(reviews/sec):99.92 #Correct:4100 #Trained:7501 Training Accuracy:54.6%
Progress:41.6% Speed(reviews/sec):96.92 #Correct:5651 #Trained:10001 Training Accuracy:56.5%
Progress:52.0% Speed(reviews/sec):93.68 #Correct:7205 #Trained:12501 Training Accuracy:57.6%
Progress:62.5% Speed(reviews/sec):91.62 #Correct:8813 #Trained:15001 Training Accuracy:58.7%
Progress:72.9% Speed(reviews/sec):90.48 #Correct:10476 #Trained:17501 Training Accuracy:59.8%
Progress:83.3% Speed(reviews/sec):89.47 #Correct:12130 #Trained:20001 Training Accuracy:60.6%
Progress:93.7% Speed(reviews/sec):88.96 #Correct:13824 #Trained:22501 Training Accuracy:61.4%
Progress:99.9% Speed(reviews/sec):88.39 #Correct:14916 #Trained:24000 Training A

In [28]:
def update_input_layer(review):
    
    global layer_0
    
    # clear out previous state, reset the layer to be all 0s
    layer_0 *= 0
    for word in review.split(" "):
        layer_0[0][word2index[word]] += 1

update_input_layer(reviews[0])

In [29]:
layer_0

array([[ 18.,   0.,   0., ...,   0.,   0.,   0.]])

In [30]:
review_counter = Counter()

In [31]:
for word in reviews[0].split(" "):
    review_counter[word] += 1

In [33]:
review_counter.most_common()[:30]

[('.', 27),
 ('', 18),
 ('the', 9),
 ('to', 6),
 ('high', 5),
 ('i', 5),
 ('bromwell', 4),
 ('is', 4),
 ('a', 4),
 ('teachers', 4),
 ('that', 4),
 ('of', 4),
 ('it', 2),
 ('at', 2),
 ('as', 2),
 ('school', 2),
 ('my', 2),
 ('in', 2),
 ('me', 2),
 ('students', 2),
 ('their', 2),
 ('student', 2),
 ('cartoon', 1),
 ('comedy', 1),
 ('ran', 1),
 ('same', 1),
 ('time', 1),
 ('some', 1),
 ('other', 1),
 ('programs', 1)]

## (4) Update Input Layer --- Improve the performance 
### Reducing the noise of input

In [49]:
import time
import sys
import numpy as np

# Encapsulate our neural network 
class SentimentNetwork:
    def __init__(self, reviews,labels,hidden_nodes = 10, learning_rate = 0.1):
        """Create a SentimenNetwork with the given settings
        Args:
            reviews(list) - List of reviews used for training
            labels(list) - List of POSITIVE/NEGATIVE labels associated with the given reviews
            hidden_nodes(int) - Number of nodes to create in the hidden layer
            learning_rate(float) - Learning rate to use while training
        
        """
        # Assign a seed to our random number generator to ensure we get
        # reproducable results during development 
        np.random.seed(1)

        # process the reviews and their associated labels so that everything
        # is ready for training
        self.pre_process_data(reviews, labels)
        
        # Build the network to have the number of hidden nodes and the learning rate that
        # were passed into this initializer. Make the same number of input nodes as
        # there are vocabulary words and create a single output node.
        self.init_network(len(self.review_vocab),hidden_nodes, 1, learning_rate)

    def pre_process_data(self, reviews, labels):
        
        # populate review_vocab with all of the words in the given reviews
        review_vocab = set()
        for review in reviews:
            for word in review.split(" "):
                review_vocab.add(word)

        # Convert the vocabulary set to a list so we can access words via indices
        self.review_vocab = list(review_vocab)
        
        # populate label_vocab with all of the words in the given labels.
        label_vocab = set()
        for label in labels:
            label_vocab.add(label)
        
        # Convert the label vocabulary set to a list so we can access labels via indices
        self.label_vocab = list(label_vocab)
        
        # Store the sizes of the review and label vocabularies.
        self.review_vocab_size = len(self.review_vocab)
        self.label_vocab_size = len(self.label_vocab)
        
        # Create a dictionary of words in the vocabulary mapped to index positions
        self.word2index = {}
        for i, word in enumerate(self.review_vocab):
            self.word2index[word] = i
        
        # Create a dictionary of labels mapped to index positions
        self.label2index = {}
        for i, label in enumerate(self.label_vocab):
            self.label2index[label] = i
        
    def init_network(self, input_nodes, hidden_nodes, output_nodes, learning_rate):
        # Set number of nodes in input, hidden and output layers.
        self.input_nodes = input_nodes
        self.hidden_nodes = hidden_nodes
        self.output_nodes = output_nodes

        # Store the learning rate
        self.learning_rate = learning_rate

        # Initialize weights

        # These are the weights between the input layer and the hidden layer.
        self.weights_0_1 = np.zeros((self.input_nodes,self.hidden_nodes))
    
        # These are the weights between the hidden layer and the output layer.
        self.weights_1_2 = np.random.normal(0.0, self.output_nodes**-0.5, 
                                                (self.hidden_nodes, self.output_nodes))
        
        # The input layer, a two-dimensional matrix with shape 1 x input_nodes
        self.layer_0 = np.zeros((1,input_nodes))
    
        
    def update_input_layer(self,review):

        # clear out previous state, reset the layer to be all 0s
        self.layer_0 *= 0
        
        for word in review.split(" "):
            
            if(word in self.word2index.keys()):
                # changed to set to 1 instead of add 1
                self.layer_0[0][self.word2index[word]] = 1
                
    def get_target_for_label(self,label):
        if(label == 'POSITIVE'):
            return 1
        else:
            return 0
        
    def sigmoid(self,x):
        return 1 / (1 + np.exp(-x))
    
    def sigmoid_output_2_derivative(self,output):
        return output * (1 - output)
    
    def train(self, training_reviews, training_labels):
        
        # make sure out we have a matching number of reviews and labels
        assert(len(training_reviews) == len(training_labels))
        
        # Keep track of correct predictions to display accuracy during training 
        correct_so_far = 0

        # Remember when we started for printing time statistics
        start = time.time()
        
        # loop through all the given reviews and run a forward and backward pass,
        # updating weights for every item
        for i in range(len(training_reviews)):
            
            # Get the next review and its correct label
            review = training_reviews[i]
            label = training_labels[i]
            
            #### Implement the forward pass here ####
            ### Forward pass ###

            # Input Layer
            self.update_input_layer(review)

            # Hidden layer
            layer_1 = self.layer_0.dot(self.weights_0_1)

            # Output layer
            layer_2 = self.sigmoid(layer_1.dot(self.weights_1_2))
            
            #### Implement the backward pass here ####
            ### Backward pass ###

            # Output error
            layer_2_error = layer_2 - self.get_target_for_label(label) # Output layer error is the difference between desired target and actual output.
            layer_2_delta = layer_2_error * self.sigmoid_output_2_derivative(layer_2)

            # Backpropagated error
            layer_1_error = layer_2_delta.dot(self.weights_1_2.T) # errors propagated to the hidden layer
            layer_1_delta = layer_1_error # hidden layer gradients - no nonlinearity so it's the same as the error

            # Update the weights
            self.weights_1_2 -= layer_1.T.dot(layer_2_delta) * self.learning_rate # update hidden-to-output weights with gradient descent step
            self.weights_0_1 -= self.layer_0.T.dot(layer_1_delta) * self.learning_rate # update input-to-hidden weights with gradient descent step

            # Keep track of correct predictions.
            if(layer_2 >= 0.5 and label == 'POSITIVE'):
                correct_so_far += 1
            elif(layer_2 < 0.5 and label == 'NEGATIVE'):
                correct_so_far += 1
            
            # For debug purposes, print out our prediction accuracy and speed 
            # throughout the training process. 
            elapsed_time = float(time.time() - start)
            reviews_per_second = i / elapsed_time if elapsed_time > 0 else 0
            
            sys.stdout.write("\rProgress:" + str(100 * i/float(len(training_reviews)))[:4] \
                             + "% Speed(reviews/sec):" + str(reviews_per_second)[0:5] \
                             + " #Correct:" + str(correct_so_far) + " #Trained:" + str(i+1) \
                             + " Training Accuracy:" + str(correct_so_far * 100 / float(i+1))[:4] + "%")
            if(i % 2500 == 0):
                print("")
    
    def test(self, testing_reviews, testing_labels):
        """
        Attempts to predict the labels for the given testing_reviews,
        and uses the test_labels to calculate the accuracy of those predictions.
        """
        
        # keep track of how many correct predictions we make
        correct = 0

        # we'll time how many predictions per second we make
        start = time.time()

        # Loop through each of the given reviews and call run to predict
        # its label. 
        for i in range(len(testing_reviews)):
            pred = self.run(testing_reviews[i])
            if(pred == testing_labels[i]):
                correct += 1
            
            # For debug purposes, print out our prediction accuracy and speed 
            # throughout the prediction process. 

            elapsed_time = float(time.time() - start)
            reviews_per_second = i / elapsed_time if elapsed_time > 0 else 0
            
            sys.stdout.write("\rProgress:" + str(100 * i/float(len(testing_reviews)))[:4] \
                             + "% Speed(reviews/sec):" + str(reviews_per_second)[0:5] \
                             + " #Correct:" + str(correct) + " #Tested:" + str(i+1) \
                             + " Testing Accuracy:" + str(correct * 100 / float(i+1))[:4] + "%")
    
    def run(self, review):
        """
        Returns a POSITIVE or NEGATIVE prediction for the given review.
        """
        # Run a forward pass through the network, like in the "train" function.
        
        # Input Layer
        self.update_input_layer(review.lower())

        # Hidden layer
        layer_1 = self.layer_0.dot(self.weights_0_1)

        # Output layer
        layer_2 = self.sigmoid(layer_1.dot(self.weights_1_2))
        
        # Return POSITIVE for values above greater-than-or-equal-to 0.5 in the output layer;
        # return NEGATIVE for other values
        if(layer_2[0] >= 0.5):
            return "POSITIVE"
        else:
            return "NEGATIVE"
        

In [50]:
mlp = SentimentNetwork(reviews[:-1000],labels[:-1000], learning_rate=0.1)
mlp.train(reviews[:-1000],labels[:-1000])

Progress:0.0% Speed(reviews/sec):0.0 #Correct:1 #Trained:1 Training Accuracy:100.%
Progress:10.4% Speed(reviews/sec):135.8 #Correct:1814 #Trained:2501 Training Accuracy:72.5%
Progress:20.8% Speed(reviews/sec):134.5 #Correct:3801 #Trained:5001 Training Accuracy:76.0%
Progress:31.2% Speed(reviews/sec):130.8 #Correct:5875 #Trained:7501 Training Accuracy:78.3%
Progress:41.6% Speed(reviews/sec):126.2 #Correct:7999 #Trained:10001 Training Accuracy:79.9%
Progress:52.0% Speed(reviews/sec):121.3 #Correct:10130 #Trained:12501 Training Accuracy:81.0%
Progress:62.5% Speed(reviews/sec):117.1 #Correct:12261 #Trained:15001 Training Accuracy:81.7%
Progress:72.9% Speed(reviews/sec):113.4 #Correct:14375 #Trained:17501 Training Accuracy:82.1%
Progress:83.3% Speed(reviews/sec):110.8 #Correct:16553 #Trained:20001 Training Accuracy:82.7%
Progress:93.7% Speed(reviews/sec):108.8 #Correct:18751 #Trained:22501 Training Accuracy:83.3%
Progress:99.9% Speed(reviews/sec):107.1 #Correct:20075 #Trained:24000 Training

In [51]:
mlp.test(reviews[-1000:],labels[-1000:])

Progress:0.0% Speed(reviews/sec):0.0 #Correct:1 #Tested:1 Testing Accuracy:100.%Progress:0.1% Speed(reviews/sec):368.8 #Correct:1 #Tested:2 Testing Accuracy:50.0%Progress:0.2% Speed(reviews/sec):572.9 #Correct:1 #Tested:3 Testing Accuracy:33.3%Progress:0.3% Speed(reviews/sec):649.7 #Correct:2 #Tested:4 Testing Accuracy:50.0%Progress:0.4% Speed(reviews/sec):727.0 #Correct:3 #Tested:5 Testing Accuracy:60.0%Progress:0.5% Speed(reviews/sec):769.8 #Correct:4 #Tested:6 Testing Accuracy:66.6%Progress:0.6% Speed(reviews/sec):779.5 #Correct:5 #Tested:7 Testing Accuracy:71.4%Progress:0.7% Speed(reviews/sec):807.6 #Correct:6 #Tested:8 Testing Accuracy:75.0%Progress:0.8% Speed(reviews/sec):829.3 #Correct:7 #Tested:9 Testing Accuracy:77.7%Progress:0.9% Speed(reviews/sec):768.4 #Correct:8 #Tested:10 Testing Accuracy:80.0%Progress:1.0% Speed(reviews/sec):792.3 #Correct:9 #Tested:11 Testing Accuracy:81.8%Progress:1.1% Speed(reviews/sec):818.1 #Correct:10 #Tested:12 Testing Accuracy:83.3%P

Progress:19.9% Speed(reviews/sec):977.3 #Correct:174 #Tested:200 Testing Accuracy:87.0%Progress:20.0% Speed(reviews/sec):978.2 #Correct:175 #Tested:201 Testing Accuracy:87.0%Progress:20.1% Speed(reviews/sec):977.5 #Correct:176 #Tested:202 Testing Accuracy:87.1%Progress:20.2% Speed(reviews/sec):978.6 #Correct:177 #Tested:203 Testing Accuracy:87.1%Progress:20.3% Speed(reviews/sec):978.8 #Correct:178 #Tested:204 Testing Accuracy:87.2%Progress:20.4% Speed(reviews/sec):979.7 #Correct:179 #Tested:205 Testing Accuracy:87.3%Progress:20.5% Speed(reviews/sec):980.2 #Correct:180 #Tested:206 Testing Accuracy:87.3%Progress:20.6% Speed(reviews/sec):981.1 #Correct:181 #Tested:207 Testing Accuracy:87.4%Progress:20.7% Speed(reviews/sec):982.3 #Correct:182 #Tested:208 Testing Accuracy:87.5%Progress:20.8% Speed(reviews/sec):982.7 #Correct:183 #Tested:209 Testing Accuracy:87.5%Progress:20.9% Speed(reviews/sec):980.3 #Correct:183 #Tested:210 Testing Accuracy:87.1%Progress:21.0% Speed(reviews/se

Progress:39.7% Speed(reviews/sec):981.0 #Correct:347 #Tested:398 Testing Accuracy:87.1%Progress:39.8% Speed(reviews/sec):976.2 #Correct:347 #Tested:399 Testing Accuracy:86.9%Progress:39.9% Speed(reviews/sec):975.0 #Correct:348 #Tested:400 Testing Accuracy:87.0%Progress:40.0% Speed(reviews/sec):972.0 #Correct:349 #Tested:401 Testing Accuracy:87.0%Progress:40.1% Speed(reviews/sec):971.2 #Correct:350 #Tested:402 Testing Accuracy:87.0%Progress:40.2% Speed(reviews/sec):969.7 #Correct:351 #Tested:403 Testing Accuracy:87.0%Progress:40.3% Speed(reviews/sec):970.3 #Correct:351 #Tested:404 Testing Accuracy:86.8%Progress:40.4% Speed(reviews/sec):971.0 #Correct:352 #Tested:405 Testing Accuracy:86.9%Progress:40.5% Speed(reviews/sec):971.6 #Correct:353 #Tested:406 Testing Accuracy:86.9%Progress:40.6% Speed(reviews/sec):971.8 #Correct:354 #Tested:407 Testing Accuracy:86.9%Progress:40.7% Speed(reviews/sec):972.5 #Correct:355 #Tested:408 Testing Accuracy:87.0%Progress:40.8% Speed(reviews/se

Progress:56.9% Speed(reviews/sec):933.7 #Correct:501 #Tested:570 Testing Accuracy:87.8%Progress:57.0% Speed(reviews/sec):932.9 #Correct:501 #Tested:571 Testing Accuracy:87.7%Progress:57.1% Speed(reviews/sec):932.7 #Correct:502 #Tested:572 Testing Accuracy:87.7%Progress:57.2% Speed(reviews/sec):931.8 #Correct:503 #Tested:573 Testing Accuracy:87.7%Progress:57.3% Speed(reviews/sec):931.7 #Correct:504 #Tested:574 Testing Accuracy:87.8%Progress:57.4% Speed(reviews/sec):931.5 #Correct:505 #Tested:575 Testing Accuracy:87.8%Progress:57.5% Speed(reviews/sec):931.6 #Correct:505 #Tested:576 Testing Accuracy:87.6%Progress:57.6% Speed(reviews/sec):932.1 #Correct:506 #Tested:577 Testing Accuracy:87.6%Progress:57.7% Speed(reviews/sec):932.6 #Correct:507 #Tested:578 Testing Accuracy:87.7%Progress:57.8% Speed(reviews/sec):931.8 #Correct:507 #Tested:579 Testing Accuracy:87.5%Progress:57.9% Speed(reviews/sec):932.1 #Correct:508 #Tested:580 Testing Accuracy:87.5%Progress:58.0% Speed(reviews/se

Progress:76.4% Speed(reviews/sec):941.0 #Correct:651 #Tested:765 Testing Accuracy:85.0%Progress:76.5% Speed(reviews/sec):940.9 #Correct:651 #Tested:766 Testing Accuracy:84.9%Progress:76.6% Speed(reviews/sec):940.8 #Correct:651 #Tested:767 Testing Accuracy:84.8%Progress:76.7% Speed(reviews/sec):941.0 #Correct:652 #Tested:768 Testing Accuracy:84.8%Progress:76.8% Speed(reviews/sec):941.1 #Correct:652 #Tested:769 Testing Accuracy:84.7%Progress:76.9% Speed(reviews/sec):941.2 #Correct:653 #Tested:770 Testing Accuracy:84.8%Progress:77.0% Speed(reviews/sec):939.9 #Correct:653 #Tested:771 Testing Accuracy:84.6%Progress:77.1% Speed(reviews/sec):939.8 #Correct:653 #Tested:772 Testing Accuracy:84.5%Progress:77.2% Speed(reviews/sec):939.9 #Correct:654 #Tested:773 Testing Accuracy:84.6%Progress:77.3% Speed(reviews/sec):939.2 #Correct:655 #Tested:774 Testing Accuracy:84.6%Progress:77.4% Speed(reviews/sec):939.1 #Correct:656 #Tested:775 Testing Accuracy:84.6%Progress:77.5% Speed(reviews/se

Progress:97.0% Speed(reviews/sec):956.6 #Correct:820 #Tested:971 Testing Accuracy:84.4%Progress:97.1% Speed(reviews/sec):956.7 #Correct:821 #Tested:972 Testing Accuracy:84.4%Progress:97.2% Speed(reviews/sec):956.9 #Correct:822 #Tested:973 Testing Accuracy:84.4%Progress:97.3% Speed(reviews/sec):956.7 #Correct:823 #Tested:974 Testing Accuracy:84.4%Progress:97.4% Speed(reviews/sec):956.6 #Correct:824 #Tested:975 Testing Accuracy:84.5%Progress:97.5% Speed(reviews/sec):956.4 #Correct:825 #Tested:976 Testing Accuracy:84.5%Progress:97.6% Speed(reviews/sec):956.4 #Correct:826 #Tested:977 Testing Accuracy:84.5%Progress:97.7% Speed(reviews/sec):956.0 #Correct:826 #Tested:978 Testing Accuracy:84.4%Progress:97.8% Speed(reviews/sec):956.3 #Correct:827 #Tested:979 Testing Accuracy:84.4%Progress:97.9% Speed(reviews/sec):956.4 #Correct:828 #Tested:980 Testing Accuracy:84.4%Progress:98.0% Speed(reviews/sec):956.6 #Correct:829 #Tested:981 Testing Accuracy:84.5%Progress:98.1% Speed(reviews/se

In [52]:
import time
import sys
import numpy as np

# Encapsulate our neural network
class SentimentNetwork:
    def __init__(self, reviews,labels,hidden_nodes = 10, learning_rate = 0.1):
        """Create a SentimenNetwork with the given settings
        Args:
            reviews(list) - List of reviews used for training
            labels(list) - List of POSITIVE/NEGATIVE labels associated with the given reviews
            hidden_nodes(int) - Number of nodes to create in the hidden layer
            learning_rate(float) - Learning rate to use while training
        
        """
        # Assign a seed to our random number generator to ensure we get
        # reproducable results during development 
        np.random.seed(1)

        # process the reviews and their associated labels so that everything
        # is ready for training
        self.pre_process_data(reviews, labels)
        
        # Build the network to have the number of hidden nodes and the learning rate that
        # were passed into this initializer. Make the same number of input nodes as
        # there are vocabulary words and create a single output node.
        self.init_network(len(self.review_vocab),hidden_nodes, 1, learning_rate)

    def pre_process_data(self, reviews, labels):
        
        # populate review_vocab with all of the words in the given reviews
        review_vocab = set()
        for review in reviews:
            for word in review.split(" "):
                review_vocab.add(word)

        # Convert the vocabulary set to a list so we can access words via indices
        self.review_vocab = list(review_vocab)
        
        # populate label_vocab with all of the words in the given labels.
        label_vocab = set()
        for label in labels:
            label_vocab.add(label)
        
        # Convert the label vocabulary set to a list so we can access labels via indices
        self.label_vocab = list(label_vocab)
        
        # Store the sizes of the review and label vocabularies.
        self.review_vocab_size = len(self.review_vocab)
        self.label_vocab_size = len(self.label_vocab)
        
        # Create a dictionary of words in the vocabulary mapped to index positions
        self.word2index = {}
        for i, word in enumerate(self.review_vocab):
            self.word2index[word] = i
        
        # Create a dictionary of labels mapped to index positions
        self.label2index = {}
        for i, label in enumerate(self.label_vocab):
            self.label2index[label] = i

    def init_network(self, input_nodes, hidden_nodes, output_nodes, learning_rate):
        # Set number of nodes in input, hidden and output layers.
        self.input_nodes = input_nodes
        self.hidden_nodes = hidden_nodes
        self.output_nodes = output_nodes

        # Store the learning rate
        self.learning_rate = learning_rate

        # Initialize weights

        # These are the weights between the input layer and the hidden layer.
        self.weights_0_1 = np.zeros((self.input_nodes,self.hidden_nodes))

        # These are the weights between the hidden layer and the output layer.
        self.weights_1_2 = np.random.normal(0.0, self.output_nodes**-0.5, 
                                                (self.hidden_nodes, self.output_nodes))
        
        ## Removed self.layer_0; added self.layer_1
        # The input layer, a two-dimensional matrix with shape 1 x hidden_nodes
        self.layer_1 = np.zeros((1,hidden_nodes))
    
    ## Removed update_input_layer function
    
    def get_target_for_label(self,label):
        if(label == 'POSITIVE'):
            return 1
        else:
            return 0
        
    def sigmoid(self,x):
        return 1 / (1 + np.exp(-x))
    
    def sigmoid_output_2_derivative(self,output):
        return output * (1 - output)
    
    ##  changed name of first parameter form 'training_reviews' 
    #   to 'training_reviews_raw'
    def train(self, training_reviews_raw, training_labels):

        ## pre-process training reviews so we can deal 
        #  directly with the indices of non-zero inputs
        training_reviews = list()
        for review in training_reviews_raw:
            indices = set()
            for word in review.split(" "):
                if(word in self.word2index.keys()):
                    indices.add(self.word2index[word])
            training_reviews.append(list(indices))

        # make sure out we have a matching number of reviews and labels
        assert(len(training_reviews) == len(training_labels))
        
        # Keep track of correct predictions to display accuracy during training 
        correct_so_far = 0

        # Remember when we started for printing time statistics
        start = time.time()
        
        # loop through all the given reviews and run a forward and backward pass,
        # updating weights for every item
        for i in range(len(training_reviews)):
            
            # Get the next review and its correct label
            review = training_reviews[i]
            label = training_labels[i]
            
            #### Implement the forward pass here ####
            ### Forward pass ###

            ##  Removed call to 'update_input_layer' function
            #   because 'layer_0' is no longer used

            # Hidden layer
            ##  Add in only the weights for non-zero items
            self.layer_1 *= 0
            for index in review:
                self.layer_1 += self.weights_0_1[index]

            # Output layer
            ##  changed to use 'self.layer_1' instead of 'local layer_1'
            layer_2 = self.sigmoid(self.layer_1.dot(self.weights_1_2))            
            
            #### Implement the backward pass here ####
            ### Backward pass ###

            # Output error
            layer_2_error = layer_2 - self.get_target_for_label(label) # Output layer error is the difference between desired target and actual output.
            layer_2_delta = layer_2_error * self.sigmoid_output_2_derivative(layer_2)

            # Backpropagated error
            layer_1_error = layer_2_delta.dot(self.weights_1_2.T) # errors propagated to the hidden layer
            layer_1_delta = layer_1_error # hidden layer gradients - no nonlinearity so it's the same as the error

            # Update the weights
            ## changed to use 'self.layer_1' instead of local 'layer_1'
            self.weights_1_2 -= self.layer_1.T.dot(layer_2_delta) * self.learning_rate # update hidden-to-output weights with gradient descent step
            
            ## Only update the weights that were used in the forward pass
            for index in review:
                self.weights_0_1[index] -= layer_1_delta[0] * self.learning_rate # update input-to-hidden weights with gradient descent step

            # Keep track of correct predictions.
            if(layer_2 >= 0.5 and label == 'POSITIVE'):
                correct_so_far += 1
            elif(layer_2 < 0.5 and label == 'NEGATIVE'):
                correct_so_far += 1
            
            # For debug purposes, print out our prediction accuracy and speed 
            # throughout the training process. 
            elapsed_time = float(time.time() - start)
            reviews_per_second = i / elapsed_time if elapsed_time > 0 else 0
            
            sys.stdout.write("\rProgress:" + str(100 * i/float(len(training_reviews)))[:4] \
                             + "% Speed(reviews/sec):" + str(reviews_per_second)[0:5] \
                             + " #Correct:" + str(correct_so_far) + " #Trained:" + str(i+1) \
                             + " Training Accuracy:" + str(correct_so_far * 100 / float(i+1))[:4] + "%")
            if(i % 2500 == 0):
                print("")
    
    def test(self, testing_reviews, testing_labels):
        """
        Attempts to predict the labels for the given testing_reviews,
        and uses the test_labels to calculate the accuracy of those predictions.
        """
        
        # keep track of how many correct predictions we make
        correct = 0

        # we'll time how many predictions per second we make
        start = time.time()

        # Loop through each of the given reviews and call run to predict
        # its label. 
        for i in range(len(testing_reviews)):
            pred = self.run(testing_reviews[i])
            if(pred == testing_labels[i]):
                correct += 1
            
            # For debug purposes, print out our prediction accuracy and speed 
            # throughout the prediction process. 

            elapsed_time = float(time.time() - start)
            reviews_per_second = i / elapsed_time if elapsed_time > 0 else 0
            
            sys.stdout.write("\rProgress:" + str(100 * i/float(len(testing_reviews)))[:4] \
                             + "% Speed(reviews/sec):" + str(reviews_per_second)[0:5] \
                             + " #Correct:" + str(correct) + " #Tested:" + str(i+1) \
                             + " Testing Accuracy:" + str(correct * 100 / float(i+1))[:4] + "%")
    
    def run(self, review):
        """
        Returns a POSITIVE or NEGATIVE prediction for the given review.
        """
        # Run a forward pass through the network, like in the "train" function.
        
        ## New for Project 5: Removed call to update_input_layer function
        #                     because layer_0 is no longer used

        # Hidden layer
        ## New for Project 5: Identify the indices used in the review and then add
        #                     just those weights to layer_1 
        self.layer_1 *= 0
        unique_indices = set()
        for word in review.lower().split(" "):
            if word in self.word2index.keys():
                unique_indices.add(self.word2index[word])
        for index in unique_indices:
            self.layer_1 += self.weights_0_1[index]
        
        # Output layer
        ## 
        layer_2 = self.sigmoid(self.layer_1.dot(self.weights_1_2))
        
        # Return POSITIVE for values above greater-than-or-equal-to 0.5 in the output layer;
        # return NEGATIVE for other values
        if(layer_2[0] >= 0.5):
            return "POSITIVE"
        else:
            return "NEGATIVE"


In [53]:
mlp = SentimentNetwork(reviews[:-1000],labels[:-1000], learning_rate=0.1)
mlp.train(reviews[:-1000],labels[:-1000])

Progress:0.0% Speed(reviews/sec):0.0 #Correct:1 #Trained:1 Training Accuracy:100.%
Progress:10.4% Speed(reviews/sec):916.6 #Correct:1802 #Trained:2501 Training Accuracy:72.0%
Progress:20.8% Speed(reviews/sec):884.2 #Correct:3794 #Trained:5001 Training Accuracy:75.8%
Progress:31.2% Speed(reviews/sec):873.3 #Correct:5883 #Trained:7501 Training Accuracy:78.4%
Progress:41.6% Speed(reviews/sec):878.3 #Correct:8021 #Trained:10001 Training Accuracy:80.2%
Progress:52.0% Speed(reviews/sec):879.2 #Correct:10142 #Trained:12501 Training Accuracy:81.1%
Progress:62.5% Speed(reviews/sec):886.7 #Correct:12272 #Trained:15001 Training Accuracy:81.8%
Progress:72.9% Speed(reviews/sec):885.4 #Correct:14400 #Trained:17501 Training Accuracy:82.2%
Progress:83.3% Speed(reviews/sec):884.2 #Correct:16584 #Trained:20001 Training Accuracy:82.9%
Progress:93.7% Speed(reviews/sec):877.3 #Correct:18771 #Trained:22501 Training Accuracy:83.4%
Progress:99.9% Speed(reviews/sec):875.5 #Correct:20094 #Trained:24000 Training

In [54]:
mlp.test(reviews[-1000:],labels[-1000:])

Progress:0.0% Speed(reviews/sec):0.0 #Correct:1 #Tested:1 Testing Accuracy:100.%Progress:0.1% Speed(reviews/sec):684.0 #Correct:2 #Tested:2 Testing Accuracy:100.%Progress:0.2% Speed(reviews/sec):1166. #Correct:2 #Tested:3 Testing Accuracy:66.6%Progress:0.3% Speed(reviews/sec):1341. #Correct:3 #Tested:4 Testing Accuracy:75.0%Progress:0.4% Speed(reviews/sec):1442. #Correct:4 #Tested:5 Testing Accuracy:80.0%Progress:0.5% Speed(reviews/sec):1518. #Correct:5 #Tested:6 Testing Accuracy:83.3%Progress:0.6% Speed(reviews/sec):1451. #Correct:6 #Tested:7 Testing Accuracy:85.7%Progress:0.7% Speed(reviews/sec):1470. #Correct:7 #Tested:8 Testing Accuracy:87.5%Progress:0.8% Speed(reviews/sec):1525. #Correct:8 #Tested:9 Testing Accuracy:88.8%Progress:0.9% Speed(reviews/sec):1252. #Correct:9 #Tested:10 Testing Accuracy:90.0%Progress:1.0% Speed(reviews/sec):1228. #Correct:10 #Tested:11 Testing Accuracy:90.9%Progress:1.1% Speed(reviews/sec):1277. #Correct:11 #Tested:12 Testing Accuracy:91.6%

Progress:24.4% Speed(reviews/sec):1263. #Correct:218 #Tested:245 Testing Accuracy:88.9%Progress:24.5% Speed(reviews/sec):1265. #Correct:219 #Tested:246 Testing Accuracy:89.0%Progress:24.6% Speed(reviews/sec):1267. #Correct:220 #Tested:247 Testing Accuracy:89.0%Progress:24.7% Speed(reviews/sec):1268. #Correct:221 #Tested:248 Testing Accuracy:89.1%Progress:24.8% Speed(reviews/sec):1272. #Correct:222 #Tested:249 Testing Accuracy:89.1%Progress:24.9% Speed(reviews/sec):1274. #Correct:222 #Tested:250 Testing Accuracy:88.8%Progress:25.0% Speed(reviews/sec):1276. #Correct:223 #Tested:251 Testing Accuracy:88.8%Progress:25.1% Speed(reviews/sec):1277. #Correct:224 #Tested:252 Testing Accuracy:88.8%Progress:25.2% Speed(reviews/sec):1279. #Correct:225 #Tested:253 Testing Accuracy:88.9%Progress:25.3% Speed(reviews/sec):1280. #Correct:226 #Tested:254 Testing Accuracy:88.9%Progress:25.4% Speed(reviews/sec):1282. #Correct:227 #Tested:255 Testing Accuracy:89.0%Progress:25.5% Speed(reviews/se

Progress:48.4% Speed(reviews/sec):1235. #Correct:434 #Tested:485 Testing Accuracy:89.4%Progress:48.5% Speed(reviews/sec):1234. #Correct:435 #Tested:486 Testing Accuracy:89.5%Progress:48.6% Speed(reviews/sec):1231. #Correct:436 #Tested:487 Testing Accuracy:89.5%Progress:48.7% Speed(reviews/sec):1231. #Correct:436 #Tested:488 Testing Accuracy:89.3%Progress:48.8% Speed(reviews/sec):1230. #Correct:437 #Tested:489 Testing Accuracy:89.3%Progress:48.9% Speed(reviews/sec):1225. #Correct:437 #Tested:490 Testing Accuracy:89.1%Progress:49.0% Speed(reviews/sec):1224. #Correct:438 #Tested:491 Testing Accuracy:89.2%Progress:49.1% Speed(reviews/sec):1225. #Correct:439 #Tested:492 Testing Accuracy:89.2%Progress:49.2% Speed(reviews/sec):1224. #Correct:440 #Tested:493 Testing Accuracy:89.2%Progress:49.3% Speed(reviews/sec):1223. #Correct:441 #Tested:494 Testing Accuracy:89.2%Progress:49.4% Speed(reviews/sec):1224. #Correct:442 #Tested:495 Testing Accuracy:89.2%Progress:49.5% Speed(reviews/se

Progress:79.3% Speed(reviews/sec):1228. #Correct:677 #Tested:794 Testing Accuracy:85.2%Progress:79.4% Speed(reviews/sec):1228. #Correct:678 #Tested:795 Testing Accuracy:85.2%Progress:79.5% Speed(reviews/sec):1229. #Correct:679 #Tested:796 Testing Accuracy:85.3%Progress:79.6% Speed(reviews/sec):1229. #Correct:680 #Tested:797 Testing Accuracy:85.3%Progress:79.7% Speed(reviews/sec):1230. #Correct:681 #Tested:798 Testing Accuracy:85.3%Progress:79.8% Speed(reviews/sec):1230. #Correct:682 #Tested:799 Testing Accuracy:85.3%Progress:79.9% Speed(reviews/sec):1231. #Correct:683 #Tested:800 Testing Accuracy:85.3%Progress:80.0% Speed(reviews/sec):1232. #Correct:684 #Tested:801 Testing Accuracy:85.3%Progress:80.1% Speed(reviews/sec):1233. #Correct:685 #Tested:802 Testing Accuracy:85.4%Progress:80.2% Speed(reviews/sec):1233. #Correct:686 #Tested:803 Testing Accuracy:85.4%Progress:80.3% Speed(reviews/sec):1234. #Correct:687 #Tested:804 Testing Accuracy:85.4%Progress:80.4% Speed(reviews/se

In [55]:
from bokeh.models import ColumnDataSource, LabelSet
from bokeh.plotting import figure, show, output_file
from bokeh.io import output_notebook
output_notebook()

In [56]:
hist, edges = np.histogram(list(map(lambda x:x[1],pos_neg_ratios.most_common())), density=True, bins=100, normed=True)

p = figure(tools="pan,wheel_zoom,reset,save",
           toolbar_location="above",
           title="Word Positive/Negative Affinity Distribution")
p.quad(top=hist, bottom=0, left=edges[:-1], right=edges[1:], line_color="#555555")
show(p)

In [57]:
frequency_frequency = Counter()

for word, cnt in total_counts.most_common():
    frequency_frequency[cnt] += 1

In [58]:
hist, edges = np.histogram(list(map(lambda x:x[1],frequency_frequency.most_common())), density=True, bins=100, normed=True)

p = figure(tools="pan,wheel_zoom,reset,save",
           toolbar_location="above",
           title="The frequency distribution of the words in our corpus")
p.quad(top=hist, bottom=0, left=edges[:-1], right=edges[1:], line_color="#555555")
show(p)

In [59]:
import time
import sys
import numpy as np

# Encapsulate our neural network 
class SentimentNetwork:
    ##  added min_count and polarity_cutoff parameters
    def __init__(self, reviews,labels,min_count = 10,polarity_cutoff = 0.1,hidden_nodes = 10, learning_rate = 0.1):
        """Create a SentimenNetwork with the given settings
        Args:
            reviews(list) - List of reviews used for training
            labels(list) - List of POSITIVE/NEGATIVE labels associated with the given reviews
            min_count(int) - Words should only be added to the vocabulary 
                             if they occur more than this many times
            polarity_cutoff(float) - The absolute value of a word's positive-to-negative
                                     ratio must be at least this big to be considered.
            hidden_nodes(int) - Number of nodes to create in the hidden layer
            learning_rate(float) - Learning rate to use while training
        
        """
        # Assign a seed to our random number generator to ensure we get
        # reproducable results during development 
        np.random.seed(1)

        # process the reviews and their associated labels so that everything
        # is ready for training
        # added min_count and polarity_cutoff arguments to pre_process_data call
        self.pre_process_data(reviews, labels, polarity_cutoff, min_count)
        
        # Build the network to have the number of hidden nodes and the learning rate that
        # were passed into this initializer. Make the same number of input nodes as
        # there are vocabulary words and create a single output node.
        self.init_network(len(self.review_vocab),hidden_nodes, 1, learning_rate)

    ## added min_count and polarity_cutoff parameters
    def pre_process_data(self, reviews, labels, polarity_cutoff, min_count):
        
        ## ----------------------------------------
        ##  Calculate positive-to-negative ratios for words before
        #   building vocabulary
        #
        positive_counts = Counter()
        negative_counts = Counter()
        total_counts = Counter()

        for i in range(len(reviews)):
            if(labels[i] == 'POSITIVE'):
                for word in reviews[i].split(" "):
                    positive_counts[word] += 1
                    total_counts[word] += 1
            else:
                for word in reviews[i].split(" "):
                    negative_counts[word] += 1
                    total_counts[word] += 1

        pos_neg_ratios = Counter()

        for term,cnt in list(total_counts.most_common()):
            if(cnt >= 50):
                pos_neg_ratio = positive_counts[term] / float(negative_counts[term]+1)
                pos_neg_ratios[term] = pos_neg_ratio

        for word,ratio in pos_neg_ratios.most_common():
            if(ratio > 1):
                pos_neg_ratios[word] = np.log(ratio)
            else:
                pos_neg_ratios[word] = -np.log((1 / (ratio + 0.01)))
        #
        ## 
        ## ----------------------------------------

        # populate review_vocab with all of the words in the given reviews
        review_vocab = set()
        for review in reviews:
            for word in review.split(" "):
                ##                    only add words that occur at least min_count times
                #                     and for words with pos/neg ratios, only add words
                #                     that meet the polarity_cutoff
                if(total_counts[word] > min_count):
                    if(word in pos_neg_ratios.keys()):
                        if((pos_neg_ratios[word] >= polarity_cutoff) or (pos_neg_ratios[word] <= -polarity_cutoff)):
                            review_vocab.add(word)
                    else:
                        review_vocab.add(word)

        # Convert the vocabulary set to a list so we can access words via indices
        self.review_vocab = list(review_vocab)
        
        # populate label_vocab with all of the words in the given labels.
        label_vocab = set()
        for label in labels:
            label_vocab.add(label)
        
        # Convert the label vocabulary set to a list so we can access labels via indices
        self.label_vocab = list(label_vocab)
        
        # Store the sizes of the review and label vocabularies.
        self.review_vocab_size = len(self.review_vocab)
        self.label_vocab_size = len(self.label_vocab)
        
        # Create a dictionary of words in the vocabulary mapped to index positions
        self.word2index = {}
        for i, word in enumerate(self.review_vocab):
            self.word2index[word] = i
        
        # Create a dictionary of labels mapped to index positions
        self.label2index = {}
        for i, label in enumerate(self.label_vocab):
            self.label2index[label] = i

    def init_network(self, input_nodes, hidden_nodes, output_nodes, learning_rate):
        # Set number of nodes in input, hidden and output layers.
        self.input_nodes = input_nodes
        self.hidden_nodes = hidden_nodes
        self.output_nodes = output_nodes

        # Store the learning rate
        self.learning_rate = learning_rate

        # Initialize weights

        # These are the weights between the input layer and the hidden layer.
        self.weights_0_1 = np.zeros((self.input_nodes,self.hidden_nodes))

        # These are the weights between the hidden layer and the output layer.
        self.weights_1_2 = np.random.normal(0.0, self.output_nodes**-0.5, 
                                                (self.hidden_nodes, self.output_nodes))
        
        ## 
        # The input layer, a two-dimensional matrix with shape 1 x hidden_nodes
        self.layer_1 = np.zeros((1,hidden_nodes))
    
    ## New for Project 5: Removed update_input_layer function
    
    def get_target_for_label(self,label):
        if(label == 'POSITIVE'):
            return 1
        else:
            return 0
        
    def sigmoid(self,x):
        return 1 / (1 + np.exp(-x))
    
    def sigmoid_output_2_derivative(self,output):
        return output * (1 - output)
    
    ## 
    #   
    def train(self, training_reviews_raw, training_labels):

        ## New for Project 5: pre-process training reviews so we can deal 
        #                     directly with the indices of non-zero inputs
        training_reviews = list()
        for review in training_reviews_raw:
            indices = set()
            for word in review.split(" "):
                if(word in self.word2index.keys()):
                    indices.add(self.word2index[word])
            training_reviews.append(list(indices))

        # make sure out we have a matching number of reviews and labels
        assert(len(training_reviews) == len(training_labels))
        
        # Keep track of correct predictions to display accuracy during training 
        correct_so_far = 0

        # Remember when we started for printing time statistics
        start = time.time()
        
        # loop through all the given reviews and run a forward and backward pass,
        # updating weights for every item
        for i in range(len(training_reviews)):
            
            # Get the next review and its correct label
            review = training_reviews[i]
            label = training_labels[i]
            
            #### Implement the forward pass here ####
            ### Forward pass ###

            ## New for Project 5: Removed call to 'update_input_layer' function
            #                     because 'layer_0' is no longer used

            # Hidden layer
            ## New for Project 5: Add in only the weights for non-zero items
            self.layer_1 *= 0
            for index in review:
                self.layer_1 += self.weights_0_1[index]

            # Output layer
            ## 
            layer_2 = self.sigmoid(self.layer_1.dot(self.weights_1_2))            
            
            #### Implement the backward pass here ####
            ### Backward pass ###

            # Output error
            layer_2_error = layer_2 - self.get_target_for_label(label) # Output layer error is the difference between desired target and actual output.
            layer_2_delta = layer_2_error * self.sigmoid_output_2_derivative(layer_2)

            # Backpropagated error
            layer_1_error = layer_2_delta.dot(self.weights_1_2.T) # errors propagated to the hidden layer
            layer_1_delta = layer_1_error # hidden layer gradients - no nonlinearity so it's the same as the error

            # Update the weights
            ## New for Project 5: changed to use 'self.layer_1' instead of local 'layer_1'
            self.weights_1_2 -= self.layer_1.T.dot(layer_2_delta) * self.learning_rate # update hidden-to-output weights with gradient descent step
            
            ## 
            for index in review:
                self.weights_0_1[index] -= layer_1_delta[0] * self.learning_rate # update input-to-hidden weights with gradient descent step

            # Keep track of correct predictions.
            if(layer_2 >= 0.5 and label == 'POSITIVE'):
                correct_so_far += 1
            elif(layer_2 < 0.5 and label == 'NEGATIVE'):
                correct_so_far += 1
            
            # For debug purposes, print out our prediction accuracy and speed 
            # throughout the training process. 
            elapsed_time = float(time.time() - start)
            reviews_per_second = i / elapsed_time if elapsed_time > 0 else 0
            
            sys.stdout.write("\rProgress:" + str(100 * i/float(len(training_reviews)))[:4] \
                             + "% Speed(reviews/sec):" + str(reviews_per_second)[0:5] \
                             + " #Correct:" + str(correct_so_far) + " #Trained:" + str(i+1) \
                             + " Training Accuracy:" + str(correct_so_far * 100 / float(i+1))[:4] + "%")
            if(i % 2500 == 0):
                print("")
    
    def test(self, testing_reviews, testing_labels):
        """
        Attempts to predict the labels for the given testing_reviews,
        and uses the test_labels to calculate the accuracy of those predictions.
        """
        
        # keep track of how many correct predictions we make
        correct = 0

        # we'll time how many predictions per second we make
        start = time.time()

        # Loop through each of the given reviews and call run to predict
        # its label. 
        for i in range(len(testing_reviews)):
            pred = self.run(testing_reviews[i])
            if(pred == testing_labels[i]):
                correct += 1
            
            # For debug purposes, print out our prediction accuracy and speed 
            # throughout the prediction process. 

            elapsed_time = float(time.time() - start)
            reviews_per_second = i / elapsed_time if elapsed_time > 0 else 0
            
            sys.stdout.write("\rProgress:" + str(100 * i/float(len(testing_reviews)))[:4] \
                             + "% Speed(reviews/sec):" + str(reviews_per_second)[0:5] \
                             + " #Correct:" + str(correct) + " #Tested:" + str(i+1) \
                             + " Testing Accuracy:" + str(correct * 100 / float(i+1))[:4] + "%")
    
    def run(self, review):
        """
        Returns a POSITIVE or NEGATIVE prediction for the given review.
        """
        # Run a forward pass through the network, like in the "train" function.
        
        ## New for Project 5: Removed call to update_input_layer function
        #                     because layer_0 is no longer used

        # Hidden layer
        ## Identify the indices used in the review and then add
        #  just those weights to layer_1 
        self.layer_1 *= 0
        unique_indices = set()
        for word in review.lower().split(" "):
            if word in self.word2index.keys():
                unique_indices.add(self.word2index[word])
        for index in unique_indices:
            self.layer_1 += self.weights_0_1[index]
        
        # Output layer
        ## changed to use self.layer_1 instead of local layer_1
        layer_2 = self.sigmoid(self.layer_1.dot(self.weights_1_2))
         
        # Return POSITIVE for values above greater-than-or-equal-to 0.5 in the output layer;
        # return NEGATIVE for other values
        if(layer_2[0] >= 0.5):
            return "POSITIVE"
        else:
            return "NEGATIVE"


In [60]:
mlp = SentimentNetwork(reviews[:-1000],labels[:-1000],min_count=20,polarity_cutoff=0.05,learning_rate=0.01)
mlp.train(reviews[:-1000],labels[:-1000])

Progress:0.0% Speed(reviews/sec):0.0 #Correct:1 #Trained:1 Training Accuracy:100.%
Progress:10.4% Speed(reviews/sec):1058. #Correct:1994 #Trained:2501 Training Accuracy:79.7%
Progress:20.8% Speed(reviews/sec):1038. #Correct:4063 #Trained:5001 Training Accuracy:81.2%
Progress:31.2% Speed(reviews/sec):1036. #Correct:6176 #Trained:7501 Training Accuracy:82.3%
Progress:41.6% Speed(reviews/sec):1042. #Correct:8336 #Trained:10001 Training Accuracy:83.3%
Progress:52.0% Speed(reviews/sec):1037. #Correct:10501 #Trained:12501 Training Accuracy:84.0%
Progress:62.5% Speed(reviews/sec):1036. #Correct:12641 #Trained:15001 Training Accuracy:84.2%
Progress:72.9% Speed(reviews/sec):1032. #Correct:14782 #Trained:17501 Training Accuracy:84.4%
Progress:83.3% Speed(reviews/sec):1029. #Correct:16954 #Trained:20001 Training Accuracy:84.7%
Progress:93.7% Speed(reviews/sec):1022. #Correct:19143 #Trained:22501 Training Accuracy:85.0%
Progress:99.9% Speed(reviews/sec):1022. #Correct:20461 #Trained:24000 Training

In [61]:
mlp.test(reviews[-1000:],labels[-1000:])

Progress:0.0% Speed(reviews/sec):0.0 #Correct:1 #Tested:1 Testing Accuracy:100.%Progress:0.1% Speed(reviews/sec):1025. #Correct:1 #Tested:2 Testing Accuracy:50.0%Progress:0.2% Speed(reviews/sec):1389. #Correct:2 #Tested:3 Testing Accuracy:66.6%Progress:0.3% Speed(reviews/sec):1630. #Correct:3 #Tested:4 Testing Accuracy:75.0%Progress:0.4% Speed(reviews/sec):1932. #Correct:4 #Tested:5 Testing Accuracy:80.0%Progress:0.5% Speed(reviews/sec):2014. #Correct:5 #Tested:6 Testing Accuracy:83.3%Progress:0.6% Speed(reviews/sec):1888. #Correct:6 #Tested:7 Testing Accuracy:85.7%Progress:0.7% Speed(reviews/sec):1832. #Correct:7 #Tested:8 Testing Accuracy:87.5%Progress:0.8% Speed(reviews/sec):1830. #Correct:8 #Tested:9 Testing Accuracy:88.8%Progress:0.9% Speed(reviews/sec):1483. #Correct:9 #Tested:10 Testing Accuracy:90.0%Progress:1.0% Speed(reviews/sec):1450. #Correct:10 #Tested:11 Testing Accuracy:90.9%Progress:1.1% Speed(reviews/sec):1487. #Correct:11 #Tested:12 Testing Accuracy:91.6%

Progress:24.1% Speed(reviews/sec):1481. #Correct:216 #Tested:242 Testing Accuracy:89.2%Progress:24.2% Speed(reviews/sec):1480. #Correct:217 #Tested:243 Testing Accuracy:89.3%Progress:24.3% Speed(reviews/sec):1476. #Correct:218 #Tested:244 Testing Accuracy:89.3%Progress:24.4% Speed(reviews/sec):1475. #Correct:219 #Tested:245 Testing Accuracy:89.3%Progress:24.5% Speed(reviews/sec):1477. #Correct:220 #Tested:246 Testing Accuracy:89.4%Progress:24.6% Speed(reviews/sec):1478. #Correct:221 #Tested:247 Testing Accuracy:89.4%Progress:24.7% Speed(reviews/sec):1479. #Correct:222 #Tested:248 Testing Accuracy:89.5%Progress:24.8% Speed(reviews/sec):1483. #Correct:223 #Tested:249 Testing Accuracy:89.5%Progress:24.9% Speed(reviews/sec):1485. #Correct:223 #Tested:250 Testing Accuracy:89.2%Progress:25.0% Speed(reviews/sec):1486. #Correct:224 #Tested:251 Testing Accuracy:89.2%Progress:25.1% Speed(reviews/sec):1488. #Correct:225 #Tested:252 Testing Accuracy:89.2%Progress:25.2% Speed(reviews/se

Progress:56.5% Speed(reviews/sec):1488. #Correct:503 #Tested:566 Testing Accuracy:88.8%Progress:56.6% Speed(reviews/sec):1488. #Correct:504 #Tested:567 Testing Accuracy:88.8%Progress:56.7% Speed(reviews/sec):1487. #Correct:505 #Tested:568 Testing Accuracy:88.9%Progress:56.8% Speed(reviews/sec):1488. #Correct:506 #Tested:569 Testing Accuracy:88.9%Progress:56.9% Speed(reviews/sec):1488. #Correct:507 #Tested:570 Testing Accuracy:88.9%Progress:57.0% Speed(reviews/sec):1484. #Correct:507 #Tested:571 Testing Accuracy:88.7%Progress:57.1% Speed(reviews/sec):1484. #Correct:508 #Tested:572 Testing Accuracy:88.8%Progress:57.2% Speed(reviews/sec):1483. #Correct:509 #Tested:573 Testing Accuracy:88.8%Progress:57.3% Speed(reviews/sec):1483. #Correct:510 #Tested:574 Testing Accuracy:88.8%Progress:57.4% Speed(reviews/sec):1482. #Correct:511 #Tested:575 Testing Accuracy:88.8%Progress:57.5% Speed(reviews/sec):1482. #Correct:511 #Tested:576 Testing Accuracy:88.7%Progress:57.6% Speed(reviews/se

In [62]:
mlp = SentimentNetwork(reviews[:-1000],labels[:-1000],min_count=20,polarity_cutoff=0.8,learning_rate=0.01)
mlp.train(reviews[:-1000],labels[:-1000])

Progress:0.0% Speed(reviews/sec):0.0 #Correct:1 #Trained:1 Training Accuracy:100.%
Progress:10.4% Speed(reviews/sec):4459. #Correct:2114 #Trained:2501 Training Accuracy:84.5%
Progress:20.8% Speed(reviews/sec):4204. #Correct:4235 #Trained:5001 Training Accuracy:84.6%
Progress:31.2% Speed(reviews/sec):4049. #Correct:6362 #Trained:7501 Training Accuracy:84.8%
Progress:41.6% Speed(reviews/sec):4074. #Correct:8513 #Trained:10001 Training Accuracy:85.1%
Progress:52.0% Speed(reviews/sec):4031. #Correct:10641 #Trained:12501 Training Accuracy:85.1%
Progress:62.5% Speed(reviews/sec):3995. #Correct:12796 #Trained:15001 Training Accuracy:85.3%
Progress:72.9% Speed(reviews/sec):3941. #Correct:14911 #Trained:17501 Training Accuracy:85.2%
Progress:83.3% Speed(reviews/sec):3954. #Correct:17077 #Trained:20001 Training Accuracy:85.3%
Progress:93.7% Speed(reviews/sec):3925. #Correct:19258 #Trained:22501 Training Accuracy:85.5%
Progress:99.9% Speed(reviews/sec):3903. #Correct:20552 #Trained:24000 Training

In [63]:
mlp.test(reviews[-1000:],labels[-1000:])

Progress:0.0% Speed(reviews/sec):0.0 #Correct:0 #Tested:1 Testing Accuracy:0.0%Progress:0.1% Speed(reviews/sec):1293. #Correct:1 #Tested:2 Testing Accuracy:50.0%Progress:0.2% Speed(reviews/sec):1930. #Correct:2 #Tested:3 Testing Accuracy:66.6%Progress:0.3% Speed(reviews/sec):2181. #Correct:3 #Tested:4 Testing Accuracy:75.0%Progress:0.4% Speed(reviews/sec):2605. #Correct:3 #Tested:5 Testing Accuracy:60.0%Progress:0.5% Speed(reviews/sec):2704. #Correct:4 #Tested:6 Testing Accuracy:66.6%Progress:0.6% Speed(reviews/sec):2645. #Correct:5 #Tested:7 Testing Accuracy:71.4%Progress:0.7% Speed(reviews/sec):2814. #Correct:6 #Tested:8 Testing Accuracy:75.0%Progress:0.8% Speed(reviews/sec):2897. #Correct:7 #Tested:9 Testing Accuracy:77.7%Progress:0.9% Speed(reviews/sec):2719. #Correct:8 #Tested:10 Testing Accuracy:80.0%Progress:1.0% Speed(reviews/sec):2706. #Correct:9 #Tested:11 Testing Accuracy:81.8%Progress:1.1% Speed(reviews/sec):2763. #Correct:10 #Tested:12 Testing Accuracy:83.3%Pr

Progress:23.6% Speed(reviews/sec):4369. #Correct:208 #Tested:237 Testing Accuracy:87.7%Progress:23.7% Speed(reviews/sec):4368. #Correct:209 #Tested:238 Testing Accuracy:87.8%Progress:23.8% Speed(reviews/sec):4374. #Correct:210 #Tested:239 Testing Accuracy:87.8%Progress:23.9% Speed(reviews/sec):4377. #Correct:211 #Tested:240 Testing Accuracy:87.9%Progress:24.0% Speed(reviews/sec):4373. #Correct:212 #Tested:241 Testing Accuracy:87.9%Progress:24.1% Speed(reviews/sec):4375. #Correct:212 #Tested:242 Testing Accuracy:87.6%Progress:24.2% Speed(reviews/sec):4377. #Correct:213 #Tested:243 Testing Accuracy:87.6%Progress:24.3% Speed(reviews/sec):4367. #Correct:214 #Tested:244 Testing Accuracy:87.7%Progress:24.4% Speed(reviews/sec):4366. #Correct:215 #Tested:245 Testing Accuracy:87.7%Progress:24.5% Speed(reviews/sec):4370. #Correct:216 #Tested:246 Testing Accuracy:87.8%Progress:24.6% Speed(reviews/sec):4377. #Correct:217 #Tested:247 Testing Accuracy:87.8%Progress:24.7% Speed(reviews/se

In [64]:
mlp_full = SentimentNetwork(reviews[:-1000],labels[:-1000],min_count=0,polarity_cutoff=0,learning_rate=0.01)

In [65]:
mlp_full.train(reviews[:-1000],labels[:-1000])

Progress:0.0% Speed(reviews/sec):0.0 #Correct:1 #Trained:1 Training Accuracy:100.%
Progress:10.4% Speed(reviews/sec):874.8 #Correct:1962 #Trained:2501 Training Accuracy:78.4%
Progress:20.8% Speed(reviews/sec):858.9 #Correct:4002 #Trained:5001 Training Accuracy:80.0%
Progress:31.2% Speed(reviews/sec):859.3 #Correct:6120 #Trained:7501 Training Accuracy:81.5%
Progress:41.6% Speed(reviews/sec):870.3 #Correct:8271 #Trained:10001 Training Accuracy:82.7%
Progress:52.0% Speed(reviews/sec):870.3 #Correct:10431 #Trained:12501 Training Accuracy:83.4%
Progress:62.5% Speed(reviews/sec):874.4 #Correct:12565 #Trained:15001 Training Accuracy:83.7%
Progress:72.9% Speed(reviews/sec):872.1 #Correct:14670 #Trained:17501 Training Accuracy:83.8%
Progress:83.3% Speed(reviews/sec):871.9 #Correct:16833 #Trained:20001 Training Accuracy:84.1%
Progress:93.7% Speed(reviews/sec):870.1 #Correct:19015 #Trained:22501 Training Accuracy:84.5%
Progress:99.9% Speed(reviews/sec):871.2 #Correct:20335 #Trained:24000 Training

In [66]:
def get_most_similar_words(focus = "horrible"):
    most_similar = Counter()

    for word in mlp_full.word2index.keys():
        most_similar[word] = np.dot(mlp_full.weights_0_1[mlp_full.word2index[word]],mlp_full.weights_0_1[mlp_full.word2index[focus]])
    
    return most_similar.most_common()

In [67]:
#get_most_similar_words("excellent")
#get_most_similar_words("terrible")

In [68]:
import matplotlib.colors as colors

words_to_visualize = list()
for word, ratio in pos_neg_ratios.most_common(500):
    if(word in mlp_full.word2index.keys()):
        words_to_visualize.append(word)
    
for word, ratio in list(reversed(pos_neg_ratios.most_common()))[0:500]:
    if(word in mlp_full.word2index.keys()):
        words_to_visualize.append(word)

In [69]:
pos = 0
neg = 0

colors_list = list()
vectors_list = list()
for word in words_to_visualize:
    if word in pos_neg_ratios.keys():
        vectors_list.append(mlp_full.weights_0_1[mlp_full.word2index[word]])
        if(pos_neg_ratios[word] > 0):
            pos+=1
            colors_list.append("#00ff00")
        else:
            neg+=1
            colors_list.append("#000000")
    

In [70]:
from sklearn.manifold import TSNE
tsne = TSNE(n_components=2, random_state=0)
words_top_ted_tsne = tsne.fit_transform(vectors_list)

In [71]:
p = figure(tools="pan,wheel_zoom,reset,save",
           toolbar_location="above",
           title="vector T-SNE for most polarized words")

source = ColumnDataSource(data=dict(x1=words_top_ted_tsne[:,0],
                                    x2=words_top_ted_tsne[:,1],
                                    names=words_to_visualize))

p.scatter(x="x1", y="x2", size=8, source=source,color=colors_list)

word_labels = LabelSet(x="x1", y="x2", text="names", y_offset=6,
                  text_font_size="8pt", text_color="#555555",
                  source=source, text_align='center')
p.add_layout(word_labels)

show(p)

# green indicates positive words, black indicates negative words

Supplying a user-defined data source AND iterable values to glyph methods is deprecated.

See https://github.com/bokeh/bokeh/issues/2056 for more information.

  warn(message)
Supplying a user-defined data source AND iterable values to glyph methods is deprecated.

See https://github.com/bokeh/bokeh/issues/2056 for more information.

  warn(message)
