In [1]:
import re, math, collections, itertools
import nltk, nltk.classify.util
from nltk import precision, recall
from nltk.classify import NaiveBayesClassifier, MaxentClassifier, DecisionTreeClassifier
from nltk.metrics import BigramAssocMeasures
from nltk.probability import FreqDist, ConditionalFreqDist
from sklearn.svm import LinearSVC

In [2]:
def make_full_dict(words):
    return dict([(word, True) for word in words])

def createFeatures(feature_select):
    #reading pre-labeled input and splitting into lines
    posSentences = open('positive.txt', 'r',encoding="utf8")
    negSentences = open('negative.txt', 'r',encoding="utf8")
    posSentences = re.split(r'\n', posSentences.read())
    negSentences = re.split(r'\n', negSentences.read())

    posFeatures = []
    negFeatures = []
    #http://stackoverflow.com/questions/367155/splitting-a-string-into-words-and-punctuation
    #breaks up the sentences into lists of individual words (as selected by the input mechanism) and appends 'pos' or 'neg' after each list
    for i in posSentences:
        posWords = re.findall(r"[\w']+|[.,!?;]", i)
        posWords = [feature_select(posWords), 'pos']
        posFeatures.append(posWords)
    for i in negSentences:
        negWords = re.findall(r"[\w']+|[.,!?;]", i)
        negWords = [feature_select(negWords), 'neg']
        negFeatures.append(negWords)

    #selects 3/4 of the features to be used for training and 1/4 to be used for testing
    posCutoff = int(math.floor(len(posFeatures)*3/4))
    negCutoff = int(math.floor(len(negFeatures)*3/4))
    trainFeatures = posFeatures[:posCutoff] + negFeatures[:negCutoff]
    testFeatures = posFeatures[posCutoff:] + negFeatures[negCutoff:]
    
    return trainFeatures,testFeatures

def evaluate(classifier,trainFeatures,testFeatures):
    referenceSets = dict()
    referenceSets['pos'] = set()
    referenceSets['neg'] = set()
    testSets = dict()
    testSets['pos'] = set()
    testSets['neg'] = set()
    for i, (features, label) in enumerate(testFeatures):
        referenceSets[label].add(i)
        predicted = classifier.classify(features)
        testSets[predicted].add(i)
        
    print ('train on %d instances, test on %d instances' % (len(trainFeatures), len(testFeatures)))
    print ('accuracy:', nltk.classify.util.accuracy(classifier, testFeatures))
    print ('pos precision:', precision(referenceSets['pos'], testSets['pos']))
    print ('pos recall:', recall(referenceSets['pos'], testSets['pos']))
    print ('neg precision:', precision(referenceSets['neg'], testSets['neg']))
    print ('neg recall:', recall(referenceSets['neg'], testSets['neg']))
    #classifier.show_most_informative_features(10)

In [3]:
def create_word_scores():
    #splits sentences into lines                                                                                                  
    posSentences = open('positive.txt', 'r')
    negSentences = open('negative.txt', 'r')
    posSentences = re.split(r'\n', posSentences.read())
    negSentences = re.split(r'\n', negSentences.read())

    #creates lists of all positive and negative words                                                                             
    posWords = []
    negWords = []
    for i in posSentences:
        posWord = re.findall(r"[\w']+|[.,!?;]", i)
        posWords.append(posWord)
    for i in negSentences:
        negWord = re.findall(r"[\w']+|[.,!?;]", i)
        negWords.append(negWord)
    posWords = list(itertools.chain(*posWords))
    negWords = list(itertools.chain(*negWords))

    word_fd = FreqDist()
    cond_word_fd = ConditionalFreqDist()

    for word in posWords:
        word_fd[word.lower()] +=1
        cond_word_fd['pos'][word.lower()] +=1
    for word in negWords:
        word_fd[word.lower()] +=1
        cond_word_fd['neg'][word.lower()] +=1
        
    pos_word_count = cond_word_fd['pos'].N()
    neg_word_count = cond_word_fd['neg'].N()
    total_word_count = pos_word_count + neg_word_count

    word_scores = {}
    for word, freq in word_fd.items():
        pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count)
        neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count)
        word_scores[word] = pos_score + neg_score

    return word_scores


def find_best_words(word_scores, number):
    best_vals = sorted(iter(word_scores.items()), key=lambda w_s: w_s[1], reverse=True)[:number]
    best_words = set([w for w, s in best_vals])
    return best_words

def best_word_features(words):
    return dict([(word, True) for word in words if word in best_words])

In [4]:
nFeat = [10,100,1000,10000]
word_scores = create_word_scores()

In [9]:
import timeit
for num in nFeat:
    best_words = find_best_words(word_scores, num)
    trainFeatures10,testFeatures10 = createFeatures(best_word_features)
    print('Features %d done\n'%num)
    print('NaiveBayes %d classifier\n'%num)
    nbstart = timeit.default_timer()
    NBclassifier10 = NaiveBayesClassifier.train(trainFeatures10)
    nbtime = timeit.default_timer()-nbstart
    evaluate(NBclassifier10,trainFeatures10,testFeatures10)
    print("NB Time used for %d features is %.5f"%(num,nbtime))
    print('Maximum Entropy %d classifier\n'%num)
    mentstart = timeit.default_timer()
    MEclassifier10 = MaxentClassifier.train(trainFeatures10,max_iter=5)
    menttime = timeit.default_timer()-mentstart
    evaluate(MEclassifier10,trainFeatures10,testFeatures10)
    print("MEnt Time used for %d features is %.5f"%(num,menttime))
    print('SVM %d classifier\n'%num)
    svmstart = timeit.default_timer()
    SVMclassifier10 = nltk.classify.SklearnClassifier(LinearSVC()).train(trainFeatures10)
    svmtime = timeit.default_timer()-svmstart
    evaluate(SVMclassifier10,trainFeatures10,testFeatures10)
    print("SVM Time used for %d features is %.5f"%(num,svmtime))

Features 10 done

NaiveBayes 10 classifier

train on 1183970 instances, test on 394657 instances
accuracy: 0.5828985676169434
pos precision: 0.5511850919320496
pos recall: 0.8976132020552279
neg precision: 0.7227492049566838
neg recall: 0.26749259304354883
NB Time used for 10 features is 2.46655
Maximum Entropy 10 classifier

  ==> Training (5 iterations)

      Iteration    Log Likelihood    Accuracy
      ---------------------------------------
             1          -0.69315        0.501
             2          -0.64167        0.588
             3          -0.62451        0.588
             4          -0.61681        0.588
         Final          -0.61287        0.588
train on 1183970 instances, test on 394657 instances
accuracy: 0.5829872522215493
pos precision: 0.5514122461200659
pos recall: 0.8949657040168063
neg precision: 0.7197330919995137
neg recall: 0.270323470920086
MEnt Time used for 10 features is 261.87135
SVM 10 classifier

train on 1183970 instances, test on 394657 in

In [None]:
trainFeatures,testFeatures = createFeatures(make_full_dict)
NBclassifier = NaiveBayesClassifier.train(trainFeatures)

In [28]:
print ('NaiveBayes classifier')
evaluate(NBclassifier,trainFeatures,testFeatures)

NaiveBayes classifier
train on 1183970 instances, test on 394657 instances
accuracy: 0.7513004963804012
pos precision: 0.8611595593151362
pos recall: 0.5998582601432585
neg precision: 0.6924914317280872
neg recall: 0.9030754089045822
Most Informative Features
            tweeteradder = True              pos : neg    =    487.3 : 1.0
                    Poem = True              pos : neg    =     75.5 : 1.0
              Banksyart2 = True              pos : neg    =     56.9 : 1.0
                    sadd = True              neg : pos    =     49.1 : 1.0
                  Farrah = True              neg : pos    =     48.8 : 1.0
                 saddens = True              neg : pos    =     48.4 : 1.0
             shareholder = True              pos : neg    =     48.2 : 1.0
                     447 = True              neg : pos    =     47.9 : 1.0
                     SAD = True              neg : pos    =     47.0 : 1.0
                 McMahon = True              neg : pos    =     4

In [11]:
nfeat = [10, 20, 30]
for num in nfeat:
    best_words = find_best_words(word_scores, num)
    trainFeatures10,testFeatures10 = createFeatures(best_word_features)
    print('Decision Tree %d classifier'%num)
    dtstart = timeit.default_timer()
    DTclassifier10 = DecisionTreeClassifier.train(trainFeatures10,entropy_cutoff=0.3)
    dttime = timeit.default_timer()-dtstart
    evaluate(DTclassifier10,trainFeatures10,testFeatures10)
    print("DT Time used for %d features is %.5f"%(num,dttime))

Decision Tree 10 classifier
train on 1183970 instances, test on 394657 instances
accuracy: 0.6014843268965203
pos precision: 0.6625308771533283
pos recall: 0.415464830798046
neg precision: 0.573556294985948
neg recall: 0.7879124558626568
DT Time used for 10 features is 193.16014
Decision Tree 20 classifier
train on 1183970 instances, test on 394657 instances
accuracy: 0.6162313097195793
pos precision: 0.6951326909072434
pos recall: 0.4155559492773798
neg precision: 0.5825399637695571
neg recall: 0.8173474978692317
DT Time used for 20 features is 594.99130
Decision Tree 30 classifier
train on 1183970 instances, test on 394657 instances
accuracy: 0.636360181119301
pos precision: 0.7011046762643481
pos recall: 0.47677744311422715
neg precision: 0.6029486898766513
neg recall: 0.7962934778197167
DT Time used for 30 features is 1229.37088


In [13]:
import time
import sys
import numpy as np

# Let's tweak our network from before to model these phenomena
class SentimentNetwork:
    def __init__(self, reviews,labels,hidden_nodes = 10, learning_rate = 0.1):
       
        # set our random number generator 
        np.random.seed(1)
    
        self.pre_process_data(reviews, labels)
        
        self.init_network(len(self.review_vocab),hidden_nodes, 1, learning_rate)
        
        
    def pre_process_data(self, reviews, labels):
        
        review_vocab = set()
        for review in reviews:
            for word in review.split(" "):
                review_vocab.add(word)
        self.review_vocab = list(review_vocab)
        
        label_vocab = set()
        for label in labels:
            label_vocab.add(label)
        
        self.label_vocab = list(label_vocab)
        
        self.review_vocab_size = len(self.review_vocab)
        self.label_vocab_size = len(self.label_vocab)
        
        self.word2index = {}
        for i, word in enumerate(self.review_vocab):
            self.word2index[word] = i
        
        self.label2index = {}
        for i, label in enumerate(self.label_vocab):
            self.label2index[label] = i
         
        
    def init_network(self, input_nodes, hidden_nodes, output_nodes, learning_rate):
        # Set number of nodes in input, hidden and output layers.
        self.input_nodes = input_nodes
        self.hidden_nodes = hidden_nodes
        self.output_nodes = output_nodes

        # Initialize weights
        self.weights_0_1 = np.zeros((self.input_nodes,self.hidden_nodes))
    
        self.weights_1_2 = np.random.normal(0.0, self.output_nodes**-0.5, 
                                                (self.hidden_nodes, self.output_nodes))
        
        self.learning_rate = learning_rate
        
        self.layer_0 = np.zeros((1,input_nodes))
    
        
    def update_input_layer(self,review):

        # clear out previous state, reset the layer to be all 0s
        self.layer_0 *= 0
        for word in review.split(" "):
            if(word in self.word2index.keys()):
                self.layer_0[0][self.word2index[word]] += 1
                
    def get_target_for_label(self,label):
        if(label == 'POSITIVE'):
            return 1
        else:
            return 0
        
    def sigmoid(self,x):
        return 1 / (1 + np.exp(-x))
    
    
    def sigmoid_output_2_derivative(self,output):
        return output * (1 - output)
    
    def train(self, training_reviews, training_labels):
        
        assert(len(training_reviews) == len(training_labels))
        
        correct_so_far = 0
        perc = 0
        
        start = time.time()
        
        for i in range(len(training_reviews)):
            
            review = training_reviews[i]
            label = training_labels[i]
            
            #### Implement the forward pass here ####
            ### Forward pass ###

            # Input Layer
            self.update_input_layer(review)

            # Hidden layer
            layer_1 = self.layer_0.dot(self.weights_0_1)

            # Output layer
            layer_2 = self.sigmoid(layer_1.dot(self.weights_1_2))

            #### Implement the backward pass here ####
            ### Backward pass ###

            # TODO: Output error
            layer_2_error = layer_2 - self.get_target_for_label(label) # Output layer error is the difference between desired target and actual output.
            layer_2_delta = layer_2_error * self.sigmoid_output_2_derivative(layer_2)

            # TODO: Backpropagated error
            layer_1_error = layer_2_delta.dot(self.weights_1_2.T) # errors propagated to the hidden layer
            layer_1_delta = layer_1_error # hidden layer gradients - no nonlinearity so it's the same as the error

            # TODO: Update the weights
            self.weights_1_2 -= layer_1.T.dot(layer_2_delta) * self.learning_rate # update hidden-to-output weights with gradient descent step
            self.weights_0_1 -= self.layer_0.T.dot(layer_1_delta) * self.learning_rate # update input-to-hidden weights with gradient descent step

            if(np.abs(layer_2_error) < 0.5):
                correct_so_far += 1
            
            reviews_per_second = i / float(time.time() - start)
            if (i/len(training_reviews)>=(perc+1)*0.1):
                sys.stdout.write("\rProgress:" + str(100 * i/float(len(training_reviews)))[:4] + "% Speed(reviews/sec):" + str(reviews_per_second)[0:5] + " #Correct:" + str(correct_so_far) + " #Trained:" + str(i+1) + " Training Accuracy:" + str(correct_so_far * 100 / float(i+1))[:4] + "%")
                perc +=1
            if (i==len(training_reviews)-1):
                sys.stdout.write("\rProgress:" + str(100 * i/float(len(training_reviews)))[:4] + "% Speed(reviews/sec):" + str(reviews_per_second)[0:5] + " #Correct:" + str(correct_so_far) + " #Trained:" + str(i+1) + " Training Accuracy:" + str(correct_so_far * 100 / float(i+1))[:4] + "%")
      
    def test(self, testing_reviews, testing_labels):
        
        correct = 0
        perc = 0
        tt = 0
        tf = 0
        ft = 0
        ff = 0
        start = time.time()
        
        for i in range(len(testing_reviews)):
            pred = self.run(testing_reviews[i])
            if(pred == testing_labels[i]):
                correct += 1
            if(testing_labels[i]=="POSITIVE" and pred=="POSITIVE"):
                tt +=1
            if(testing_labels[i]=="POSITIVE" and pred=="NEGATIVE"):
                tf +=1
            if(testing_labels[i]=="NEGATIVE" and pred=="POSITIVE"):
                ft +=1
            if(testing_labels[i]=="NEGATIVE" and pred=="NEGATIVE"):
                ff +=1
                
            reviews_per_second = i / float(time.time() - start)
            if (i/len(testing_reviews)>=(perc+1)*0.1):
                sys.stdout.write("\rProgress:" + str(100 * i/float(len(testing_reviews)))[:4] \
                                 + "% Speed(reviews/sec):" + str(reviews_per_second)[0:5] \
                                + "% #Correct:" + str(correct) + " #Tested:" + str(i+1) + " Testing Accuracy:" + str(correct * 100 / float(i+1))[:4] + "%")
                perc +=1
            if (i==len(testing_reviews)-1):
                sys.stdout.write("\rProgress:" + str(100 * i/float(len(testing_reviews)))[:4] \
                                 + "% Speed(reviews/sec):" + str(reviews_per_second)[0:5] \
                                + "% #Correct:" + str(correct) + " #Tested:" + str(i+1) + " Testing Accuracy:" + str(correct * 100 / float(i+1))[:4] + "%")
                pospre = tt/(tt+ft)
                posrec = tt/(tt+tf)
                negpre = ff/(ff+tf)
                negrec = ff/(ff+ft)
                print("pos precision: %.2f"%pospre)
                print("pos recall: %.2f"%posrec)
                print("neg precision: %.2f"%negpre)
                print("neg recall: %.2f"%negrec)
    def run(self, review):
        
        # Input Layer
        self.update_input_layer(review.lower())

        # Hidden layer
        layer_1 = self.layer_0.dot(self.weights_0_1)

        # Output layer
        layer_2 = self.sigmoid(layer_1.dot(self.weights_1_2))
        
        if(layer_2[0] > 0.5):
            return "POSITIVE"
        else:
            return "NEGATIVE"

In [14]:
def convertnnfeature(features):
    featuresnn = []
    labelsnn = []
    for feat in features:
        str = ''
        for k in feat[0].keys():
            str += k+' '
        featuresnn.append(str)
        if (feat[1]=='pos'):
            labelsnn.append("POSITIVE")
        else:
            labelsnn.append("NEGATIVE")
    return featuresnn,labelsnn

In [17]:
for num in nFeat:
    best_words = find_best_words(word_scores, num)
    trainFeatures10,testFeatures10 = createFeatures(best_word_features)
    trainFeatures10,testFeatures10 = createFeatures(best_word_features)
    trainnnfeatures, trainlabelsnn = convertnnfeature(trainFeatures10)
    testnnfeatures,testlabelsnn = convertnnfeature(testFeatures10)
    combined = list(zip(trainnnfeatures, trainlabelsnn))
    np.random.shuffle(combined)
    trainnnfeatures, trainlabelsnn = zip(*combined)
    print('Features %d done'%num)
    print('Neural Network %d'%num)
    
    mlp = SentimentNetwork(trainnnfeatures,trainlabelsnn,learning_rate=0.001)
    
    nnstart = timeit.default_timer()
    mlp.train(trainnnfeatures,trainlabelsnn)
    nntime = timeit.default_timer()-nnstart
    print("nn train feature %d time is : %.5f"%(num,nntime))
    #mlp.test(testnnfeatures,testlabelsnn)
    print(' ')

Features 10 done
Neural Network 10
Progress:99.9% Speed(reviews/sec):37181 #Correct:726554 #Trained:1183970 Training Accuracy:61.3%nn train feature 10 time is : 31.84232
 
Features 100 done
Neural Network 100
Progress:99.9% Speed(reviews/sec):28997 #Correct:823843 #Trained:1183970 Training Accuracy:69.5%nn train feature 100 time is : 40.82929
 
Features 1000 done
Neural Network 1000
Progress:99.9% Speed(reviews/sec):11539 #Correct:881036 #Trained:1183970 Training Accuracy:74.4%nn train feature 1000 time is : 102.59473
 
Features 10000 done
Neural Network 10000
Progress:99.9% Speed(reviews/sec):1771. #Correct:887319 #Trained:1183970 Training Accuracy:74.9%nn train feature 10000 time is : 668.43656
 


In [21]:
len(word_scores)

708452