In [1]:
# http://andybromberg.com/sentiment-analysis-python/
# Andy Bromberg's Simple Sentiment Analysis System
# Uses data from Pang & Lee (2005)
# Uses a Naive Bayes Classifier Train the System
#  NB Updated 2016 for package changes around scores

import re, math, collections, itertools, sys, os
import nltk, nltk.classify.util, nltk.metrics
from nltk.classify import NaiveBayesClassifier
from nltk.classify import DecisionTreeClassifier
from nltk.metrics import BigramAssocMeasures, scores
from nltk.probability import FreqDist, ConditionalFreqDist
from sklearn.svm import SVC
from nltk.classify import SklearnClassifier
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 


__location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname("XLect10.Progs")))


def evaluate_features(feature_select):
    #reading pre-labeled input and splitting into lines
    negSentences = open(os.path.join(__location__, 'rt-polarity-neg.txt'), 'r', encoding='utf8')
    posSentences = open(os.path.join(__location__, 'rt-polarity-pos.txt'), 'r', encoding='utf8')
    #stopWords = open(os.path.join(__location__, 'stopwords.txt'), 'r', encoding='utf8')
    stop_words = set(stopwords.words('english'))

    negSentences = re.split(r'\n', negSentences.read())
    posSentences = re.split(r'\n', posSentences.read())
    
    
    #Removal of stop words from Negative Sentences
    for i in range(0,len(negSentences)):
        word_tokens = word_tokenize(negSentences[i])
        filtered_sentence = [] 
        for w in word_tokens: 
            if w not in stop_words: 
                filtered_sentence.append(w)
        negSentences[i] = ' '.join(filtered_sentence)
        
    #Removal of stop words from Positive Sentences
    for i in range(0,len(posSentences)):
        word_tokens = word_tokenize(posSentences[i])
        filtered_sentence = [] 
        for w in word_tokens: 
            if w not in stop_words: 
                filtered_sentence.append(w)
        posSentences[i] = ' '.join(filtered_sentence)
    
    posFeatures = []
    negFeatures = []
    # breaks up the sentences into lists of individual words
    # creates instance structures for classifier
    for i in posSentences:
        posWords = re.findall(r"[\w']+|[.,!?;]", i)
        posWords = [feature_select(posWords), 'pos']
        posFeatures.append(posWords)
    for i in negSentences:
        negWords = re.findall(r"[\w']+|[.,!?;]", i)
        negWords = [feature_select(negWords), 'neg']
        negFeatures.append(negWords)
    posCutoff = int(math.floor(len(posFeatures)*3/4))
    negCutoff = int(math.floor(len(negFeatures)*3/4))
    
    trainFeatures = posFeatures[:posCutoff] + negFeatures[:negCutoff]
    testFeatures = posFeatures[posCutoff:] + negFeatures[negCutoff:]
    
    #Runs the classifier on the testFeatures
    #classifier = DecisionTreeClassifier.train(trainFeatures)
    classifier = NaiveBayesClassifier.train(trainFeatures)
    #classifier = SklearnClassifier(SVC(), sparse=False).train(trainFeatures)
    
    #Sets up labels to look at output
    referenceSets = collections.defaultdict(set)
    testSets = collections.defaultdict(set)
    for i, (features, label) in enumerate(testFeatures): # enumerate adds number-count to each item
        referenceSets[label].add(i)               # recorded polarity for these test sentences
        predicted = classifier.classify(features) # classifiers' proposed polarity for tests
        testSets[predicted].add(i)

    #Outputs
    print('train on %s instances, test on %s instances'% (len(trainFeatures), len(testFeatures)))
    print('accuracy:', nltk.classify.util.accuracy(classifier, testFeatures))
    print('pos precision:', scores.precision(referenceSets['pos'], testSets['pos']))
    print('pos recall:', scores.recall(referenceSets['pos'], testSets['pos']))
    print('neg precision:', scores.precision(referenceSets['neg'], testSets['neg']))
    print('neg recall:', scores.recall(referenceSets['neg'], testSets['neg']))
    classifier.show_most_informative_features(10)

def make_full_dict(words):
    return dict([(word, True) for word in words])

print('using all words as features')
evaluate_features(make_full_dict)

using all words as features
train on 7998 instances, test on 2666 instances
accuracy: 0.7753188297074268
pos precision: 0.7747005988023952
pos recall: 0.7764441110277569
neg precision: 0.7759398496240602
neg recall: 0.7741935483870968
Most Informative Features
              engrossing = True              pos : neg    =     17.0 : 1.0
                   quiet = True              pos : neg    =     15.7 : 1.0
                mediocre = True              neg : pos    =     13.7 : 1.0
               absorbing = True              pos : neg    =     13.0 : 1.0
                portrait = True              pos : neg    =     12.4 : 1.0
              refreshing = True              pos : neg    =     12.3 : 1.0
               inventive = True              pos : neg    =     12.3 : 1.0
                   flaws = True              pos : neg    =     12.3 : 1.0
                 triumph = True              pos : neg    =     11.7 : 1.0
            refreshingly = True              pos : neg    =     

In [2]:
import nltk
from nltk import FreqDist
from nltk.probability import ConditionalFreqDist
import os
import re
import itertools
from nltk.collocations import BigramAssocMeasures

In [3]:
def create_word_scores():
    __location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname("XLect10.Progs")))
    #splits sentences into lines
    posSentences = open(os.path.join(__location__, 'rt-polarity-pos.txt'), 'r')
    negSentences = open(os.path.join(__location__, 'rt-polarity-neg.txt'), 'r')
    
    posSentences = re.split(r'\n', posSentences.read())
    negSentences = re.split(r'\n', negSentences.read())
 
    #creates lists of all positive and negative words
    posWords = []
    negWords = []
    for i in posSentences:
        posWord = re.findall(r"[\w']+|[.,!?;]", i)
        posWords.append(posWord)
    for i in negSentences:
        negWord = re.findall(r"[\w']+|[.,!?;]", i)
        negWords.append(negWord)
    posWords = list(itertools.chain(*posWords))
    negWords = list(itertools.chain(*negWords))
    return (posWords,negWords)

In [4]:
def find_best_words(word_scores, number):
    best_vals = sorted(word_scores.items(), key= lambda x: x[1], reverse=True)[:number]
    best_words = set([w for w, s in best_vals])
    return best_words

In [5]:
def best_word_features(words):
    return dict([(word, True) for word in words if word in best_words])

In [6]:
word_fd = FreqDist()

In [7]:
cond_word_fd = ConditionalFreqDist()

In [8]:
wordList = create_word_scores()
posWords = wordList[0]
negWords = wordList[1]

In [9]:
for word in posWords:
    word_fd[word.lower()] += 1
    cond_word_fd['pos'][word.lower()]+= 1
for word in negWords:
    word_fd[word.lower()] += 1
    cond_word_fd['neg'][word.lower()]+= 1

In [10]:
pos_word_count = cond_word_fd['pos'].N()
neg_word_count = cond_word_fd['neg'].N()
total_word_count = pos_word_count + neg_word_count

In [11]:
word_scores = {}
for word, freq in word_fd.items():
    pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count)
    neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count)
    word_scores[word] = pos_score + neg_score

In [12]:
numbers_to_test = [100,15000,30000]
#tries the best_word_features mechanism with each of the numbers_to_test of features
for num in numbers_to_test:
    print('evaluating best',num,'word features')
    best_words = find_best_words(word_scores, num)
    evaluate_features(best_word_features)

evaluating best 100 word features
train on 7998 instances, test on 2666 instances
accuracy: 0.6376594148537135
pos precision: 0.5981808453718566
pos recall: 0.8387096774193549
neg precision: 0.7302383939774153
neg recall: 0.436609152288072
Most Informative Features
              engrossing = True              pos : neg    =     17.0 : 1.0
                mediocre = True              neg : pos    =     13.7 : 1.0
                portrait = True              pos : neg    =     12.4 : 1.0
               inventive = True              pos : neg    =     12.3 : 1.0
                    flat = True              neg : pos    =     11.4 : 1.0
                  boring = True              neg : pos    =     11.3 : 1.0
               beautiful = True              pos : neg    =     10.7 : 1.0
                    warm = True              pos : neg    =     10.6 : 1.0
                  stupid = True              neg : pos    =     10.6 : 1.0
                touching = True              pos : neg    =