# SENTIMENT ANALYSIS

In [236]:
# Code from http://streamhacker.com/2010/05/10/text-classification-sentiment-analysis-naive-bayes-classifier/
import nltk.classify.util, nltk.metrics
from nltk.classify import NaiveBayesClassifier
from nltk.corpus import movie_reviews, stopwords
import collections, itertools
from nltk import scores
import random
from sklearn.cross_validation import train_test_split

import itertools
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures

from nltk.probability import FreqDist, ConditionalFreqDist
from numpy import mean
import pandas as pd

In [61]:
random.seed(88)

In [255]:
# Function to evaluate features
def evaluate_classifier_original(featx,                         
                        negtrain,
                        negtest,
                        postrain, 
                        postest,
                        bestwords):
    
    postrain_feats = [(featx(w, bestwords), 'pos') for w in postrain]
    postest_feats  = [(featx(w, bestwords), 'pos') for w in postest]
    negtrain_feats = [(featx(w, bestwords), 'neg') for w in negtrain]
    negtest_feats  = [(featx(w, bestwords), 'neg') for w in negtest]   
    
    trainfeats = negtrain_feats + postrain_feats
    testfeats = negtest_feats + postest_feats     
        
    classifier = NaiveBayesClassifier.train(trainfeats)
    refsets = collections.defaultdict(set)
    testsets = collections.defaultdict(set)
 
    for i, (feats, label) in enumerate(testfeats):
            refsets[label].add(i)
            observed = classifier.classify(feats)
            testsets[observed].add(i)
 
    print('accuracy:' + str(nltk.classify.util.accuracy(classifier, testfeats)))
    print('pos precision:'+ str(scores.precision(refsets['pos'], testsets['pos'])))
    print('pos recall:' + str(scores.recall(refsets['pos'], testsets['pos'])))
    print('neg precision:' + str(scores.precision(refsets['neg'], testsets['neg'])))
    print('neg recall:' + str(scores.recall(refsets['neg'], testsets['neg'])))
    classifier.show_most_informative_features()

In [223]:
# Function to evaluate features
def evaluate_classifier(featx,                         
                        negtrain,
                        negtest,
                        postrain, 
                        postest,
                        bestwords):
    
    postrain_feats = [(featx(w, bestwords), 'pos') for w in postrain]
    postest_feats  = [(featx(w, bestwords), 'pos') for w in postest]
    negtrain_feats = [(featx(w, bestwords), 'neg') for w in negtrain]
    negtest_feats  = [(featx(w, bestwords), 'neg') for w in negtest]    
    
    trainfeats = negtrain_feats + postrain_feats
    testfeats = negtest_feats + postest_feats     
        
    classifier = NaiveBayesClassifier.train(trainfeats)
    refsets = collections.defaultdict(set)
    testsets = collections.defaultdict(set)
 
    for i, (feats, label) in enumerate(testfeats):
            refsets[label].add(i)
            observed = classifier.classify(feats)
            testsets[observed].add(i)
 
    accuracy = nltk.classify.util.accuracy(classifier, testfeats)
    pos_precision = scores.precision(refsets['pos'], testsets['pos'])
    pos_recall = scores.recall(refsets['pos'], testsets['pos'])
    neg_precision = scores.precision(refsets['neg'], testsets['neg'])
    neg_recall = scores.recall(refsets['neg'], testsets['neg'])
    return accuracy, pos_precision, pos_recall, neg_precision, neg_recall
    

# FEATURES

In [197]:
# Bag of words: All words
def word_feats(words, bestwords):
    return dict([(word, True) for word in words])

In [198]:
# Stopword filtering
stop_set = set(stopwords.words('english')) 
def stopword_filtered_word_feats(words, bestwords, stopset = stop_set):
    return dict([(word, True) for word in words if word not in stopset])

In [199]:
# Bigram Collocations
def bigram_word_feats(words, bestwords, score_fn = BigramAssocMeasures.chi_sq, n = 200):
    bigram_finder = BigramCollocationFinder.from_words(words)
    bigrams = bigram_finder.nbest(score_fn, n)
    return dict([(ngram, True) for ngram in itertools.chain(words, bigrams)])

In [219]:
# Eliminate Low Information Features
# http://streamhacker.com/2010/06/16/text-classification-sentiment-analysis-eliminate-low-information-features/
def get_best_words(neg_train, pos_train, best_n):
    word_fd = FreqDist()
    label_word_fd = ConditionalFreqDist()

    for w in [word for review in neg_train for word in review]:
        word_fd[w.lower()] += 1
        label_word_fd['neg'][w.lower()] += 1

    for word in [word for review in pos_train for word in review]:
        word_fd[word.lower()] += 1
        label_word_fd['pos'][word.lower()] += 1

    pos_word_count = label_word_fd['pos'].N()
    neg_word_count = label_word_fd['neg'].N()
    total_word_count = pos_word_count + neg_word_count

    word_scores = {}
    for word, freq in word_fd.items():
        pos_score = BigramAssocMeasures.chi_sq( \
            label_word_fd['pos'][word], \
            (freq, pos_word_count), \
            total_word_count)
        neg_score = BigramAssocMeasures.chi_sq( \
            label_word_fd['neg'][word], \
            (freq, neg_word_count), \
            total_word_count)
        word_scores[word] = pos_score + neg_score

    best = sorted(word_scores.items(), \
                  key=lambda s: s[1], \
                  reverse=True)[:best_n]    
    return set([w for w, s in best])
    
def best_word_feats(words, bestwords):
    return dict([(word, True) for word in words if word in bestwords])

In [220]:
# Best words + bigram
def best_bigram_word_feats(words, 
                           bestwords,
                           score_fn = BigramAssocMeasures.chi_sq, 
                           n = 200):
    bigram_finder = BigramCollocationFinder.from_words(words)
    bigrams = bigram_finder.nbest(score_fn, n)
    d = dict([(bigram, True) for bigram in bigrams])
    d.update(best_word_feats(words, bestwords))
    return d

# DEVELOP MODEL

In [65]:
# Split review ids: positive vs negative
neg_ids = movie_reviews.fileids('neg')
pos_ids = movie_reviews.fileids('pos')

In [66]:
# Split review: positive vs negative
neg_words = [movie_reviews.words(fileids=[f]) for f in negids]
pos_words = [movie_reviews.words(fileids=[f]) for f in posids]

In [70]:
# Split data set: developing vs validation
neg_develop, neg_val = train_test_split(neg_words, test_size=0.25)
pos_develop, pos_val = train_test_split(pos_words, test_size=0.25)

In [202]:
# k-fold cross validation
k = 5
results = collections.defaultdict(dict)
models = ['bag_of_words', 'stop_words', 'bigram', \
          'best_words', 'bigram_best_words']

for model_name in models:
    results[model_name]['accuracy'] = []
    results[model_name]['pos_precision'] = []
    results[model_name]['pos_recall'] = []
    results[model_name]['neg_precision'] = []
    results[model_name]['neg_recall'] = []

In [224]:
for i in range(0, k):
    # Split developing data set: training vs testing
    neg_train, neg_test = train_test_split(neg_develop, test_size=1/k)
    pos_train, pos_test = train_test_split(pos_develop, test_size=1/k)    
    best_words = get_best_words(neg_train, pos_train, 10000)
    
    num_train = len(neg_train) + len(pos_train)
    num_test = len(neg_test) + len(pos_test)    
    print(str(i) + '. Train on %d instances, test on %d instances' % (num_train, num_test)) 
    
    model_name = 'bag_of_words'
    print(model_name.upper())   
    accuracy, pos_precision, pos_recall, neg_precision, neg_recall = \
    evaluate_classifier(word_feats,                         
                        negtrain = neg_train,
                        negtest = neg_test,
                        postrain = pos_train, 
                        postest = pos_test,
                        bestwords = best_words)
    results[model_name]['accuracy'].append(accuracy)
    results[model_name]['pos_precision'].append(pos_precision)
    results[model_name]['pos_recall'].append(pos_recall)
    results[model_name]['neg_precision'].append(neg_precision)
    results[model_name]['neg_recall'].append(neg_recall)  
    
    model_name = 'stop_words'
    print(model_name.upper()) 
    accuracy, pos_precision, pos_recall, neg_precision, neg_recall = \
    evaluate_classifier(stopword_filtered_word_feats,                         
                        negtrain = neg_train,
                        negtest = neg_test,
                        postrain = pos_train, 
                        postest = pos_test,
                        bestwords = best_words)
    results[model_name]['accuracy'].append(accuracy)
    results[model_name]['pos_precision'].append(pos_precision)
    results[model_name]['pos_recall'].append(pos_recall)
    results[model_name]['neg_precision'].append(neg_precision)
    results[model_name]['neg_recall'].append(neg_recall)  
    
    model_name = 'bigram'
    print(model_name.upper()) 
    accuracy, pos_precision, pos_recall, neg_precision, neg_recall = \
    evaluate_classifier(bigram_word_feats,                         
                        negtrain = neg_train,
                        negtest = neg_test,
                        postrain = pos_train, 
                        postest = pos_test,
                        bestwords = best_words)
    results[model_name]['accuracy'].append(accuracy)
    results[model_name]['pos_precision'].append(pos_precision)
    results[model_name]['pos_recall'].append(pos_recall)
    results[model_name]['neg_precision'].append(neg_precision)
    results[model_name]['neg_recall'].append(neg_recall) 
    
    model_name = 'best_words'    
    print(model_name.upper()) 
    accuracy, pos_precision, pos_recall, neg_precision, neg_recall = \
    evaluate_classifier(best_word_feats,                        
                        negtrain = neg_train,
                        negtest = neg_test,
                        postrain = pos_train, 
                        postest = pos_test,
                        bestwords = best_words)    
    results[model_name]['accuracy'].append(accuracy)
    results[model_name]['pos_precision'].append(pos_precision)
    results[model_name]['pos_recall'].append(pos_recall)
    results[model_name]['neg_precision'].append(neg_precision)
    results[model_name]['neg_recall'].append(neg_recall) 
    
    model_name = 'bigram_best_words'
    print(model_name.upper()) 
    accuracy, pos_precision, pos_recall, neg_precision, neg_recall = \
    evaluate_classifier(best_bigram_word_feats,                         
                        negtrain = neg_train,
                        negtest = neg_test,
                        postrain = pos_train, 
                        postest = pos_test,
                        bestwords = best_words)
    results[model_name]['accuracy'].append(accuracy)
    results[model_name]['pos_precision'].append(pos_precision)
    results[model_name]['pos_recall'].append(pos_recall)
    results[model_name]['neg_precision'].append(neg_precision)
    results[model_name]['neg_recall'].append(neg_recall) 

Train on 1200 instances, test on 300 instances
0. BAG_OF_WORDS
0. STOP_WORDS
0. BIGRAM
0. BEST_WORDS
0. BIGRAM_BEST_WORDS
Train on 1200 instances, test on 300 instances
1. BAG_OF_WORDS
1. STOP_WORDS
1. BIGRAM
1. BEST_WORDS
1. BIGRAM_BEST_WORDS
Train on 1200 instances, test on 300 instances
2. BAG_OF_WORDS
2. STOP_WORDS
2. BIGRAM
2. BEST_WORDS
2. BIGRAM_BEST_WORDS
Train on 1200 instances, test on 300 instances
3. BAG_OF_WORDS
3. STOP_WORDS
3. BIGRAM
3. BEST_WORDS
3. BIGRAM_BEST_WORDS
Train on 1200 instances, test on 300 instances
4. BAG_OF_WORDS
4. STOP_WORDS
4. BIGRAM
4. BEST_WORDS
4. BIGRAM_BEST_WORDS


In [254]:
# Choose Best Features
best_features = collections.defaultdict(dict)
for model_name in models:
    best_features[model_name]['accuracy'] = mean(results[model_name]['accuracy'])
    best_features[model_name]['pos_precision'] = mean(results[model_name]['pos_precision'])
    best_features[model_name]['pos_recall'] = mean(results[model_name]['pos_recall'])
    best_features[model_name]['neg_precision'] = mean(results[model_name]['neg_precision'])
    best_features[model_name]['neg_recall'] = mean(results[model_name]['neg_recall'])

pd.DataFrame.from_dict({(i): best_features[i]  
                        for i in best_features.keys()}, 
                       orient='index')

Unnamed: 0,pos_precision,neg_recall,pos_recall,neg_precision,accuracy
bag_of_words,0.673313,0.528889,0.965556,0.941721,0.747222
best_words,0.73844,0.669333,0.933333,0.910706,0.801333
bigram,0.749924,0.688889,0.933333,0.912427,0.811111
bigram_best_words,0.787113,0.76,0.886667,0.870391,0.823333
stop_words,0.662581,0.506667,0.965556,0.939124,0.736111


# BEST MODEL: TRAIN ON FULL DEVELOPING DATA SET, TEST ON VALIDATION SET

In [256]:
best_words = get_best_words(neg_develop, pos_develop, 10000)
num_train = len(neg_develop) + len(pos_develop)
num_test = len(neg_val) + len(pos_val)    

print('Train on %d instances, test on %d instances' % (num_train, num_test)) 

model_name = 'bigram_best_words'
print(str(i) + '. ' + model_name.upper()) 
evaluate_classifier_original(best_bigram_word_feats,
                             negtrain = neg_develop,
                             negtest = neg_val,
                             postrain = pos_develop, 
                             postest = pos_val,
                             bestwords = best_words)

Train on 1500 instances, test on 500 instances
4. BIGRAM_BEST_WORDS
accuracy:0.818
pos precision:0.8192771084337349
pos recall:0.816
neg precision:0.8167330677290837
neg recall:0.82
Most Informative Features
               ludicrous = True              neg : pos    =     19.7 : 1.0
      ('even', 'better') = True              pos : neg    =     15.0 : 1.0
             exceptional = True              pos : neg    =     13.7 : 1.0
                   sucks = True              neg : pos    =     13.0 : 1.0
               insulting = True              neg : pos    =     13.0 : 1.0
              philosophy = True              pos : neg    =     12.3 : 1.0
     ('saving', 'grace') = True              neg : pos    =     11.7 : 1.0
                  avoids = True              pos : neg    =     11.7 : 1.0
  ('title', 'character') = True              pos : neg    =     11.7 : 1.0
         ('shows', 'us') = True              pos : neg    =     11.0 : 1.0
