# SENTIMENT ANALYSIS

In [116]:
# Code from http://streamhacker.com/2010/05/10/text-classification-sentiment-analysis-naive-bayes-classifier/
import nltk.classify.util
from nltk.classify import NaiveBayesClassifier
from nltk import MaxentClassifier
from nltk.corpus import stopwords
import collections, itertools
from nltk import scores, sent_tokenize, word_tokenize, pos_tag

import random
from sklearn.cross_validation import train_test_split

import itertools
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures

from nltk.probability import FreqDist, ConditionalFreqDist
from numpy import mean
import pandas as pd
import json

from string import punctuation

In [117]:
random.seed(88)

In [118]:
from sklearn.metrics import accuracy_score

# Function to evaluate features
def evaluate_classifier_original(featx,                         
                        negtrain,
                        negtest,
                        postrain, 
                        postest,
                        bestwords):    
    
    negtrain_feats = [(featx(w, bestwords), 'neg') for w in negtrain]
    negtest_feats  = [(featx(w, bestwords), 'neg') for w in negtest]
    postrain_feats = [(featx(w, bestwords), 'pos') for w in postrain]
    postest_feats  = [(featx(w, bestwords), 'pos') for w in postest]
    
    trainfeats = negtrain_feats + postrain_feats
    testfeats = negtest_feats + postest_feats     
        
    classifier = NaiveBayesClassifier.train(trainfeats)
    # classifier = MaxentClassifier.train(trainfeats)
    refsets = collections.defaultdict(set)
    testsets = collections.defaultdict(set)
            
    for i, (feats, label) in enumerate(testfeats):
            refsets[label].add(i)
            observed = classifier.classify(feats)
            testsets[observed].add(i)            
 
    print('accuracy:' + str(nltk.classify.util.accuracy(classifier, testfeats)))
    print('pos precision:'+ str(scores.precision(refsets['pos'], testsets['pos'])))
    print('pos recall:' + str(scores.recall(refsets['pos'], testsets['pos'])))
    print('neg precision:' + str(scores.precision(refsets['neg'], testsets['neg'])))
    print('neg recall:' + str(scores.recall(refsets['neg'], testsets['neg'])))
    classifier.show_most_informative_features()

In [152]:
# Function to evaluate features
def evaluate_classifier(featx,                         
                        negtrain,
                        negtest,
                        postrain, 
                        postest):    
    
    negtrain_feats = [(featx(w), 'neg') for w in negtrain]
    negtest_feats  = [(featx(w), 'neg') for w in negtest]
    postrain_feats = [(featx(w), 'pos') for w in postrain]
    postest_feats  = [(featx(w), 'pos') for w in postest]
    
    trainfeats = negtrain_feats + postrain_feats
    testfeats = negtest_feats + postest_feats     
        
    classifier = NaiveBayesClassifier.train(trainfeats)
    # classifier = MaxentClassifier.train(trainfeats)
    
    actual = collections.defaultdict(set)
    predict = collections.defaultdict(set)
 
    for i, (feats, label) in enumerate(testfeats):
            actual[label].add(i)
            observed = classifier.classify(feats)
            predict[observed].add(i)    
            
    
    tp = len(set.intersection(actual['pos'], predict['pos']))
    tn = len(set.intersection(actual['neg'], predict['neg']))
    fn = len(set.intersection(actual['pos'], predict['neg']))
    fp = len(set.intersection(actual['neg'], predict['pos']))
    
    return tp,tn,fn,fp
    

In [153]:
# Function to evaluate features
def evaluate_classifier2(featx,                         
                        negtrain,
                        negtest,
                        postrain, 
                        postest,
                        bestwords):    
    
    negtrain_feats = [(featx(w, bestwords), 'neg') for w in negtrain]
    negtest_feats  = [(featx(w, bestwords), 'neg') for w in negtest]
    postrain_feats = [(featx(w, bestwords), 'pos') for w in postrain]
    postest_feats  = [(featx(w, bestwords), 'pos') for w in postest]
    
    trainfeats = negtrain_feats + postrain_feats
    testfeats = negtest_feats + postest_feats     
        
    classifier = NaiveBayesClassifier.train(trainfeats)
    # classifier = MaxentClassifier.train(trainfeats)
    
    actual = collections.defaultdict(set)
    predict = collections.defaultdict(set)
 
    for i, (feats, label) in enumerate(testfeats):
            actual[label].add(i)
            observed = classifier.classify(feats)
            predict[observed].add(i)    
    
    tp = len(set.intersection(actual['pos'], predict['pos']))
    tn = len(set.intersection(actual['neg'], predict['neg']))
    fn = len(set.intersection(actual['pos'], predict['neg']))
    fp = len(set.intersection(actual['neg'], predict['pos']))
    
    return tp,tn,fn,fp

# FEATURES

In [131]:
# Bag of words: All words
def word_feats(words):
    return dict([(word, True) for word in words])

In [132]:
# Stopword filtering
stop_set = set(stopwords.words('english')) 

def stopword_filtered_word_feats(words, stopset = stop_set):
    return dict([(word, True) for word in words if word not in stopset])

In [133]:
# Bigram Collocations
def bigram_word_feats(words, score_fn = BigramAssocMeasures.chi_sq, n = 200):
    bigram_finder = BigramCollocationFinder.from_words(words)
    bigrams = bigram_finder.nbest(score_fn, n)
    return dict([(ngram, True) for ngram in itertools.chain(words, bigrams)])

In [134]:
# Eliminate Low Information Features
# http://streamhacker.com/2010/06/16/text-classification-sentiment-analysis-eliminate-low-information-features/
def get_best_words(neg_train, pos_train, best_n):
    word_fd = FreqDist()
    label_word_fd = ConditionalFreqDist()

    for w in [word for review in neg_train for word in review]:
        word_fd[w.lower()] += 1
        label_word_fd['neg'][w.lower()] += 1

    for w in [word for review in pos_train for word in review]:
        word_fd[w.lower()] += 1
        label_word_fd['pos'][w.lower()] += 1

    neg_word_count = label_word_fd['neg'].N()
    pos_word_count = label_word_fd['pos'].N()    
    total_word_count = pos_word_count + neg_word_count

    word_scores = {}
    for word, freq in word_fd.items():
        pos_score = BigramAssocMeasures.chi_sq( \
            label_word_fd['pos'][word], \
            (freq, pos_word_count), \
            total_word_count)
        neg_score = BigramAssocMeasures.chi_sq( \
            label_word_fd['neg'][word], \
            (freq, neg_word_count), \
            total_word_count)
        word_scores[word] = pos_score + neg_score

    best = sorted(word_scores.items(), \
                  key = lambda s: s[1], \
                  reverse = True)[:best_n]    
    return set([w for w, s in best])
    
def best_word_feats(words, bestwords):
    return dict([(word, True) for word in words if word in bestwords])

In [135]:
# Best words + bigram
def best_bigram_word_feats(words, 
                           bestwords,
                           score_fn = BigramAssocMeasures.chi_sq, 
                           n = 200):
    bigram_finder = BigramCollocationFinder.from_words(words)
    bigrams = bigram_finder.nbest(score_fn, n)
    d = dict([(bigram, True) for bigram in bigrams])
    d.update(best_word_feats(words, bestwords))
    return d

# DEVELOP MODEL

In [136]:
# Load raw data
file_name = '/home/yuri/Dropbox/Compartido/Projects_professional/kpn/data/rawdata_20170620.json'
with open(file_name) as json_data:
    data = json.load(json_data)

In [137]:
df = pd.DataFrame(data)
df.shape, 

((2239, 8),)

In [130]:
df = df.drop_duplicates()
df.shape

(2219, 8)

In [139]:
import re
import string

def remove_punctuation(s):
    s = ''.join([i for i in s if i not in frozenset(punctuation)])
    return s

rem = string.punctuation
pattern = r"[{}]".format(rem)

df['text_nopunct'] = df['text'].str.replace(pattern, ' ')

#df['text_nopunct'] = df['text'].apply(remove_punctuation)
#df['text_nopunct'] = df['text']

In [140]:
pattern;

'[!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~]'

In [141]:
# Split reviews: negative vs positive vs neutral
# df_neg = df[df.stars == 1]
# df_pos = df[df.stars == 5]

df_neg = df[(df.stars == 1) | (df.stars == 2)]
df_pos = df[(df.stars == 4) | (df.stars == 5)]
df_neu = df[(df.stars == 3)]
df_neg.shape, df_pos.shape, df_neu.shape

((487, 9), (1525, 9), (227, 9))

In [143]:
df_neg.text_nopunct[5];
df_neu.text_nopunct[:20];
df_pos.text_nopunct[:6];

In [144]:
# Split review: positive vs negative
neg_words = [word_tokenize(f) for f in df_neg.text_nopunct]
pos_words = [word_tokenize(f) for f in df_pos.text_nopunct]

In [146]:
len(neg_words[6]), neg_words[10];

In [147]:
# Split data set: developing vs validation
neg_develop, neg_val = train_test_split(neg_words, test_size=0.25)
pos_develop, pos_val = train_test_split(pos_words, test_size=0.25)

In [148]:
len(neg_develop),len(neg_val), len(pos_develop),len(pos_val)

(365, 122, 1143, 382)

In [154]:
# k-fold cross validation
k = 5
results = collections.defaultdict(dict)
models = ['bag_of_words', 'stop_words', 'bigram', \
          'best_words', 'bigram_best_words']

for model_name in models:
    results[model_name]['accuracy'] = []
    results[model_name]['pos_precision'] = []
    results[model_name]['pos_recall'] = []
    results[model_name]['neg_precision'] = []
    results[model_name]['neg_recall'] = []

In [156]:
for i in range(0, k):
    # Split developing data set: training vs testing
    neg_train, neg_test = train_test_split(neg_develop, test_size=1/k)
    pos_train, pos_test = train_test_split(pos_develop, test_size=1/k)    
        
    num_train = len(neg_train) + len(pos_train)
    num_test = len(neg_test) + len(pos_test)    
    print(str(i) + '. Train on %d instances, test on %d instances' % (num_train, num_test)) 
    
    model_name = 'bag_of_words'
    print(model_name.upper())        
    tp,tn,fn,fp = evaluate_classifier(word_feats,                         
                        negtrain = neg_train,
                        negtest = neg_test,
                        postrain = pos_train, 
                        postest = pos_test)
    print(str(tp) + ' ' + str(tn) + ' ' + str(fn) + ' ' + str(fp))

0. Train on 1206 instances, test on 302 instances
BAG_OF_WORDS
54 73 175 0
1. Train on 1206 instances, test on 302 instances
BAG_OF_WORDS
45 73 184 0
2. Train on 1206 instances, test on 302 instances
BAG_OF_WORDS
47 72 182 1
3. Train on 1206 instances, test on 302 instances
BAG_OF_WORDS
38 72 191 1
4. Train on 1206 instances, test on 302 instances
BAG_OF_WORDS
47 72 182 1


In [209]:
negtrain_feats = [(word_feats(w), 'neg') for w in neg_train]
negtest_feats  = [(word_feats(w), 'neg') for w in neg_test]
postrain_feats = [(word_feats(w), 'pos') for w in pos_train]
postest_feats  = [(word_feats(w), 'pos') for w in pos_test]

trainfeats = negtrain_feats + postrain_feats
testfeats = negtest_feats + postest_feats     

classifier = NaiveBayesClassifier.train(trainfeats)
#classifier = MaxentClassifier.train(trainfeats)

actual = collections.defaultdict(set)
predict = collections.defaultdict(set)

for i, (feats, label) in enumerate(testfeats):
        actual[label].add(i)
        observed = classifier.classify(feats)
        predict[observed].add(i)    


tp = set.intersection(actual['pos'], predict['pos'])
tn = set.intersection(actual['neg'], predict['neg'])
fn = set.intersection(actual['pos'], predict['neg'])
fp = set.intersection(actual['neg'], predict['pos'])


#tp,tn,fn,fp,\
len(trainfeats), len(negtrain_feats), len(postrain_feats), \
len(testfeats), len(negtest_feats), len(postest_feats), \
testfeats[33] , classifier.classify(testfeats[33][0])

(1206,
 292,
 914,
 302,
 73,
 229,
 ({'Rebtel': True,
   'connection': True,
   'drops': True,
   'often': True,
   'very': True},
  'neg'),
 'pos')

In [208]:
my_text1 = word_tokenize('I think rebtel is awesome')
my_text2 = word_tokenize('Rebtel is terrible')
my_text3 = word_tokenize('the service sucks')
my_text4 = word_tokenize('the service is good')
my_text5 = word_tokenize('I want my mone back')
my_text6 = word_tokenize('the service is very expensive')
my_text7 = word_tokenize('good deal for online service telephone')

classifier.classify(word_feats(my_text1)),\
classifier.classify(word_feats(my_text2)),\
classifier.classify(word_feats(my_text3)),\
classifier.classify(word_feats(my_text4)),\
classifier.classify(word_feats(my_text5)),\
classifier.classify(word_feats(my_text6)),\
classifier.classify(word_feats(my_text7))

('pos', 'neg', 'pos', 'pos', 'neg', 'pos', 'pos')

In [162]:
best_words = get_best_words(neg_train, pos_train, 2000)
len(best_words), best_words;

In [136]:
for i in range(0, k):
    # Split developing data set: training vs testing
    neg_train, neg_test = train_test_split(neg_develop, test_size=1/k)
    pos_train, pos_test = train_test_split(pos_develop, test_size=1/k)    
    best_words = get_best_words(neg_train, pos_train, 10000)
    
    num_train = len(neg_train) + len(pos_train)
    num_test = len(neg_test) + len(pos_test)    
    print(str(i) + '. Train on %d instances, test on %d instances' % (num_train, num_test)) 
    
    model_name = 'bag_of_words'
    print(model_name.upper())        
    accuracy, pos_precision, pos_recall, neg_precision, neg_recall = \
    evaluate_classifier(word_feats,                         
                        negtrain = neg_train,
                        negtest = neg_test,
                        postrain = pos_train, 
                        postest = pos_test,
                        bestwords = best_words)
    results[model_name]['accuracy'].append(accuracy)
    results[model_name]['pos_precision'].append(pos_precision)
    results[model_name]['pos_recall'].append(pos_recall)
    results[model_name]['neg_precision'].append(neg_precision)
    results[model_name]['neg_recall'].append(neg_recall)
    
    
    model_name = 'stop_words'
    print(model_name.upper())     
    accuracy, pos_precision, pos_recall, neg_precision, neg_recall = \
    evaluate_classifier(stopword_filtered_word_feats,                         
                        negtrain = neg_train,
                        negtest = neg_test,
                        postrain = pos_train, 
                        postest = pos_test,
                        bestwords = best_words)
    results[model_name]['accuracy'].append(accuracy)
    results[model_name]['pos_precision'].append(pos_precision)
    results[model_name]['pos_recall'].append(pos_recall)
    results[model_name]['neg_precision'].append(neg_precision)
    results[model_name]['neg_recall'].append(neg_recall) 
            
    
    model_name = 'bigram'
    print(model_name.upper())     
    accuracy, pos_precision, pos_recall, neg_precision, neg_recall = \
    evaluate_classifier(bigram_word_feats,                         
                        negtrain = neg_train,
                        negtest = neg_test,
                        postrain = pos_train, 
                        postest = pos_test,
                        bestwords = best_words)
    results[model_name]['accuracy'].append(accuracy)
    results[model_name]['pos_precision'].append(pos_precision)
    results[model_name]['pos_recall'].append(pos_recall)
    results[model_name]['neg_precision'].append(neg_precision)
    results[model_name]['neg_recall'].append(neg_recall)
    
                     
    model_name = 'best_words'    
    print(model_name.upper())            
    accuracy, pos_precision, pos_recall, neg_precision, neg_recall = \
    evaluate_classifier(best_word_feats,                        
                        negtrain = neg_train,
                        negtest = neg_test,
                        postrain = pos_train, 
                        postest = pos_test,
                        bestwords = best_words)    
    results[model_name]['accuracy'].append(accuracy)
    results[model_name]['pos_precision'].append(pos_precision)
    results[model_name]['pos_recall'].append(pos_recall)
    results[model_name]['neg_precision'].append(neg_precision)
    results[model_name]['neg_recall'].append(neg_recall) 
    
                    
    model_name = 'bigram_best_words'
    print(model_name.upper())         
    accuracy, pos_precision, pos_recall, neg_precision, neg_recall = \
    evaluate_classifier(best_bigram_word_feats,                         
                        negtrain = neg_train,
                        negtest = neg_test,
                        postrain = pos_train, 
                        postest = pos_test,
                        bestwords = best_words)
    results[model_name]['accuracy'].append(accuracy)
    results[model_name]['pos_precision'].append(pos_precision)
    results[model_name]['pos_recall'].append(pos_recall)
    results[model_name]['neg_precision'].append(neg_precision)
    results[model_name]['neg_recall'].append(neg_recall) 
                    

0. Train on 1194 instances, test on 300 instances
BAG_OF_WORDS
STOP_WORDS
BIGRAM


ZeroDivisionError: float division by zero

In [33]:
results;

In [25]:
# Choose Best Model (Features)
best_features = collections.defaultdict(dict)
for model_name in models:
    best_features[model_name]['accuracy'] = mean(results[model_name]['accuracy'])
    best_features[model_name]['pos_precision'] = mean(results[model_name]['pos_precision'])
    best_features[model_name]['pos_recall'] = mean(results[model_name]['pos_recall'])
    best_features[model_name]['neg_precision'] = mean(results[model_name]['neg_precision'])
    best_features[model_name]['neg_recall'] = mean(results[model_name]['neg_recall'])

pd.DataFrame.from_dict({(i): best_features[i]  
                        for i in best_features.keys()}, 
                       orient='index')

  ret = ret.dtype.type(ret / rcount)


Unnamed: 0,neg_precision,accuracy,neg_recall,pos_precision,pos_recall
bag_of_words,0.280945,0.396,0.994366,0.99245,0.21048
best_words,,,,,
bigram,,,,,
bigram_best_words,,,,,
stop_words,0.329592,0.523333,0.980282,0.985012,0.381659


In [64]:
model_name = 'stop_words'
print(str(i) + '. ' + model_name.upper()) 
ref, pred = evaluate_classifier(stopword_filtered_word_feats,                         
                    negtrain = neg_train,
                    negtest = neg_test,
                    postrain = pos_train, 
                    postest = pos_test,
                    bestwords = best_words)

0. STOP_WORDS


In [65]:
len(set.intersection(ref['pos'], pred['pos'])),\
len(set.intersection(ref['neg'], pred['neg'])),\
len(set.intersection(ref['pos'], pred['neg'])),\
len(set.intersection(ref['neg'], pred['pos']))

(85, 67, 144, 4)

In [67]:
model_name = 'best_words'    
print(model_name.upper())            
ref, pred = evaluate_classifier(
    best_word_feats,                        
    negtrain = neg_train,
    negtest = neg_test,
    postrain = pos_train, 
    postest = pos_test,
    bestwords = best_words) 


BEST_WORDS


In [76]:
len(neg_train), len(neg_test), len(pos_train), len(pos_test)

(282, 71, 912, 229)

In [77]:
model_name = 'best_bigrams'
print(str(i) + '. ' + model_name.upper()) 

#negtrain_feats = [(best_bigram_word_feats(w, best_words), 'neg') for w in neg_train]
#negtest_feats  = [(best_bigram_word_feats(w, best_words), 'neg') for w in neg_test]
postrain_feats = [(best_bigram_word_feats(w, best_words), 'pos') for w in pos_train]
#postest_feats  = [(best_bigram_word_feats(w, best_words), 'pos') for w in pos_test]
  

0. BEST_BIGRAMS


ZeroDivisionError: float division by zero

In [35]:
model_name = 'bigram'
print(model_name.upper())     
evaluate_classifier(
    bigram_word_feats,                         
    negtrain = neg_train,
    negtest = neg_test,
    postrain = pos_train, 
    postest = pos_test,
    bestwords = best_words)

BIGRAM


ZeroDivisionError: float division by zero

# BEST MODEL: TRAIN ON FULL DEVELOPING DATA SET, TEST ON VALIDATION SET

In [60]:
best_words = get_best_words(neg_develop, pos_develop, 10000)
num_train = len(neg_develop) + len(pos_develop)
num_test = len(neg_val) + len(pos_val)    

print('Train on %d instances, test on %d instances' % (num_train, num_test)) 

model_name = 'bigram_best_words'
print(str(i) + '. ' + model_name.upper()) 
evaluate_classifier_original(best_bigram_word_feats,
                             negtrain = neg_develop,
                             negtest = neg_val,
                             postrain = pos_develop, 
                             postest = pos_val,
                             bestwords = best_words)

Train on 1494 instances, test on 499 instances
4. BIGRAM_BEST_WORDS
accuracy:0.43286573146292584
pos precision:0.9803921568627451
pos recall:0.26246719160104987
neg precision:0.29219143576826195
neg recall:0.9830508474576272
Most Informative Features
                       # = True              neg : pos    =     50.5 : 1.0
               contacted = True              neg : pos    =     48.4 : 1.0
                  refund = True              neg : pos    =     38.1 : 1.0
                  saying = True              neg : pos    =     36.8 : 1.0
                response = True              neg : pos    =     36.4 : 1.0
         ('my', 'money') = True              neg : pos    =     34.2 : 1.0
                      20 = True              neg : pos    =     31.2 : 1.0
                  cancel = True              neg : pos    =     31.2 : 1.0
         ('and', 'when') = True              neg : pos    =     29.0 : 1.0
                  unable = True              neg : pos    =     29.0 : 1.0

In [46]:
from sklearn.metrics import accuracy_score
y_pred = [0, 2, 1, 3, 8, 8]
y_true = [0, 1, 2, 3, 7, 5]
accuracy_score(y_true, y_pred)

0.33333333333333331

In [57]:
help(scores.recall)

Help on function recall in module nltk.metrics.scores:

recall(reference, test)
    Given a set of reference values and a set of test values, return
    the fraction of reference values that appear in the test set.
    In particular, return card(``reference`` intersection ``test``)/card(``reference``).
    If ``reference`` is empty, then return None.
    
    :type reference: set
    :param reference: A set of reference values.
    :type test: set
    :param test: A set of values to compare against the reference set.
    :rtype: float or None



In [56]:
help(scores.precision)

Help on function precision in module nltk.metrics.scores:

precision(reference, test)
    Given a set of reference values and a set of test values, return
    the fraction of test values that appear in the reference set.
    In particular, return card(``reference`` intersection ``test``)/card(``test``).
    If ``test`` is empty, then return None.
    
    :type reference: set
    :param reference: A set of reference values.
    :type test: set
    :param test: A set of values to compare against the reference set.
    :rtype: float or None



In [None]:
from textblob import TextBlob
TextBlob('yuri is super awesome and positive').sentiment
zen = TextBlob("Beautiful is better than ugly."
               " Explicit is better than implicit."
               " Simple is better than complex.")
zen2 = TextBlob('This is my first sentence. This is a second one.')
zen.words, zen.sentences, zen2.sentences
for sentence in zen.sentences:
    print(sentence)
    print(sentence.sentiment)
    from textblob import Word

w1 = Word("octopi")
w2 = Word("went")
w1.lemmatize(), w2.lemmatize("v")

In [161]:
TextBlob('reslults').correct()

TextBlob("results")