# Selección de características para análisis de sentimiento

Vamos a explorar ideas de la sesión 2 para la selección de características en el problema de análisis de sentimiento

## Entrenamiento y evaluación

Usamos una función para encapsular el entrenamiento y evaluación. 
 * Por simplificar, asumimos alguna de las cosas que serían mejorables - no usar las características de tests en la selección y entrenamiento
 * Experimentamos con diferentes procedimientos para seleccionar características. El clasificador recibe la función `featx` como parámetro 

In [3]:
import collections
import nltk.metrics
from nltk.classify import NaiveBayesClassifier
from nltk.corpus import movie_reviews
 

def evaluate_classifier(featx):
    negids = movie_reviews.fileids('neg')
    posids = movie_reviews.fileids('pos')
 
    negfeats = [(featx(movie_reviews.words(fileids=[f])), 'neg') for f in negids]
    posfeats = [(featx(movie_reviews.words(fileids=[f])), 'pos') for f in posids]
 
    negcutoff = len(negfeats)*3/4
    poscutoff = len(posfeats)*3/4
 
    trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff]
    testfeats = negfeats[negcutoff:] + posfeats[poscutoff:]
    print 'train on %d instances, test on %d instances' % (len(trainfeats), len(testfeats))
 
    classifier = NaiveBayesClassifier.train(trainfeats)
    refsets = collections.defaultdict(set)
    testsets = collections.defaultdict(set)
 
    for i, (feats, label) in enumerate(testfeats):
        refsets[label].add(i)
        observed = classifier.classify(feats)
        testsets[observed].add(i)
 
    print 'accuracy:', nltk.classify.util.accuracy(classifier, testfeats)
    print 'pos precision:', nltk.metrics.precision(refsets['pos'], testsets['pos'])
    print 'pos recall:', nltk.metrics.recall(refsets['pos'], testsets['pos'])
    print 'pos F-measure:', nltk.metrics.f_measure(refsets['pos'], testsets['pos'])
    print 'neg precision:', nltk.metrics.precision(refsets['neg'], testsets['neg'])
    print 'neg recall:', nltk.metrics.recall(refsets['neg'], testsets['neg'])
    print 'neg F-measure:', nltk.metrics.f_measure(refsets['neg'], testsets['neg'])
    classifier.show_most_informative_features()




## Bag of words - all words

In [4]:
def word_feats(words):
    return dict([(word, True) for word in words])
 
evaluate_classifier(word_feats)

train on 1500 instances, test on 500 instances
accuracy: 0.728
pos precision: 0.651595744681
pos recall: 0.98
pos F-measure: 0.782747603834
neg precision: 0.959677419355
neg recall: 0.476
neg F-measure: 0.636363636364
Most Informative Features
             magnificent = True              pos : neg    =     15.0 : 1.0
             outstanding = True              pos : neg    =     13.6 : 1.0
               insulting = True              neg : pos    =     13.0 : 1.0
              vulnerable = True              pos : neg    =     12.3 : 1.0
               ludicrous = True              neg : pos    =     11.8 : 1.0
                  avoids = True              pos : neg    =     11.7 : 1.0
             uninvolving = True              neg : pos    =     11.7 : 1.0
              astounding = True              pos : neg    =     10.3 : 1.0
             fascination = True              pos : neg    =     10.3 : 1.0
                 idiotic = True              neg : pos    =      9.8 : 1.0


## Filtrado de stopwords

In [7]:
from nltk.corpus import stopwords
stopset = set(stopwords.words('english'))
 
def stopword_filtered_word_feats(words):
    return dict([(word, True) for word in words if word not in stopset])
 
evaluate_classifier(stopword_filtered_word_feats)

train on 1500 instances, test on 500 instances
accuracy: 0.726
pos precision: 0.649867374005
pos recall: 0.98
pos F-measure: 0.781499202552
neg precision: 0.959349593496
neg recall: 0.472
neg F-measure: 0.632707774799
Most Informative Features
             magnificent = True              pos : neg    =     15.0 : 1.0
             outstanding = True              pos : neg    =     13.6 : 1.0
               insulting = True              neg : pos    =     13.0 : 1.0
              vulnerable = True              pos : neg    =     12.3 : 1.0
               ludicrous = True              neg : pos    =     11.8 : 1.0
                  avoids = True              pos : neg    =     11.7 : 1.0
             uninvolving = True              neg : pos    =     11.7 : 1.0
              astounding = True              pos : neg    =     10.3 : 1.0
             fascination = True              pos : neg    =     10.3 : 1.0
                 idiotic = True              neg : pos    =      9.8 : 1.0


## Filtrando a N palabras más frecuentes

In [36]:
max_types = 100000

all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words())
most_common = set([w for (w,f) in all_words.most_common(max_types)])

def most_common_word_feats(words):
    return dict([(word, True) for word in words if word in most_common and word not in stopset])

evaluate_classifier(most_common_word_feats)

train on 1500 instances, test on 500 instances
accuracy: 0.726
pos precision: 0.649867374005
pos recall: 0.98
pos F-measure: 0.781499202552
neg precision: 0.959349593496
neg recall: 0.472
neg F-measure: 0.632707774799
Most Informative Features
             magnificent = True              pos : neg    =     15.0 : 1.0
             outstanding = True              pos : neg    =     13.6 : 1.0
               insulting = True              neg : pos    =     13.0 : 1.0
              vulnerable = True              pos : neg    =     12.3 : 1.0
               ludicrous = True              neg : pos    =     11.8 : 1.0
                  avoids = True              pos : neg    =     11.7 : 1.0
             uninvolving = True              neg : pos    =     11.7 : 1.0
              astounding = True              pos : neg    =     10.3 : 1.0
             fascination = True              pos : neg    =     10.3 : 1.0
                 idiotic = True              neg : pos    =      9.8 : 1.0


## Filtrando por frecuencia

In [55]:
minfreq = 10

most_frequent = [w for w in set(movie_reviews.words()) if all_words[w] > minfreq]

def most_frequent_word_feats(words):
    return dict([(word, True) for word in words if word in most_frequent and word not in stopset])

evaluate_classifier(most_common_word_feats)

train on 1500 instances, test on 500 instances
accuracy: 0.726
pos precision: 0.649867374005
pos recall: 0.98
pos F-measure: 0.781499202552
neg precision: 0.959349593496
neg recall: 0.472
neg F-measure: 0.632707774799
Most Informative Features
             magnificent = True              pos : neg    =     15.0 : 1.0
             outstanding = True              pos : neg    =     13.6 : 1.0
               insulting = True              neg : pos    =     13.0 : 1.0
              vulnerable = True              pos : neg    =     12.3 : 1.0
               ludicrous = True              neg : pos    =     11.8 : 1.0
                  avoids = True              pos : neg    =     11.7 : 1.0
             uninvolving = True              neg : pos    =     11.7 : 1.0
              astounding = True              pos : neg    =     10.3 : 1.0
             fascination = True              pos : neg    =     10.3 : 1.0
                 idiotic = True              neg : pos    =      9.8 : 1.0


## Bigramas

In [16]:
import itertools

def bigram_feats(words):
    bigrams = nltk.bigrams(words)
    return dict([(word, True) for word in bigrams])

evaluate_classifier(bigram_feats)

train on 1500 instances, test on 500 instances
accuracy: 0.754
pos precision: 0.680911680912
pos recall: 0.956
pos F-measure: 0.79534109817
neg precision: 0.926174496644
neg recall: 0.552
neg F-measure: 0.691729323308
Most Informative Features
      (u'not', u'funny') = True              neg : pos    =     13.7 : 1.0
     (u'matt', u'damon') = True              pos : neg    =     13.7 : 1.0
     (u'and', u'boring') = True              neg : pos    =     13.0 : 1.0
  (u'a', u'wonderfully') = True              pos : neg    =     13.0 : 1.0
    (u'is', u'terrific') = True              pos : neg    =     13.0 : 1.0
    (u'perfect', u'for') = True              pos : neg    =     12.3 : 1.0
       (u'a', u'boring') = True              neg : pos    =     11.7 : 1.0
      (u'the', u'magic') = True              pos : neg    =     11.0 : 1.0
        (u'how', u'bad') = True              neg : pos    =     11.0 : 1.0
          (u'son', u"'") = True              pos : neg    =     11.0 : 1.0


## Bigramas y palabras

In [13]:
import itertools

def word_and_bigram_feats(words):
    bigrams = nltk.bigrams(words)
    return dict([(word, True) for word in itertools.chain(words,bigrams)])

evaluate_classifier(word_and_bigram_feats)

train on 1500 instances, test on 500 instances
accuracy: 0.74
pos precision: 0.664835164835
pos recall: 0.968
pos F-measure: 0.788273615635
neg precision: 0.941176470588
neg recall: 0.512
neg F-measure: 0.663212435233
Most Informative Features
             magnificent = True              pos : neg    =     15.0 : 1.0
      (u'not', u'funny') = True              neg : pos    =     13.7 : 1.0
     (u'matt', u'damon') = True              pos : neg    =     13.7 : 1.0
             outstanding = True              pos : neg    =     13.6 : 1.0
               insulting = True              neg : pos    =     13.0 : 1.0
     (u'and', u'boring') = True              neg : pos    =     13.0 : 1.0
  (u'a', u'wonderfully') = True              pos : neg    =     13.0 : 1.0
    (u'is', u'terrific') = True              pos : neg    =     13.0 : 1.0
    (u'perfect', u'for') = True              pos : neg    =     12.3 : 1.0
              vulnerable = True              pos : neg    =     12.3 : 1.0


## Seleccion de los bigramas más significativos 

In [60]:
BigramCollocationFinder?

In [62]:
BigramAssocMeasures.chi_sq?

In [58]:
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures
 
def bigram_word_feats(words, score_fn=BigramAssocMeasures.chi_sq, n=200):
    bigram_finder = BigramCollocationFinder.from_words(words)
    bigrams = bigram_finder.nbest(score_fn, n)
    return dict([(ngram, True) for ngram in itertools.chain(words, bigrams)])
 
evaluate_classifier(bigram_word_feats)


train on 1500 instances, test on 500 instances
accuracy: 0.816
pos precision: 0.753205128205
pos recall: 0.94
pos F-measure: 0.836298932384
neg precision: 0.920212765957
neg recall: 0.692
neg F-measure: 0.7899543379
Most Informative Features
             magnificent = True              pos : neg    =     15.0 : 1.0
             outstanding = True              pos : neg    =     13.6 : 1.0
               insulting = True              neg : pos    =     13.0 : 1.0
              vulnerable = True              pos : neg    =     12.3 : 1.0
     (u'matt', u'damon') = True              pos : neg    =     12.3 : 1.0
        (u'give', u'us') = True              neg : pos    =     12.3 : 1.0
               ludicrous = True              neg : pos    =     11.8 : 1.0
                  avoids = True              pos : neg    =     11.7 : 1.0
             uninvolving = True              neg : pos    =     11.7 : 1.0
  (u'absolutely', u'no') = True              neg : pos    =     10.6 : 1.0


## Selección de las palabras más significativas

In [63]:
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures
from nltk.probability import FreqDist, ConditionalFreqDist
 

word_fd = FreqDist()
label_word_fd = ConditionalFreqDist()

for word in movie_reviews.words(categories = ['pos']):
    word_fd[word.lower()] += 1
    label_word_fd['pos'][word.lower()] +=1
    
for word in movie_reviews.words(categories = ['neg']):
    word_fd[word.lower()] += 1
    label_word_fd['neg'][word.lower()] += 1

pos_word_count = label_word_fd['pos'].N()
neg_word_count = label_word_fd['neg'].N()
total_word_count = pos_word_count + neg_word_count

word_scores = {}

for word, freq in word_fd.iteritems():
    pos_score = BigramAssocMeasures.chi_sq(label_word_fd['pos'][word],
        (freq, pos_word_count), total_word_count)
    neg_score = BigramAssocMeasures.chi_sq(label_word_fd['neg'][word],
        (freq, neg_word_count), total_word_count)
    word_scores[word] = pos_score + neg_score

best = sorted(word_scores.iteritems(), key=lambda (w,s): s, reverse=True)[:10000]
bestwords = set([w for w, s in best])    
    
def best_word_feats(words):
    return dict([(word, True) for word in words if word in bestwords])

print 'evaluating best word features'
evaluate_classifier(best_word_feats)
 

evaluating best word features
train on 1500 instances, test on 500 instances
accuracy: 0.932
pos precision: 0.894160583942
pos recall: 0.98
pos F-measure: 0.935114503817
neg precision: 0.977876106195
neg recall: 0.884
neg F-measure: 0.928571428571
Most Informative Features
             magnificent = True              pos : neg    =     15.0 : 1.0
             outstanding = True              pos : neg    =     13.6 : 1.0
               insulting = True              neg : pos    =     13.0 : 1.0
              vulnerable = True              pos : neg    =     12.3 : 1.0
               ludicrous = True              neg : pos    =     11.8 : 1.0
                  avoids = True              pos : neg    =     11.7 : 1.0
             uninvolving = True              neg : pos    =     11.7 : 1.0
             fascination = True              pos : neg    =     10.3 : 1.0
              astounding = True              pos : neg    =     10.3 : 1.0
                 idiotic = True              neg : 

## Selección de los bigramas y las palabras más significativos

In [64]:
def best_bigram_word_feats(words, score_fn=BigramAssocMeasures.chi_sq, n=200):
    bigram_finder = BigramCollocationFinder.from_words(words)
    bigrams = bigram_finder.nbest(score_fn, n)
    d = dict([(bigram, True) for bigram in bigrams])
    d.update(best_word_feats(words))
    return d
 
print 'evaluating best words + bigram chi_sq word features'
evaluate_classifier(best_bigram_word_feats)

evaluating best words + bigram chi_sq word features
train on 1500 instances, test on 500 instances
accuracy: 0.92
pos precision: 0.913385826772
pos recall: 0.928
pos F-measure: 0.920634920635
neg precision: 0.926829268293
neg recall: 0.912
neg F-measure: 0.91935483871
Most Informative Features
             magnificent = True              pos : neg    =     15.0 : 1.0
             outstanding = True              pos : neg    =     13.6 : 1.0
               insulting = True              neg : pos    =     13.0 : 1.0
              vulnerable = True              pos : neg    =     12.3 : 1.0
     (u'matt', u'damon') = True              pos : neg    =     12.3 : 1.0
        (u'give', u'us') = True              neg : pos    =     12.3 : 1.0
               ludicrous = True              neg : pos    =     11.8 : 1.0
                  avoids = True              pos : neg    =     11.7 : 1.0
             uninvolving = True              neg : pos    =     11.7 : 1.0
  (u'absolutely', u'no') = Tru