In [60]:
import nltk
from nltk.corpus import sentence_polarity
import random

In [61]:
sentences = sentence_polarity.sents()
print(len(sentences))
print(type(sentences))
print(sentence_polarity.categories())
for sent in sentences[:4]:
    print(sent)

10662
<class 'nltk.corpus.reader.util.ConcatenatedCorpusView'>
['neg', 'pos']
['simplistic', ',', 'silly', 'and', 'tedious', '.']
["it's", 'so', 'laddish', 'and', 'juvenile', ',', 'only', 'teenage', 'boys', 'could', 'possibly', 'find', 'it', 'funny', '.']
['exploitative', 'and', 'largely', 'devoid', 'of', 'the', 'depth', 'or', 'sophistication', 'that', 'would', 'make', 'watching', 'such', 'a', 'graphic', 'treatment', 'of', 'the', 'crimes', 'bearable', '.']
['[garbus]', 'discards', 'the', 'potential', 'for', 'pathological', 'study', ',', 'exhuming', 'instead', ',', 'the', 'skewed', 'melodrama', 'of', 'the', 'circumstantial', 'situation', '.']


In [62]:
pos_sents = sentence_polarity.sents(categories='pos')
print(len(pos_sents))
neg_sents = sentence_polarity.sents(categories='neg')
print(len(neg_sents))

5331
5331


In [63]:
documents = [(sent, cat) for cat in sentence_polarity.categories() 
        for sent in sentence_polarity.sents(categories=cat)]

In [64]:
print(documents[0])
print(documents[-1])

(['simplistic', ',', 'silly', 'and', 'tedious', '.'], 'neg')
(['provides', 'a', 'porthole', 'into', 'that', 'noble', ',', 'trembling', 'incoherence', 'that', 'defines', 'us', 'all', '.'], 'pos')


In [114]:
random.shuffle(documents)

In [115]:
all_words_list = [word for (sent,cat) in documents for word in sent]
all_words = nltk.FreqDist(all_words_list)
word_items = all_words.most_common(2000)
word_features = [word for (word, freq) in word_items]
# look at the first 50 words in the most frequent list of words
print(word_features[:50])

['.', 'the', ',', 'a', 'and', 'of', 'to', 'is', 'in', 'that', 'it', 'as', 'but', 'with', 'film', 'this', 'for', 'its', 'an', 'movie', "it's", 'be', 'on', 'you', 'not', 'by', 'about', 'more', 'one', 'like', 'has', 'are', 'at', 'from', 'than', '"', 'all', '--', 'his', 'have', 'so', 'if', 'or', 'story', 'i', 'too', 'just', 'who', 'into', 'what']


In [116]:
def document_features(document, word_features):
    document_words = set(document)
    features = {}
    for word in word_features:
        features['V_{}'.format(word)] = (word in document_words)
    return features

In [117]:
featuresets = [(document_features(d,word_features), c) for (d,c) in documents]
train_set, test_set = featuresets[1000:], featuresets[:1000]
classifier = nltk.NaiveBayesClassifier.train(train_set)
print (nltk.classify.accuracy(classifier, test_set))

0.774


In [93]:
classifier.show_most_informative_features(30)

Most Informative Features
            V_engrossing = True              pos : neg    =     20.5 : 1.0
                V_stupid = True              neg : pos    =     17.5 : 1.0
              V_mediocre = True              neg : pos    =     16.9 : 1.0
                V_beauty = True              pos : neg    =     15.8 : 1.0
               V_generic = True              neg : pos    =     15.5 : 1.0
               V_routine = True              neg : pos    =     14.2 : 1.0
                  V_flat = True              neg : pos    =     14.1 : 1.0
                V_boring = True              neg : pos    =     14.0 : 1.0
             V_inventive = True              pos : neg    =     13.1 : 1.0
            V_refreshing = True              pos : neg    =     13.1 : 1.0
          V_refreshingly = True              pos : neg    =     12.4 : 1.0
                  V_warm = True              pos : neg    =     12.3 : 1.0
              V_powerful = True              pos : neg    =     12.2 : 1.0

In [27]:
SLpath = "subjclueslen1-HLTEMNLP05.tff"

In [28]:
def readSubjectivity(path):
    flexicon = open(path, 'r')
    # initialize an empty dictionary
    sldict = { }
    for line in flexicon:
        fields = line.split()   # default is to split on whitespace
        # split each field on the '=' and keep the second part as the value
        strength = fields[0].split("=")[1]
        word = fields[2].split("=")[1]
        posTag = fields[3].split("=")[1]
        stemmed = fields[4].split("=")[1]
        polarity = fields[5].split("=")[1]
        if (stemmed == 'y'):
            isStemmed = True
        else:
            isStemmed = False
        # put a dictionary entry with the word as the keyword
        #     and a list of the other values
        sldict[word] = [strength, posTag, isStemmed, polarity]
    return sldict

In [29]:
SL = readSubjectivity(SLpath)

In [30]:
SL['absolute']
SL['shabby']

['strongsubj', 'adj', False, 'negative']

In [32]:
strength, posTag, isStemmed, polarity = SL['absolute']
print(polarity)

neutral


In [33]:
def SL_features(document, SL):
    document_words = set(document)
    features = {}
    for word in word_features:
        features['contains(%s)' % word] = (word in document_words)
        # count variables for the 4 classes of subjectivity
        weakPos = 0
        strongPos = 0
        weakNeg = 0
        strongNeg = 0
        for word in document_words:
            if word in SL:
                strength, posTag, isStemmed, polarity = SL[word]
                if strength == 'weaksubj' and polarity == 'positive':
                    weakPos += 1
                if strength == 'strongsubj' and polarity == 'positive':
                    strongPos += 1
                if strength == 'weaksubj' and polarity == 'negative':
                    weakNeg += 1
                if strength == 'strongsubj' and polarity == 'negative':
                    strongNeg += 1
                features['positivecount'] = weakPos + (2 * strongPos)
                features['negativecount'] = weakNeg + (2 * strongNeg)      
    return features

In [34]:
SL_featuresets = [(SL_features(d, SL), c) for (d,c) in documents]

In [35]:
print(SL_featuresets[0][0]['positivecount'])
print(SL_featuresets[0][0]['negativecount'])

1
0


In [36]:
SL_featuresets[0][1]

'pos'

In [39]:
train_set, test_set = SL_featuresets[1000:], SL_featuresets[:1000]
classifier = nltk.NaiveBayesClassifier.train(train_set)
nltk.classify.accuracy(classifier, test_set)

0.764

In [40]:
for sent in list(sentences)[:50]:
   for word in sent:
     if (word.endswith("n't")):
       print(sent)

['there', 'is', 'a', 'difference', 'between', 'movies', 'with', 'the', 'courage', 'to', 'go', 'over', 'the', 'top', 'and', 'movies', 'that', "don't", 'care', 'about', 'being', 'stupid']
['a', 'farce', 'of', 'a', 'parody', 'of', 'a', 'comedy', 'of', 'a', 'premise', ',', 'it', "isn't", 'a', 'comparison', 'to', 'reality', 'so', 'much', 'as', 'it', 'is', 'a', 'commentary', 'about', 'our', 'knowledge', 'of', 'films', '.']
['i', "didn't", 'laugh', '.', 'i', "didn't", 'smile', '.', 'i', 'survived', '.']
['i', "didn't", 'laugh', '.', 'i', "didn't", 'smile', '.', 'i', 'survived', '.']
['most', 'of', 'the', 'problems', 'with', 'the', 'film', "don't", 'derive', 'from', 'the', 'screenplay', ',', 'but', 'rather', 'the', 'mediocre', 'performances', 'by', 'most', 'of', 'the', 'actors', 'involved']
['the', 'lack', 'of', 'naturalness', 'makes', 'everything', 'seem', 'self-consciously', 'poetic', 'and', 'forced', '.', '.', '.', "it's", 'a', 'pity', 'that', "[nelson's]", 'achievement', "doesn't", 'match'

In [41]:
negationwords = ['no', 'not', 'never', 'none', 'nowhere', 'nothing', 'noone', 'rather', 'hardly', 'scarcely', 'rarely', 'seldom', 'neither', 'nor']

In [42]:
def NOT_features(document, word_features, negationwords):
    features = {}
    for word in word_features:
        features['V_{}'.format(word)] = False
        features['V_NOT{}'.format(word)] = False
    # go through document words in order
    for i in range(0, len(document)):
        word = document[i]
        if ((i + 1) < len(document)) and ((word in negationwords) or (word.endswith("n't"))):
            i += 1
            features['V_NOT{}'.format(document[i])] = (document[i] in word_features)
        else:
            features['V_{}'.format(word)] = (word in word_features)
    return features

In [45]:
NOT_featuresets = [(NOT_features(d, word_features, negationwords), c) for (d, c) in documents]

In [47]:
print(NOT_featuresets[0][0]['V_NOTcare'])
print(NOT_featuresets[0][0]['V_always'])

False
False


In [51]:
train_set, test_set = NOT_featuresets[200:], NOT_featuresets[:200]
classifier = nltk.NaiveBayesClassifier.train(train_set)
print(nltk.classify.accuracy(classifier, test_set))

0.75


In [53]:
stopwords = nltk.corpus.stopwords.words('english')
len(stopwords)

179

In [55]:
newstopwords = [word for word in stopwords if word not in negationwords]
len(newstopwords)

176

In [118]:
new_all_words_list = [word for word in all_words_list if word not in newstopwords]
new_all_words = nltk.FreqDist(new_all_words_list)
new_word_items = new_all_words.most_common(2000)
new_word_features = [word for (word,count) in new_word_items]
print(new_word_features[:30])

['.', ',', 'film', 'movie', 'not', 'one', 'like', '"', '--', 'story', 'no', 'much', 'even', 'good', 'comedy', 'time', 'characters', 'little', 'way', 'funny', 'make', 'enough', 'never', 'makes', 'may', 'us', 'work', 'best', 'bad', 'director']


In [119]:
featuresets2 = [(document_features(d,new_word_features), c) for (d,c) in documents]
train_set2, test_set2 = featuresets2[1000:], featuresets2[:1000]
classifier2 = nltk.NaiveBayesClassifier.train(train_set2)
print (nltk.classify.accuracy(classifier2, test_set2))

0.755


In [58]:
classifier.show_most_informative_features(30)

Most Informative Features
                  V_flat = True              neg : pos    =     24.3 : 1.0
            V_engrossing = True              pos : neg    =     21.7 : 1.0
               V_generic = True              neg : pos    =     17.0 : 1.0
              V_mediocre = True              neg : pos    =     17.0 : 1.0
               V_routine = True              neg : pos    =     16.3 : 1.0
            V_unexpected = True              pos : neg    =     15.7 : 1.0
             V_inventive = True              pos : neg    =     15.0 : 1.0
                V_boring = True              neg : pos    =     14.7 : 1.0
            V_refreshing = True              pos : neg    =     14.3 : 1.0
              V_haunting = True              pos : neg    =     13.0 : 1.0
          V_refreshingly = True              pos : neg    =     13.0 : 1.0
                  V_warm = True              pos : neg    =     13.0 : 1.0
                    V_90 = True              neg : pos    =     13.0 : 1.0

In [120]:
print(stopwords)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '