In [1]:
import nltk

In [2]:
def gender_features(word):
    return {'last_letter': word[-1]}
gender_features('Shrek')

{'last_letter': 'k'}

In [3]:
from nltk.corpus import names
labeled_names = ([(name, 'male') for name in names.words('male.txt')] + [(name, 'female') for name in names.words('female.txt')])

In [4]:
import random
random.shuffle(labeled_names)

In [5]:
featuresets = [(gender_features(n), gender) for (n, gender) in labeled_names]
train_set, test_set = featuresets[500:], featuresets[:500]
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [7]:
featuresets

[({'last_letter': u'a'}, 'female'),
 ({'last_letter': u'o'}, 'female'),
 ({'last_letter': u'l'}, 'female'),
 ({'last_letter': u't'}, 'male'),
 ({'last_letter': u'e'}, 'female'),
 ({'last_letter': u'e'}, 'female'),
 ({'last_letter': u'r'}, 'male'),
 ({'last_letter': u'a'}, 'female'),
 ({'last_letter': u'n'}, 'male'),
 ({'last_letter': u'a'}, 'female'),
 ({'last_letter': u'k'}, 'male'),
 ({'last_letter': u'n'}, 'male'),
 ({'last_letter': u'a'}, 'female'),
 ({'last_letter': u'l'}, 'female'),
 ({'last_letter': u'e'}, 'female'),
 ({'last_letter': u'h'}, 'female'),
 ({'last_letter': u'l'}, 'male'),
 ({'last_letter': u'm'}, 'female'),
 ({'last_letter': u't'}, 'male'),
 ({'last_letter': u'a'}, 'female'),
 ({'last_letter': u'a'}, 'female'),
 ({'last_letter': u'n'}, 'female'),
 ({'last_letter': u's'}, 'male'),
 ({'last_letter': u'n'}, 'male'),
 ({'last_letter': u't'}, 'female'),
 ({'last_letter': u'd'}, 'male'),
 ({'last_letter': u'y'}, 'male'),
 ({'last_letter': u'd'}, 'male'),
 ({'last_letter'

In [8]:
print(nltk.classify.accuracy(classifier, test_set))

0.728


In [9]:
classifier.show_most_informative_features(5)

Most Informative Features
             last_letter = u'a'           female : male   =     35.7 : 1.0
             last_letter = u'k'             male : female =     31.8 : 1.0
             last_letter = u'f'             male : female =     27.7 : 1.0
             last_letter = u'p'             male : female =     12.6 : 1.0
             last_letter = u'm'             male : female =     10.8 : 1.0


In [10]:
def gender_features2(name):
    features = {}
    features["first_letter"] = name[0].lower()
    features["last_letter"] = name[-1].lower()
    for letter in 'abcdefghijklmnopqrstuvwxyz':
        features["count({})".format(letter)] = name.lower().count(letter)
        features["has({})".format(letter)] = (letter in name.lower())
    return features

In [11]:
gender_features2('John') 

{'count(a)': 0,
 'count(b)': 0,
 'count(c)': 0,
 'count(d)': 0,
 'count(e)': 0,
 'count(f)': 0,
 'count(g)': 0,
 'count(h)': 1,
 'count(i)': 0,
 'count(j)': 1,
 'count(k)': 0,
 'count(l)': 0,
 'count(m)': 0,
 'count(n)': 1,
 'count(o)': 1,
 'count(p)': 0,
 'count(q)': 0,
 'count(r)': 0,
 'count(s)': 0,
 'count(t)': 0,
 'count(u)': 0,
 'count(v)': 0,
 'count(w)': 0,
 'count(x)': 0,
 'count(y)': 0,
 'count(z)': 0,
 'first_letter': 'j',
 'has(a)': False,
 'has(b)': False,
 'has(c)': False,
 'has(d)': False,
 'has(e)': False,
 'has(f)': False,
 'has(g)': False,
 'has(h)': True,
 'has(i)': False,
 'has(j)': True,
 'has(k)': False,
 'has(l)': False,
 'has(m)': False,
 'has(n)': True,
 'has(o)': True,
 'has(p)': False,
 'has(q)': False,
 'has(r)': False,
 'has(s)': False,
 'has(t)': False,
 'has(u)': False,
 'has(v)': False,
 'has(w)': False,
 'has(x)': False,
 'has(y)': False,
 'has(z)': False,
 'last_letter': 'n'}

In [12]:
featuresets = [(gender_features2(n), gender) for (n, gender) in labeled_names]
train_set, test_set = featuresets[500:], featuresets[:500]
classifier = nltk.NaiveBayesClassifier.train(train_set)
print(nltk.classify.accuracy(classifier, test_set))

0.786


In [13]:
train_names = labeled_names[1500:]
devtest_names = labeled_names[500:1500]
test_names = labeled_names[:500]

In [16]:
train_set = [(gender_features(n), gender) for (n, gender) in train_names]
devtest_set = [(gender_features(n), gender) for (n, gender) in devtest_names]
test_set = [(gender_features(n), gender) for (n, gender) in test_names]
classifier = nltk.NaiveBayesClassifier.train(train_set)
print(nltk.classify.accuracy(classifier, devtest_set))

0.768


In [17]:
errors = []
for (name, tag) in devtest_names:
     guess = classifier.classify(gender_features(name))
     if guess != tag:
         errors.append( (tag, guess, name) )

In [18]:
for (tag, guess, name) in sorted(errors):
     print('correct={:<8} guess={:<8s} name={:<30}'.format(tag, guess, name))

correct=female   guess=male     name=Ag                            
correct=female   guess=male     name=Aigneis                       
correct=female   guess=male     name=Alison                        
correct=female   guess=male     name=Alyson                        
correct=female   guess=male     name=Anabel                        
correct=female   guess=male     name=Arlyn                         
correct=female   guess=male     name=Astrid                        
correct=female   guess=male     name=Averil                        
correct=female   guess=male     name=Beitris                       
correct=female   guess=male     name=Bess                          
correct=female   guess=male     name=Bliss                         
correct=female   guess=male     name=Brett                         
correct=female   guess=male     name=Brook                         
correct=female   guess=male     name=Caroljean                     
correct=female   guess=male     name=Cat        

In [19]:
def gender_features(word):
     return {'suffix1': word[-1:],
             'suffix2': word[-2:]}

In [20]:
train_set = [(gender_features(n), gender) for (n, gender) in train_names]
devtest_set = [(gender_features(n), gender) for (n, gender) in devtest_names]
classifier = nltk.NaiveBayesClassifier.train(train_set)
print(nltk.classify.accuracy(classifier, devtest_set))

0.777


In [21]:
train_set

[({'suffix1': u'n', 'suffix2': u'in'}, 'female'),
 ({'suffix1': u'e', 'suffix2': u'ie'}, 'female'),
 ({'suffix1': u'n', 'suffix2': u'in'}, 'female'),
 ({'suffix1': u'e', 'suffix2': u'te'}, 'female'),
 ({'suffix1': u'e', 'suffix2': u'ie'}, 'male'),
 ({'suffix1': u's', 'suffix2': u'ns'}, 'female'),
 ({'suffix1': u'e', 'suffix2': u'ie'}, 'female'),
 ({'suffix1': u'e', 'suffix2': u'ie'}, 'female'),
 ({'suffix1': u'y', 'suffix2': u'ey'}, 'female'),
 ({'suffix1': u'y', 'suffix2': u'oy'}, 'male'),
 ({'suffix1': u'p', 'suffix2': u'ip'}, 'male'),
 ({'suffix1': u'n', 'suffix2': u'in'}, 'female'),
 ({'suffix1': u'e', 'suffix2': u'ne'}, 'female'),
 ({'suffix1': u'a', 'suffix2': u'na'}, 'female'),
 ({'suffix1': u'e', 'suffix2': u'ie'}, 'female'),
 ({'suffix1': u'l', 'suffix2': u'el'}, 'male'),
 ({'suffix1': u'e', 'suffix2': u'le'}, 'female'),
 ({'suffix1': u'a', 'suffix2': u'na'}, 'female'),
 ({'suffix1': u'e', 'suffix2': u'ie'}, 'female'),
 ({'suffix1': u'e', 'suffix2': u'ye'}, 'female'),
 ({'suff

## Doc Classification

In [22]:
from nltk.corpus import movie_reviews
documents = [(list(movie_reviews.words(fileid)), category)
              for category in movie_reviews.categories()
              for fileid in movie_reviews.fileids(category)]
random.shuffle(documents)

In [24]:
all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words())
word_features = list(all_words)[:2000]

In [25]:
def document_features(document): 
    document_words = set(document)
    features = {}
    for word in word_features:
        features['contains({})'.format(word)] = (word in document_words)
    return features

In [30]:
featuresets = [(document_features(d), c) for (d,c) in documents]
train_set, test_set = featuresets[100:], featuresets[:100]
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [31]:
print(nltk.classify.accuracy(classifier, test_set))
classifier.show_most_informative_features(5)

0.7
Most Informative Features
          contains(sans) = True              neg : pos    =      9.0 : 1.0
     contains(uplifting) = True              pos : neg    =      8.6 : 1.0
  contains(effortlessly) = True              pos : neg    =      7.8 : 1.0
    contains(mediocrity) = True              neg : pos    =      7.7 : 1.0
          contains(hugo) = True              pos : neg    =      7.0 : 1.0


In [35]:
def pos_features(sentence, i, history): # [_consec-pos-tag-features]
     features = {"suffix(1)": sentence[i][-1:],
                 "suffix(2)": sentence[i][-2:],
                 "suffix(3)": sentence[i][-3:]}
     if i == 0:
         features["prev-word"] = "<START>"
         features["prev-tag"] = "<START>"
     else:
         features["prev-word"] = sentence[i-1]
         features["prev-tag"] = history[i-1]
     return features

class ConsecutivePosTagger(nltk.TaggerI): # [_consec-pos-tagger]

    def __init__(self, train_sents):
        train_set = []
        for tagged_sent in train_sents:
            untagged_sent = nltk.tag.untag(tagged_sent)
            history = []
            for i, (word, tag) in enumerate(tagged_sent):
                featureset = pos_features(untagged_sent, i, history)
                train_set.append( (featureset, tag) )
                history.append(tag)
        self.classifier = nltk.NaiveBayesClassifier.train(train_set)

    def tag(self, sentence):
        history = []
        for i, word in enumerate(sentence):
            featureset = pos_features(sentence, i, history)
            tag = self.classifier.classify(featureset)
            history.append(tag)
        return zip(sentence, history)

In [37]:
from nltk.corpus import brown
tagged_sents = brown.tagged_sents(categories='news')
size = int(len(tagged_sents) * 0.1)
train_sents, test_sents = tagged_sents[size:], tagged_sents[:size]
tagger = ConsecutivePosTagger(train_sents)
print(tagger.evaluate(test_sents))

0.798052851182


In [38]:
posts = nltk.corpus.nps_chat.xml_posts()[:10000]

In [43]:
def dialogue_act_features(post):
     features = {}
     for word in nltk.word_tokenize(post):
         features['contains({})'.format(word.lower())] = True
     return features

In [45]:
featuresets = [(dialogue_act_features(post.text), post.get('class'))
                for post in posts]
size = int(len(featuresets) * 0.1)
train_set, test_set = featuresets[size:], featuresets[:size]
classifier = nltk.NaiveBayesClassifier.train(train_set)
print(nltk.classify.accuracy(classifier, test_set))

0.668
