1. Supervised Classification

In [12]:
# Gender Identification

def gender_features(word):
    return {'last_letter':word[-1], "length": len(word), "first_letter": word[0]}

gender_features("Shrek")

{'last_letter': 'k', 'length': 5, 'first_letter': 'S'}

In [13]:
from nltk.corpus import names
labeled_names = ([(name, 'male') for name in names.words('male.txt')] +
[(name, 'female') for name in names.words('female.txt')])

import random
random.shuffle(labeled_names)
labeled_names

[('Bobbie', 'male'),
 ('Saxon', 'male'),
 ('Oralie', 'female'),
 ('Issie', 'female'),
 ('Kessiah', 'female'),
 ('Ramonda', 'female'),
 ('Saunders', 'male'),
 ('Golda', 'female'),
 ('Aubine', 'female'),
 ('Aileen', 'female'),
 ('Yvonne', 'female'),
 ('Francis', 'male'),
 ('Maye', 'female'),
 ('Edouard', 'male'),
 ('Claudina', 'female'),
 ('Evaleen', 'female'),
 ('Prasad', 'male'),
 ('Karole', 'female'),
 ('Terencio', 'male'),
 ('Korella', 'female'),
 ('Scarface', 'male'),
 ('Ambrosia', 'female'),
 ('Pauline', 'female'),
 ('Eugene', 'male'),
 ('Gay', 'male'),
 ('Joly', 'female'),
 ('Nerissa', 'female'),
 ('Gerhard', 'male'),
 ('Vikki', 'female'),
 ('Merissa', 'female'),
 ('Yehudit', 'female'),
 ('Johny', 'male'),
 ('Theodora', 'female'),
 ('Gladi', 'female'),
 ('Corey', 'male'),
 ('Joey', 'female'),
 ('Ozzy', 'male'),
 ('Zuzana', 'female'),
 ('Athene', 'female'),
 ('Fiorenze', 'female'),
 ('Nahum', 'male'),
 ('April', 'female'),
 ('Chastity', 'female'),
 ('Juliana', 'female'),
 ('Britney

In [14]:
import nltk

featuresets = [(gender_features(n), gender) for (n, gender) in labeled_names]
train_set, test_set = featuresets[500:], featuresets[:500]
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [15]:
classifier.classify(gender_features("Neo"))

'male'

In [16]:
classifier.classify(gender_features("Maria"))

'female'

In [17]:
classifier.classify(gender_features("Anri"))

'female'

In [18]:
print(nltk.classify.accuracy(classifier, test_set))

0.772


In [20]:
classifier.show_most_informative_features(20)

Most Informative Features
             last_letter = 'a'            female : male   =     37.0 : 1.0
             last_letter = 'k'              male : female =     31.1 : 1.0
             last_letter = 'f'              male : female =     16.6 : 1.0
             last_letter = 'p'              male : female =     12.5 : 1.0
             last_letter = 'v'              male : female =     11.2 : 1.0
             last_letter = 'd'              male : female =      9.1 : 1.0
             last_letter = 'm'              male : female =      9.1 : 1.0
             last_letter = 'o'              male : female =      8.1 : 1.0
             last_letter = 'r'              male : female =      6.7 : 1.0
             last_letter = 'z'              male : female =      5.6 : 1.0
             last_letter = 'w'              male : female =      5.4 : 1.0
            first_letter = 'W'              male : female =      5.0 : 1.0
             last_letter = 'g'              male : female =      4.6 : 1.0

In [21]:
from nltk.classify import apply_features
train_set = apply_features(gender_features, labeled_names[500:])
test_set = apply_features(gender_features, labeled_names[:500])

In [25]:
# Choosing the right features

def gender_features2(name):
    features = {}
    features["first_letter"] = name[0].lower()
    features["last_letter"] = name[-1].lower()
    for letter in 'abcdefghijklmnopqrstuvwxyz':
        features["count({})".format(letter)] = name.lower().count(letter)
        features["has({})".format(letter)] = (letter in name.lower())
    return features

gender_features2("John")

{'first_letter': 'j',
 'last_letter': 'n',
 'count(a)': 0,
 'has(a)': False,
 'count(b)': 0,
 'has(b)': False,
 'count(c)': 0,
 'has(c)': False,
 'count(d)': 0,
 'has(d)': False,
 'count(e)': 0,
 'has(e)': False,
 'count(f)': 0,
 'has(f)': False,
 'count(g)': 0,
 'has(g)': False,
 'count(h)': 1,
 'has(h)': True,
 'count(i)': 0,
 'has(i)': False,
 'count(j)': 1,
 'has(j)': True,
 'count(k)': 0,
 'has(k)': False,
 'count(l)': 0,
 'has(l)': False,
 'count(m)': 0,
 'has(m)': False,
 'count(n)': 1,
 'has(n)': True,
 'count(o)': 1,
 'has(o)': True,
 'count(p)': 0,
 'has(p)': False,
 'count(q)': 0,
 'has(q)': False,
 'count(r)': 0,
 'has(r)': False,
 'count(s)': 0,
 'has(s)': False,
 'count(t)': 0,
 'has(t)': False,
 'count(u)': 0,
 'has(u)': False,
 'count(v)': 0,
 'has(v)': False,
 'count(w)': 0,
 'has(w)': False,
 'count(x)': 0,
 'has(x)': False,
 'count(y)': 0,
 'has(y)': False,
 'count(z)': 0,
 'has(z)': False}

In [26]:
featuresets = [(gender_features2(n), gender) for (n, gender) in labeled_names]
train_set, test_set = featuresets[500:], featuresets[:500]
classifier = nltk.NaiveBayesClassifier.train(train_set)
print(nltk.classify.accuracy(classifier, test_set))

0.792


In [27]:
# Once a initial feature set is chosen, a very productive method for 
# refining features is to do error analysis

train_names = labeled_names[1500:]
devtest_names = labeled_names[500:1500]
test_names = labeled_names[:500]

train_set = [(gender_features2(n), gender) for (n, gender) in train_names]
devtest_set = [(gender_features2(n), gender) for (n, gender) in devtest_names]
test_set = [(gender_features2(n), gender) for (n, gender) in test_names]
classifier = nltk.NaiveBayesClassifier.train(train_set)
print(nltk.classify.accuracy(classifier, devtest_set))

0.782


In [28]:
errors = []
for (name, tag) in devtest_names:
    guess = classifier.classify(gender_features(name))
    if guess != tag:
        errors.append((tag, guess, name))
errors

[('female', 'male', 'Kathleen'),
 ('female', 'male', 'Eran'),
 ('female', 'male', 'Oprah'),
 ('male', 'female', 'Gustave'),
 ('female', 'male', 'Barbey'),
 ('male', 'female', 'Rikki'),
 ('female', 'male', 'Adelind'),
 ('male', 'female', 'Duane'),
 ('female', 'male', 'Aubrey'),
 ('female', 'male', 'Delly'),
 ('female', 'male', 'Katey'),
 ('female', 'male', 'Gill'),
 ('female', 'male', 'Brook'),
 ('female', 'male', 'Gaby'),
 ('male', 'female', 'Rodolphe'),
 ('female', 'male', 'Aubry'),
 ('male', 'female', 'Ajai'),
 ('male', 'female', 'Robbie'),
 ('female', 'male', 'Hetty'),
 ('female', 'male', 'Gwyneth'),
 ('female', 'male', 'Doralynn'),
 ('female', 'male', 'Mady'),
 ('female', 'male', 'Ceciley'),
 ('female', 'male', 'Loren'),
 ('female', 'male', 'Charleen'),
 ('female', 'male', 'Cammy'),
 ('female', 'male', 'Jacklin'),
 ('female', 'male', 'Evey'),
 ('female', 'male', 'Mag'),
 ('female', 'male', 'Ginger'),
 ('female', 'male', 'Audrey'),
 ('female', 'male', 'Margalo'),
 ('male', 'female',

In [29]:
def gender_features3(name):
    features = {}
    features["first_letter"] = name[0].lower()
    features["suffix1"] = name[-1].lower()
    features["suffix2"] = name[-2:].lower()
    for letter in 'abcdefghijklmnopqrstuvwxyz':
        features["count({})".format(letter)] = name.lower().count(letter)
        features["has({})".format(letter)] = (letter in name.lower())
    return features

featuresets = [(gender_features3(n), gender) for (n, gender) in labeled_names]
train_set, test_set = featuresets[500:], featuresets[:500]
classifier = nltk.NaiveBayesClassifier.train(train_set)
print(nltk.classify.accuracy(classifier, test_set))

0.814
