In [6]:
from nltk.corpus import names
import random
import pandas as pd
import numpy as np
import nltk

# 1. classify a name's gender

In [7]:
labeled_names = ([(name, 'male') for name in names.words('male.txt')] +
[(name, 'female') for name in names.words('female.txt')])

random.shuffle(labeled_names)

In [8]:
len(labeled_names)

7944

In [9]:
def gender_features(word):
    return {'last_letter': word[-1]}

gender_features('Shrek')
{'last_letter': 'k'}

{'last_letter': 'k'}

In [10]:
featuresets = [(gender_features(n), gender) for (n, gender) in labeled_names]
train_set, test_set = featuresets[1000:], featuresets[:1000]
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [11]:
print(nltk.classify.accuracy(classifier, test_set))

0.752


In [14]:
classifier.show_most_informative_features(5)

Most Informative Features
             last_letter = 'a'            female : male   =     35.1 : 1.0
             last_letter = 'k'              male : female =     27.7 : 1.0
             last_letter = 'f'              male : female =     15.6 : 1.0
             last_letter = 'p'              male : female =     11.6 : 1.0
             last_letter = 'd'              male : female =     11.1 : 1.0


In [12]:
classifier.classify(gender_features('Neo'))

'male'

In [13]:
classifier.classify(gender_features('Trinity'))


'female'

## When working with large corpora, can split the train and test set as follows, to save memories

In [15]:
from nltk.classify import apply_features
train_set = apply_features(gender_features, labeled_names[1000:])
test_set = apply_features(gender_features, labeled_names[:1000])

## 1.2 Another way to classify name's gender

In [16]:
def gender_features2(name):
    features = {}
    features["first_letter"] = name[0].lower()
    features["last_letter"] = name[-1].lower()
    for letter in 'abcdefghijklmnopqrstuvwxyz':
        features["count({})".format(letter)] = name.lower().count(letter)
        features["has({})".format(letter)] = (letter in name.lower())
    return features

In [18]:
#gender_features2('John') 

In [19]:
featuresets = [(gender_features2(n), gender) for (n, gender) in labeled_names]
train_set, test_set = featuresets[500:], featuresets[:500]
classifier = nltk.NaiveBayesClassifier.train(train_set)
print(nltk.classify.accuracy(classifier, test_set))

0.794


In [22]:
# featuresets is a list, each element are tuple of dictionary + gender
type(featuresets[0])

tuple

## 1.3 error analysis

In [23]:
#original divided into 3 groups,train set, test set, and development set
train_names = labeled_names[1500:]
devtest_names = labeled_names[500:1500]
test_names = labeled_names[:500]

In [24]:
train_set = [(gender_features(n), gender) for (n, gender) in train_names]
devtest_set = [(gender_features(n), gender) for (n, gender) in devtest_names]
test_set = [(gender_features(n), gender) for (n, gender) in test_names]
classifier = nltk.NaiveBayesClassifier.train(train_set) 

print(nltk.classify.accuracy(classifier, devtest_set))

0.769


In [26]:
errors = []
for (name, tag) in devtest_names:
    guess = classifier.classify(gender_features(name))
    if guess != tag:
        errors.append( (tag, guess, name) ) #tuple as element appended

In [28]:
#errors

[('female', 'male', 'Cass'),
 ('male', 'female', 'Skelly'),
 ('female', 'male', 'Lurleen'),
 ('male', 'female', 'Yancy'),
 ('male', 'female', 'Samuele'),
 ('female', 'male', 'Jennifer'),
 ('female', 'male', 'Bryn'),
 ('female', 'male', 'Fern'),
 ('female', 'male', 'Bo'),
 ('female', 'male', 'Jazmin'),
 ('male', 'female', 'Corey'),
 ('male', 'female', 'Dennie'),
 ('male', 'female', 'Greggory'),
 ('female', 'male', 'Keren'),
 ('female', 'male', 'Karin'),
 ('female', 'male', 'Starlin'),
 ('female', 'male', 'Jill'),
 ('female', 'male', 'Jackelyn'),
 ('male', 'female', 'Levy'),
 ('male', 'female', 'Allie'),
 ('male', 'female', 'Lesley'),
 ('male', 'female', 'Reggy'),
 ('male', 'female', 'Danie'),
 ('female', 'male', 'Robinet'),
 ('female', 'male', 'Sean'),
 ('female', 'male', 'Chantal'),
 ('female', 'male', 'Gennifer'),
 ('female', 'male', 'Beryl'),
 ('male', 'female', 'Ash'),
 ('male', 'female', 'Towny'),
 ('female', 'male', 'Lynett'),
 ('male', 'female', 'Mace'),
 ('male', 'female', 'Serg

In [27]:
for (tag, guess, name) in sorted(errors):
    print('correct={:<8} guess={:<8s} name={:<30}'.format(tag, guess, name)) #{:<30} is the printing format

correct=female   guess=male     name=Abagael                       
correct=female   guess=male     name=Abigail                       
correct=female   guess=male     name=Alex                          
correct=female   guess=male     name=Alisun                        
correct=female   guess=male     name=Alleen                        
correct=female   guess=male     name=Ambur                         
correct=female   guess=male     name=Angel                         
correct=female   guess=male     name=Annabel                       
correct=female   guess=male     name=Ardys                         
correct=female   guess=male     name=Arleen                        
correct=female   guess=male     name=Arlen                         
correct=female   guess=male     name=Astrix                        
correct=female   guess=male     name=Berget                        
correct=female   guess=male     name=Beryl                         
correct=female   guess=male     name=Bidget     

## 1.4 improve features taken

In [30]:
#take 2 last letters
def gender_features(word):
    return {'suffix1': word[-1:],
            'suffix2': word[-2:]}

In [31]:
train_set = [(gender_features(n), gender) for (n, gender) in train_names]
devtest_set = [(gender_features(n), gender) for (n, gender) in devtest_names]
classifier = nltk.NaiveBayesClassifier.train(train_set)

print(nltk.classify.accuracy(classifier, devtest_set))

0.778
