<a href="https://colab.research.google.com/github/teddydavidson245/NLTK_Gender_Classifier/blob/main/NLTK_Gender_Classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
def gender_features(word):
    return {'last_letter': word[-1]}
gender_features('Teddy')


{'last_letter': 'y'}

In [2]:
import random
import nltk
nltk.download('names')
from nltk.corpus import names
labeled_names = ([(name, 'male') for name in names.words('male.txt')] + [(name, 'female') for name in names.words('female.txt')])

random.shuffle(labeled_names)

featuresets = [(gender_features(n), gender) for (n, gender) in labeled_names]


[nltk_data] Downloading package names to /root/nltk_data...
[nltk_data]   Unzipping corpora/names.zip.


In [3]:
len(featuresets)

7944

In [4]:
train_set, test_set, dev_set = featuresets[1588:], featuresets[:794], featuresets[794:1588]

In [5]:
classifier = nltk.NaiveBayesClassifier.train(dev_set)

In [6]:
print(classifier.classify(gender_features('Teddy')))

print(classifier.classify(gender_features('Terry')))

female
female


In [7]:
nltk.classify.accuracy(classifier, dev_set)

0.7758186397984886

In [8]:
classifier.show_most_informative_features(10)

Most Informative Features
             last_letter = 'a'            female : male   =     21.2 : 1.0
             last_letter = 'k'              male : female =     10.8 : 1.0
             last_letter = 'o'              male : female =     10.0 : 1.0
             last_letter = 'd'              male : female =      8.7 : 1.0
             last_letter = 'g'              male : female =      5.7 : 1.0
             last_letter = 'r'              male : female =      5.1 : 1.0
             last_letter = 'w'              male : female =      4.5 : 1.0
             last_letter = 'h'              male : female =      3.2 : 1.0
             last_letter = 't'              male : female =      3.1 : 1.0
             last_letter = 'i'            female : male   =      2.9 : 1.0


In [9]:
# from nltk.classify import apply_features
# train_set = apply_features(gender_features, labeled_names[500:])
# test_set = apply_features(gender_features, labeled_names[:500])

# """When working with large corpora, constructing a single list that contains the features of every instance can use up a large amount of memory. 
# In these cases, use the function nltk.classify.apply_features, which returns an object that acts like a list but does not store all the feature sets in memory"""

### Checking whether other features can help us distinguish the words better

1. Name Length

In [10]:
def gender_features2(word):
    return {'name_length': len(word)}
gender_features2('Teddy')

{'name_length': 5}

In [11]:
featuresets2 = [(gender_features2(n), gender) for (n, gender) in labeled_names]
train_set2, test_set2, dev_set2 = featuresets2[1588:], featuresets2[:794], featuresets2[794:1588]
classifier2 = nltk.NaiveBayesClassifier.train(train_set2)

print(classifier2.classify(gender_features2('Teddy')))

print(classifier2.classify(gender_features2('Michael')))

female
female


In [12]:
nltk.classify.accuracy(classifier2, dev_set)

0.6612090680100756

In [13]:
classifier2.show_most_informative_features(10)

Most Informative Features
             name_length = 2                male : female =      2.8 : 1.0
             name_length = 3                male : female =      2.0 : 1.0
             name_length = 12               male : female =      1.3 : 1.0
             name_length = 10             female : male   =      1.3 : 1.0
             name_length = 9              female : male   =      1.3 : 1.0
             name_length = 4                male : female =      1.3 : 1.0
             name_length = 11             female : male   =      1.2 : 1.0
             name_length = 7              female : male   =      1.1 : 1.0
             name_length = 8              female : male   =      1.1 : 1.0
             name_length = 5              female : male   =      1.1 : 1.0


2. First letter

In [14]:
def gender_features3(word):
    return {'first_letter': word[0]}
gender_features3('Teddy')

{'first_letter': 'T'}

In [15]:
featuresets3 = [(gender_features3(n), gender) for (n, gender) in labeled_names]
train_set3, test_set3, dev_set3 = featuresets3[1588:], featuresets3[:794], featuresets3[794:1588]
classifier3 = nltk.NaiveBayesClassifier.train(train_set3)

print(classifier3.classify(gender_features3('Teddy')))

print(classifier3.classify(gender_features3('Michael')))

female
female


In [16]:
nltk.classify.accuracy(classifier3, dev_set3)

0.6712846347607053

In [17]:
classifier3.show_most_informative_features(10)

Most Informative Features
            first_letter = 'W'              male : female =      4.8 : 1.0
            first_letter = 'U'              male : female =      2.6 : 1.0
            first_letter = 'X'              male : female =      2.4 : 1.0
            first_letter = 'K'            female : male   =      2.4 : 1.0
            first_letter = 'Q'              male : female =      2.2 : 1.0
            first_letter = 'H'              male : female =      2.2 : 1.0
            first_letter = 'L'            female : male   =      1.8 : 1.0
            first_letter = 'T'              male : female =      1.6 : 1.0
            first_letter = 'Z'              male : female =      1.6 : 1.0
            first_letter = 'Y'              male : female =      1.6 : 1.0


3. First and Last letter

In [18]:
def gender_features4(word):
     features = {}
     features["first_letter"] = word[0].lower()
     features["last_letter"] = word[-1].lower()
     return features
gender_features4('Teddy')

{'first_letter': 't', 'last_letter': 'y'}

In [19]:
featuresets4 = [(gender_features4(n), gender) for (n, gender) in labeled_names]
train_set4, test_set4, dev_set4 = featuresets4[1588:], featuresets4[:794], featuresets4[794:1588]
classifier4 = nltk.NaiveBayesClassifier.train(train_set4)

print(classifier4.classify(gender_features4('Teddy')))

print(classifier4.classify(gender_features4('Michael')))

male
female


In [20]:
nltk.classify.accuracy(classifier4, dev_set4)

0.7858942065491183

In [21]:
nltk.classify.accuracy(classifier4, test_set4)

0.7959697732997482

In [22]:
classifier4.show_most_informative_features(10)

Most Informative Features
             last_letter = 'k'              male : female =     35.8 : 1.0
             last_letter = 'a'            female : male   =     33.3 : 1.0
             last_letter = 'f'              male : female =     14.1 : 1.0
             last_letter = 'm'              male : female =     10.8 : 1.0
             last_letter = 'd'              male : female =      9.5 : 1.0
             last_letter = 'v'              male : female =      8.9 : 1.0
             last_letter = 'o'              male : female =      7.9 : 1.0
             last_letter = 'p'              male : female =      7.6 : 1.0
             last_letter = 'r'              male : female =      7.5 : 1.0
             last_letter = 'w'              male : female =      6.3 : 1.0


In [23]:
from nltk.metrics import *
import collections



In [24]:
# trainsets = collections.defaultdict(set)
# testsets = collections.defaultdict(set)

# for i, (feats, label) in enumerate(test_set4):
#     trainsets[label].add(i)
#     observed = classifier4.classify(feats)
#     testsets[observed].add(i)

In [25]:
# print( 'Precision:', nltk.metrics.precision(trainsets, testsets) )
# print( 'Recall:', nltk.metrics.recall(trainsets, testsets) )

In [26]:
def precision_recall(classifier, test_dict):
    refsets = collections.defaultdict(set)
    testsets = collections.defaultdict(set)
    for i, (feats, label) in enumerate(test_dict):
        refsets[label].add(i)
        observed = classifier.classify(feats)
        testsets[observed].add(i)
    print ('male precision:', precision(refsets['male'], testsets['male']))
    print ('male recall:', recall(refsets['male'], testsets['male']))
    print ('male F-measure:', f_measure(refsets['male'], testsets['male']))
    print ('female precision:', precision(refsets['female'], testsets['female']))
    print ('female recall:', recall(refsets['female'], testsets['female']))
    print ('female F-measure:', f_measure(refsets['female'], testsets['female']))

In [27]:
precision_recall(classifier4,test_set4)

male precision: 0.7193675889328063
male recall: 0.6666666666666666
male F-measure: 0.6920152091254752
female precision: 0.8317929759704251
female recall: 0.8637236084452975
female F-measure: 0.8474576271186439


In [28]:
classifier5 = nltk.DecisionTreeClassifier.train(train_set4)

In [29]:
nltk.classify.accuracy(classifier5, dev_set4)

0.792191435768262

In [30]:
nltk.classify.accuracy(classifier5, test_set4)

0.7871536523929471

In [31]:
precision_recall(classifier5, test_set4)

male precision: 0.7
male recall: 0.6666666666666666
male F-measure: 0.6829268292682926
female precision: 0.8295880149812734
female recall: 0.8502879078694817
female F-measure: 0.8398104265402844


In [49]:
errors = []
for (name, tag) in test_set2:
    guess = classifier2.classify(gender_features2(name))
    if guess != tag:         
      errors.append((tag, guess, name))

In [None]:
for (tag, guess, name) in errors:
  print('correct={} guess={} length={}'.format(tag, guess, name))