1. Supervised Classification

In [1]:
# Gender Identification

def gender_features(word):
    return {'last_letter':word[-1], "length": len(word), "first_letter": word[0]}

gender_features("Shrek")

{'last_letter': 'k', 'length': 5, 'first_letter': 'S'}

In [2]:
from nltk.corpus import names
labeled_names = ([(name, 'male') for name in names.words('male.txt')] +
[(name, 'female') for name in names.words('female.txt')])

import random
random.shuffle(labeled_names)
labeled_names

[('Rodge', 'male'),
 ('Simeon', 'male'),
 ('Ivie', 'female'),
 ('Flore', 'female'),
 ('Cindra', 'female'),
 ('Marris', 'female'),
 ('Trevor', 'male'),
 ('Andrea', 'male'),
 ('Hakim', 'male'),
 ('Maynard', 'male'),
 ('Danie', 'male'),
 ('Idalia', 'female'),
 ('Anjanette', 'female'),
 ('Sheilah', 'female'),
 ('Nina', 'female'),
 ('Neddy', 'male'),
 ('Yolanda', 'female'),
 ('Cinda', 'female'),
 ('Connie', 'male'),
 ('Roland', 'male'),
 ('Brandy', 'male'),
 ('Merla', 'female'),
 ('Annecorinne', 'female'),
 ('Miquela', 'female'),
 ('Dionis', 'male'),
 ('Marcelle', 'female'),
 ('Melva', 'female'),
 ('Josie', 'female'),
 ('Candace', 'female'),
 ('Latia', 'female'),
 ('Bettie', 'female'),
 ('Anne-Corinne', 'female'),
 ('Pamela', 'female'),
 ('Evan', 'male'),
 ('Toinette', 'female'),
 ('Daniele', 'female'),
 ('Aretha', 'female'),
 ('Templeton', 'male'),
 ('Eberhard', 'male'),
 ('Moyra', 'female'),
 ('Eolanda', 'female'),
 ('Roberta', 'female'),
 ('Clinten', 'male'),
 ('Eugenie', 'female'),
 ('B

In [3]:
import nltk

featuresets = [(gender_features(n), gender) for (n, gender) in labeled_names]
train_set, test_set = featuresets[500:], featuresets[:500]
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [4]:
classifier.classify(gender_features("Neo"))

'male'

In [5]:
classifier.classify(gender_features("Maria"))

'female'

In [6]:
classifier.classify(gender_features("Anri"))

'female'

In [7]:
print(nltk.classify.accuracy(classifier, test_set))

0.748


In [8]:
classifier.show_most_informative_features(20)

Most Informative Features
             last_letter = 'a'            female : male   =     38.4 : 1.0
             last_letter = 'k'              male : female =     31.3 : 1.0
             last_letter = 'f'              male : female =     24.3 : 1.0
             last_letter = 'v'              male : female =     18.7 : 1.0
             last_letter = 'p'              male : female =     10.5 : 1.0
             last_letter = 'm'              male : female =      9.8 : 1.0
             last_letter = 'd'              male : female =      9.3 : 1.0
             last_letter = 'o'              male : female =      8.3 : 1.0
             last_letter = 'r'              male : female =      7.0 : 1.0
             last_letter = 'w'              male : female =      6.2 : 1.0
             last_letter = 'g'              male : female =      5.2 : 1.0
            first_letter = 'W'              male : female =      4.7 : 1.0
             last_letter = 'z'              male : female =      4.6 : 1.0

In [9]:
from nltk.classify import apply_features
train_set = apply_features(gender_features, labeled_names[500:])
test_set = apply_features(gender_features, labeled_names[:500])

In [10]:
# Choosing the right features

def gender_features2(name):
    features = {}
    features["first_letter"] = name[0].lower()
    features["last_letter"] = name[-1].lower()
    for letter in 'abcdefghijklmnopqrstuvwxyz':
        features["count({})".format(letter)] = name.lower().count(letter)
        features["has({})".format(letter)] = (letter in name.lower())
    return features

gender_features2("John")

{'first_letter': 'j',
 'last_letter': 'n',
 'count(a)': 0,
 'has(a)': False,
 'count(b)': 0,
 'has(b)': False,
 'count(c)': 0,
 'has(c)': False,
 'count(d)': 0,
 'has(d)': False,
 'count(e)': 0,
 'has(e)': False,
 'count(f)': 0,
 'has(f)': False,
 'count(g)': 0,
 'has(g)': False,
 'count(h)': 1,
 'has(h)': True,
 'count(i)': 0,
 'has(i)': False,
 'count(j)': 1,
 'has(j)': True,
 'count(k)': 0,
 'has(k)': False,
 'count(l)': 0,
 'has(l)': False,
 'count(m)': 0,
 'has(m)': False,
 'count(n)': 1,
 'has(n)': True,
 'count(o)': 1,
 'has(o)': True,
 'count(p)': 0,
 'has(p)': False,
 'count(q)': 0,
 'has(q)': False,
 'count(r)': 0,
 'has(r)': False,
 'count(s)': 0,
 'has(s)': False,
 'count(t)': 0,
 'has(t)': False,
 'count(u)': 0,
 'has(u)': False,
 'count(v)': 0,
 'has(v)': False,
 'count(w)': 0,
 'has(w)': False,
 'count(x)': 0,
 'has(x)': False,
 'count(y)': 0,
 'has(y)': False,
 'count(z)': 0,
 'has(z)': False}

In [11]:
featuresets = [(gender_features2(n), gender) for (n, gender) in labeled_names]
train_set, test_set = featuresets[500:], featuresets[:500]
classifier = nltk.NaiveBayesClassifier.train(train_set)
print(nltk.classify.accuracy(classifier, test_set))

0.768


In [12]:
# Once a initial feature set is chosen, a very productive method for 
# refining features is to do error analysis

train_names = labeled_names[1500:]
devtest_names = labeled_names[500:1500]
test_names = labeled_names[:500]

train_set = [(gender_features2(n), gender) for (n, gender) in train_names]
devtest_set = [(gender_features2(n), gender) for (n, gender) in devtest_names]
test_set = [(gender_features2(n), gender) for (n, gender) in test_names]
classifier = nltk.NaiveBayesClassifier.train(train_set)
print(nltk.classify.accuracy(classifier, devtest_set))

0.777


In [13]:
errors = []
for (name, tag) in devtest_names:
    guess = classifier.classify(gender_features(name))
    if guess != tag:
        errors.append((tag, guess, name))
errors

[('female', 'male', 'Sharon'),
 ('female', 'male', 'Cris'),
 ('female', 'male', 'Edy'),
 ('male', 'female', 'Vite'),
 ('male', 'female', 'Lyle'),
 ('male', 'female', 'Roice'),
 ('female', 'male', 'Joly'),
 ('female', 'male', 'Buffy'),
 ('female', 'male', 'Bethany'),
 ('male', 'female', 'Giorgi'),
 ('male', 'female', 'Alfonse'),
 ('female', 'male', 'Loreen'),
 ('female', 'male', 'Dido'),
 ('female', 'male', 'Courtney'),
 ('female', 'male', 'Teryl'),
 ('female', 'male', 'Inger'),
 ('female', 'male', 'Theo'),
 ('female', 'male', 'Roxy'),
 ('female', 'male', 'Rosamund'),
 ('female', 'male', 'Stacy'),
 ('female', 'male', 'Ruth'),
 ('male', 'female', 'Dennie'),
 ('female', 'male', 'Karmen'),
 ('male', 'female', 'Vale'),
 ('male', 'female', 'Lawerence'),
 ('female', 'male', 'Siobhan'),
 ('female', 'male', 'Billy'),
 ('female', 'male', 'Sharyl'),
 ('male', 'female', 'Olle'),
 ('female', 'male', 'Nariko'),
 ('female', 'male', 'Courtenay'),
 ('female', 'male', 'Gwen'),
 ('male', 'female', 'Jerem

In [14]:
def gender_features3(name):
    features = {}
    features["first_letter"] = name[0].lower()
    features["suffix1"] = name[-1].lower()
    features["suffix2"] = name[-2:].lower()
    for letter in 'abcdefghijklmnopqrstuvwxyz':
        features["count({})".format(letter)] = name.lower().count(letter)
        features["has({})".format(letter)] = (letter in name.lower())
    return features

featuresets = [(gender_features3(n), gender) for (n, gender) in labeled_names]
train_set, test_set = featuresets[500:], featuresets[:500]
classifier = nltk.NaiveBayesClassifier.train(train_set)
print(nltk.classify.accuracy(classifier, test_set))

0.78


In [16]:
# Document Classification

from nltk.corpus import movie_reviews
documents = [(list(movie_reviews.words(fileid)), category)
            for category in movie_reviews.categories()
            for fileid in movie_reviews.fileids(category)]
random.shuffle(documents)

In [18]:
# Define feature extractor for documents

all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words())
word_features = list(all_words)[:2000]

def document_features(document):
    document_words = set(document)
    features = {}
    for word in word_features:
        features['contains{}'.format(word)] = (word in document_words)
    return features

print(document_features(movie_reviews.words('pos/cv957_8737.txt')))

{'containsplot': True, 'contains:': True, 'containstwo': True, 'containsteen': False, 'containscouples': False, 'containsgo': False, 'containsto': True, 'containsa': True, 'containschurch': False, 'containsparty': False, 'contains,': True, 'containsdrink': False, 'containsand': True, 'containsthen': True, 'containsdrive': False, 'contains.': True, 'containsthey': True, 'containsget': True, 'containsinto': True, 'containsan': True, 'containsaccident': False, 'containsone': True, 'containsof': True, 'containsthe': True, 'containsguys': False, 'containsdies': False, 'containsbut': True, 'containshis': True, 'containsgirlfriend': True, 'containscontinues': False, 'containssee': False, 'containshim': True, 'containsin': True, 'containsher': False, 'containslife': False, 'containshas': True, 'containsnightmares': False, 'containswhat': True, "contains'": True, 'containss': True, 'containsdeal': False, 'contains?': False, 'containswatch': True, 'containsmovie': True, 'contains"': True, 'conta

In [20]:
featuresets = [(document_features(d), c) for (d, c) in documents]
train_set, test_set = featuresets[100:], featuresets[:100]
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [21]:
print(nltk.classify.accuracy(classifier, test_set))

0.78


In [22]:
classifier.show_most_informative_features(5)

Most Informative Features
   containsunimaginative = True              neg : pos    =      8.3 : 1.0
          containsturkey = True              neg : pos    =      8.1 : 1.0
      containsschumacher = True              neg : pos    =      7.4 : 1.0
          containssuvari = True              neg : pos    =      7.0 : 1.0
          containsshoddy = True              neg : pos    =      7.0 : 1.0


In [24]:
# Part of speech tagging
# We can tain a classifier to work out which suffixes are most informative, instead of 
# using hand crafted regexes

from nltk.corpus import brown
suffix_fdist = nltk.FreqDist()
for word in brown.words():
    word = word.lower()
    suffix_fdist[word[-1:]] += 1
    suffix_fdist[word[-2:]] += 1
    suffix_fdist[word[-3:]] += 1

common_suffixes = [suffix for (suffix, count) in suffix_fdist.most_common(100)]
print(common_suffixes)

['e', ',', '.', 's', 'd', 't', 'he', 'n', 'a', 'of', 'the', 'y', 'r', 'to', 'in', 'f', 'o', 'ed', 'nd', 'is', 'on', 'l', 'g', 'and', 'ng', 'er', 'as', 'ing', 'h', 'at', 'es', 'or', 're', 'it', '``', 'an', "''", 'm', ';', 'i', 'ly', 'ion', 'en', 'al', '?', 'nt', 'be', 'hat', 'st', 'his', 'th', 'll', 'le', 'ce', 'by', 'ts', 'me', 've', "'", 'se', 'ut', 'was', 'for', 'ent', 'ch', 'k', 'w', 'ld', '`', 'rs', 'ted', 'ere', 'her', 'ne', 'ns', 'ith', 'ad', 'ry', ')', '(', 'te', '--', 'ay', 'ty', 'ot', 'p', 'nce', "'s", 'ter', 'om', 'ss', ':', 'we', 'are', 'c', 'ers', 'uld', 'had', 'so', 'ey']


In [26]:
def pos_features(word):
    features = {}
    for suffix in common_suffixes:
        features['contains{}'.format(suffix)] = word.lower().endswith(suffix)
    return features

pos_features("tehe")


{'containse': True,
 'contains,': False,
 'contains.': False,
 'containss': False,
 'containsd': False,
 'containst': False,
 'containshe': True,
 'containsn': False,
 'containsa': False,
 'containsof': False,
 'containsthe': False,
 'containsy': False,
 'containsr': False,
 'containsto': False,
 'containsin': False,
 'containsf': False,
 'containso': False,
 'containsed': False,
 'containsnd': False,
 'containsis': False,
 'containson': False,
 'containsl': False,
 'containsg': False,
 'containsand': False,
 'containsng': False,
 'containser': False,
 'containsas': False,
 'containsing': False,
 'containsh': False,
 'containsat': False,
 'containses': False,
 'containsor': False,
 'containsre': False,
 'containsit': False,
 'contains``': False,
 'containsan': False,
 "contains''": False,
 'containsm': False,
 'contains;': False,
 'containsi': False,
 'containsly': False,
 'containsion': False,
 'containsen': False,
 'containsal': False,
 'contains?': False,
 'containsnt': False,
 'con

In [28]:
tagged_words = brown.tagged_words(categories='news')
featuresets = [(pos_features(n), g) for (n, g) in tagged_words]
featuresets[:10]

[({'containse': True,
   'contains,': False,
   'contains.': False,
   'containss': False,
   'containsd': False,
   'containst': False,
   'containshe': True,
   'containsn': False,
   'containsa': False,
   'containsof': False,
   'containsthe': True,
   'containsy': False,
   'containsr': False,
   'containsto': False,
   'containsin': False,
   'containsf': False,
   'containso': False,
   'containsed': False,
   'containsnd': False,
   'containsis': False,
   'containson': False,
   'containsl': False,
   'containsg': False,
   'containsand': False,
   'containsng': False,
   'containser': False,
   'containsas': False,
   'containsing': False,
   'containsh': False,
   'containsat': False,
   'containses': False,
   'containsor': False,
   'containsre': False,
   'containsit': False,
   'contains``': False,
   'containsan': False,
   "contains''": False,
   'containsm': False,
   'contains;': False,
   'containsi': False,
   'containsly': False,
   'containsion': False,
   'conta

In [32]:
size = int(len(featuresets) * 0.1)
train_set, test_set = featuresets[size:], featuresets[:size]
classifier = nltk.DecisionTreeClassifier.train(train_set)
nltk.classify.accuracy(classifier, test_set)

0.6270512182993535

In [33]:
print(classifier.pseudocode(depth=4))

if containsthe == False: 
  if contains, == False: 
    if containss == False: 
      if contains. == False: return '.'
      if contains. == True: return '.'
    if containss == True: 
      if containsis == False: return 'PP$'
      if containsis == True: return 'BEZ'
  if contains, == True: return ','
if containsthe == True: return 'AT'



In [34]:
# Exploring Context

def pos_features(sentence, i):
    features = {"suffix(1)" : sentence[i][-1:],
                "suffix(2)" : sentence[i][-2:],
                "suffix(3)" : sentence[i][-3:]}
    if i == 0:
        features["prev-word"] = "<START>"
    else:
        features["prev-word"] = sentence[i-1]
    return features

pos_features(brown.sents()[0], 8)

{'suffix(1)': 'n', 'suffix(2)': 'on', 'suffix(3)': 'ion', 'prev-word': 'an'}

In [36]:
tagged_sents = brown.tagged_sents(categories='news')
featuresets = []
for tagged_sent in tagged_sents:
    untagged_sent = nltk.tag.untag(tagged_sent)
    for i, (word, tag) in enumerate(tagged_sent):
        featuresets.append((pos_features(untagged_sent, i), tag))

size = int(len(featuresets) * 0.1)
train_set, test_set = featuresets[size:], featuresets[:size]
classifier = nltk.NaiveBayesClassifier.train(train_set)
nltk.classify.accuracy(classifier, test_set)

0.7891596220785678

In [37]:
# Sequence classification

def pos_features(sentence, i, history):
    features = {"suffix(1)" : sentence[i][-1:],
                "suffix(2)" : sentence[i][-2:],
                "suffix(3)" : sentence[i][-3:]}
    if i == 0:
        features["prev-word"] = "<START>"
        features["prev-tag"] = "<START"
    else:
        features["prev-word"] = sentence[i - 1]
        features["prev-tag"] = history[i - 1]
    return features

class ConsecutivePosTagger(nltk.TaggerI):
    
    def __init__(self, train_sents):
        train_set = []
        for tagged_sent in train_sents:
            untagged_sent = nltk.tag.untag(tagged_sent)
            history = []
            for i, (word, tag) in enumerate(tagged_sent):
                featureset = pos_features(untagged_sent, i, history)
                history.append(tag)
                train_set.append((featureset, tag))
        self.classifier = nltk.NaiveBayesClassifier.train(train_set)
    
    def tag(self, sentence):
        history = []
        for i, word in enumerate(sentence):
            featureset = pos_features(sentence, i, history)
            tag = self.classifier.classify(featureset)
            history.append(tag)
        return zip(sentence, history)

tagged_sents = brown.tagged_sents(categories="news")
size = int(len(tagged_sents) * 0.1)
train_sents, test_sents = tagged_sents[size:], tagged_sents[:size]
tagger = ConsecutivePosTagger(train_sents)
print(tagger.evaluate(test_sents))

0.7980528511821975


In [38]:
# 2 Further examples of Supervised Classification

In [40]:
# Sentence Segmentation

sents = nltk.corpus.treebank_raw.sents()
tokens = []
boundaries = set()
offset = 0
for sent in sents:
    tokens.extend(sent)
    offset += len(sent)
    boundaries.add(offset - 1)

In [41]:
tokens

['.',
 'START',
 'Pierre',
 'Vinken',
 ',',
 '61',
 'years',
 'old',
 ',',
 'will',
 'join',
 'the',
 'board',
 'as',
 'a',
 'nonexecutive',
 'director',
 'Nov',
 '.',
 '29',
 '.',
 'Mr',
 '.',
 'Vinken',
 'is',
 'chairman',
 'of',
 'Elsevier',
 'N',
 '.',
 'V',
 '.,',
 'the',
 'Dutch',
 'publishing',
 'group',
 '.',
 '.',
 'START',
 'Rudolph',
 'Agnew',
 ',',
 '55',
 'years',
 'old',
 'and',
 'former',
 'chairman',
 'of',
 'Consolidated',
 'Gold',
 'Fields',
 'PLC',
 ',',
 'was',
 'named',
 'a',
 'nonexecutive',
 'director',
 'of',
 'this',
 'British',
 'industrial',
 'conglomerate',
 '.',
 '.',
 'START',
 'A',
 'form',
 'of',
 'asbestos',
 'once',
 'used',
 'to',
 'make',
 'Kent',
 'cigarette',
 'filters',
 'has',
 'caused',
 'a',
 'high',
 'percentage',
 'of',
 'cancer',
 'deaths',
 'among',
 'a',
 'group',
 'of',
 'workers',
 'exposed',
 'to',
 'it',
 'more',
 'than',
 '30',
 'years',
 'ago',
 ',',
 'researchers',
 'reported',
 '.',
 'The',
 'asbestos',
 'fiber',
 ',',
 'crocidolit

In [43]:
def punct_features(tokens, i):
    return {
        'next-word-capitalized': tokens[i+1][0].isupper(),
        'prev-word': tokens[i - 1].lower(),
        'punct': tokens[i],
        'prev-word-is-one-char': len(tokens[i-1]) == 1
    }

featuresets = [(punct_features(tokens, i), (i in boundaries)) for i in range(1, len(tokens) - 1) if tokens[i] in '.?!']
featuresets

[({'next-word-capitalized': False,
   'prev-word': 'nov',
   'punct': '.',
   'prev-word-is-one-char': False},
  False),
 ({'next-word-capitalized': True,
   'prev-word': '29',
   'punct': '.',
   'prev-word-is-one-char': False},
  True),
 ({'next-word-capitalized': True,
   'prev-word': 'mr',
   'punct': '.',
   'prev-word-is-one-char': False},
  False),
 ({'next-word-capitalized': True,
   'prev-word': 'n',
   'punct': '.',
   'prev-word-is-one-char': True},
  False),
 ({'next-word-capitalized': False,
   'prev-word': 'group',
   'punct': '.',
   'prev-word-is-one-char': False},
  True),
 ({'next-word-capitalized': True,
   'prev-word': '.',
   'punct': '.',
   'prev-word-is-one-char': True},
  False),
 ({'next-word-capitalized': False,
   'prev-word': 'conglomerate',
   'punct': '.',
   'prev-word-is-one-char': False},
  True),
 ({'next-word-capitalized': True,
   'prev-word': '.',
   'punct': '.',
   'prev-word-is-one-char': True},
  False),
 ({'next-word-capitalized': True,
   'pr

In [46]:
size = int(len(featuresets) * 0.1)
train_set, test_set = featuresets[size:], featuresets[:size]
classifier = nltk.NaiveBayesClassifier.train(train_set)
nltk.classify.accuracy(classifier, test_set)

0.936026936026936

In [57]:
def segment_sentences(words):
    start = 0
    sents = []
    for i, word in enumerate(words[:-1]):
        if word in '.?!' and classifier.classify(punct_features(words, i)) == True:
            sents.append(words[start:i+1])
            start = i + 1
    if start < len(words):
        sents.append(words[start:])
    return sents

from nltk.tokenize import word_tokenize
#segment_sentences()
words = word_tokenize("what do you thinks you are doing? U.S.A is not a company but a country! Come and visit!")

In [58]:
segment_sentences(words)

[['what', 'do', 'you', 'thinks', 'you', 'are', 'doing', '?'],
 ['U.S.A', 'is', 'not', 'a', 'company', 'but', 'a', 'country', '!'],
 ['Come', 'and', 'visit', '!']]

In [62]:
# Identifying Dialogue Act Types

posts = nltk.corpus.nps_chat.xml_posts()[:10000]

def dialogue_act_features(post):
    features = {}
    for word in nltk.word_tokenize(post):
        features["contains({})".format(word.lower())] = True
    return features

featuresets = [(dialogue_act_features(post.text), post.get('class')) for post in posts]
size = int(len(featuresets) * 0.1)
train_set, test_set = featuresets[size:], featuresets[:size]
classifier = nltk.NaiveBayesClassifier.train(train_set)
print(nltk.classify.accuracy(classifier, test_set))

0.668


In [None]:
# 3. Evaluation
# accuracy, precision recall f-score cross validation