# Module 5: Natural Language Processing
## Lecture 3: Text Classification

* Classification is a topic in machine learning.
* Classification is supervised, meaning you need to provide correct classification labels.
* Data should be partitioned into training and test data.
* There are many classification models, but only naive Bayes will be used in this lecture.
* More details about machine learning will be covered in the next module.

# References for This Lecture
* NLTK Book, Ch. 6
    * http://www.nltk.org/book/ch06.html
    * Sections 1.1, 1.3, 1.4, 2.1

# Gender Classification

In [1]:
# names can be clasified into male and female names
# what features should be used?
# let's start with the last letter of the name
def gender_features(word):
    return {'last_letter': word[-1]}

gender_features('John')

{'last_letter': 'n'}

In [4]:
# NLTK contains lists of male and female names
# load and shuffle them
import nltk
from nltk.corpus import names
import random

labeled_names = ([(name, 'male') for name in names.words('male.txt')] + 
                 [(name, 'female') for name in names.words('female.txt')])
random.shuffle(labeled_names)
labeled_names[:10]

[('Elianore', 'female'),
 ('Kata', 'female'),
 ('Anet', 'female'),
 ('Rici', 'female'),
 ('Baird', 'male'),
 ('Joao', 'male'),
 ('Peri', 'female'),
 ('Carsten', 'male'),
 ('Hallam', 'male'),
 ('Lonny', 'male')]

In [6]:
# it's necessary to partition the data into
# training and test data
# use the training data to train a naive Bayes classifier
# and evaluate the classifier on the test data
featuresets = [(gender_features(n), gender) for (n, gender) in labeled_names]
train_set, test_set = featuresets[500:], featuresets[:500]
classifier = nltk.NaiveBayesClassifier.train(train_set)
nltk.classify.accuracy(classifier, test_set)

0.782

In [1]:
# you can apply the classifier to a particular name as well
# it correctly classifies my name!
classifier.classify(gender_features('Jim'))

NameError: name 'classifier' is not defined

In [13]:
# you can also examine which feature values were most useful
# likelihood ratios are displayed
classifier.show_most_informative_features(5)

Most Informative Features
             last_letter = 'a'            female : male   =     33.5 : 1.0
             last_letter = 'k'              male : female =     31.8 : 1.0
             last_letter = 'f'              male : female =     16.4 : 1.0
             last_letter = 'p'              male : female =     12.4 : 1.0
             last_letter = 'v'              male : female =     10.4 : 1.0


# Document Classification

In [11]:
# it's possible to classify documents into categories
# let's classify the first nursing note of each ICU stay
# according to ICU mortality
# here is the SQL query
from nltk.corpus import movie_reviews
documents = [(list(movie_reviews.words(fileid)), category)
              for category in movie_reviews.categories()
              for fileid in movie_reviews.fileids(category)]
random.shuffle(documents)

In [13]:
# run the query and extract data from MIMIC
documents[0]

(['all',
  'feature',
  'film',
  'directors',
  'who',
  'cut',
  'their',
  'teeth',
  'on',
  'music',
  'videos',
  ',',
  'please',
  'raise',
  'your',
  'hands',
  '.',
  'thank',
  'you',
  'for',
  'identifying',
  'yourselves',
  ';',
  'now',
  'would',
  'you',
  'all',
  'please',
  'go',
  'away',
  '.',
  'your',
  'influence',
  'has',
  'to',
  'rank',
  'as',
  'one',
  'of',
  'the',
  'most',
  'annoying',
  'trends',
  'in',
  'filmmaking',
  'in',
  'the',
  'last',
  'decade',
  ',',
  'and',
  'it',
  'shows',
  'no',
  'sign',
  'of',
  'abating',
  'any',
  'time',
  'soon',
  '.',
  'it',
  'isn',
  "'",
  't',
  'just',
  'the',
  'strobe',
  'light',
  'quality',
  'of',
  'your',
  'twenty',
  '-',
  'cuts',
  '-',
  'per',
  '-',
  'minute',
  'editing',
  ',',
  'or',
  'the',
  'numbing',
  'over',
  '-',
  'use',
  'of',
  'popular',
  'music',
  'artists',
  'on',
  'the',
  'soundtracks',
  'which',
  'makes',
  'me',
  'suspicious',
  'of',
  'any',

In [29]:
# again, the most difficult step is feature extraction
# let's use the presence of the 1000 most common words
import re

movie_tokens = []
for (review, rating) in documents:
    movie_tokens += review

# apply raw text processing we studied in lecture 1
movie_tokens = [nltk.PorterStemmer().stem(token) for token in movie_tokens 
             if len(token) > 1 and
             not re.search(r'^\*\*.+$', token) and 
             not re.search(r'^.+\*\*$', token) and                 
             not re.search(r'^[0-9]+', token)]    
    
all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words())
word_features = list(all_words.keys())[:2000]
word_features[:20]


['plot',
 ':',
 'two',
 'teen',
 'couples',
 'go',
 'to',
 'a',
 'church',
 'party',
 ',',
 'drink',
 'and',
 'then',
 'drive',
 '.',
 'they',
 'get',
 'into',
 'an']

In [30]:
# feature extractor
def document_features(document):
    document_words = set(document)
    features = {}
    for word in word_features:
        features['contains(%s)' % word] = (word in document_words)
    return features

In [39]:
# train a naive Bayes classifier and evaluate it on test data
featuresets = [(document_features(review), rating) for (review, rating) in documents]
train_set, test_set = featuresets[:200], featuresets[200:]
classifier = nltk.NaiveBayesClassifier.train(train_set)
nltk.classify.accuracy(classifier, test_set)

0.7033333333333334

In [40]:
# most important features
classifier.show_most_informative_features(5)

Most Informative Features
      contains(terrible) = True              neg : pos    =      9.0 : 1.0
         contains(awful) = True              neg : pos    =      8.3 : 1.0
       contains(stories) = True              pos : neg    =      7.7 : 1.0
             contains(4) = True              neg : pos    =      7.6 : 1.0
         contains(heads) = True              pos : neg    =      7.1 : 1.0


# POS Tagging with Classification

In [41]:
# instead of manually creating a POS tagger
# it's possible to train a classifier to learn suffix patterns
# let's extract common suffixes first
from nltk.corpus import brown

suffix_fdist = nltk.FreqDist()
for word in brown.words():
    word = word.lower()
    suffix_fdist[word[-1:]] += 1
    suffix_fdist[word[-2:]] += 1
    suffix_fdist[word[-3:]] += 1

common_suffixes = [suffix for (suffix, count) in suffix_fdist.most_common(100)]
common_suffixes[:10]

['e', ',', '.', 's', 'd', 't', 'he', 'n', 'a', 'of']

In [42]:
# define a feature extractor
# that indicates whether the given word ends with 
# one of the common suffixes
def pos_features(word):
    features = {}
    for suffix in common_suffixes:
        features['endswith({})'.format(suffix)] = word.lower().endswith(suffix)
    return features

In [43]:
# train a naive Bayes classifier and evaluate
tagged_words = brown.tagged_words(categories='news')
featuresets = [(pos_features(n), g) for (n,g) in tagged_words]
size = int(len(featuresets) * 0.1)
train_set, test_set = featuresets[size:], featuresets[:size]
classifier = nltk.NaiveBayesClassifier.train(train_set)
nltk.classify.accuracy(classifier, test_set)

0.5636001989060169

In [36]:
# see how the classifier performs for a specific word
classifier.classify(pos_features('health'))

u'ABX'

# Sentence Segmentation with Classification

In [21]:
# setence segmentation essentially looks for 
# sentence-ending punctuation
# which can be learned using machine learning
# first we need segmented data
import nltk
sents = nltk.corpus.treebank_raw.sents()
tokens = []
boundaries = set()
offset = 0
for sent in sents:
    tokens.extend(sent)
    offset += len(sent)
    boundaries.add(offset-1)

In [22]:
# tokens contains tokens from individual sentences
tokens[:10]

[u'.',
 u'START',
 u'Pierre',
 u'Vinken',
 u',',
 u'61',
 u'years',
 u'old',
 u',',
 u'will']

In [35]:
# boundaries contains indexes of sentence-boundary tokens
print boundaries

set([99783, 1, 90116, 16389, 40968, 81929, 24587, 16396, 73741, 8207, 32784, 93292, 20, 64158, 57366, 8221, 36869, 89832, 84656, 24611, 36, 53254, 38, 98345, 73771, 8236, 64178, 8238, 49201, 73785, 49210, 67344, 16445, 64, 66, 32835, 101750, 93021, 91489, 24649, 16459, 34146, 65615, 36618, 82001, 57426, 49235, 8276, 90197, 32855, 41050, 8285, 73824, 90128, 32868, 38246, 102, 16487, 65640, 98410, 82029, 24688, 49265, 41075, 16502, 32887, 97056, 92863, 52651, 65662, 8319, 24704, 57474, 58731, 134, 43713, 41097, 41099, 49292, 32911, 39619, 79214, 65686, 57495, 8346, 65200, 8348, 32930, 163, 24740, 41126, 57511, 49320, 73897, 16554, 96238, 45833, 65712, 41137, 57522, 16568, 99701, 90304, 98497, 82114, 8390, 199, 49352, 57545, 98475, 16587, 211, 24788, 65749, 32982, 75295, 32804, 76495, 41180, 81957, 57569, 49378, 50555, 228, 57382, 51921, 98537, 65770, 237, 8430, 87571, 94248, 33010, 6867, 80206, 73973, 57590, 90359, 54019, 8442, 21887, 86058, 96675, 258, 24836, 49413, 41004, 16650, 65803,

In [31]:
# extract the following features
def punct_features(tokens, i):
    return {'next-word-capitalized': tokens[i+1][0].isupper(),
            'prev-word': tokens[i-1].lower(),
            'punct': tokens[i],
            'prev-word-is-one-char': len(tokens[i-1]) == 1}

In [32]:
# extract features and use them 
# to train and evaluate a naive Bayes classifier
featuresets = [(punct_features(tokens, i), (i in boundaries))
               for i in range(1, len(tokens)-1)
               if tokens[i] in '.?!']
size = int(len(featuresets) * 0.1)
train_set, test_set = featuresets[size:], featuresets[:size]
classifier = nltk.NaiveBayesClassifier.train(train_set)
nltk.classify.accuracy(classifier, test_set)

0.936026936026936

# Module 5 Closing Remarks
* There is a lot more to NLP
    * Such as analyzing sentence structure, grammar, etc
    * If interested, read the rest of the NLTK book
* This lecture is a nice segue to Module 6 on machine learning
* This module was important for the course project