In [1]:
import nltk
from collections import defaultdict
import re
from nltk.corpus import *
import random
from nltk.classify import apply_features

__☼ Using any of the three classifiers described in this chapter, and any features you can think of, build the best name gender classifier you can. Begin by splitting the Names Corpus into three subsets: 500 words for the test set, 500 words for the dev-test set, and the remaining 6900 words for the training set. Then, starting with the example name gender classifier, make incremental improvements. Use the dev-test set to check your progress. Once you are satisfied with your classifier, check its final performance on the test set. How does the performance on the test set compare to the performance on the dev-test set? Is this what you'd expect?__

In [4]:
labeled_names = ([(name, 'male') for name in names.words('male.txt')] +
                [(name, 'female') for name in names.words('female.txt')])
random.shuffle(labeled_names)

In [13]:
train_set = labeled_names[:len(labeled_names)-1000]
dev_set = labeled_names[-1000:-500]
test_set = labeled_names[-500:]

In [14]:
print("Train set is {}\nDev set is {}\nTest set is {}".format(len(train_set), len(dev_set), len(test_set)))

Train set is 6944
Dev set is 500
Test set is 500


In [124]:
def gender_features(name):
    features = {}
    features["first_letter"] = name[0].lower()
    features["last_letter"] = name[-1].lower()
    for letter in 'abcdefghijklmnopqrstuvwxyz':
        features["count({})".format(letter)] = name.lower().count(letter)
        features["has({})".format(letter)] = (letter in name.lower())
    features["vowels"] = len(re.findall(r'[aeoui]', name.lower()))  
    features["consonants"] = len(re.findall(r'[aeoui]', name.lower())) - len(name)
    features["last_two"] = name[-2:].lower()
    features["last_three"] = name[-3:].lower()
    features["length"] = len(name)
    features["double"] = len(re.findall(r'(\w)\1', name))
    
    return features

In [125]:
train_data = apply_features(gender_features, train_set)
dev_data = apply_features(gender_features, dev_set)
test_data = apply_features(gender_features, test_set)

In [126]:
model_gender = nltk.NaiveBayesClassifier.train(train_data)

In [127]:
# First model from the book had an accuracy of 0.74
# After adding features "vowel", "consonant" the accuracy is 0.748
# After adding feature "last_two", "last_three" the accuracy is 0.774, and 7/10 of most infomrative features is "last_two"
# Accuracy 0.78 after "length" and "double" features
nltk.classify.accuracy(model_gender, dev_data)

0.78

In [128]:
model_gender.show_most_informative_features(10)

Most Informative Features
                last_two = 'na'           female : male   =     95.4 : 1.0
                last_two = 'la'           female : male   =     70.4 : 1.0
                last_two = 'ia'           female : male   =     53.5 : 1.0
             last_letter = 'k'              male : female =     43.7 : 1.0
             last_letter = 'a'            female : male   =     35.0 : 1.0
                last_two = 'sa'           female : male   =     31.1 : 1.0
                last_two = 'us'             male : female =     27.3 : 1.0
                last_two = 'ch'             male : female =     26.0 : 1.0
                last_two = 'do'             male : female =     24.9 : 1.0
              last_three = 'ana'          female : male   =     24.3 : 1.0


In [129]:
nltk.classify.accuracy(model_gender, test_data)

0.776

__☼ The Senseval 2 Corpus contains data intended to train word-sense disambiguation classifiers. It contains data for four words: hard, interest, line, and serve. Choose one of these four words, and load the corresponding data:__

 	
> from nltk.corpus import senseval

> instances = senseval.instances('hard.pos')

> size = int(len(instances) * 0.1)

> train_set, test_set = instances[size:], instances[:size]

Using this dataset, build a classifier that predicts the correct sense tag for a given instance. See the corpus HOWTO at https://www.nltk.org/_modules/nltk/corpus/reader/senseval.html for information on using the instance objects returned by the Senseval 2 Corpus.

In [178]:
instances = senseval.instances('hard.pos')
size = int(len(instances) * 0.1)
idx = list(range(len(instances)))
random.shuffle(idx)

In [179]:
# random.shuffle does not directly shuffle a senseval instances, so I did it this way. 
# Maybe, I will come back later and do it in a more elegant manner
shuffled = []
for id in idx:
    shuffled.append(instances[id])

In [184]:
train_hard, test_hard = shuffled[size:], shuffled[:size]

In [185]:
len(train_hard), len(test_hard)

(3900, 433)

In [193]:
#some sanity check
test = set()
train = set()
for ins in train_hard:
    train.add(ins.senses)

for ins in test_hard:
    test.add(ins.senses)

test, train    # Yes, each set has all three senses of the word "hard"

({('HARD1',), ('HARD2',), ('HARD3',)}, {('HARD1',), ('HARD2',), ('HARD3',)})

In [208]:
def hard_features(instance):
    features = {}
    features["tag_0"] = instance.context[instance.position][1] 
    if instance.position !=0 :
        features["tag_0-1"] = instance.context[instance.position-1][1]
        features["word_0-1"] = instance.context[instance.position-1][0]
    else:
        features["tag_0-1"] = "<START>"
        features["word_0-1"] = "<START>"
        
    if instance.position != len(instance.context) - 1:
        features["tag_0+1"] = instance.context[instance.position+1][1]
        features["word_0+1"] = instance.context[instance.position+1][0]
    else:
        features["tag_0+1"] = "<END>"
        features["word_0+1"] = "<END>"
    
    return features

In [209]:
train_hard_data = [(hard_features(ins), ins.senses) for ins in train_hard]
test_hard_data = [(hard_features(ins), ins.senses) for ins in test_hard]

In [210]:
model_hard = nltk.NaiveBayesClassifier.train(train_hard_data)

In [211]:
# When trained only on tags 1 step away the target word, the accuracy was around 77%
# yet adding words 1 step away the target word increased accuracy up to 90%
nltk.classify.accuracy(model_hard, test_hard_data)

0.9006928406466512

__☼ Using the movie review document classifier discussed in this chapter, generate a list of the 30 features that the classifier finds to be most informative. Can you explain why these particular features are informative? Do you find any of them surprising?__

In [223]:
documents = [(list(movie_reviews.words(fileid)), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]
random.shuffle(documents)

In [246]:
all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words())
word_features = list(all_words)[:2000]

def document_features(document):
    document_words = set(document)
    features = {}
    for word in word_features:
        features['contains({})'.format(word)] = (word in document_words)
    for bigram in nltk.bigrams(document):
        features['bigrams({})'.format(bigram)] = (bigram in nltk.bigrams(document))
    return features

In [247]:
featuresets = [(document_features(d), c) for (d,c) in documents]
train_set_movie, test_set_movie = featuresets[100:], featuresets[:100]
classifier = nltk.NaiveBayesClassifier.train(train_set_movie)

In [248]:
classifier.show_most_informative_features(30)

Most Informative Features
bigrams(('not', 'funny')) = True              neg : pos    =     17.8 : 1.0
bigrams(('is', 'terrific')) = True              pos : neg    =     16.9 : 1.0
bigrams(('and', 'boring')) = True              neg : pos    =     14.4 : 1.0
   contains(outstanding) = True              pos : neg    =     13.8 : 1.0
bigrams(('a', 'boring')) = True              neg : pos    =     13.1 : 1.0
 bigrams(('&', 'robin')) = True              neg : pos    =     12.4 : 1.0
bigrams(('batman', '&')) = True              neg : pos    =     12.4 : 1.0
bigrams(('quite', 'frankly')) = True              neg : pos    =     12.4 : 1.0
bigrams(('.', 'cameron')) = True              pos : neg    =     12.3 : 1.0
 bigrams(('our', 'own')) = True              pos : neg    =     12.3 : 1.0
bigrams(('works', 'well')) = True              pos : neg    =     12.3 : 1.0
  bigrams(('-', 'note')) = True              neg : pos    =     11.7 : 1.0
  bigrams(('be', 'fun')) = True              neg : pos    = 

In [249]:
# adding bigrams increased accuracy from 75% yo 78%
# as there are cases when unigrams do not work well
# for example, the first informative feature
# I could also try adding more features and maybe working with trigrams
# but I want to finish the book asap. Later, I will come back to this question

nltk.classify.accuracy(classifier, test_set_movie)

0.78

The features (based on unigrams) do not seem to be surprising as the majority of them are positive or negative words such as "wonderful"/"superb" or "poorly"/"mess". The only surprising thing, yet still expected from a model trained with feature "contain_word" is that there are some names of actors in the list. 

The features after adding bigrams are similar to those of unigrams, as both contain names of actors. Yet, I am surprised that "batman &" got a negative tag! In addition there are some bigrams that do not really make sense out of context, such as "our own" or "- note" 

__☼ Select one of the classification tasks described in this chapter, such as name gender detection, document classification, part-of-speech tagging, or dialog act classification. Using the same training and test data, and the same feature extractor, build three classifiers for the task: a decision tree, a naive Bayes classifier, and a Maximum Entropy classifier. Compare the performance of the three classifiers on your selected task. How do you think that your results might be different if you used a different feature extractor?__

In [2]:
posts = nltk.corpus.nps_chat.xml_posts()[:10000]

def dialogue_act_features(post):
    features = {}
    for word in nltk.word_tokenize(post):
        features['contains({})'.format(word.lower())] = True
    return features

featuresets = [(dialogue_act_features(post.text), post.get('class')) for post in posts]
size = int(len(featuresets) * 0.1)
train_set_dialogue, test_set_dialogue = featuresets[size:], featuresets[:size]

In [260]:
classifier_bayes = nltk.NaiveBayesClassifier.train(train_set_dialogue)

In [261]:
classifier_maxent = nltk.MaxentClassifier.train(train_set_dialogue)

  ==> Training (100 iterations)

      Iteration    Log Likelihood    Accuracy
      ---------------------------------------
             1          -2.70805        0.050
             2          -1.25297        0.847
             3          -0.92154        0.881
             4          -0.74995        0.898
             5          -0.63694        0.910
             6          -0.55420        0.918
             7          -0.49058        0.924
             8          -0.44065        0.929
             9          -0.40112        0.932
            10          -0.36951        0.937
            11          -0.34385        0.940
            12          -0.32262        0.943
            13          -0.30470        0.946
            14          -0.28932        0.948
            15          -0.27594        0.950
            16          -0.26415        0.952
            17          -0.25366        0.953
            18          -0.24425        0.954
            19          -0.23575        0.956
 

In [263]:
# classifier_tree = nltk.DecisionTreeClassifier.train(train_set_dialogue)  # takes a very long time to train

In [264]:
print("For Naive Bayes the accuracy is", nltk.classify.accuracy(classifier_bayes, test_set_dialogue))
# print("For Decision Tree the accuracy is", nltk.classify.accuracy(classifier_tree, test_set_dialogue))
print("For Maximum Entropy the accuracy is", nltk.classify.accuracy(classifier_maxent, test_set_dialogue))

For Naive Bayes the accuracy is 0.667
For Maximum Entropy the accuracy is 0.711


In [14]:
# Since tree training takes a lot of time in nltk, I will use sklearn's Tree with the same data 
import numpy as np

X = [post.text for post in posts]
y = [post.get("class") for post in posts]

In [24]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import LabelEncoder

X_train, X_test, y_train, y_test = X[:size], X[size:], y[:size], y[size:]
cv = CountVectorizer()  # no ngrams ot other params since with nltk I did not use them
le = LabelEncoder()

X_train_cv = cv.fit_transform(X_train)
y_train_le = le.fit_transform(y_train)

In [16]:
tree = DecisionTreeClassifier()
tree.fit(X_train_cv, y_train_le)

DecisionTreeClassifier()

In [18]:
tree.score(cv.transform(X_test), le.transform(y_test))

0.6365555555555555

In [20]:
print(classification_report(tree.predict(cv.transform(X_test)), le.transform(y_test)))

              precision    recall  f1-score   support

           0       0.32      0.20      0.25       295
           1       0.51      0.87      0.64       101
           2       0.00      0.00      0.00       126
           3       0.08      0.09      0.08       120
           4       0.92      0.35      0.50      2494
           5       0.00      0.00      0.00        29
           6       0.81      0.93      0.87      1100
           7       0.00      0.00      0.00         0
           8       0.02      0.06      0.02        34
           9       0.45      0.64      0.52      1922
          10       0.91      0.99      0.95      1987
          11       0.45      0.33      0.38        91
          12       0.75      0.73      0.74       455
          13       0.30      0.53      0.38        55
          14       0.21      0.50      0.30       191

    accuracy                           0.64      9000
   macro avg       0.38      0.41      0.38      9000
weighted avg       0.72   

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [76]:
# Just having some fin with diiferent feature extractions and models

from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import SGDClassifier

tfidf = TfidfVectorizer(ngram_range = (1,4), analyzer = "char_wb")
X_train_tf = tfidf.fit_transform(X_train)

my_classifier = OneVsRestClassifier(SGDClassifier(loss = "log", penalty = "l1", alpha = 0.00001, random_state = 42))
my_classifier.fit(X_train_tf, y_train_le)

my_classifier.score(tfidf.transform(X_test), le.transform(y_test))

0.7714444444444445

__☼ The synonyms "strong" and "powerful" pattern differently (try combining them with "chip" and "sales"). What features are relevant in this distinction? Build a classifier that predicts when each word should be used__

In [78]:
reuter = reuters.sents()
len([sent for sent in reuter if "strong" in sent]), len([sent for sent in reuter if "powerful" in sent])  # not enough data in Reuters

(370, 18)

In [79]:
brown_ = brown.sents()
len([sent for sent in brown_ if "strong" in sent]), len([sent for sent in brown_ if "powerful" in sent])  # not enough data in Brown either

(188, 62)

In [80]:
guten = gutenberg.sents()
len([sent for sent in guten if "strong" in sent]), len([sent for sent in guten if "powerful" in sent])

(569, 64)

In [81]:
# So, I can combine these three corpuses to have some data to train on
strong = [sent for sent in reuter if "strong" in sent] + [sent for sent in brown_ if "strong" in sent] + [sent for sent in guten if "strong" in sent]
powerful = [sent for sent in reuter if "powerful" in sent] + [sent for sent in brown_ if "powerful" in sent] + [sent for sent in guten if "powerful" in sent]

In [82]:
len(strong), len(powerful)

(1127, 144)

In [83]:
# Not all of these corpuses have tagged sents, so for now I will base my classifier on the words only
# Later I will use a trained tagger to tag Reuters and Gutenberg, and train classifier based on the tags of the following word

def feats(sent):
    features = {}
    if "strong" in sent:
        features["next_word"] = sent[sent.index("strong") + 1]
        features["prev_word"] = sent[sent.index("strong") - 1]
    else:
        features["next_word"] = sent[sent.index("powerful") + 1]
        features["prev_word"] = sent[sent.index("powerful") - 1]
    return features

In [84]:
data = [(feats(sent), "strong") for sent in strong] + [(feats(sent), "powerful") for sent in powerful]
random.shuffle(data)
n = int(len(data) * 0.9)

trainData, testData = data[:n], data[n:]

In [85]:
my_model = nltk.NaiveBayesClassifier.train(trainData)
nltk.classify.accuracy(my_model, testData)  # I am quite surprised to see an accuracy of 88% on test data, given the size of my dataset

0.921875

In [86]:
# Let us see the informative features it uses
my_model.show_most_informative_features(20)

Most Informative Features
               prev_word = 'more'         powerf : strong =     19.3 : 1.0
               prev_word = 'most'         powerf : strong =     19.3 : 1.0
               next_word = '.'            strong : powerf =      7.1 : 1.0
               next_word = 'central'      powerf : strong =      5.4 : 1.0
               next_word = 'influence'    powerf : strong =      5.4 : 1.0
               prev_word = 'O'            powerf : strong =      4.2 : 1.0
               prev_word = 'sufficiently' powerf : strong =      4.2 : 1.0
               prev_word = 'them'         powerf : strong =      4.2 : 1.0
               prev_word = 'unusually'    powerf : strong =      4.2 : 1.0
               prev_word = 'what'         powerf : strong =      4.2 : 1.0
               prev_word = 'this'         powerf : strong =      4.2 : 1.0
               next_word = 'as'           strong : powerf =      3.4 : 1.0
               prev_word = 'very'         strong : powerf =      3.3 : 1.0

Okay, now I get why the accuracy is high. The word _powerful_ in comparative needs _more_ before it and _most_ in superlative.

__◑ The dialog act classifier assigns labels to individual posts, without considering the context in which the post is found. However, dialog acts are highly dependent on context, and some sequences of dialog act are much more likely than others. For example, a ynQuestion dialog act is much more likely to be answered by a yanswer than by a greeting. Make use of this fact to build a consecutive classifier for labeling dialog acts. Be sure to consider what features might be useful. See the code for the consecutive classifier for part-of-speech tags in 1.7 to get some ideas.__

In [111]:
featureset_consecutive = []

for idx, post in enumerate(list(posts)):
    feature_consecutive = {}
    for word in nltk.word_tokenize(post.text):
        feature_consecutive['contains({})'.format(word.lower())] = True
    if idx != 0: feature_consecutive["prev_post"] = posts[idx-1].get("class")
    featureset_consecutive.append((feature_consecutive, posts[idx].get('class')))
    
size = int(len(featureset_consecutive) * 0.1)
train_set_consecutive, test_set_consecutive = featureset_consecutive[size:], featureset_consecutive[:size]

In [112]:
classifier_consecutive = nltk.NaiveBayesClassifier.train(train_set_consecutive)

In [113]:
nltk.classify.accuracy(classifier_consecutive, test_set_consecutive)

0.653

In [116]:
classifier_consecutive.show_most_informative_features(20)

Most Informative Features
            contains(hi) = True            Greet : System =    408.2 : 1.0
             contains(>) = True            Other : System =    384.6 : 1.0
         contains(empty) = True            Other : System =    339.4 : 1.0
          contains(part) = True           System : Statem =    302.0 : 1.0
           contains(brb) = True              Bye : Statem =    300.4 : 1.0
            contains(no) = True           nAnswe : System =    262.3 : 1.0
             contains(<) = True            Other : Greet  =    249.2 : 1.0
           contains(yes) = True           yAnswe : Emotio =    242.5 : 1.0
             contains(0) = True            Other : Statem =    199.4 : 1.0
           contains(are) = True           whQues : System =    198.6 : 1.0
            contains(na) = True           ynQues : System =    167.4 : 1.0
            contains(ok) = True           Accept : System =    165.6 : 1.0
           contains(lol) = True           Emotio : System =    154.3 : 1.0

__◑ Word features can be very useful for performing document classification, since the words that appear in a document give a strong indication about what its semantic content is. However, many words occur very infrequently, and some of the most informative words in a document may never have occurred in our training data. One solution is to make use of a lexicon, which describes how different words relate to one another. Using WordNet lexicon, augment the movie review document classifier presented in this chapter to use features that generalize the words that appear in a document, making it more likely that they will match words found in the training data.__

In [142]:
documents = [(list(movie_reviews.words(fileid)), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]
random.shuffle(documents)

In [157]:
all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words())
word_lemmas = [nltk.stem.WordNetLemmatizer().lemmatize(w) for w in list(all_words)[:2000]]

def document_features_lemmas(document):
    document_words = set([nltk.stem.WordNetLemmatizer().lemmatize(w) for w in document])
    features = {}
    for word in word_lemmas:
        features['contains({})'.format(word)] = (word in document_words)
    for bigram in nltk.bigrams(document):
        features['bigrams({})'.format(bigram)] = (bigram in nltk.bigrams(document_words))
        
    return features

In [158]:
featuresets_lemmas = [(document_features_lemmas(d), c) for (d,c) in documents]
train_set_movie_lemmas, test_set_movie_lemmas = featuresets_lemmas[100:], featuresets_lemmas[:100]
classifier_lemmas = nltk.NaiveBayesClassifier.train(train_set_movie_lemmas)

In [159]:
# Compared to 0.78 when thew ords were not lemmatized. I also want to see waht happens if I use PorterStemmer
nltk.classify.accuracy(classifier_lemmas, test_set_movie_lemmas)

0.84

In [160]:
classifier_lemmas.show_most_informative_features()

Most Informative Features
bigrams(('is', 'terrific')) = False             pos : neg    =     17.2 : 1.0
bigrams(('not', 'funny')) = False             neg : pos    =     16.8 : 1.0
bigrams(('the', 'political')) = False             pos : neg    =     16.5 : 1.0
bigrams(('and', 'boring')) = False             neg : pos    =     14.2 : 1.0
 bigrams(('our', 'own')) = False             pos : neg    =     13.1 : 1.0
bigrams(('well', 'worth')) = False             pos : neg    =     12.5 : 1.0
bigrams(('insult', 'to')) = False             neg : pos    =     12.2 : 1.0
bigrams(('fairy', 'tale')) = False             pos : neg    =     11.8 : 1.0
bigrams(('quite', 'frankly')) = False             neg : pos    =     11.5 : 1.0
bigrams(('.', 'cameron')) = False             pos : neg    =     11.1 : 1.0


__★ The PP Attachment Corpus is a corpus describing prepositional phrase attachment decisions. Each instance in the corpus is encoded as a PPAttachment object__

Select only the instances where inst.attachment is N. Using this sub-corpus, build a classifier that attempts to predict which preposition is used to connect a given pair of nouns. For example, given the pair of nouns "team" and "researchers," the classifier should predict the preposition "of". See the corpus HOWTO at https://www.nltk.org/howto/corpus.html#ppattach or https://www.nltk.org/_modules/nltk/corpus/reader/ppattach.html for more information on using the PP attachment corpus.

In [155]:
noun_pp = [pp for pp in ppattach.attachments("training") if pp.attachment == "N"]

In [161]:
noun_pp[:5]

[PPAttachment(sent='1', verb='is', noun1='chairman', prep='of', noun2='N.V.', attachment='N'),
 PPAttachment(sent='2', verb='named', noun1='director', prep='of', noun2='conglomerate', attachment='N'),
 PPAttachment(sent='3', verb='caused', noun1='percentage', prep='of', noun2='deaths', attachment='N'),
 PPAttachment(sent='9', verb='is', noun1='asbestos', prep='in', noun2='products', attachment='N'),
 PPAttachment(sent='12', verb='led', noun1='team', prep='of', noun2='researchers', attachment='N')]

In [182]:
def pp_features(inst):
    features = {}
    features["N1"] = nltk.stem.WordNetLemmatizer().lemmatize(inst.noun1)    # the only informative feature available is N1
    # features["N2"] = nltk.stem.WordNetLemmatizer().lemmatize(inst.noun2)  # it does not affect the accuracy much
    # features["V"] = nltk.stem.WordNetLemmatizer().lemmatize(inst.verb)    # it deacrses accuracy
    return features

In [183]:
featureset_pp = [(pp_features(inst), inst.prep) for inst in noun_pp]
size_pp = int(len(featureset_pp) * 0.9)
train_pp, test_pp = featureset_pp[:size], featureset_pp[size:]
my_model_pp = nltk.NaiveBayesClassifier.train(train_pp)

In [184]:
# For higher accuracy I need more information than just two noun words before and after
nltk.classify.accuracy(my_model_pp, test_pp)

0.5575266092245311

In [185]:
my_model_pp.show_most_informative_features()

Most Informative Features
                      N1 = 'stake'            in : of     =     11.8 : 1.0
                      N1 = 'interest'         in : of     =      6.8 : 1.0
                      N1 = 'executive'      with : of     =      6.1 : 1.0
                      N1 = '%'                of : to     =      5.7 : 1.0
                      N1 = 'million'          in : for    =      5.6 : 1.0
                      N1 = 'one'              of : for    =      4.6 : 1.0
                      N1 = 'stock'            in : of     =      4.4 : 1.0
                      N1 = 'increase'         in : of     =      3.4 : 1.0
                      N1 = 'billion'          in : of     =      3.2 : 1.0
                      N1 = 'growth'           in : of     =      3.1 : 1.0


__★ Suppose you wanted to automatically generate a prose description of a scene, and already had a word to uniquely describe each entity, such as _the jar_, and simply wanted to decide whether to use "in" or "on" in relating various items, e.g. _the book is in the cupboard_ vs _the book is on the shelf_. Explore this issue by looking at corpus data; writing programs as needed.__

a.		in the car versus on the train

b.		in town versus on campus

c.		in the picture versus on the screen

d.		in Macbeth versus on Letterman



In [205]:
# I did not quite get the question, so I will just make a model to predict "in" or "on" given a word
corpora = brown.words() + reuters.words()
in_on_tuples = [(w, corpora[idx + 1], corpora[idx+2]) for idx, w in enumerate(list(corpora)) if (w == "in" or w == "on") and (corpora[idx+1] in ["the", "a", "an"])]
in_on_tuples += [(w, corpora[idx + 1]) for idx, w in enumerate(list(corpora)) if (w == "in" or w == "on") and (corpora[idx+1] not in ["the", "a", "an"])]

In [207]:
in_on_tuples[:10]

[('in', 'the', 'hard-fought'),
 ('in', 'the', 'election'),
 ('on', 'a', 'number'),
 ('in', 'the', 'Fulton'),
 ('in', 'the', 'state'),
 ('in', 'the', 'future'),
 ('on', 'the', 'Fulton'),
 ('in', 'the', 'appointment'),
 ('in', 'a', 'manner'),
 ('on', 'the', 'petition')]

In [210]:
def feature_in_on(tuple_):
    features = {}
    if len(tuple_) == 2:
        features["word"] = tuple_[1]
    if len(tuple_) == 3:
        features["word"] = tuple_[2]
    return features

In [211]:
featureset_in_on = [(feature_in_on(t), t[0]) for t in in_on_tuples]
size_in_on = int(len(featureset_in_on) * 0.9)
train_in_on, test_in_on = featureset_in_on[:size], featureset_in_on[size:]

In [212]:
model_in_on = nltk.NaiveBayesClassifier.train(train_in_on)

In [213]:
nltk.classify.accuracy(model_in_on, test_in_on)

0.7435837571916951

In [214]:
model_in_on.show_most_informative_features()

Most Informative Features
                    word = 'past'             in : on     =      4.7 : 1.0
                    word = 'Congo'            in : on     =      3.9 : 1.0
                    word = 'national'         on : in     =      3.8 : 1.0
                    word = "President's"      on : in     =      3.8 : 1.0
                    word = 'subject'          on : in     =      3.8 : 1.0
                    word = 'last'             in : on     =      3.5 : 1.0
                    word = 'world'            in : on     =      3.5 : 1.0
                    word = 'market'           on : in     =      2.9 : 1.0
                    word = 'matter'           on : in     =      2.7 : 1.0
                    word = 'second'           on : in     =      2.7 : 1.0


In [216]:
model_in_on.classify({"word": "car"})

'in'