## Working with word suffix and tagged_word

In [1]:
# Load and prepare the dataset
import nltk
from nltk.corpus import brown
import random

In [2]:
#Brown is another database 

In [3]:
# nltk.FreqDist() function is used to find the frequency of words within a text. It returns a dictionary. 
suffix_fdist = nltk.FreqDist()
for word in brown.words():
    word = word.lower()
    suffix_fdist[word[-1:]] += 1
    suffix_fdist[word[-2:]] += 1
    suffix_fdist[word[-3:]] += 1

In [4]:
#notice this
"love"[-3:]

'ove'

In [5]:
#notice nword[-3:] = "ies", and within current suffix_fdist there are 3708 times "ies" or {"ies":3708} in the dict
nword = "categories"
suffix_fdist[nword[-3:]]

3708

In [6]:
suffix_fdist["oe"]

146

In [7]:
suffix_fdist

FreqDist({'e': 202946, ',': 175002, '.': 152999, 's': 128722, 'd': 105687, 't': 94459, 'he': 92084, 'n': 87889, 'a': 74912, 'of': 72978, ...})

In [8]:
# .most_common(100)function to find the most common 100 suffix
common_suffixes = [suffix for (suffix, count) in suffix_fdist.most_common(100)]

In [9]:
print(common_suffixes)

['e', ',', '.', 's', 'd', 't', 'he', 'n', 'a', 'of', 'the', 'y', 'r', 'to', 'in', 'f', 'o', 'ed', 'nd', 'is', 'on', 'l', 'g', 'and', 'ng', 'er', 'as', 'ing', 'h', 'at', 'es', 'or', 're', 'it', '``', 'an', "''", 'm', ';', 'i', 'ly', 'ion', 'en', 'al', '?', 'nt', 'be', 'hat', 'st', 'his', 'th', 'll', 'le', 'ce', 'by', 'ts', 'me', 've', "'", 'se', 'ut', 'was', 'for', 'ent', 'ch', 'k', 'w', 'ld', '`', 'rs', 'ted', 'ere', 'her', 'ne', 'ns', 'ith', 'ad', 'ry', ')', '(', 'te', '--', 'ay', 'ty', 'ot', 'p', 'nce', "'s", 'ter', 'om', 'ss', ':', 'we', 'are', 'c', 'ers', 'uld', 'had', 'so', 'ey']


In [10]:
#build feature dict
def pos_features(word):
    features = {}
    for suffix in common_suffixes:
        features['endswith({})'.format(suffix)] = word.lower().endswith(suffix)
        return features

In [13]:
# #brown has 15 categories
# brown.categories()

In [12]:
browndocuments = [(list(brown.words(fileid)), category)
              for category in brown.categories()
              for fileid in brown.fileids(category)]

random.shuffle(browndocuments)

In [17]:
brown_featuresets = [([pos_features(w) for w in d], c) for (d,c) in browndocuments]
train_set, test_set = brown_featuresets[100:], brown_featuresets[:100] #80% train 20% test, total 500

In [20]:
train_set[0]

([{'endswith(e)': False},
  {'endswith(e)': True},
  {'endswith(e)': False},
  {'endswith(e)': False},
  {'endswith(e)': False},
  {'endswith(e)': False},
  {'endswith(e)': False},
  {'endswith(e)': False},
  {'endswith(e)': False},
  {'endswith(e)': True},
  {'endswith(e)': True},
  {'endswith(e)': False},
  {'endswith(e)': False},
  {'endswith(e)': False},
  {'endswith(e)': False},
  {'endswith(e)': False},
  {'endswith(e)': False},
  {'endswith(e)': True},
  {'endswith(e)': True},
  {'endswith(e)': False},
  {'endswith(e)': False},
  {'endswith(e)': False},
  {'endswith(e)': False},
  {'endswith(e)': False},
  {'endswith(e)': False},
  {'endswith(e)': False},
  {'endswith(e)': False},
  {'endswith(e)': False},
  {'endswith(e)': False},
  {'endswith(e)': False},
  {'endswith(e)': False},
  {'endswith(e)': False},
  {'endswith(e)': False},
  {'endswith(e)': False},
  {'endswith(e)': False},
  {'endswith(e)': False},
  {'endswith(e)': False},
  {'endswith(e)': False},
  {'endswith(e)':

## Tagged_word and nltk.word_tokenize

In [32]:
#word tagging, is a kind of special coding for words
text = nltk.word_tokenize("And now for something completely different")
nltk.pos_tag(text)

[('And', 'CC'),
 ('now', 'RB'),
 ('for', 'IN'),
 ('something', 'NN'),
 ('completely', 'RB'),
 ('different', 'JJ')]

In [33]:
text

['And', 'now', 'for', 'something', 'completely', 'different']

In [34]:
#word tag explanation 
nltk.help.upenn_tagset('RB')

RB: adverb
    occasionally unabatingly maddeningly adventurously professedly
    stirringly prominently technologically magisterially predominately
    swiftly fiscally pitilessly ...


## working with nltk brown database

In [37]:
# notice pos_features(n) is a function we have defined before
tagged_words = brown.tagged_words(categories='news')
featuresets = [(pos_features(n), g) for (n,g) in tagged_words]

In [38]:
tagged_words

[('The', 'AT'), ('Fulton', 'NP-TL'), ...]

In [40]:
featuresets[10]

({'endswith(e)': False}, 'NP$')

In [41]:
size = int(len(featuresets) * 0.1)
train_set, test_set = featuresets[size:], featuresets[:size]

In [42]:
classifier = nltk.DecisionTreeClassifier.train(train_set)
nltk.classify.accuracy(classifier, test_set)

0.17135753356539035

In [43]:
classifier.classify(pos_features('The'))

'AT'

## working with brown sentences

In [44]:
def pos_features2(sentence, i): 
    features = {"suffix(1)": sentence[i][-1:],
                "suffix(2)": sentence[i][-2:],
                "suffix(3)": sentence[i][-3:]}
    if i == 0:
        features["prev-word"] = "<START>"
    else:
        features["prev-word"] = sentence[i-1]
    return features

In [45]:
# it means brown has 57340 sentences
len(brown.sents())

57340

In [65]:
brown.sents()[0]

['The',
 'Fulton',
 'County',
 'Grand',
 'Jury',
 'said',
 'Friday',
 'an',
 'investigation',
 'of',
 "Atlanta's",
 'recent',
 'primary',
 'election',
 'produced',
 '``',
 'no',
 'evidence',
 "''",
 'that',
 'any',
 'irregularities',
 'took',
 'place',
 '.']

In [66]:
 pos_features2(brown.sents()[0], 6) # the 6 is the 6th word "Friday" in brown.sents()[0] list

{'suffix(1)': 'y', 'suffix(2)': 'ay', 'suffix(3)': 'day', 'prev-word': 'said'}

In [67]:
tagged_sents = brown.tagged_sents(categories='news')
featuresets = []
for tagged_sent in tagged_sents:
    untagged_sent = nltk.tag.untag(tagged_sent)
    for i, (word, tag) in enumerate(tagged_sent):
        featuresets.append( (pos_features2(untagged_sent, i), tag) )

In [68]:
tagged_sents

[[('The', 'AT'), ('Fulton', 'NP-TL'), ('County', 'NN-TL'), ('Grand', 'JJ-TL'), ('Jury', 'NN-TL'), ('said', 'VBD'), ('Friday', 'NR'), ('an', 'AT'), ('investigation', 'NN'), ('of', 'IN'), ("Atlanta's", 'NP$'), ('recent', 'JJ'), ('primary', 'NN'), ('election', 'NN'), ('produced', 'VBD'), ('``', '``'), ('no', 'AT'), ('evidence', 'NN'), ("''", "''"), ('that', 'CS'), ('any', 'DTI'), ('irregularities', 'NNS'), ('took', 'VBD'), ('place', 'NN'), ('.', '.')], [('The', 'AT'), ('jury', 'NN'), ('further', 'RBR'), ('said', 'VBD'), ('in', 'IN'), ('term-end', 'NN'), ('presentments', 'NNS'), ('that', 'CS'), ('the', 'AT'), ('City', 'NN-TL'), ('Executive', 'JJ-TL'), ('Committee', 'NN-TL'), (',', ','), ('which', 'WDT'), ('had', 'HVD'), ('over-all', 'JJ'), ('charge', 'NN'), ('of', 'IN'), ('the', 'AT'), ('election', 'NN'), (',', ','), ('``', '``'), ('deserves', 'VBZ'), ('the', 'AT'), ('praise', 'NN'), ('and', 'CC'), ('thanks', 'NNS'), ('of', 'IN'), ('the', 'AT'), ('City', 'NN-TL'), ('of', 'IN-TL'), ('Atlant

In [69]:
size = int(len(featuresets) * 0.1)
train_set, test_set = featuresets[size:], featuresets[:size]
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [70]:
nltk.classify.accuracy(classifier, test_set)

0.7891596220785678

In [71]:
def pos_features3(sentence, i, history):
    features = {"suffix(1)": sentence[i][-1:],
                 "suffix(2)": sentence[i][-2:],
                 "suffix(3)": sentence[i][-3:]}
    if i == 0:
        features["prev-word"] = "<START>"
        features["prev-tag"] = "<START>"
    else:
        features["prev-word"] = sentence[i-1]
        features["prev-tag"] = history[i-1]
    return features

In [55]:
class ConsecutivePosTagger(nltk.TaggerI): 

    def __init__(self, train_sents):
        train_set = []
        for tagged_sent in train_sents:
            untagged_sent = nltk.tag.untag(tagged_sent)
            history = []
            for i, (word, tag) in enumerate(tagged_sent):
                featureset = pos_features3(untagged_sent, i, history)
                train_set.append( (featureset, tag) )
                history.append(tag)
        self.classifier = nltk.NaiveBayesClassifier.train(train_set)

    def tag(self, sentence):
        history = []
        for i, word in enumerate(sentence):
            featureset = pos_features3(sentence, i, history)
            tag = self.classifier.classify(featureset)
            history.append(tag)
        return zip(sentence, history)

In [56]:
tagged_sents = brown.tagged_sents(categories='news')
size = int(len(tagged_sents) * 0.1)
train_sents, test_sents = tagged_sents[size:], tagged_sents[:size]
tagger = ConsecutivePosTagger(train_sents)
print(tagger.evaluate(test_sents))

0.7980528511821975


## sample function of Enumerate()

In [57]:
# index from 0 and element of list into tuple to be in a list
l1 = ["eat","sleep","repeat"]
print(list(enumerate(l1)))

[(0, 'eat'), (1, 'sleep'), (2, 'repeat')]


In [58]:
en = list(enumerate(l1))
type(en[1])

tuple