## Machine Learning

In [5]:
from collections import Counter
import math, random

In [6]:
def split_data(data, prob):
    """split data into fractions [prob, 1 - prob]"""
    results = [], []
    for row in data:
        results[0 if random.random() < prob else 1].append(row)
    return results

In [7]:
def train_test_split(x, y, test_pct):
    data = list(zip(x, y))                        # pair corresponding values
    train, test = split_data(data, 1 - test_pct)  # split the dataset of pairs
    x_train, y_train = list(zip(*train))          # magical un-zip trick
    x_test, y_test = list(zip(*test))
    return x_train, x_test, y_train, y_test

In [8]:
## The below code gives error as there is no prediction model implemented
## to get the test and train values.
model = SomeKindOfModel()
x_train, x_test, y_train, y_test = train_test_split(xs, ys, 0.33)
model.train(x_train, y_train)
performance = model.test(x_test, y_test)

NameError: name 'SomeKindOfModel' is not defined

In [9]:
def accuracy(tp, fp, fn, tn):
    correct = tp + tn
    total = tp + fp + fn + tn
    return correct / total

In [10]:
print("accuracy(70, 4930, 13930, 981070)", accuracy(70, 4930, 13930, 981070))

accuracy(70, 4930, 13930, 981070) 0.98114


In [11]:
def precision(tp, fp, fn, tn):
    return tp / (tp + fp)

In [12]:
print("precision(70, 4930, 13930, 981070)", precision(70, 4930, 13930, 981070))

precision(70, 4930, 13930, 981070) 0.014


In [13]:
def recall(tp, fp, fn, tn):
    return tp / (tp + fn)

In [14]:
print("recall(70, 4930, 13930, 981070)", recall(70, 4930, 13930, 981070))

recall(70, 4930, 13930, 981070) 0.005


In [15]:
def f1_score(tp, fp, fn, tn):
    p = precision(tp, fp, fn, tn)
    r = recall(tp, fp, fn, tn)

    return 2 * p * r / (p + r)

In [16]:
print("f1_score(70, 4930, 13930, 981070)", f1_score(70, 4930, 13930, 981070))

f1_score(70, 4930, 13930, 981070) 0.00736842105263158


## Naive Bayes

In [69]:
from collections import Counter, defaultdict
from machine_learning import split_data
import math, random, re, glob

In [70]:
def tokenize(message):
    message = message.lower()                       # convert to lowercase
    all_words = re.findall("[a-z0-9']+", message)   # extract the words
    return set(all_words) 

In [71]:
def count_words(training_set):
    """training set consists of pairs (message, is_spam)"""
    counts = defaultdict(lambda: [0, 0])
    for message, is_spam in training_set:
        for word in tokenize(message):
            counts[word][0 if is_spam else 1] += 1
    return counts

In [72]:
def word_probabilities(counts, total_spams, total_non_spams, k=0.5):
    """turn the word_counts into a list of triplets
    w, p(w | spam) and p(w | ~spam)"""
    return [(w,
             (spam + k) / (total_spams + 2 * k),
             (non_spam + k) / (total_non_spams + 2 * k))
             for w, (spam, non_spam) in counts.items()]

In [73]:
def spam_probability(word_probs, message):
    message_words = tokenize(message)
    log_prob_if_spam = log_prob_if_not_spam = 0.0
    for word, prob_if_spam, prob_if_not_spam in word_probs:

        # for each word in the message,
        # add the log probability of seeing it
        if word in message_words:
            log_prob_if_spam += math.log(prob_if_spam)
            log_prob_if_not_spam += math.log(prob_if_not_spam)

        # for each word that's not in the message
        # add the log probability of _not_ seeing it
        else:
            log_prob_if_spam += math.log(1.0 - prob_if_spam)
            log_prob_if_not_spam += math.log(1.0 - prob_if_not_spam)

    prob_if_spam = math.exp(log_prob_if_spam)
    prob_if_not_spam = math.exp(log_prob_if_not_spam)
    return prob_if_spam / (prob_if_spam + prob_if_not_spam)

In [22]:
class NaiveBayesClassifier:

    def __init__(self, k=0.5):
        self.k = k
        self.word_probs = []

    def train(self, training_set):

        # count spam and non-spam messages
        num_spams = len([is_spam
                         for message, is_spam in training_set
                         if is_spam])
        num_non_spams = len(training_set) - num_spams

        # run training data through our "pipeline"
        word_counts = count_words(training_set)
        self.word_probs = word_probabilities(word_counts,
                                             num_spams,
                                             num_non_spams,
                                             self.k)

    def classify(self, message):
        return spam_probability(self.word_probs, message)

In [23]:
import glob, re
# modify the path with wherever you've put the files
path = r"C:\spam\*\*"
data = []
# glob.glob returns every filename that matches the wildcarded path
for fn in glob.glob(path):
    is_spam = "ham" not in fn
    with open(fn,'r') as file:
        for line in file:
            if line.startswith("Subject:"):
                # remove the leading "Subject: " and keep what's left
                subject = re.sub(r"^Subject: ", "", line).strip()
                data.append((subject, is_spam))


In [77]:
random.seed(0) # just so you get the same answers as me
train_data, test_data = split_data(data, 0.75)
classifier = NaiveBayesClassifier()
classifier.train(train_data)

In [25]:
# triplets (subject, actual is_spam, predicted spam probability)
classified = [(subject, is_spam, classifier.classify(subject))
 for subject, is_spam in test_data]
# assume that spam_probability > 0.5 corresponds to spam prediction
# and count the combinations of (actual is_spam, predicted is_spam)
counts = Counter((is_spam, spam_probability > 0.5)
 for _, is_spam, spam_probability in classified)

In [26]:
def get_subject_data(path):

    data = []

    # regex for stripping out the leading "Subject:" and any spaces after it
    subject_regex = re.compile(r"^Subject:\s+")

    # glob.glob returns every filename that matches the wildcarded path
    for fn in glob.glob(path):
        is_spam = "ham" not in fn

        with open(fn,'r',encoding='ISO-8859-1') as file:
            for line in file:
                if line.startswith("Subject:"):
                    subject = subject_regex.sub("", line).strip()
                    data.append((subject, is_spam))

    return data

In [27]:
def p_spam_given_word(word_prob):
    word, prob_if_spam, prob_if_not_spam = word_prob
    return prob_if_spam / (prob_if_spam + prob_if_not_spam)

In [28]:
words = sorted(classifier.word_probs, key=p_spam_given_word)
spammiest_words = words[-5:]
hammiest_words = words[:5]

In [29]:
def train_and_test_model(path):

    data = get_subject_data(path)
    random.seed(0)      # just so you get the same answers as me
    train_data, test_data = split_data(data, 0.75)

    classifier = NaiveBayesClassifier()
    classifier.train(train_data)

    classified = [(subject, is_spam, classifier.classify(subject))
              for subject, is_spam in test_data]

    counts = Counter((is_spam, spam_probability > 0.5) # (actual, predicted)
                     for _, is_spam, spam_probability in classified)

    print(counts)

    classified.sort(key=lambda row: row[2])
    spammiest_hams = list(filter(lambda row: not row[1], classified))[-5:]
    hammiest_spams = list(filter(lambda row: row[1], classified))[:5]

    print("spammiest_hams", spammiest_hams)
    print("hammiest_spams", hammiest_spams)

    words = sorted(classifier.word_probs, key=p_spam_given_word)

    spammiest_words = words[-5:]
    hammiest_words = words[:5]

    print("spammiest_words", spammiest_words)
    print("hammiest_words", hammiest_words)

In [76]:
train_and_test_model(r"C:\Users\shash\Anaconda ipyb files\spam\spam/*")

Counter({(True, True): 134})
spammiest_hams []
hammiest_spams [('Life Insurance - Why Pay More?', True, 1.0), ('[ILUG] Guaranteed to lose 10-12 lbs in 30 days 10.206', True, 1.0), ('FORTUNE 500 COMPANY HIRING, AT HOME REPS.', True, 1.0), ('^^^^^Cell Phone Belt Clips $1.95^^^^^^                           18070', True, 1.0), ('FREE Cell Phone + $50 Cash Back!', True, 1.0)]
spammiest_words [('ilug', 0.09054054054054055, 0.5), ('you', 0.09594594594594595, 0.5), ('the', 0.10405405405405406, 0.5), ('for', 0.10405405405405406, 0.5), ('your', 0.1472972972972973, 0.5)]
hammiest_words [('150', 0.004054054054054054, 0.5), ('user', 0.004054054054054054, 0.5), ('name', 0.004054054054054054, 0.5), ('sites', 0.004054054054054054, 0.5), ('password', 0.004054054054054054, 0.5)]


In [31]:
def drop_final_s(word):
    return re.sub("s$", "", word)

### Decision Trees

In [2]:
from collections import Counter, defaultdict
from functools import partial
import math, random

In [3]:
def entropy(class_probabilities):
    """given a list of class probabilities, compute the entropy"""
    return sum(-p * math.log(p, 2) for p in class_probabilities if p)

In [4]:
def class_probabilities(labels):
    total_count = len(labels)
    return [count / total_count
            for count in Counter(labels).values()]

In [5]:
def data_entropy(labeled_data):
    labels = [label for _, label in labeled_data]
    probabilities = class_probabilities(labels)
    return entropy(probabilities)

In [6]:
def partition_entropy(subsets):
    """find the entropy from this partition of data into subsets"""
    total_count = sum(len(subset) for subset in subsets)

    return sum( data_entropy(subset) * len(subset) / total_count
                for subset in subsets )

In [7]:
inputs = [
 ({'level':'Senior', 'lang':'Java', 'tweets':'no', 'phd':'no'}, False),
 ({'level':'Senior', 'lang':'Java', 'tweets':'no', 'phd':'yes'}, False),
 ({'level':'Mid', 'lang':'Python', 'tweets':'no', 'phd':'no'}, True),
 ({'level':'Junior', 'lang':'Python', 'tweets':'no', 'phd':'no'}, True),
 ({'level':'Junior', 'lang':'R', 'tweets':'yes', 'phd':'no'}, True),
 ({'level':'Junior', 'lang':'R', 'tweets':'yes', 'phd':'yes'}, False),
 ({'level':'Mid', 'lang':'R', 'tweets':'yes', 'phd':'yes'}, True),
 ({'level':'Senior', 'lang':'Python', 'tweets':'no', 'phd':'no'}, False),
 ({'level':'Senior', 'lang':'R', 'tweets':'yes', 'phd':'no'}, True),
 ({'level':'Junior', 'lang':'Python', 'tweets':'yes', 'phd':'no'}, True),
 ({'level':'Senior', 'lang':'Python', 'tweets':'yes', 'phd':'yes'}, True),
 ({'level':'Mid', 'lang':'Python', 'tweets':'no', 'phd':'yes'}, True),
 ({'level':'Mid', 'lang':'Java', 'tweets':'yes', 'phd':'no'}, True),
 ({'level':'Junior', 'lang':'Python', 'tweets':'no', 'phd':'yes'}, False)
]

In [13]:
def partition_by(inputs, attribute):
    """each input is a pair (attribute_dict, label).
    returns a dict : attribute_value -> inputs"""
    groups = defaultdict(list)
    for input in inputs:
        key = input[0][attribute] # get the value of the specified attribute
        groups[key].append(input) # then add this input to the correct list
    return groups

In [14]:
def partition_entropy_by(inputs, attribute):
    """computes the entropy corresponding to the given partition"""
    partitions = partition_by(inputs, attribute)
    return partition_entropy(partitions.values())

In [15]:
for key in ['level','lang','tweets','phd']:
    print (key, partition_entropy_by(inputs, key))

level 0.6935361388961919
lang 0.8601317128547441
tweets 0.7884504573082896
phd 0.8921589282623617


In [16]:
senior_inputs = [(input, label)
 for input, label in inputs if input["level"] == "Senior"]

In [18]:
for key in ['lang', 'tweets', 'phd']:
    print (key, partition_entropy_by(senior_inputs, key))

lang 0.4
tweets 0.0
phd 0.9509775004326938


In [19]:
def classify(tree, input):
    """classify the input using the given decision tree"""

    # if this is a leaf node, return its value
    if tree in [True, False]:
        return tree

    # otherwise find the correct subtree
    attribute, subtree_dict = tree

    subtree_key = input.get(attribute)  # None if input is missing attribute

    if subtree_key not in subtree_dict: # if no subtree for key,
        subtree_key = None              # we'll use the None subtree

    subtree = subtree_dict[subtree_key] # choose the appropriate subtree
    return classify(subtree, input)     # and use it to classify the input

In [20]:
def build_tree_id3(inputs, split_candidates=None):

    # if this is our first pass,
    # all keys of the first input are split candidates
    if split_candidates is None:
        split_candidates = inputs[0][0].keys()

    # count Trues and Falses in the inputs
    num_inputs = len(inputs)
    num_trues = len([label for item, label in inputs if label])
    num_falses = num_inputs - num_trues

    if num_trues == 0:                  # if only Falses are left
        return False                    # return a "False" leaf

    if num_falses == 0:                 # if only Trues are left
        return True                     # return a "True" leaf

    if not split_candidates:            # if no split candidates left
        return num_trues >= num_falses  # return the majority leaf

    # otherwise, split on the best attribute
    best_attribute = min(split_candidates,
        key=partial(partition_entropy_by, inputs))

    partitions = partition_by(inputs, best_attribute)
    new_candidates = [a for a in split_candidates
                      if a != best_attribute]

    # recursively build the subtrees
    subtrees = { attribute : build_tree_id3(subset, new_candidates)
                 for attribute, subset in partitions.items() }

    subtrees[None] = num_trues > num_falses # default case

    return (best_attribute, subtrees)

In [21]:
tree = build_tree_id3(inputs)

In [28]:
tree

('level',
 {'Senior': ('tweets', {'no': False, 'yes': True, None: False}),
  'Mid': True,
  'Junior': ('phd', {'no': True, 'yes': False, None: True}),
  None: True})

In [22]:
classify(tree, { "level" : "Junior",
 "lang" : "Java",
 "tweets" : "yes",
 "phd" : "no"} ) 

True

In [23]:
classify(tree, { "level" : "Junior",
 "lang" : "Java",
 "tweets" : "yes",
 "phd" : "yes"} ) 

False

In [24]:
classify(tree, { "level" : "Intern" } ) 

True

In [25]:
classify(tree, { "level" : "Senior" } ) 

False

In [26]:
def forest_classify(trees, input):
    votes = [classify(tree, input) for tree in trees]
    vote_counts = Counter(votes)
    return vote_counts.most_common(1)[0][0]

### Leraning To Classify text

In [37]:
def gender_features(word):
    return {'last_letter': word[-1]}

In [38]:
gender_features('Shrek')

{'last_letter': 'k'}

In [39]:
from nltk.corpus import names

In [40]:
import random

In [41]:
import nltk

In [42]:
labeled_names = ([(name, 'male') for name in names.words('male.txt')] +
[(name, 'female') for name in names.words('female.txt')])

In [43]:
random.shuffle(labeled_names)

In [44]:
labeled_names[:10]

[('Lissie', 'female'),
 ('Danya', 'female'),
 ('Welby', 'male'),
 ('Antone', 'male'),
 ('Bert', 'female'),
 ('Erek', 'male'),
 ('Gayleen', 'female'),
 ('Rory', 'female'),
 ('Donelle', 'female'),
 ('Felicle', 'female')]

In [45]:
featuresets = [(gender_features(n), gender) for (n, gender) in labeled_names]

In [46]:
featuresets[:10]

[({'last_letter': 'e'}, 'female'),
 ({'last_letter': 'a'}, 'female'),
 ({'last_letter': 'y'}, 'male'),
 ({'last_letter': 'e'}, 'male'),
 ({'last_letter': 't'}, 'female'),
 ({'last_letter': 'k'}, 'male'),
 ({'last_letter': 'n'}, 'female'),
 ({'last_letter': 'y'}, 'female'),
 ({'last_letter': 'e'}, 'female'),
 ({'last_letter': 'e'}, 'female')]

In [41]:
train_set, test_set = featuresets[500:], featuresets[:500]

In [44]:
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [45]:
classifier.classify(gender_features('Neo'))

'male'

In [46]:
classifier.classify(gender_features('Trinity'))

'female'

In [54]:
classifier.classify(gender_features('Varun'))

'male'

In [55]:
classifier.classify(gender_features('Rohan'))

'male'

In [57]:
classifier.classify(gender_features('Sai'))

'female'

In [56]:
print(nltk.classify.accuracy(classifier, test_set))

0.734


In [58]:
classifier.show_most_informative_features(5)

Most Informative Features
             last_letter = 'k'              male : female =     44.7 : 1.0
             last_letter = 'a'            female : male   =     38.3 : 1.0
             last_letter = 'f'              male : female =     16.0 : 1.0
             last_letter = 'p'              male : female =     11.9 : 1.0
             last_letter = 'v'              male : female =     11.3 : 1.0


In [59]:
from nltk.classify import apply_features

In [60]:
train_set = apply_features(gender_features, labeled_names[500:])

In [61]:
test_set = apply_features(gender_features, labeled_names[:500])

In [62]:
def gender_features2(name):
    features = {}
    features["first_letter"] = name[0].lower()
    features["last_letter"] = name[-1].lower()
    for letter in 'abcdefghijklmnopqrstuvwxyz':
        features["count({})".format(letter)] = name.lower().count(letter)
        features["has({})".format(letter)] = (letter in name.lower())
    return features

In [63]:
gender_features2('John')

{'first_letter': 'j',
 'last_letter': 'n',
 'count(a)': 0,
 'has(a)': False,
 'count(b)': 0,
 'has(b)': False,
 'count(c)': 0,
 'has(c)': False,
 'count(d)': 0,
 'has(d)': False,
 'count(e)': 0,
 'has(e)': False,
 'count(f)': 0,
 'has(f)': False,
 'count(g)': 0,
 'has(g)': False,
 'count(h)': 1,
 'has(h)': True,
 'count(i)': 0,
 'has(i)': False,
 'count(j)': 1,
 'has(j)': True,
 'count(k)': 0,
 'has(k)': False,
 'count(l)': 0,
 'has(l)': False,
 'count(m)': 0,
 'has(m)': False,
 'count(n)': 1,
 'has(n)': True,
 'count(o)': 1,
 'has(o)': True,
 'count(p)': 0,
 'has(p)': False,
 'count(q)': 0,
 'has(q)': False,
 'count(r)': 0,
 'has(r)': False,
 'count(s)': 0,
 'has(s)': False,
 'count(t)': 0,
 'has(t)': False,
 'count(u)': 0,
 'has(u)': False,
 'count(v)': 0,
 'has(v)': False,
 'count(w)': 0,
 'has(w)': False,
 'count(x)': 0,
 'has(x)': False,
 'count(y)': 0,
 'has(y)': False,
 'count(z)': 0,
 'has(z)': False}

In [64]:
featuresets = [(gender_features2(n), gender) for (n, gender) in labeled_names]

In [65]:
train_set, test_set = featuresets[500:], featuresets[:500]

In [66]:
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [67]:
print(nltk.classify.accuracy(classifier, test_set))

0.75


In [68]:
train_names = labeled_names[1500:]

In [69]:
devtest_names = labeled_names[500:1500]

In [70]:
test_names = labeled_names[:500]

In [71]:
train_set = [(gender_features(n), gender) for (n, gender) in train_names]

In [72]:
devtest_set = [(gender_features(n), gender) for (n, gender) in devtest_names]

In [73]:
test_set = [(gender_features(n), gender) for (n, gender) in test_names]

In [74]:
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [75]:
print(nltk.classify.accuracy(classifier, devtest_set))

0.777


In [76]:
errors = []
for (name, tag) in devtest_names:
    guess = classifier.classify(gender_features(name))
    if guess != tag:
        errors.append( (tag, guess, name) )

In [77]:
for (tag, guess, name) in sorted(errors):
     print('correct={:<8} guess={:<8s} name={:<30}'.format(tag, guess, name))

correct=female   guess=male     name=Adriaens                      
correct=female   guess=male     name=Aeriel                        
correct=female   guess=male     name=Aidan                         
correct=female   guess=male     name=Ailyn                         
correct=female   guess=male     name=Aleen                         
correct=female   guess=male     name=Alexis                        
correct=female   guess=male     name=Alleen                        
correct=female   guess=male     name=Allis                         
correct=female   guess=male     name=Angel                         
correct=female   guess=male     name=Avrit                         
correct=female   guess=male     name=Babs                          
correct=female   guess=male     name=Brooks                        
correct=female   guess=male     name=Caril                         
correct=female   guess=male     name=Carilyn                       
correct=female   guess=male     name=Carin      

In [78]:
def gender_features(word):
     return {'suffix1': word[-1:],
             'suffix2': word[-2:]}

In [79]:
train_set = [(gender_features(n), gender) for (n, gender) in train_names]
devtest_set = [(gender_features(n), gender) for (n, gender) in devtest_names]
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [80]:
print(nltk.classify.accuracy(classifier, devtest_set))

0.79


In [81]:
from nltk.corpus import movie_reviews
documents = [(list(movie_reviews.words(fileid)), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]
random.shuffle(documents)

In [82]:
all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words())
word_features = list(all_words)[:2000]

def document_features(document):
    document_words = set(document)
    features = {}
    for word in word_features:
        features['contains({})'.format(word)] = (word in document_words)
    return features

In [83]:
print(document_features(movie_reviews.words('pos/cv957_8737.txt')))

{'contains(plot)': True, 'contains(:)': True, 'contains(two)': True, 'contains(teen)': False, 'contains(couples)': False, 'contains(go)': False, 'contains(to)': True, 'contains(a)': True, 'contains(church)': False, 'contains(party)': False, 'contains(,)': True, 'contains(drink)': False, 'contains(and)': True, 'contains(then)': True, 'contains(drive)': False, 'contains(.)': True, 'contains(they)': True, 'contains(get)': True, 'contains(into)': True, 'contains(an)': True, 'contains(accident)': False, 'contains(one)': True, 'contains(of)': True, 'contains(the)': True, 'contains(guys)': False, 'contains(dies)': False, 'contains(but)': True, 'contains(his)': True, 'contains(girlfriend)': True, 'contains(continues)': False, 'contains(see)': False, 'contains(him)': True, 'contains(in)': True, 'contains(her)': False, 'contains(life)': False, 'contains(has)': True, 'contains(nightmares)': False, 'contains(what)': True, "contains(')": True, 'contains(s)': True, 'contains(deal)': False, 'contains

In [84]:
featuresets = [(document_features(d), c) for (d,c) in documents]
train_set, test_set = featuresets[100:], featuresets[:100]
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [85]:
print(nltk.classify.accuracy(classifier, test_set))

0.77


In [86]:
classifier.show_most_informative_features(5)

Most Informative Features
 contains(unimaginative) = True              neg : pos    =      7.6 : 1.0
        contains(welles) = True              neg : pos    =      7.6 : 1.0
     contains(atrocious) = True              neg : pos    =      7.0 : 1.0
          contains(mena) = True              neg : pos    =      7.0 : 1.0
        contains(shoddy) = True              neg : pos    =      7.0 : 1.0


In [56]:
from nltk.corpus import brown

In [88]:
suffix_fdist = nltk.FreqDist()
for word in brown.words():
    word = word.lower()
    suffix_fdist[word[-1:]] += 1
    suffix_fdist[word[-2:]] += 1
    suffix_fdist[word[-3:]] += 1

In [89]:
common_suffixes = [suffix for (suffix, count) in suffix_fdist.most_common(100)]

In [90]:
print(common_suffixes)

['e', ',', '.', 's', 'd', 't', 'he', 'n', 'a', 'of', 'the', 'y', 'r', 'to', 'in', 'f', 'o', 'ed', 'nd', 'is', 'on', 'l', 'g', 'and', 'ng', 'er', 'as', 'ing', 'h', 'at', 'es', 'or', 're', 'it', '``', 'an', "''", 'm', ';', 'i', 'ly', 'ion', 'en', 'al', '?', 'nt', 'be', 'hat', 'st', 'his', 'th', 'll', 'le', 'ce', 'by', 'ts', 'me', 've', "'", 'se', 'ut', 'was', 'for', 'ent', 'ch', 'k', 'w', 'ld', '`', 'rs', 'ted', 'ere', 'her', 'ne', 'ns', 'ith', 'ad', 'ry', ')', '(', 'te', '--', 'ay', 'ty', 'ot', 'p', 'nce', "'s", 'ter', 'om', 'ss', ':', 'we', 'are', 'c', 'ers', 'uld', 'had', 'so', 'ey']


In [91]:
def pos_features(word):
    features = {}
    for suffix in common_suffixes:
        features['endswith({})'.format(suffix)] = word.lower().endswith(suffix)
    return features

In [92]:
tagged_words = brown.tagged_words(categories='news')
featuresets = [(pos_features(n), g) for (n,g) in tagged_words]

In [93]:
size = int(len(featuresets) * 0.1)
train_set, test_set = featuresets[size:], featuresets[:size]

In [95]:
classifier = nltk.DecisionTreeClassifier.train(train_set)
nltk.classify.accuracy(classifier, test_set)

0.6270512182993535

In [96]:
classifier.classify(pos_features('cats'))

'NNS'

In [97]:
print(classifier.pseudocode(depth=4))

if endswith(the) == False: 
  if endswith(,) == False: 
    if endswith(s) == False: 
      if endswith(.) == False: return '.'
      if endswith(.) == True: return '.'
    if endswith(s) == True: 
      if endswith(is) == False: return 'PP$'
      if endswith(is) == True: return 'BEZ'
  if endswith(,) == True: return ','
if endswith(the) == True: return 'AT'



In [100]:
def pos_features(sentence, i):
    features = {"suffix(1)": sentence[i][-1:],
                "suffix(2)": sentence[i][-2:],
                "suffix(3)": sentence[i][-3:]}
    if i == 0:
        features["prev-word"] = "<START>"
    else:
        features["prev-word"] = sentence[i-1]
    return features

In [101]:
pos_features(brown.sents()[0], 8)

{'suffix(1)': 'n', 'suffix(2)': 'on', 'suffix(3)': 'ion', 'prev-word': 'an'}

In [102]:
tagged_sents = brown.tagged_sents(categories='news')

In [103]:
featuresets = []
for tagged_sent in tagged_sents:
    untagged_sent = nltk.tag.untag(tagged_sent)
    for i, (word, tag) in enumerate(tagged_sent):
        featuresets.append( (pos_features(untagged_sent, i), tag) )

In [104]:
size = int(len(featuresets) * 0.1)
train_set, test_set = featuresets[size:], featuresets[:size]
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [105]:
nltk.classify.accuracy(classifier, test_set)

0.7891596220785678

In [106]:
 def pos_features(sentence, i, history):
     features = {"suffix(1)": sentence[i][-1:],
                 "suffix(2)": sentence[i][-2:],
                 "suffix(3)": sentence[i][-3:]}
     if i == 0:
         features["prev-word"] = "<START>"
         features["prev-tag"] = "<START>"
     else:
         features["prev-word"] = sentence[i-1]
         features["prev-tag"] = history[i-1]
     return features

class ConsecutivePosTagger(nltk.TaggerI):

    def __init__(self, train_sents):
        train_set = []
        for tagged_sent in train_sents:
            untagged_sent = nltk.tag.untag(tagged_sent)
            history = []
            for i, (word, tag) in enumerate(tagged_sent):
                featureset = pos_features(untagged_sent, i, history)
                train_set.append( (featureset, tag) )
                history.append(tag)
        self.classifier = nltk.NaiveBayesClassifier.train(train_set)

    def tag(self, sentence):
        history = []
        for i, word in enumerate(sentence):
            featureset = pos_features(sentence, i, history)
            tag = self.classifier.classify(featureset)
            history.append(tag)
        return zip(sentence, history)

In [107]:
tagged_sents = brown.tagged_sents(categories='news')

In [108]:
size = int(len(tagged_sents) * 0.1)
train_sents, test_sents = tagged_sents[size:], tagged_sents[:size]
tagger = ConsecutivePosTagger(train_sents)

In [109]:
print(tagger.evaluate(test_sents))

0.7980528511821975


In [110]:
sents = nltk.corpus.treebank_raw.sents()
tokens = []
boundaries = set()
offset = 0
for sent in sents:
    tokens.extend(sent)
    offset += len(sent)
    boundaries.add(offset-1)

In [111]:
def punct_features(tokens, i):
    return {'next-word-capitalized': tokens[i+1][0].isupper(),
            'prev-word': tokens[i-1].lower(),
            'punct': tokens[i],
            'prev-word-is-one-char': len(tokens[i-1]) == 1}

In [112]:
featuresets = [(punct_features(tokens, i), (i in boundaries))
               for i in range(1, len(tokens)-1)
               if tokens[i] in '.?!']

In [113]:
size = int(len(featuresets) * 0.1)

In [114]:
train_set, test_set = featuresets[size:], featuresets[:size]

In [115]:
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [116]:
nltk.classify.accuracy(classifier, test_set)

0.936026936026936

In [117]:
def segment_sentences(words):
    start = 0
    sents = []
    for i, word in enumerate(words):
        if word in '.?!' and classifier.classify(punct_features(words, i)) == True:
            sents.append(words[start:i+1])
            start = i+1
    if start < len(words):
        sents.append(words[start:])
    return sents

In [118]:
posts = nltk.corpus.nps_chat.xml_posts()[:10000]

In [119]:
def dialogue_act_features(post):
    features = {}
    for word in nltk.word_tokenize(post):
        features['contains({})'.format(word.lower())] = True
    return features

In [120]:
featuresets = [(dialogue_act_features(post.text), post.get('class'))
               for post in posts]

In [121]:
size = int(len(featuresets) * 0.1)
train_set, test_set = featuresets[size:], featuresets[:size]

In [122]:
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [123]:
print(nltk.classify.accuracy(classifier, test_set))

0.668


In [124]:
def rte_features(rtepair):
    extractor = nltk.RTEFeatureExtractor(rtepair)
    features = {}
    features['word_overlap'] = len(extractor.overlap('word'))
    features['word_hyp_extra'] = len(extractor.hyp_extra('word'))
    features['ne_overlap'] = len(extractor.overlap('ne'))
    features['ne_hyp_extra'] = len(extractor.hyp_extra('ne'))
    return features

In [130]:
rtepair = nltk.corpus.rte.pairs(['rte3_dev.xml'])[33]

In [131]:
extractor = nltk.RTEFeatureExtractor(rtepair)

In [132]:
print(extractor.text_words)

{'Co', 'representing', 'Russia', 'that', 'terrorism.', 'four', 'fledgling', 'central', 'meeting', 'republics', 'Soviet', 'Asia', 'was', 'Shanghai', 'association', 'SCO', 'former', 'fight', 'at', 'binds', 'operation', 'Parviz', 'Iran', 'together', 'China', 'Davudi', 'Organisation'}


In [133]:
print(extractor.hyp_words)

{'member', 'China', 'SCO.'}


In [134]:
print(extractor.overlap('word'))

set()


In [135]:
print(extractor.overlap('ne'))

{'China'}


In [136]:
print(extractor.hyp_extra('word'))

{'member'}


In [137]:
import random
from nltk.corpus import brown

In [138]:
tagged_sents = list(brown.tagged_sents(categories='news'))
random.shuffle(tagged_sents)
size = int(len(tagged_sents) * 0.1)
train_set, test_set = tagged_sents[size:], tagged_sents[:size]

In [139]:
file_ids = brown.fileids(categories='news')
size = int(len(file_ids) * 0.1)
train_set = brown.tagged_sents(file_ids[size:])
test_set = brown.tagged_sents(file_ids[:size])

In [171]:
train_set = brown.tagged_sents(categories='news')
test_set = brown.tagged_sents(categories='fiction')

In [189]:
from nltk.tokenize import word_tokenize
all_words = set(word.lower() for passage in train_set[0] for word in word_tokenize(passage[0]))
t = [({word: (word in word_tokenize(x[0])) for word in all_words}, x[1]) for x in train_set[0]]
t

[({'took': False,
   'election': False,
   'atlanta': False,
   'that': False,
   'an': False,
   'no': False,
   '.': False,
   'county': False,
   "''": False,
   'irregularities': False,
   'primary': False,
   'jury': False,
   'place': False,
   "'s": False,
   'friday': False,
   'grand': False,
   'investigation': False,
   'said': False,
   'of': False,
   'fulton': False,
   'produced': False,
   'any': False,
   'the': False,
   'evidence': False,
   'recent': False,
   '``': False},
  'AT'),
 ({'took': False,
   'election': False,
   'atlanta': False,
   'that': False,
   'an': False,
   'no': False,
   '.': False,
   'county': False,
   "''": False,
   'irregularities': False,
   'primary': False,
   'jury': False,
   'place': False,
   "'s": False,
   'friday': False,
   'grand': False,
   'investigation': False,
   'said': False,
   'of': False,
   'fulton': False,
   'produced': False,
   'any': False,
   'the': False,
   'evidence': False,
   'recent': False,
   '``': F

In [47]:
## The above command gives an error so finding an alternate code:
train_set = featuresets[:1900]
test_set = featuresets[1900:]

In [183]:
classifier = nltk.NaiveBayesClassifier.train(t)
classifier

<nltk.classify.naivebayes.NaiveBayesClassifier at 0x1c16bbe8908>

In [197]:
## with the classifiers as input we get the below error
print('Accuracy: {:4.2f}'.format(nltk.classify.accuracy(classifier, test_set)))

ValueError: not enough values to unpack (expected 2, got 1)

In [49]:
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [51]:
print("Classifier accuracy percent:",(nltk.classify.accuracy(classifier, test_set))*100)

Classifier accuracy percent: 75.90999338186631


In [52]:
def tag_list(tagged_sents):
    return [tag for sent in tagged_sents for (word, tag) in sent]

In [53]:
def apply_tagger(tagger, corpus):
    return [tagger.tag(nltk.tag.untag(sent)) for sent in corpus]

In [57]:
gold = tag_list(brown.tagged_sents(categories='editorial'))
test = tag_list(apply_tagger(t2, brown.tagged_sents(categories='editorial')))
cm = nltk.ConfusionMatrix(gold, test)
print(cm.pretty_format(sort_by_count=True, show_percents=True, truncate=9))

NameError: name 't2' is not defined

In [58]:
## As t2 varibale is not defined executing the below code
gold = tag_list(brown.tagged_sents(categories='editorial'))
test = tag_list(brown.tagged_sents(categories='editorial'))
cm = nltk.ConfusionMatrix(gold, test)
print(cm.pretty_format(sort_by_count=True, show_percents=True, truncate=9))

    |                                         N                      |
    |      N      I      A      J             N             V      N |
    |      N      N      T      J      .      S      ,      B      P |
----+----------------------------------------------------------------+
 NN | <12.5%>     .      .      .      .      .      .      .      . |
 IN |      . <10.1%>     .      .      .      .      .      .      . |
 AT |      .      .  <8.6%>     .      .      .      .      .      . |
 JJ |      .      .      .  <5.8%>     .      .      .      .      . |
  . |      .      .      .      .  <4.9%>     .      .      .      . |
NNS |      .      .      .      .      .  <4.8%>     .      .      . |
  , |      .      .      .      .      .      .  <4.4%>     .      . |
 VB |      .      .      .      .      .      .      .  <3.5%>     . |
 NP |      .      .      .      .      .      .      .      .  <3.1%>|
----+----------------------------------------------------------------+
(row =

In [59]:
import math
def entropy(labels):
    freqdist = nltk.FreqDist(labels)
    probs = [freqdist.freq(l) for l in freqdist]
    return -sum(p * math.log(p,2) for p in probs)

In [60]:
print(entropy(['male', 'male', 'male', 'male']))

-0.0


In [61]:
print(entropy(['male', 'female', 'male', 'male']))

0.8112781244591328


In [62]:
print(entropy(['female', 'male', 'female', 'male']))

1.0


In [63]:
print(entropy(['female', 'female', 'male', 'female']))

0.8112781244591328


In [64]:
print(entropy(['female', 'female', 'female', 'female'])) 

-0.0
