# Goal: Predict the poets based on their poems
- Given 2 poems, build and train a model that could classify a line of poem belong to which poet.

# Loop through each file, save each line to a list, save the labels too

In [1]:
!wget -nc https://raw.githubusercontent.com/lazyprogrammer/machine_learning_examples/master/hmm_class/edgar_allan_poe.txt
!wget -nc https://raw.githubusercontent.com/lazyprogrammer/machine_learning_examples/master/hmm_class/robert_frost.txt

File ‘edgar_allan_poe.txt’ already there; not retrieving.

File ‘robert_frost.txt’ already there; not retrieving.



In [2]:
import numpy as np
import matplotlib.pyplot as plt
import string
from sklearn.model_selection import train_test_split

In [3]:
input_files = [
    'edgar_allan_poe.txt',
    'robert_frost.txt',
]

In [4]:
!head edgar_allan_poe.txt

LO! Death hath rear'd himself a throne
In a strange city, all alone,
Far down within the dim west
Where the good, and the bad, and the worst, and the best,
Have gone to their eternal rest.
 
There shrines, and palaces, and towers
Are not like any thing of ours
Oh no! O no! ours never loom
To heaven with that ungodly gloom!


In [5]:
!head robert_frost.txt

Two roads diverged in a yellow wood,
And sorry I could not travel both
And be one traveler, long I stood
And looked down one as far as I could
To where it bent in the undergrowth; 

Then took the other, as just as fair,
And having perhaps the better claim
Because it was grassy and wanted wear,
Though as for that the passing there


In [6]:
# collect data into lists
input_texts = []
labels = []


# loop through the files, use enumerate to give us the index. Edgar will be 0, Robert will be 1
for label, f in enumerate(input_files):
    print(f'{f} corresponds to label {label}')


    for line in open(f):
        # using rstrip to get rid of "\n" at the beginning of each line
        line = line.rstrip().lower()
        # because some lines are empty, run this only if the line is not empty
        if line:
            # removing punctuation. String.punctuation is to identify the punc, and maketrans to map the punc to 'None'
            line = line.translate(str.maketrans('', '', string.punctuation))
            # append the texts and labels to our lists
            input_texts.append(line)
            labels.append(label)


edgar_allan_poe.txt corresponds to label 0
robert_frost.txt corresponds to label 1


# Train test split

In [7]:
train_text, test_text, Ytrain, Ytest = train_test_split(input_texts, labels)

In [8]:
len(Ytrain), len(Ytest)

(1615, 539)

In [9]:
train_text

['with desperate energy t hath beaten down',
 'and i cried it was surely october',
 'too lofty and original to rage',
 'to ask if there is some mistake',
 'as the stardials hinted of morn ',
 'to seek a shelter in some happier star',
 'on my grave is growing or grown',
 'from their throats',
 'no',
 'the uncommonly deep snow has made him think',
 'hath ever told or is it of a thought',
 'the sacred sun of all who weeping bless thee',
 'that i journeyed  i journeyed down here ',
 'among the raspberries and hew and shape it',
 'i hope if he is where he sees me now',
 'they have best right to be heard in this place',
 'now that theyve got it settled whose i be',
 'last night was one of her nights out shes kiting',
 'and tell me whether',
 'the very hours are breathing low',
 'o i care not that my earthly lot',
 'o god on my funereal mind',
 'something i must have learned riding in trains',
 'i cant keep track of other peoples daughters',
 'the crib he slept in and as sures youre born',
 '

In [10]:
Ytrain

[0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 0,
 0,
 0,
 1,
 1,
 1,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 0,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,


# Creating mapping from unique word to uniqe integer index

In [11]:
idx = 1
# the key unk is mapped to 0
word2idx = {'<unk>': 0}

# Loop through data, tokenize each line
# Assign each word to an integer

In [12]:
# populate word2idx
for text in train_text:
    tokens = text.split()

    for token in tokens:
        # check if the token is already in our dictionary
        if token not in word2idx:
            # assign the index to the current token. the 'token' as keys, and index as values
            word2idx[token] = idx
            idx += 1

In [13]:
word2idx

{'<unk>': 0,
 'with': 1,
 'desperate': 2,
 'energy': 3,
 't': 4,
 'hath': 5,
 'beaten': 6,
 'down': 7,
 'and': 8,
 'i': 9,
 'cried': 10,
 'it': 11,
 'was': 12,
 'surely': 13,
 'october': 14,
 'too': 15,
 'lofty': 16,
 'original': 17,
 'to': 18,
 'rage': 19,
 'ask': 20,
 'if': 21,
 'there': 22,
 'is': 23,
 'some': 24,
 'mistake': 25,
 'as': 26,
 'the': 27,
 'stardials': 28,
 'hinted': 29,
 'of': 30,
 'morn': 31,
 'seek': 32,
 'a': 33,
 'shelter': 34,
 'in': 35,
 'happier': 36,
 'star': 37,
 'on': 38,
 'my': 39,
 'grave': 40,
 'growing': 41,
 'or': 42,
 'grown': 43,
 'from': 44,
 'their': 45,
 'throats': 46,
 'no': 47,
 'uncommonly': 48,
 'deep': 49,
 'snow': 50,
 'has': 51,
 'made': 52,
 'him': 53,
 'think': 54,
 'ever': 55,
 'told': 56,
 'thought': 57,
 'sacred': 58,
 'sun': 59,
 'all': 60,
 'who': 61,
 'weeping': 62,
 'bless': 63,
 'thee': 64,
 'that': 65,
 'journeyed': 66,
 'here': 67,
 'among': 68,
 'raspberries': 69,
 'hew': 70,
 'shape': 71,
 'hope': 72,
 'he': 73,
 'where': 74,
 

In [14]:
# this will determine the size of our Markov matrix, and initial state distribution
len(word2idx)

2497

# Convert each line of text (the samples) into integer lists

In [15]:
train_text_int = []
test_text_int = []


for text in train_text:
    tokens = text.split()
    # map each token to its corresponding text
    line_as_int = [word2idx[token] for token in tokens]
    train_text_int.append(line_as_int)

for text in test_text:
    tokens = text.split()
    line_as_int = [word2idx.get(token, 0) for token in tokens]
    test_text_int.append(line_as_int)

In [16]:
train_text_int

[[1, 2, 3, 4, 5, 6, 7],
 [8, 9, 10, 11, 12, 13, 14],
 [15, 16, 8, 17, 18, 19],
 [18, 20, 21, 22, 23, 24, 25],
 [26, 27, 28, 29, 30, 31],
 [18, 32, 33, 34, 35, 24, 36, 37],
 [38, 39, 40, 23, 41, 42, 43],
 [44, 45, 46],
 [47],
 [27, 48, 49, 50, 51, 52, 53, 54],
 [5, 55, 56, 42, 23, 11, 30, 33, 57],
 [27, 58, 59, 30, 60, 61, 62, 63, 64],
 [65, 9, 66, 9, 66, 7, 67],
 [68, 27, 69, 8, 70, 8, 71, 11],
 [9, 72, 21, 73, 23, 74, 73, 75, 76, 77],
 [78, 79, 80, 81, 18, 82, 83, 35, 84, 85],
 [77, 65, 86, 87, 11, 88, 89, 9, 82],
 [90, 91, 12, 92, 30, 93, 94, 95, 96, 97],
 [8, 98, 76, 99],
 [27, 100, 101, 102, 103, 104],
 [105, 9, 106, 107, 65, 39, 108, 109],
 [105, 110, 38, 39, 111, 112],
 [113, 9, 114, 79, 115, 116, 35, 117],
 [9, 118, 119, 120, 30, 121, 122, 123],
 [27, 124, 73, 125, 35, 8, 26, 126, 127, 128],
 [65, 129, 130, 131],
 [132, 133, 134, 27, 135],
 [65, 136, 76, 137, 27, 138, 26, 139, 26, 140],
 [141, 18, 142, 143, 144, 145, 136, 27, 146],
 [147, 148, 149, 150, 15, 151, 98, 27, 152, 153

# Build A and pi matrices to represent the Markov model
# Add one smoothing

In [17]:
# define vocab size
V = len(word2idx)

# b/c we are going to use add one smoothing, ones are the intial fake counts for each initial word and each transition
A0 = np.ones((V, V))
pi0 = np.ones(V)

A1 = np.ones((V, V))
pi1 = np.ones(V)

A0

# Fit the data

In [18]:
# compute counts for A and pi
def compute_counts(text_as_int, A, pi):

    for tokens in text_as_int:
        last_idx = None
        for idx in tokens:
            if last_idx is None:
                # it's the first word in the sentence
                pi[idx] += 1
            else:
                # the last word exists, so count a transition from one word to the next
                A[last_idx, idx] += 1
            
            last_idx = idx

In [19]:
# compute count for class 0
compute_counts([t for t, y in zip(train_text_int, Ytrain) if y==0], A0, pi0)
# compute count for class 1
compute_counts([t for t, y in zip(train_text_int, Ytrain) if y==1], A1, pi1)

In [20]:
# normalize A and pi so they are valid probability matries
A0 /= A0.sum(axis=1, keepdims=True)
# no need for keepdims since pi is 1D array
pi0 /= pi0.sum()

A1 /= A1.sum(axis=1, keepdims=True)
pi1 /= pi1.sum()

In [21]:
# log A and pi since we don't need actual probs
logA0 = np.log(A0)
logpi0 = np.log(pi0)

logA1 = np.log(A1)
logpi1 = np.log(pi1)

In [22]:
# compute priors
count0 = sum(y == 0 for y in Ytrain)   
# how many samples belong to class 1
count1 = sum(y == 1 for y in Ytrain)

total = len(Ytrain)

# compute the proportion of each class, in total of Ytrain
p0 = count0 / total
p1 = count1 / total

logp0 = np.log(p0)
logp1 = np.log(p1)
p0, p1
# there are more p1 than p0
# we should not use the maximum likelihood method
# instead we should look at the posterior, which corresponds to MAP method

(0.3238390092879257, 0.6761609907120743)

# Train Markov model for each class (Edgar Allan Poe/ Robert Frost). Train on lines of poems.
# Write a function to compute the posterior for each class, given an input
# Take the argmax (highest probability score) over the posteriors to get the predicted class

In [23]:
# build a classifier
class Classifier:
    def __init__(self, logAs, logpis, logpriors):
        self.logAs = logAs
        self.logpis = logpis
        self.logpriors = logpriors
        self.K = len(logpriors)  # number of classes, 2 in our case
    
    def _compute_log_likelihood(self, input_, class_):
        # retrieving log As and log pis, by indexing our lists by the class
        logA = self.logAs[class_]
        logpi = self.logpis[class_]

        last_idx = None
        logprob = 0
        for idx in input_:
            if last_idx is None:
                # index log pi, add the result to logprob
                logprob += logpi[idx]
            else:
                logprob += logA[last_idx, idx]
            
            last_idx = idx

        return logprob
    
    def predict(self, inputs):
        # initialize an array to store the predictions
        predictions = np.zeros(len(inputs))
        for i, input_ in enumerate(inputs):
            # compute the posteriors for each class
            posteriors = [self._compute_log_likelihood(input_, c) + self.logpriors[c] \
                          for c in range(self.K)]
            
            pred = np.argmax(posteriors)
            # store the prediction in our array of predictions at index i
            predictions[i] = pred
        return predictions

# Make predictions for both train/ test sets

In [24]:
clf = Classifier([logA0, logA1], [logpi0, logpi1], [logp0, logp1])

In [25]:
Ptrain = clf.predict(train_text_int)
print(f'Train accuracy: {np.mean(Ptrain == Ytrain)}')

Train accuracy: 0.9944272445820433


In [26]:
Ptest = clf.predict(test_text_int)
print(f'Test accuracy: {np.mean(Ptest == Ytest)}')

Test accuracy: 0.8033395176252319


# Compute accuracy for train/test

In [27]:
from sklearn.metrics import confusion_matrix, f1_score

cm = confusion_matrix(Ytrain, Ptrain)
cm

In [28]:
cm_test = confusion_matrix(Ytest, Ptest)
cm_test

# Check for class imbalance. If imbalance, check confusion matrix and F1-score to see which one has incorrect prediction most often

In [29]:
f1_score(Ytrain, Ptrain)

0.9958960328317373

In [30]:
f1_score(Ytest, Ptest)

0.8626943005181348