In [1]:
## sklearn.feature_extraction.text has vectorizers in there. The vectorizers are
## bag of words and tf-idf essentially. There is one option to do bag of words vectorization
## with hashing.

from collections import Counter
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import numpy as np

In [2]:
# COMPILE DOCUMENTS

In [3]:
doc1 = 'the cat in the hat'

In [4]:
doc2 = 'the cat in the tree'

In [5]:
doc3 = 'the cat ate my hat'

In [6]:
doc4 = 'the cat in the hat the cat in the hat the cat in the hat'

In [7]:
documents = [doc1, doc2, doc3, doc4]

In [8]:
# FEATURIZE DOCUMENTS

In [9]:
# This just splits on spaces and returns all of the words as a list. 
 
vocabulary = [word for doc in documents for word in doc.split(' ')]

# Remove duplicates in the vocabulary. 

vocabulary = sorted(list(set(vocabulary)))

In [10]:
print 'Vocabulary (features):',vocabulary

Vocabulary (features): ['ate', 'cat', 'hat', 'in', 'my', 'the', 'tree']


In [11]:



def vectorize(doc, vocabulary):
    # It just splits the string on a space and then counts the number of times that terms happen. 
    bag_of_words = Counter(doc.split(' '))
    # The counter object will be translated into a vector.
    # And the next line creates a numpy vector with space for those entries. 
    doc_vector = np.zeros(len(vocabulary))
    # The next for loop is very interesting. Although the doc may have words that are not in 
    # the vocabulary, it is only for words in the vocabulary that the vector is made. 
    # word_index is an integer and indexes the doc_vector. 
    for word_index, word in enumerate(vocabulary):
        if word in bag_of_words:
            doc_vector[word_index] += bag_of_words[word]
    return doc_vector

In [228]:
# The -1 is sort of a place holder. The first number, 1, says how many rows we will have. 
# the -1 means that we don't know how many columns a priori but that the function is to 
# fill in that information after it has created the vector. 

doc1_vectorized = vectorize(doc1, vocabulary).reshape(1, -1)
doc2_vectorized = vectorize(doc2, vocabulary).reshape(1, -1)
doc3_vectorized = vectorize(doc3, vocabulary).reshape(1, -1)
doc4_vectorized = vectorize(doc4, vocabulary).reshape(1, -1)

# vstack() combines numpy vectors into a matrix. 
tf_matrix = np.vstack((doc1_vectorized,
                       doc2_vectorized,
                       doc3_vectorized,
                       doc4_vectorized))

In [13]:
print 'features:',vocabulary
print '"%s":'%doc1, tf_matrix[0]
print '"%s":'%doc2, tf_matrix[1]
print '"%s":'%doc3, tf_matrix[2]
print '"%s":\n'%doc4, '    ', tf_matrix[3]
print
print 'feature matrix:'
print tf_matrix

features: ['ate', 'cat', 'hat', 'in', 'my', 'the', 'tree']
"the cat in the hat": [ 0.  1.  1.  1.  0.  2.  0.]
"the cat in the tree": [ 0.  1.  0.  1.  0.  2.  1.]
"the cat ate my hat": [ 1.  1.  1.  0.  1.  1.  0.]
"the cat in the hat the cat in the hat the cat in the hat":
     [ 0.  3.  3.  3.  0.  6.  0.]

feature matrix:
[[ 0.  1.  1.  1.  0.  2.  0.]
 [ 0.  1.  0.  1.  0.  2.  1.]
 [ 1.  1.  1.  0.  1.  1.  0.]
 [ 0.  3.  3.  3.  0.  6.  0.]]


In [14]:
# sklearn can do this for you
count_vectorizer = CountVectorizer(stop_words=None,
                                  vocabulary=vocabulary)

# The count_vectorizer has a sparse representation by default. todense() turns it into a dense matrix.
feature_matrix = count_vectorizer.fit_transform([doc1]).todense()

In [15]:
print 'Vectorize:',doc1
print 'sklearn result',feature_matrix
print 'our result',vectorize(doc1, vocabulary)
print
print 'feature matrix'
print count_vectorizer.fit_transform(documents).todense()

Vectorize: the cat in the hat
sklearn result [[0 1 1 1 0 2 0]]
our result [ 0.  1.  1.  1.  0.  2.  0.]

feature matrix
[[0 1 1 1 0 2 0]
 [0 1 0 1 0 2 1]
 [1 1 1 0 1 1 0]
 [0 3 3 3 0 6 0]]


In [16]:
# COMPARE DOCUMENT FEATURES

In [None]:
# COSINE SIMILARITY COMPARISON

In [229]:
print 'Compare "%s" \nwith "%s"'%(doc1, doc2)
print cosine_similarity(doc1_vectorized, doc2_vectorized)

Compare "the cat in the hat" 
with "the cat in the tree"
[[ 0.85714286]]


In [230]:
print 'Compare "%s" \nwith "%s"'%(doc1, doc3)
print cosine_similarity(doc1_vectorized, doc3_vectorized)

Compare "the cat in the hat" 
with "the cat ate my hat"
[[ 0.6761234]]


In [231]:
# EUCLIDEAN DISTANCE COMPARISON
print 'Compare "%s" \nwith "%s"'%(doc1, doc2)
eu = euclidean_distances

def normalize(v):
    return v/np.linalg.norm(v)

v1 = normalize(doc1_vectorized)
v2 = normalize(doc2_vectorized)

print eu(v1,v2)


Compare "the cat in the hat" 
with "the cat in the tree"
[[ 0.53452248]]


# Poetry search engine

In [232]:
# This box contains code for creating a corpus of poems. 

frost_poem = \
"""The Road Not Taken 
Robert Frost 
1874-1963

U.S., New, England
Time & Brevity, Nature, Landscapes & Pastorals, Living, Midlife, Fall

Rhymed Stanza

Two roads diverged in a yellow wood, 
And sorry I could not travel both 
And be one traveler, long I stood 
And looked down one as far as I could 
To where it bent in the undergrowth; 

Then took the other, as just as fair, 
And having perhaps the better claim, 
Because it was grassy and wanted wear; 
Though as for that the passing there 
Had worn them really about the same, 

And both that morning equally lay 
In leaves no step had trodden black. 
Oh, I kept the first for another day! 
Yet knowing how way leads on to way, 
I doubted if I should ever come back. 

I shall be telling this with a sigh 
Somewhere ages and ages hence: 
Two roads diverged in a wood, and I-- 
I took the one less traveled by, 
And that has made all the difference."""



class Poem:
    
    def __init__(self,poem):
        self.title = poem[0].strip()
        self.author = poem[1].strip()
        self.region = poem[4].strip()
        self.lifespan = poem[2].strip()
        self.topics = [topic.strip() for topic in poem[5].split(',')]
        self.type = poem[7]
        self.text = "".join(poem[9:])
        self.clean_text = ""
        
        
    def clean(self):
        # Get rid of line breaks. 
        x = self.text.split("\n")
        x = [i.strip() for i in x]
        self.clean_text = " ".join(x)
        
        # Lower case.
        self.clean_text = self.clean_text.lower()
        
        # Remove punctuations
        punctuations='.:,;!()?\'"-=[]'
        for p in punctuations:
            self.clean_text = self.clean_text.replace(p,'')
            
        # Remove spaces. 
        while "  " in self.clean_text:
            self.clean_text = self.clean_text.replace("  "," ")
    
    def lemmatize(self):
        
        # Lemmatize.
        from nltk.stem import WordNetLemmatizer
        wordnet_lemmatizer = WordNetLemmatizer()
        temp=""
        for word in self.clean_text.split():
            temp=temp+" "+wordnet_lemmatizer.lemmatize(word)
        
        self.clean_text = temp.strip() # get rid of white space. 
        
    def stem(self):
        from nltk.stem.porter import PorterStemmer
        porter = PorterStemmer()
        temp=""
        for word in self.clean_text.split():
            temp=temp+" "+porter.stem(word)
        
        self.clean_text = temp.strip() # get rid of white space. 
        
        
        
 
import os

# Load the corpus of poems from a subdirectory. 
poetry_corpus = []
for filename in os.listdir('./poems'):
    if filename[-3:] == 'txt':
        f = open('./poems/'+filename)
        poem = Poem(f.readlines())
        f.close()
        poetry_corpus.append(poem)
 
# Preprocess the text
for poem in poetry_corpus:
    poem.clean()
    temp = poem.clean_text
    poem.lemmatize()
    poem.stem()
    #if temp!=poem.clean_text:
    #    print temp[0:30]
    #    print poem.clean_text[0:30]+"\n"
print poetry_corpus[10].text
poetry_corpus[10].clean()    
print poetry_corpus[10].clean_text

"Hope" is the thing with feathers - 
That perches in the soul - 
And sings the tune without the words - 
And never stops - at all - 

And sweetest - in the Gale - is heard - 
And sore must be the storm - 
That could abash the little Bird 
That kept so many warm - 

I've heard it in the chillest land - 
And on the strangest Sea - 
Yet - never - in Extremity, 
It asked a crumb - of me.

hope is the thing with feathers that perches in the soul and sings the tune without the words and never stops at all and sweetest in the gale is heard and sore must be the storm that could abash the little bird that kept so many warm ive heard it in the chillest land and on the strangest sea yet never in extremity it asked a crumb of me 


In [54]:
# Remove stop words
from nltk.corpus import stopwords
#stopwords.words('english')

## Bag of words and cosine similarity. 
### Finding the poem most like Hope is the Thing with Feathers.

In [233]:
for poem in poetry_corpus:
    poem.clean()
    poem.lemmatize()
    #poem.stem()

documents = []
vocabulary = set()
for poem in poetry_corpus:
    documents.append(poem.clean_text)
    for word in poem.clean_text.split():
        vocabulary.add(word)    

count_vectorizer = CountVectorizer(stop_words=stopwords.words('english'),
                                  vocabulary=vocabulary)



In [234]:


# gives us a matrix with rows being the keys of words.
count_vectorizer.fit(documents)
vectorized_corpus = count_vectorizer.transform(documents).todense()

# gives us a dictionary with keys as words and values as keys.
#print count_vectorizer.vocabulary_

print vectorized_corpus

[[0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 ..., 
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]]


In [235]:
roads_index = 10

search_results=list()
for i,docvec in enumerate(vectorized_corpus):
    sim = cosine_similarity(vectorized_corpus[roads_index],docvec)
    search_results.append([i,sim])
    
search_results = sorted(search_results,key=lambda x:-x[1])
print search_results[1]
print poetry_corpus[search_results[1][0]].text



[3138, array([[ 0.24452905]])]
Three lovely notes he whistled, too soft to be heard
 If others sang; but others never sang
 In the great beech-wood all that May and June.
 No one saw him: I alone could hear him
 Though many listened. Was it but four years
 Ago? or five? He never came again.
  
 Oftenest when I heard him I was alone,
 Nor could I ever make another hear.
 La-la-la! he called, seeming far-off--
 As if a cock crowed past the edge of the world,
 As if the bird or I were in a dream.
 Yet that he travelled through the trees and sometimes
 Neared me, was plain, though somehow distant still
 He sounded. All the proof is--I told men
 What I had heard.
  
                                    I never knew a voice,
 Man, beast, or bird, better than this. I told
 The naturalists; but neither had they heard
 Anything like the notes that did so haunt me,
 I had them clear by heart and have them still.
 Four years, or five, have made no difference. Then
 As now that La-la-la! was bodile

## TF-IDF and cosine similarity. Finding the poem most like Two Roads Diverged in Yellow Wood.

In [236]:


tfidf_vectorizer = TfidfVectorizer(stop_words=stopwords.words('english'),
                                  vocabulary=vocabulary)
tfidf_matrix = tfidf_vectorizer.fit_transform(documents).todense()

In [239]:

roads_index = 10
search_results=list()
for i,docvec in enumerate(tfidf_matrix):
    sim = cosine_similarity(tfidf_matrix[roads_index],docvec)
    search_results.append([i,sim])
    
search_results = sorted(search_results,key=lambda x:-x[1])
print search_results[1]
print poetry_corpus[search_results[1][0]].text

[3267, array([[ 0.14775207]])]
The palm at the end of the mind,
Beyond the last thought, rises
In the bronze decor,

A gold-feathered bird
Sings in the palm, without human meaning,
Without human feeling, a foreign song.

You know then that it is not the reason 
That makes us happy or unhappy. 
The bird sings. Its feathers shine.

The palm stands on the edge of space. 
The wind moves slowly in the branches. 
The bird's fire-fangled feathers dangle down.



In [247]:
# Compare the best matching poem with the worst matching poem.
print poetry_corpus[search_results[-1][0]].text

Father, where do the wild swans go?
         Far, far. Ceaselessly winging,
         Their necks outstraining, they haste them singing
         Far, far. Whither, none may know.

Father, where do the cloud-ships go?
         Far, far. The winds pursue them,
         And over the shining heaven strew them
         Far, far. Whither, none may know.

Father, where do the days all go?
         Far, far. Each runs and races--
         No one can catch them, they leave no traces--
         Far, far. Whither, none may know.

But father, we--where do we then go?
         Far, far. Our dim eyes veiling,
         With bended head we go sighing, wailing
         Far, far. Whither none may know.



## Generating poetry with n-grams

In order to generate poetry using an n gram we first have to count the number of times certain combinations of words occur and then find the one the one which maximizes the probability. We should use the cleaned version of the text to do this. 



In [96]:
for poem in poetry_corpus:
    poem.clean()


In [97]:
def ngram(n):
    words = list()
    for poem in poetry_corpus:
        x = poem.clean_text.split()
        for i in range(len(x)-(n-1)):
            if '' not in x[i:i+n]:
                words.append(",".join(x[i:i+n]))

    return Counter(words)

bigram = ngram(2)
trigram = ngram(3)
fourgram = ngram(4)
bigram.most_common(5)
    

[('of,the', 4297),
 ('in,the', 4046),
 ('and,the', 2870),
 ('to,the', 2208),
 ('on,the', 1922)]

Now that we have a dictionary with counts in them and bigrams, we can use this to generate poetry. We seed it with the word "the" and then choose the word combo that has highest probability.

In [98]:
import random

def most_likely_word(seed,gram,n):
    """ input: list, string, int"""
    best = -1
    best_word=""
    for i in gram:
        if i.split(',')[0:n-1]==seed:
            if gram[i]>best:
                best = gram[i]
                best_word = i.split(',')[-1]
    if best == -1:
        n=len(gram.keys())
        rand = random.randint(0,n)
        m=0
        for i in gram:
            if m == rand:
                return i.split(',')[-1]
            m += 1
        
    return best_word
            

In [99]:
print "the"
print most_likely_word(['the'],bigram,2)
print most_likely_word(['the','world'],trigram,3)

the
world
and


In [100]:
def generate_poem(seed,length,gram):
    """input: list,int,Counter"""
    n = len(seed)+1
    poem = seed
    for i in xrange(length):
        poem.append(most_likely_word(poem[1-n:],gram,n))
    return " ".join(poem)


In [101]:
generate_poem(['the'],30,bigram)

'the world and the world and the world and the world and the world and the world and the world and the world and the world and the world and the'

In [102]:
generate_poem(['the','world'],30,trigram)

'the world and all the world and all the world and all the world and all the world and all the world and all the world and all the world and all'

In [103]:
generate_poem(['the','world','is'],40,fourgram)

'the world is all his own mischance mute with a glassy countenance did she look to camelot and as the bird it left no trace in the heaven of your face in your stupidity i found the sweet hush after a sweet sound'

In [104]:
incorpus = False
for poem in poetry_corpus:
    if "in your stupidity i found the sweet hush after a sweet sound" in poem.clean_text:
        incorpus = True

print incorpus
    

True


Our model is over fit. It is just regenerating the same poetry that is in the text, which is not what we want. We want new poems. For fix that, we'll have to do something like randomize the output. 

In [114]:
def generate_random_poem(seed,length,grams):
    """input: list,int,Counter"""
    from random import randint
    n = len(seed)+1
    poem = seed
    for i in xrange(length):
        m = randint(1,(n-1)**2) # 4-grams more likley than 3-grams, etc...
        if m<=4:
            m=3
        elif m<=9:
            m=4
        poem.append(most_likely_word(poem[1-m:],grams[m-2],m))
    return " ".join(poem)

In [225]:
generate_random_poem(['the','world','is'],50,[bigram,trigram,fourgram])

'the world is all i can see the face of the sea and the sky and the sun and the rain a weary heart went thankful to rest and what the waur am i gin a body kiss a body meet a body meet a body meet a body comin thro the rye'

### Naive Bayes classifier. 
 

In [128]:
import numpy as np
X = np.random.randint(5, size=(6, 100))
y = np.array([1, 2, 3, 4, 5, 6])
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()
clf.fit(X, y)
MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)
print(clf.predict(X[2:3]))

[3]


97


[('Living', 1276),
 ('Nature', 921),
 ('Relationships', 901),
 ('Love', 817),
 ('Social Commentaries', 784),
 ('Arts & Sciences', 620),
 ('Religion', 515),
 ('Death', 462),
 ('Time & Brevity', 361),
 ('Romantic Love', 322)]

array([ 0.,  0.,  0.,  0.,  0.])

Naive Bayes' can help us predict a category. The poem types are not mutually exclusive and are therefor not categories. A poem may have multiple types. Hence we cannot use Naive Bayes to predict the type of a poem. What we can do though, is predict whether a poem is a particular type or not. Say, if we wanted to tell if a poem was about nature. 

In [174]:
topic_word_count = {"Nature":{},"Not Nature":{}}
for poem in poetry_corpus:
    if 'Nature' in poem.topics:
        topic="Nature"
    else:
        topic="Not Nature"
        
    #count the words
    words = poem.clean_text.split()
    for word in words:
        if word in topic_word_count[topic]:
            topic_word_count[topic][word]+=1
        else:
            topic_word_count[topic][word]=1

topic_count={}
for topic in topic_word_count:
    count=0
    for word in topic_word_count[topic]:
        count+=topic_word_count[topic][word]
    topic_count[topic]=count
     
total_count = 0
for topic in topic_count:
    total_count += topic_count[topic]
    
import numpy as np
def get_nb(doc,topic):
    score=np.log(1.0 * topic_count[topic]/total_count) #the base probability.
    for word in doc.split():
        if word in topic_word_count[topic]:
            score += np.log(1.0*topic_word_count[topic][word]/topic_count[topic])
        else:
            score += np.log(0.001)
    return score


        


In [204]:
def is_it_nature(doc):
    score = (get_nb(doc,"Nature")-get_nb(doc,"Not Nature"))
    if score > 0:
        print "Yes", np.exp(score)
    else:
        print "No", np.exp(score)
        
is_it_nature("orange")

Yes 2.66666666667
