In [2]:
from collections import Counter
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import numpy as np

In [3]:
# COMPILE DOCUMENTS

In [4]:
doc1 = 'the cat in the hat'

In [5]:
doc2 = 'the cat in the tree'

In [6]:
doc3 = 'the cat ate my hat'

In [7]:
doc4 = 'the cat in the hat the cat in the hat the cat in the hat'

In [8]:
documents = [doc1, doc2, doc3, doc4]

In [9]:
# FEATURIZE DOCUMENTS

In [24]:

vocabulary = [word for doc in documents for word in doc.split(' ')]
vocabulary = sorted(list(set(vocabulary)))

In [10]:
sorted(document.split(' '))

NameError: name 'doc' is not defined

In [25]:
print 'Vocabulary (features):',vocabulary

Vocabulary (features): ['ate', 'cat', 'hat', 'in', 'my', 'the', 'tree']


In [26]:
def vectorize(doc, vocabulary):
    bag_of_words = Counter(doc.split(' '))
    doc_vector = np.zeros(len(vocabulary))
    for word_index, word in enumerate(vocabulary):
        if word in bag_of_words:
            doc_vector[word_index] += bag_of_words[word]
    return doc_vector

In [27]:
doc1_vectorized = vectorize(doc1, vocabulary)
doc2_vectorized = vectorize(doc2, vocabulary)
doc3_vectorized = vectorize(doc3, vocabulary)
doc4_vectorized = vectorize(doc4, vocabulary) 

tf_matrix = np.vstack((doc1_vectorized,
                       doc2_vectorized,
                       doc3_vectorized,
                       doc4_vectorized))

In [28]:
print 'features:',vocabulary
print '"%s":'%doc1, tf_matrix[0]
print '"%s":'%doc2, tf_matrix[1]
print '"%s":'%doc3, tf_matrix[2]
print '"%s":\n'%doc4, '    ', tf_matrix[3]
print
print 'feature matrix:'
print tf_matrix

features: ['ate', 'cat', 'hat', 'in', 'my', 'the', 'tree']
"the cat in the hat": [ 0.  1.  1.  1.  0.  2.  0.]
"the cat in the tree": [ 0.  1.  0.  1.  0.  2.  1.]
"the cat ate my hat": [ 1.  1.  1.  0.  1.  1.  0.]
"the cat in the hat the cat in the hat the cat in the hat":
     [ 0.  3.  3.  3.  0.  6.  0.]

feature matrix:
[[ 0.  1.  1.  1.  0.  2.  0.]
 [ 0.  1.  0.  1.  0.  2.  1.]
 [ 1.  1.  1.  0.  1.  1.  0.]
 [ 0.  3.  3.  3.  0.  6.  0.]]


In [29]:
# sklearn can do this for you
count_vectorizer = CountVectorizer(stop_words=None,
                                  vocabulary=vocabulary)
feature_matrix = count_vectorizer.fit_transform([doc1]).todense()

In [30]:
print 'Vectorize:',doc1
print 'sklearn result',feature_matrix
print 'our result',vectorize(doc1, vocabulary)
print
print 'feature matrix'
print count_vectorizer.fit_transform(documents).todense()

Vectorize: the cat in the hat
sklearn result [[0 1 1 1 0 2 0]]
our result [ 0.  1.  1.  1.  0.  2.  0.]

feature matrix
[[0 1 1 1 0 2 0]
 [0 1 0 1 0 2 1]
 [1 1 1 0 1 1 0]
 [0 3 3 3 0 6 0]]


In [31]:
# COMPARE DOCUMENT FEATURES

In [32]:
# EUCLIDEAN DISTANCE COMPARISON
print 'Compare "%s" \nwith "%s"'%(doc1, doc2)
print euclidean_distances(doc1_vectorized, doc2_vectorized)

Compare "the cat in the hat" 
with "the cat in the tree"
[[ 1.41421356]]


In [33]:
print 'Compare "%s" \nwith "%s"'%(doc1, doc3)
print euclidean_distances(doc1_vectorized, doc3_vectorized)

Compare "the cat in the hat" 
with "the cat ate my hat"
[[ 2.]]


In [34]:
print 'Compare "%s" \nwith "%s"'%(doc1, doc4)
print euclidean_distances(doc1_vectorized, doc4_vectorized)

Compare "the cat in the hat" 
with "the cat in the hat the cat in the hat the cat in the hat"
[[ 5.29150262]]


In [35]:
# COSINE SIMILARITY COMPARISON

In [36]:
print 'Compare "%s" \nwith "%s"'%(doc1, doc4)
print cosine_similarity(doc1_vectorized, doc4_vectorized)

Compare "the cat in the hat" 
with "the cat in the hat the cat in the hat the cat in the hat"
[[ 1.]]


In [37]:
print 'Compare "%s" \nwith "%s"'%(doc1, doc3)
print cosine_similarity(doc1_vectorized, doc3_vectorized)

Compare "the cat in the hat" 
with "the cat ate my hat"
[[ 0.6761234]]


In [38]:
print 'Compare "%s" \nwith "%s"'%(doc1, doc2)
print cosine_similarity(doc1_vectorized, doc2_vectorized)

Compare "the cat in the hat" 
with "the cat in the tree"
[[ 0.85714286]]


In [39]:
# SEARCH ENGINE

In [40]:
query = 'the cat ate what?'

In [41]:
query_vectorized = count_vectorizer.fit_transform([query]).todense()
print "Query:",query
print "Vectorized query:",query_vectorized

Query: the cat ate what?
Vectorized query: [[1 1 0 0 0 1 0]]


In [42]:
for doc, tf_doc in zip(documents, tf_matrix):
    print doc, cosine_similarity(query_vectorized, tf_doc)

the cat in the hat [[ 0.65465367]]
the cat in the tree [[ 0.65465367]]
the cat ate my hat [[ 0.77459667]]
the cat in the hat the cat in the hat the cat in the hat [[ 0.65465367]]


In [43]:
def tfidf(total_frequencies): # total_frequencies is a bag of words feature matrix
    number_of_documents = total_frequencies.shape[0]
    number_of_documents_with_term = (tf_matrix>0).astype(int).sum(axis=0)
    idf = np.log(float(number_of_documents)/(number_of_documents_with_term))
    tfidf = total_frequencies * idf
    return tfidf

In [44]:
print "TF converted to TFIDF"
tfidf_matrix = tfidf(tf_matrix)
for index, document in enumerate(documents):
    print document
    for word, tf, tfidf_ in zip(vocabulary,
                                tf_matrix[index],
                                tfidf_matrix[index]):
        print word, tf, tfidf_
    print 


TF converted to TFIDF
the cat in the hat
ate 0.0 0.0
cat 1.0 0.0
hat 1.0 0.287682072452
in 1.0 0.287682072452
my 0.0 0.0
the 2.0 0.0
tree 0.0 0.0

the cat in the tree
ate 0.0 0.0
cat 1.0 0.0
hat 0.0 0.0
in 1.0 0.287682072452
my 0.0 0.0
the 2.0 0.0
tree 1.0 1.38629436112

the cat ate my hat
ate 1.0 1.38629436112
cat 1.0 0.0
hat 1.0 0.287682072452
in 0.0 0.0
my 1.0 1.38629436112
the 1.0 0.0
tree 0.0 0.0

the cat in the hat the cat in the hat the cat in the hat
ate 0.0 0.0
cat 3.0 0.0
hat 3.0 0.863046217355
in 3.0 0.863046217355
my 0.0 0.0
the 6.0 0.0
tree 0.0 0.0



In [45]:
tfidf_vectorizer = TfidfVectorizer(stop_words=None,
                                   vocabulary=vocabulary)
tfidf_matrix = tfidf_vectorizer.fit_transform(documents).todense()

In [46]:
# COSINE SIMILARITY ON TF AND TFIDF
for index, document in enumerate(documents[1:]):
    print '"%s" compared with "%s"'%(documents[0], documents[index+1])
    print 'TF cosine similarity:', cosine_similarity(tf_matrix[0],
                                                     tf_matrix[index+1])
    print 'TF-IDF cosine similarity:', cosine_similarity(tfidf_matrix[0],
                                                         tfidf_matrix[index+1])
    print

"the cat in the hat" compared with "the cat in the tree"
TF cosine similarity: [[ 0.85714286]]
TF-IDF cosine similarity: [[ 0.72060317]]

"the cat in the hat" compared with "the cat ate my hat"
TF cosine similarity: [[ 0.6761234]]
TF-IDF cosine similarity: [[ 0.48303532]]

"the cat in the hat" compared with "the cat in the hat the cat in the hat the cat in the hat"
TF cosine similarity: [[ 1.]]
TF-IDF cosine similarity: [[ 1.]]

