In [1]:
import re

docs = ["Natural Language Processing with Python",
        "Handbook of Natural Language Processing",
        "Learning IPython for Interactive Computing and Data Visualization"]

# We have three documents (book titles), first two are about a similar
# subject, last one is different. Lets tokenize the documents and
# normalize them to lowercase.

tokenized_documents = [re.findall(r'\w+', d.lower()) for d in docs]
tokenized_documents

[['natural', 'language', 'processing', 'with', 'python'],
 ['handbook', 'of', 'natural', 'language', 'processing'],
 ['learning',
  'ipython',
  'for',
  'interactive',
  'computing',
  'and',
  'data',
  'visualization']]

In [2]:
lexicon = sorted(set(sum(tokenized_documents, [])))

# our lexicon or vocabulary looks like this.

lexicon

['and',
 'computing',
 'data',
 'for',
 'handbook',
 'interactive',
 'ipython',
 'language',
 'learning',
 'natural',
 'of',
 'processing',
 'python',
 'visualization',
 'with']

In [3]:
from collections import OrderedDict

vector_template = OrderedDict((token, 0) for token in lexicon)

# our vector template looks like this.

vector_template

OrderedDict([('and', 0),
             ('computing', 0),
             ('data', 0),
             ('for', 0),
             ('handbook', 0),
             ('interactive', 0),
             ('ipython', 0),
             ('language', 0),
             ('learning', 0),
             ('natural', 0),
             ('of', 0),
             ('processing', 0),
             ('python', 0),
             ('visualization', 0),
             ('with', 0)])

In [4]:
import copy
from collections import Counter

doc_tfidf_vectors = []
for doc_tokens in tokenized_documents:
    vec = copy.copy(vector_template)
    token_counts = Counter(doc_tokens)
    for key, value in token_counts.items():
        docs_containing_key = 0
        for _doc_tokens in tokenized_documents:
            if key in _doc_tokens:
                docs_containing_key += 1
        tf = value / len(lexicon)  # normalized by vocab size here?
        if docs_containing_key:
            idf = len(tokenized_documents) / docs_containing_key
        else:
            idf = 0
        vec[key] = tf * idf
    doc_tfidf_vectors.append(vec)
    
# and our document vectors are the following

doc_tfidf_vectors

[OrderedDict([('and', 0),
              ('computing', 0),
              ('data', 0),
              ('for', 0),
              ('handbook', 0),
              ('interactive', 0),
              ('ipython', 0),
              ('language', 0.1),
              ('learning', 0),
              ('natural', 0.1),
              ('of', 0),
              ('processing', 0.1),
              ('python', 0.2),
              ('visualization', 0),
              ('with', 0.2)]),
 OrderedDict([('and', 0),
              ('computing', 0),
              ('data', 0),
              ('for', 0),
              ('handbook', 0.2),
              ('interactive', 0),
              ('ipython', 0),
              ('language', 0.1),
              ('learning', 0),
              ('natural', 0.1),
              ('of', 0.2),
              ('processing', 0.1),
              ('python', 0),
              ('visualization', 0),
              ('with', 0)]),
 OrderedDict([('and', 0.2),
              ('computing', 0.2),
              ('da

In [5]:
import math

def cosine_sim(vec1, vec2):
    vec1 = list(vec1.values())
    vec2 = list(vec2.values())
    dot_prod = 0
    for i, v in enumerate(vec1):
        dot_prod += v * vec2[i]
    mag_1 = math.sqrt(sum([x**2 for x in vec1]))
    mag_2 = math.sqrt(sum([x**2 for x in vec2]))
    return dot_prod / (mag_1 * mag_2)

# Lets compare the first document to the other two. As expected
# document 1 and 2 have some similarity, documents 1 and 3 have
# 0 similarity since they don't share any words.

print(cosine_sim(doc_tfidf_vectors[0], doc_tfidf_vectors[1]))
print(cosine_sim(doc_tfidf_vectors[0], doc_tfidf_vectors[2]))

0.2727272727272727
0.0


In [6]:
print("\t0\t1\t2")
for r, doc1 in enumerate(doc_tfidf_vectors):
    print(r, end='\t')
    for c, doc2 in enumerate(doc_tfidf_vectors):
        print(round(cosine_sim(doc1, doc2), 2), end='\t')
    print()

# And lets compare every document to the other documents
# in our toy corpus.

	0	1	2
0	1.0	0.27	0.0	
1	0.27	1.0	0.0	
2	0.0	0.0	1.0	


In [7]:
query = "IPython Interactive Computing and Visualization Cookbook"
query_vec = copy.copy(vector_template)
tokens = re.findall(r'\w+', query.lower())
token_counts = Counter(tokens)

for key, value in token_counts.items():
    docs_containing_key = 0
    for _doc in tokenized_documents:
        if key in _doc:
            docs_containing_key += 1
    if docs_containing_key == 0:
        continue
    tf = value / len(tokens)  # normalized by doc word length here?
    idf = len(tokenized_documents) / docs_containing_key
    query_vec[key] = tf * idf
    
# And finally, lets query our model with a new document.
# Unsurprisingly, this new book title is only similar
# to the last document in our collection.

print(cosine_sim(query_vec, doc_tfidf_vectors[0]))
print(cosine_sim(query_vec, doc_tfidf_vectors[1]))
print(cosine_sim(query_vec, doc_tfidf_vectors[2]))

0.0
0.0
0.7905694150420947
