In [1]:
import re

docs = ["The faster Harry got to the store, the faster and faster Harry would get home.",
        "Harry is hairy and faster than Jill.",
        "Jill is not as hairy as Harry."]

doc_tokens = [re.findall(r'\w+', d.lower()) for d in docs]
doc_tokens

[['the',
  'faster',
  'harry',
  'got',
  'to',
  'the',
  'store',
  'the',
  'faster',
  'and',
  'faster',
  'harry',
  'would',
  'get',
  'home'],
 ['harry', 'is', 'hairy', 'and', 'faster', 'than', 'jill'],
 ['jill', 'is', 'not', 'as', 'hairy', 'as', 'harry']]

In [2]:
lexicon = sorted(set(sum(doc_tokens, [])))
lexicon

['and',
 'as',
 'faster',
 'get',
 'got',
 'hairy',
 'harry',
 'home',
 'is',
 'jill',
 'not',
 'store',
 'than',
 'the',
 'to',
 'would']

In [3]:
from collections import OrderedDict

vector_template = OrderedDict((token, 0) for token in lexicon)
vector_template

OrderedDict([('and', 0),
             ('as', 0),
             ('faster', 0),
             ('get', 0),
             ('got', 0),
             ('hairy', 0),
             ('harry', 0),
             ('home', 0),
             ('is', 0),
             ('jill', 0),
             ('not', 0),
             ('store', 0),
             ('than', 0),
             ('the', 0),
             ('to', 0),
             ('would', 0)])

In [4]:
import copy
from collections import Counter

doc_tfidf_vectors = []
for doc in docs:
    vec = copy.copy(vector_template)
    tokens = re.findall(r'\w+', doc.lower())
    token_counts = Counter(tokens)
    for key, value in token_counts.items():
        docs_containing_key = 0
        for _doc in docs:
            if key in _doc:
                docs_containing_key += 1
        tf = value / len(lexicon)
        if docs_containing_key:
            idf = len(docs) / docs_containing_key
        else:
            idf = 0
        vec[key] = tf * idf
    doc_tfidf_vectors.append(vec)

doc_tfidf_vectors

[OrderedDict([('and', 0.09375),
              ('as', 0),
              ('faster', 0.28125),
              ('get', 0.1875),
              ('got', 0.1875),
              ('hairy', 0),
              ('harry', 0.0),
              ('home', 0.1875),
              ('is', 0),
              ('jill', 0),
              ('not', 0),
              ('store', 0.1875),
              ('than', 0),
              ('the', 0.5625),
              ('to', 0.1875),
              ('would', 0.1875)]),
 OrderedDict([('and', 0.09375),
              ('as', 0),
              ('faster', 0.09375),
              ('get', 0),
              ('got', 0),
              ('hairy', 0.09375),
              ('harry', 0.0),
              ('home', 0),
              ('is', 0.09375),
              ('jill', 0.0),
              ('not', 0),
              ('store', 0),
              ('than', 0.1875),
              ('the', 0),
              ('to', 0),
              ('would', 0)]),
 OrderedDict([('and', 0),
              ('as', 0.125),
     

In [5]:
import math

def cosine_sim(vec1, vec2):
    vec1 = [val for val in vec1.values()]
    vec2 = [val for val in vec2.values()]
    dot_prod = 0
    for i, v in enumerate(vec1):
        dot_prod += v * vec2[i]
    mag_1 = math.sqrt(sum([x**2 for x in vec1]))
    mag_2 = math.sqrt(sum([x**2 for x in vec2]))
    return dot_prod / (mag_1 * mag_2)

cosine_sim(doc_tfidf_vectors[2], doc_tfidf_vectors[0]), cosine_sim(doc_tfidf_vectors[2], doc_tfidf_vectors[1])

(0.0, 0.253546276418555)

In [6]:
print("\t0\t1\t2")
for r, doc1 in enumerate(doc_tfidf_vectors):
    print(r, end='\t')
    for c, doc2 in enumerate(doc_tfidf_vectors):
        print(round(cosine_sim(doc1, doc2), 2), end='\t')
    print()

	0	1	2
0	1.0	0.17	0.0	
1	0.17	1.0	0.25	
2	0.0	0.25	1.0	


In [10]:
query = "How long does it take to get to the store?"
query_vec = copy.copy(vector_template)
tokens = re.findall(r'\w+', query)
token_counts = Counter(tokens)

for key, value in token_counts.items():
    docs_containing_key = 0
    for _doc in docs:
        if key in _doc.lower():
            docs_containing_key += 1
    if docs_containing_key == 0:
        continue
    tf = value / len(tokens)
    idf = len(docs) / docs_containing_key
    query_vec[key] = tf * idf

print(cosine_sim(query_vec, doc_tfidf_vectors[0]))
print(cosine_sim(query_vec, doc_tfidf_vectors[1]))
print(cosine_sim(query_vec, doc_tfidf_vectors[2]))

0.6324555320336759
0.0
0.0
