In [1]:
import numpy as np
from scipy.spatial import distance

#Enumerates all terms in a collection of documents
def compileVocab(D):
    vocab = {}
    #Index all terms in the collection
    for d in D:
        for w in d.split(' '):
            if not w in vocab:
                vocab[w] = len(vocab) # first term is 0, second is 1, etc. -> at indexing time
    return vocab

#Turn a string of text into a TF vector. Also at indexing time
def vectorize(txt, vocab):    
    #Turn documents into vectors
    vec = np.zeros(len(vocab), dtype=int)
    for w in txt.split(' '):
        vec[vocab[w]] += 1
    return vec

#Rank a collection of documents relative to a query
def rank(Q, D):
    vocab = compileVocab(D)
    v_q = vectorize(Q, vocab)
    ranking = []
    
    #Compute cosine distance between query and documents
    for d in D:
        score = distance.cosine(v_q, vectorize(d, vocab))
        ranking.append((d, score))
        
    ranking.sort(key=lambda x:x[1])
    return ranking
        

In [2]:
Q = 'french bulldog'
D = ['the french revolution was a period of upheaval in france', 
     'the french bulldog is a small breed of domestic dog', 
     'french is a very french language spoken by the french']
print(rank(Q, D))

# Smaller is better, 3 x french is best because we only use tf and not tfidf

[('french is a very french language spoken by the french', 0.4696699141100894), ('the french bulldog is a small breed of domestic dog', 0.5527864045000421), ('the french revolution was a period of upheaval in france', 0.7763932022500211)]
