In [12]:
import numpy
import scipy
import gensim

[Way to install gensim](https://stackoverflow.com/questions/38739250/how-to-install-gensim-on-windows)

In [15]:
print(dir(gensim))

['__builtins__', '__cached__', '__doc__', '__file__', '__loader__', '__name__', '__package__', '__path__', '__spec__', '__version__', 'corpora', 'interfaces', 'logger', 'logging', 'matutils', 'models', 'parsing', 'scripts', 'similarities', 'summarization', 'topic_coherence', 'utils']


In [20]:
raw_documents = ["I'm taking the show on the road.",
                 "My socks are a force multiplier.",
             "I am the barber who cuts everyone's hair who doesn't cut their own.",
             "Legend has it that the mind is a mad monkey.",
            "I make my own fun."]
print("Number of documents:",len(raw_documents))

Number of documents: 5


In [32]:
from nltk.tokenize import word_tokenize
gen_docs = [[w.lower() for w in word_tokenize(text)] 
            for text in raw_documents]
print(gen_docs)

[['i', "'m", 'taking', 'the', 'show', 'on', 'the', 'road', '.'], ['my', 'socks', 'are', 'a', 'force', 'multiplier', '.'], ['i', 'am', 'the', 'barber', 'who', 'cuts', 'everyone', "'s", 'hair', 'who', 'does', "n't", 'cut', 'their', 'own', '.'], ['legend', 'has', 'it', 'that', 'the', 'mind', 'is', 'a', 'mad', 'monkey', '.'], ['i', 'make', 'my', 'own', 'fun', '.']]


* **We will create a dictionary from a list of documents.**
* **A dictionary maps every word to a number.**

In [47]:
dictionary = gensim.corpora.Dictionary(gen_docs)
print(dictionary[5])
print(dictionary.token2id['show'])
print('Number of words in dictionary:', len(dictionary))
for i in range(len(dictionary)):
    print(i, dictionary[i])

show
5
Number of words in dictionary: 36
0 'm
1 .
2 i
3 on
4 road
5 show
6 taking
7 the
8 a
9 are
10 force
11 multiplier
12 my
13 socks
14 's
15 am
16 barber
17 cut
18 cuts
19 does
20 everyone
21 hair
22 n't
23 own
24 their
25 who
26 has
27 is
28 it
29 legend
30 mad
31 mind
32 monkey
33 that
34 fun
35 make


**Now we will create a corpus. A corpus is a list of bags of words. A bag-of-words representation for a document just lists the number of times each word occurs in the document.**

In [63]:
# doc2bow: dictionary to bag-of-word
corpus = [dictionary.doc2bow(gen_doc) for gen_doc in gen_docs]
print(corpus)
# Each document is a list of tuples.
# The first number is the index of the word.
# The second number is its frequencey.

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 2)], [(1, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1)], [(1, 1), (2, 1), (7, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1), (19, 1), (20, 1), (21, 1), (22, 1), (23, 1), (24, 1), (25, 2)], [(1, 1), (7, 1), (8, 1), (26, 1), (27, 1), (28, 1), (29, 1), (30, 1), (31, 1), (32, 1), (33, 1)], [(1, 1), (2, 1), (12, 1), (23, 1), (34, 1), (35, 1)]]


**Now we create a tf-idf model from the corpus.**

Learn more about [tf-idf](http://www.tfidf.com/)

In [65]:
# num_nzz is the number of tokens
tf_idf = gensim.models.TfidfModel(corpus)
print(tf_idf)
s = 0
for i in corpus:
    s += len(i)

print(s)

TfidfModel(num_docs=5, num_nnz=47)
47


**Now we will create a similarity measure object in tf-idf space.**

In [79]:
tf_idf[corpus]

<gensim.interfaces.TransformedCorpus at 0x712d87aac8>

In [97]:
sims = gensim.similarities.Similarity([], tf_idf[corpus],
                                      num_features = len(dictionary))
print(sims)

Similarity index with 5 documents in 0 shards (stored under [])


**Now create a query document and convert it to tf-idf**

In [98]:
query_doc = [w.lower() for w in word_tokenize("Socks are a force for good.")]
print(query_doc)
query_doc_bow = dictionary.doc2bow(query_doc)
print(query_doc_bow)
query_doc_tf_idf = tf_idf[query_doc_bow]
print(query_doc_tf_idf)

['socks', 'are', 'a', 'force', 'for', 'good', '.']
[(1, 1), (8, 1), (9, 1), (10, 1), (13, 1)]
[(8, 0.31226270667960454), (9, 0.5484803253891997), (10, 0.5484803253891997), (13, 0.5484803253891997)]


In [106]:
doc = "Socks are a force for good."
vec_bow = dictionary.doc2bow(doc.lower().split(' '))
vec_tf_idf = tf_idf[vec_bow]
vec_tf_idf

[(8, 0.31226270667960454),
 (9, 0.5484803253891997),
 (10, 0.5484803253891997),
 (13, 0.5484803253891997)]

In [108]:
index = gensim.similarities.MatrixSimilarity(tf_idf[corpus])

In [110]:
sims = index[vec_tf_idf]
sims

array([0.        , 0.84565616, 0.        , 0.06124881, 0.        ],
      dtype=float32)

In [111]:
sims = sorted(enumerate(sims), key = lambda item: -item[1])
print(sims)

[(1, 0.84565616), (3, 0.061248805), (0, 0.0), (2, 0.0), (4, 0.0)]


In [112]:
raw_documents

["I'm taking the show on the road.",
 'My socks are a force multiplier.',
 "I am the barber who cuts everyone's hair who doesn't cut their own.",
 'Legend has it that the mind is a mad monkey.',
 'I make my own fun.']