In [1]:
import warnings
import numpy as np
from gensim import corpora, models, similarities
warnings.filterwarnings('ignore')

# Create sample corpus and dictionary

In [2]:
#sample documents
documents = ["Human machine interface for lab abc computer applications",
              "A survey of user opinion of computer system response time",
              "The EPS user interface management system",
              "System and human system engineering testing of EPS",
              "Relation of user perceived response time to error measurement",
              "The generation of random binary unordered trees",
              "The intersection graph of paths in trees",
              "Graph minors IV Widths of trees and well quasi ordering",
              "Graph minors A survey"]

In [3]:
#define stop words
stoplist = set('for a of the and to in'.split())
print(stoplist)

{'in', 'and', 'for', 'to', 'the', 'of', 'a'}


In [4]:
#split documents into word list
texts = [[word for word in document.lower().split() if word not in stoplist] for document in documents]
print(texts)

[['human', 'machine', 'interface', 'lab', 'abc', 'computer', 'applications'], ['survey', 'user', 'opinion', 'computer', 'system', 'response', 'time'], ['eps', 'user', 'interface', 'management', 'system'], ['system', 'human', 'system', 'engineering', 'testing', 'eps'], ['relation', 'user', 'perceived', 'response', 'time', 'error', 'measurement'], ['generation', 'random', 'binary', 'unordered', 'trees'], ['intersection', 'graph', 'paths', 'trees'], ['graph', 'minors', 'iv', 'widths', 'trees', 'well', 'quasi', 'ordering'], ['graph', 'minors', 'survey']]


In [5]:
#define dictionary(word <--> id table)
dictionary = corpora.Dictionary(texts)
print(dictionary)
print(dictionary.token2id)

Dictionary(35 unique tokens: ['perceived', 'trees', 'paths', 'quasi', 'iv']...)
{'perceived': 17, 'trees': 22, 'paths': 28, 'quasi': 31, 'iv': 33, 'generation': 23, 'survey': 11, 'widths': 30, 'ordering': 29, 'user': 8, 'relation': 19, 'graph': 27, 'management': 14, 'system': 7, 'abc': 3, 'intersection': 26, 'interface': 1, 'human': 2, 'opinion': 10, 'computer': 4, 'lab': 6, 'minors': 32, 'engineering': 15, 'unordered': 21, 'binary': 25, 'measurement': 18, 'machine': 0, 'time': 9, 'random': 24, 'eps': 13, 'applications': 5, 'testing': 16, 'well': 34, 'response': 12, 'error': 20}


In [6]:
#convert some document to (id, count) expression by dictionary
print(dictionary.doc2bow("Human computer management".lower().split()))

[(2, 1), (4, 1), (14, 1)]


In [7]:
#Create corpus
corpus = [dictionary.doc2bow(text) for text in texts]
print(corpus)

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1)], [(4, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1)], [(1, 1), (7, 1), (8, 1), (13, 1), (14, 1)], [(2, 1), (7, 2), (13, 1), (15, 1), (16, 1)], [(8, 1), (9, 1), (12, 1), (17, 1), (18, 1), (19, 1), (20, 1)], [(21, 1), (22, 1), (23, 1), (24, 1), (25, 1)], [(22, 1), (26, 1), (27, 1), (28, 1)], [(22, 1), (27, 1), (29, 1), (30, 1), (31, 1), (32, 1), (33, 1), (34, 1)], [(11, 1), (27, 1), (32, 1)]]


# Create TF-IDF model for weighted vector

In [8]:
#define tf-idf model to convert vector spaces each other
tfidf = models.TfidfModel(corpus) 
doc_bow = [(0, 1), (1, 1)]
tfidf[doc_bow]

[(0, 0.8251824121072071), (1, 0.5648663441460566)]

In [9]:
#print converted vector for all corpus
for x in tfidf[corpus]:
    print(x)

[(0, 0.4301019571350565), (1, 0.2944198962221451), (2, 0.2944198962221451), (3, 0.4301019571350565), (4, 0.2944198962221451), (5, 0.4301019571350565), (6, 0.4301019571350565)]
[(4, 0.3726494271826947), (7, 0.27219160459794917), (8, 0.27219160459794917), (9, 0.3726494271826947), (10, 0.5443832091958983), (11, 0.3726494271826947), (12, 0.3726494271826947)]
[(1, 0.438482464916089), (7, 0.32027755044706185), (8, 0.32027755044706185), (13, 0.438482464916089), (14, 0.6405551008941237)]
[(2, 0.3449874408519962), (7, 0.5039733231394895), (13, 0.3449874408519962), (15, 0.5039733231394895), (16, 0.5039733231394895)]
[(8, 0.21953536176370683), (9, 0.30055933182961736), (12, 0.30055933182961736), (17, 0.43907072352741366), (18, 0.43907072352741366), (19, 0.43907072352741366), (20, 0.43907072352741366)]
[(21, 0.48507125007266594), (22, 0.24253562503633297), (23, 0.48507125007266594), (24, 0.48507125007266594), (25, 0.48507125007266594)]
[(22, 0.31622776601683794), (26, 0.6324555320336759), (27, 0.3

# Topic model(LDA)

In [10]:
lda = models.ldamodel.LdaModel(corpus, id2word=dictionary, num_topics=3)
for topic in lda.show_topics():
    print(topic)
for topics_per_document in lda[corpus]:
    print(topics_per_document)



(0, '0.061*system + 0.060*user + 0.060*trees + 0.060*survey + 0.060*graph + 0.046*computer + 0.046*interface + 0.035*response + 0.035*eps + 0.035*unordered')
(1, '0.063*system + 0.052*human + 0.037*minors + 0.037*trees + 0.036*graph + 0.036*user + 0.036*time + 0.036*iv + 0.036*eps + 0.036*ordering')
(2, '0.032*graph + 0.031*survey + 0.031*user + 0.031*system + 0.031*trees + 0.030*minors + 0.030*intersection + 0.029*time + 0.029*interface + 0.029*paths')
[(0, 0.056904655603499645), (1, 0.89963036822034737), (2, 0.043464976176153025)]
[(0, 0.91118353415640574), (1, 0.046230040182272072), (2, 0.042586425661322279)]
[(0, 0.88120725743272066), (1, 0.062026445816163726), (2, 0.056766296751115553)]
[(0, 0.052262479930604906), (1, 0.89911066481035962), (2, 0.048626855259035581)]
[(0, 0.046355471420264978), (1, 0.91082265877905577), (2, 0.042821869800679264)]
[(0, 0.8862337455431063), (1, 0.056774455085898269), (2, 0.0569917993709953)]
[(0, 0.86195035587696844), (1, 0.069693068829925237), (2, 0

In [11]:
#first document is converted to LDA space 
print(lda[corpus[0]])

[(0, 0.056922226933856707), (1, 0.89961278283157864), (2, 0.043464990234564641)]


## Similality calculation by LDA

In [12]:
index = similarities.MatrixSimilarity(lda[corpus])
sims = index[lda[corpus[0]]]
#show similality including its own.
print(list(enumerate(sims)))



[(0, 1.0), (1, 0.11569257), (2, 0.13581519), (3, 0.99997038), (4, 0.99992341), (5, 0.12950063), (6, 0.14657225), (7, 0.99981445), (8, 0.1776873)]


# Word2Vec

In [13]:
model = models.word2vec.Word2Vec(texts, size=100, min_count=1)
print(model)
out = model.most_similar(positive=[u'machine'])
for x in out:
    print(x[0],x[1])

Word2Vec(vocab=35, size=100, alpha=0.025)
computer 0.24336141347885132
unordered 0.19552487134933472
trees 0.19459959864616394
time 0.16224011778831482
intersection 0.1320515275001526
perceived 0.12837675213813782
quasi 0.12015747278928757
opinion 0.11418915539979935
survey 0.09854245185852051
testing 0.09425602108240128


In [14]:
# similality between two words
model.similarity('human', 'machine')

0.088144269635882011

In [15]:
# human + machine - management = ...?
model.most_similar(positive=['human', 'machine'], negative=['management'], topn=1)

[('response', 0.2094278335571289)]

In [16]:
# show vector expression of a word
model['human']

array([  4.03621560e-03,  -1.12776889e-03,  -2.70737638e-03,
        -5.28433418e-04,   2.60211038e-03,   9.53729847e-04,
         3.83250951e-03,  -1.87926169e-04,   1.37907965e-03,
         4.53122426e-03,  -3.99733381e-03,   3.74906725e-04,
         1.51365087e-03,   2.42908625e-03,   8.26373172e-04,
        -4.08146158e-03,   3.85995349e-03,   1.98068633e-03,
        -9.78420838e-04,  -4.98752249e-03,   4.76452842e-04,
        -6.89192035e-04,   9.50184476e-05,   4.55490453e-03,
        -4.83759353e-03,  -2.50132009e-03,  -3.38290608e-03,
        -1.84357655e-03,  -2.84379092e-03,  -1.23974343e-03,
        -1.71634916e-03,   4.69436450e-03,  -4.46483353e-03,
        -4.72780969e-03,   3.52653768e-03,  -4.06282907e-03,
         3.09269712e-03,   1.19773776e-03,   8.99195904e-04,
        -3.89312534e-03,  -2.15449394e-03,   4.85087512e-04,
        -1.38260715e-04,   3.29889753e-03,  -3.44559969e-03,
        -3.99467535e-04,  -7.12820794e-04,  -4.72263526e-03,
        -4.13687946e-03,

In [17]:
# Vector expression of text(just simple average of each words...)
np.mean(model[texts[0]], axis=0)

array([ -1.21747982e-03,  -5.98347164e-04,  -7.44400895e-04,
        -4.24063939e-04,   1.08058746e-04,  -7.97799148e-04,
         6.76415511e-04,  -4.90673003e-04,   2.94549129e-04,
         8.47728734e-05,  -2.05539051e-03,  -6.16764650e-04,
         1.00845157e-03,  -7.69111037e-04,   2.21019913e-03,
        -2.06974358e-03,  -2.60475295e-04,  -4.84288001e-04,
         3.15664045e-04,  -4.33850830e-04,  -1.36917713e-03,
        -2.42086662e-05,  -6.14502351e-04,   1.42709631e-03,
        -1.21736713e-03,   2.66548246e-04,  -1.97036425e-04,
        -1.54024560e-03,  -1.00586086e-03,  -2.87082348e-05,
        -1.44869206e-03,   3.38991318e-04,  -2.33746460e-03,
        -5.71915763e-04,  -5.00007358e-04,  -3.71419446e-04,
         2.40033507e-04,   1.43597357e-03,  -4.65652905e-04,
        -1.29254453e-03,  -9.98286181e-04,   9.67176748e-04,
         1.74276181e-03,  -3.53550931e-05,  -8.21346301e-04,
        -1.77060149e-03,   8.81072890e-04,   1.40109376e-04,
         4.32525005e-04,

# Doc2Vec

In [18]:
# I'm not sure why this kinds of classes are implemented in the package...
class LabeledListSentence(object):
    def __init__(self, words_list):
        self.words_list = words_list

    def __getitem__(self, index):
        t = [t for t in self]
        return t[index]
    
    def __iter__(self):
        for i, words in enumerate(self.words_list):
            yield models.doc2vec.LabeledSentence(words, ['text_{0}'.format(i)])

In [19]:
labeled_texts = LabeledListSentence(texts)
model = models.doc2vec.Doc2Vec(labeled_texts, min_count=1, alpha=0.025, min_alpha=0.025)
print(model.docvecs.most_similar([model.docvecs[1]]))

[(1, 1.0), (3, 0.07125711441040039), (2, 0.015766317024827003), (8, -0.00950068049132824), (4, -0.021924331784248352), (7, -0.03319110721349716), (5, -0.03922884911298752), (6, -0.04244283586740494), (0, -0.07505106180906296)]


# Useful references
- https://radimrehurek.com/gensim/index.html
- http://hivecolor.com/id/58
- http://qiita.com/okappy/items/e16639178ba85edfee72
- http://qiita.com/yasunori/items/31a23eb259482e4824e2
- http://qiita.com/shima_x/items/196e8d823412e45680e9
- http://tjo.hatenablog.com/entry/2014/06/19/233949
- http://rare-technologies.com/word2vec-tutorial/