In [1]:
import warnings
warnings.filterwarnings('ignore')

# Create sample corpus and dictionary

In [2]:
from gensim import corpora, models, similarities
#sample documents
documents = ["Human machine interface for lab abc computer applications",
              "A survey of user opinion of computer system response time",
              "The EPS user interface management system",
              "System and human system engineering testing of EPS",
              "Relation of user perceived response time to error measurement",
              "The generation of random binary unordered trees",
              "The intersection graph of paths in trees",
              "Graph minors IV Widths of trees and well quasi ordering",
              "Graph minors A survey"]

In [3]:
#define stop words
stoplist = set('for a of the and to in'.split())
print(stoplist)

{'and', 'for', 'a', 'of', 'the', 'in', 'to'}


In [4]:
#split documents into word list
texts = [[word for word in document.lower().split() if word not in stoplist] for document in documents]
print(texts)

[['human', 'machine', 'interface', 'lab', 'abc', 'computer', 'applications'], ['survey', 'user', 'opinion', 'computer', 'system', 'response', 'time'], ['eps', 'user', 'interface', 'management', 'system'], ['system', 'human', 'system', 'engineering', 'testing', 'eps'], ['relation', 'user', 'perceived', 'response', 'time', 'error', 'measurement'], ['generation', 'random', 'binary', 'unordered', 'trees'], ['intersection', 'graph', 'paths', 'trees'], ['graph', 'minors', 'iv', 'widths', 'trees', 'well', 'quasi', 'ordering'], ['graph', 'minors', 'survey']]


In [5]:
#define dictionary(word <--> id table)
dictionary = corpora.Dictionary(texts)
print(dictionary)
print(dictionary.token2id)

Dictionary(35 unique tokens: ['minors', 'trees', 'paths', 'human', 'binary']...)
{'minors': 34, 'trees': 24, 'paths': 26, 'human': 1, 'binary': 22, 'random': 21, 'opinion': 8, 'user': 9, 'testing': 16, 'management': 14, 'measurement': 17, 'computer': 0, 'machine': 2, 'relation': 20, 'lab': 4, 'iv': 33, 'survey': 12, 'generation': 25, 'well': 31, 'time': 7, 'widths': 29, 'quasi': 30, 'abc': 3, 'error': 19, 'ordering': 32, 'graph': 28, 'response': 10, 'perceived': 18, 'intersection': 27, 'engineering': 15, 'eps': 13, 'unordered': 23, 'applications': 5, 'system': 11, 'interface': 6}


In [6]:
#convert some document to (id, count) expression by dictionary
print(dictionary.doc2bow("Human computer management".lower().split()))

[(0, 1), (1, 1), (14, 1)]


In [7]:
#Create corpus
corpus = [dictionary.doc2bow(text) for text in texts]
print(corpus)

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1)], [(0, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1)], [(6, 1), (9, 1), (11, 1), (13, 1), (14, 1)], [(1, 1), (11, 2), (13, 1), (15, 1), (16, 1)], [(7, 1), (9, 1), (10, 1), (17, 1), (18, 1), (19, 1), (20, 1)], [(21, 1), (22, 1), (23, 1), (24, 1), (25, 1)], [(24, 1), (26, 1), (27, 1), (28, 1)], [(24, 1), (28, 1), (29, 1), (30, 1), (31, 1), (32, 1), (33, 1), (34, 1)], [(12, 1), (28, 1), (34, 1)]]


# Create TF-IDF model for weighted vector

In [8]:
#define tf-idf model to convert vector spaces each other
tfidf = models.TfidfModel(corpus) 
doc_bow = [(0, 1), (1, 1)]
tfidf[doc_bow]

[(0, 0.7071067811865476), (1, 0.7071067811865476)]

In [9]:
#print converted vector for all corpus
for x in tfidf[corpus]:
    print(x)

[(0, 0.2944198962221451), (1, 0.2944198962221451), (2, 0.4301019571350565), (3, 0.4301019571350565), (4, 0.4301019571350565), (5, 0.4301019571350565), (6, 0.2944198962221451)]
[(0, 0.3726494271826947), (7, 0.3726494271826947), (8, 0.5443832091958983), (9, 0.27219160459794917), (10, 0.3726494271826947), (11, 0.27219160459794917), (12, 0.3726494271826947)]
[(6, 0.438482464916089), (9, 0.32027755044706185), (11, 0.32027755044706185), (13, 0.438482464916089), (14, 0.6405551008941237)]
[(1, 0.3449874408519962), (11, 0.5039733231394895), (13, 0.3449874408519962), (15, 0.5039733231394895), (16, 0.5039733231394895)]
[(7, 0.30055933182961736), (9, 0.21953536176370683), (10, 0.30055933182961736), (17, 0.43907072352741366), (18, 0.43907072352741366), (19, 0.43907072352741366), (20, 0.43907072352741366)]
[(21, 0.48507125007266594), (22, 0.48507125007266594), (23, 0.48507125007266594), (24, 0.24253562503633297), (25, 0.48507125007266594)]
[(24, 0.31622776601683794), (26, 0.6324555320336759), (27, 0

# Topic model(LDA)

In [10]:
lda = models.ldamodel.LdaModel(corpus, id2word=dictionary, num_topics=3)
for topic in lda.show_topics():
    print(topic)
for topics_per_document in lda[corpus]:
    print(topics_per_document)



(0, '0.062*system + 0.061*graph + 0.058*human + 0.037*trees + 0.037*minors + 0.036*eps + 0.036*time + 0.035*response + 0.035*measurement + 0.035*perceived')
(1, '0.087*trees + 0.051*graph + 0.050*random + 0.050*generation + 0.050*unordered + 0.050*binary + 0.049*ordering + 0.049*iv + 0.049*minors + 0.049*quasi')
(2, '0.097*user + 0.097*system + 0.056*survey + 0.055*computer + 0.055*response + 0.055*interface + 0.055*eps + 0.055*time + 0.054*management + 0.054*opinion')
[(0, 0.90902847546723375), (1, 0.043984891726608355), (2, 0.046986632806157884)]
[(0, 0.045190777308922694), (1, 0.04204965574509055), (2, 0.9127595669459867)]
[(0, 0.059481657183272606), (1, 0.056190244784453912), (2, 0.88432809803227352)]
[(0, 0.89507739984606638), (1, 0.048159158978324706), (2, 0.056763441175608879)]
[(0, 0.90476058228775469), (1, 0.042191285162119278), (2, 0.053048132550126123)]
[(0, 0.056394704603308511), (1, 0.88756012105227844), (2, 0.056045174344413085)]
[(0, 0.85026168187101059), (1, 0.082190066

In [11]:
#first document is converted to LDA space 
print(lda[corpus[0]])

[(0, 0.9090002653953051), (1, 0.043985250236135455), (2, 0.047014484368559449)]


## Similality calculation by LDA

In [12]:
index = similarities.MatrixSimilarity(lda[corpus])
sims = index[lda[corpus[0]]]
#show similality including its own.
print(list(enumerate(sims)))



[(0, 1.0), (1, 0.1029003), (2, 0.12119333), (3, 0.99991703), (4, 0.99997473), (5, 0.11444242), (6, 0.99847388), (7, 0.092569277), (8, 0.99564171)]


# Word2Vec

In [13]:
model = models.word2vec.Word2Vec(texts, size=100, min_count=1)
print(model)
out = model.most_similar(positive=[u'machine'])
for x in out:
    print(x[0],x[1])

Word2Vec(vocab=35, size=100, alpha=0.025)
perceived 0.257527232170105
management 0.20598620176315308
testing 0.14812245965003967
unordered 0.14332260191440582
generation 0.1272101253271103
minors 0.1088174507021904
engineering 0.10216177999973297
random 0.08155420422554016
trees 0.06890177726745605
response 0.06308461725711823


In [14]:
# similality between two words
model.similarity('human', 'machine')

-0.061809501778046678

In [15]:
model.most_similar(positive=['human', 'machine'], negative=['management'], topn=1)

[('generation', 0.1547522246837616)]

# [TODO]Doc2Vec

# Useful references
- https://radimrehurek.com/gensim/index.html
- http://hivecolor.com/id/58
- http://qiita.com/okappy/items/e16639178ba85edfee72
- http://qiita.com/yasunori/items/31a23eb259482e4824e2
- http://tjo.hatenablog.com/entry/2014/06/19/233949
- http://rare-technologies.com/word2vec-tutorial/