In [1]:
from gensim import corpora
documents = ["Human machine interface for lab abc computer applications", \
            "A survey of user opinion of computer system response time", \
            "The EPS user interface management system", \
            "System and human system engineering testing of EPS", \
            "Relation of user perceived response time to error measurement", \
            "The generation of random binary unordered trees", \
            "The intersection graph of paths in trees", \
            "Graph minors IV Widths of trees and well quasi ordering", \
            "Graph minors A survey"]
# remove common words and tokenize
stoplist = set('for a of the and to in'.split())
texts = [[word for word in document.lower().split() if word not in stoplist] for document in documents]
# remove words that appear only once
from collections import defaultdict
frequency = defaultdict(int)
for text in texts:
    for token in text:
        frequency[token] += 1
texts = [[token for token in text if frequency[token] > 1]for text in texts]
from pprint import pprint  # pretty-printer
pprint(texts)

[['human', 'interface', 'computer'],
 ['survey', 'user', 'computer', 'system', 'response', 'time'],
 ['eps', 'user', 'interface', 'system'],
 ['system', 'human', 'system', 'eps'],
 ['user', 'response', 'time'],
 ['trees'],
 ['graph', 'trees'],
 ['graph', 'minors', 'trees'],
 ['graph', 'minors', 'survey']]


In [2]:
dictionary = corpora.Dictionary(texts)
dictionary.save('/tmp/deerwester.dict')  # store the dictionary, for future reference
print(dictionary)
print(dictionary.token2id)

Dictionary(12 unique tokens: ['computer', 'human', 'interface', 'response', 'survey']...)
{'computer': 0, 'human': 1, 'interface': 2, 'response': 3, 'survey': 4, 'system': 5, 'time': 6, 'user': 7, 'eps': 8, 'trees': 9, 'graph': 10, 'minors': 11}


In [3]:
corpus = [dictionary.doc2bow(text) for text in texts]
corpora.MmCorpus.serialize('/tmp/deerwester.mm', corpus)  # store to disk, for later use
print(corpus) 

[[(0, 1), (1, 1), (2, 1)], [(0, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1)], [(2, 1), (5, 1), (7, 1), (8, 1)], [(1, 1), (5, 2), (8, 1)], [(3, 1), (6, 1), (7, 1)], [(9, 1)], [(9, 1), (10, 1)], [(9, 1), (10, 1), (11, 1)], [(4, 1), (10, 1), (11, 1)]]


In [4]:
# https://radimrehurek.com/gensim/tut1.html
from gensim import corpora, models, similarities
corpus = [[(0, 1.0), (1, 1.0), (2, 1.0)],  \
          [(2, 1.0), (3, 1.0), (4, 1.0), (5, 1.0), (6, 1.0), (8, 1.0)], \
          [(1, 1.0), (3, 1.0), (4, 1.0), (7, 1.0)], [(0, 1.0), (4, 2.0), (7, 1.0)],\
          [(3, 1.0), (5, 1.0), (6, 1.0)],  [(9, 1.0)],  [(9, 1.0), (10, 1.0)], \
          [(9, 1.0), (10, 1.0), (11, 1.0)], [(8, 1.0), (10, 1.0), (11, 1.0)]]
from six import iteritems
# collect statistics about all tokens
dictionary = corpora.Dictionary(line.lower().split() for line in open('mycorpus.txt'))
# remove stop words and words that appear only once
stop_ids = [dictionary.token2id[stopword] for stopword in stoplist if stopword in dictionary.token2id]
once_ids = [tokenid for tokenid, docfreq in iteritems(dictionary.dfs) if docfreq == 1]
dictionary.filter_tokens(stop_ids + once_ids)  # remove stop words and words that appear only once
dictionary.compactify()  # remove gaps in id sequence after words that were removed
print(dictionary)

Dictionary(12 unique tokens: ['computer', 'human', 'interface', 'response', 'survey']...)


In [5]:
corpus = [[(1, 0.5)], []]
corpora.MmCorpus.serialize('/tmp/corpus.mm', corpus)
corpora.SvmLightCorpus.serialize('/tmp/corpus.svmlight', corpus)
corpora.BleiCorpus.serialize('/tmp/corpus.lda-c', corpus)
corpora.LowCorpus.serialize('/tmp/corpus.low', corpus)

In [6]:
corpus = corpora.MmCorpus('/tmp/corpus.mm')
# print(corpus)
# print(list(corpus))
corpora.BleiCorpus.serialize('/tmp/corpus.lda-c', corpus)

In [7]:
# https://radimrehurek.com/gensim/tut2.html
import logging, os
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

from gensim import corpora, models, similarities
if (os.path.exists("/tmp/deerwester.dict")):
    dictionary = corpora.Dictionary.load('/tmp/deerwester.dict')
    corpus = corpora.MmCorpus('/tmp/deerwester.mm')
    print("Used files generated from first tutorial")
else: print("Please run first tutorial to generate data set")

2018-02-03 06:08:33,841 : INFO : loading Dictionary object from /tmp/deerwester.dict
2018-02-03 06:08:33,843 : INFO : loaded /tmp/deerwester.dict
2018-02-03 06:08:33,846 : INFO : loaded corpus index from /tmp/deerwester.mm.index
2018-02-03 06:08:33,847 : INFO : initializing corpus reader from /tmp/deerwester.mm
2018-02-03 06:08:33,849 : INFO : accepted corpus with 9 documents, 12 features, 28 non-zero entries


Used files generated from first tutorial


In [8]:
tfidf = models.TfidfModel(corpus) # step 1 -- initialize a model
doc_bow = [(0, 1), (1, 1)]
print(tfidf[doc_bow]) # step 2 -- use the model to transform vectors
corpus_tfidf = tfidf[corpus]
for doc in corpus_tfidf: print(doc)
lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=2) # initialize an LSI transformation
corpus_lsi = lsi[corpus_tfidf] # create a double wrapper over the original corpus: bow->tfidf->fold-in-lsi

2018-02-03 06:08:33,866 : INFO : collecting document frequencies
2018-02-03 06:08:33,870 : INFO : PROGRESS: processing document #0
2018-02-03 06:08:33,873 : INFO : calculating IDF weights for 9 documents and 11 features (28 matrix non-zeros)
2018-02-03 06:08:33,878 : INFO : using serial LSI version on this node
2018-02-03 06:08:33,880 : INFO : updating model with new documents
2018-02-03 06:08:33,882 : INFO : preparing a new chunk of documents
2018-02-03 06:08:33,884 : INFO : using 100 extra samples and 2 power iterations
2018-02-03 06:08:33,886 : INFO : 1st phase: constructing (12, 102) action matrix
2018-02-03 06:08:33,887 : INFO : orthonormalizing (12, 102) action matrix
2018-02-03 06:08:33,891 : INFO : 2nd phase: running dense svd on (12, 9) matrix
2018-02-03 06:08:33,894 : INFO : computing the final decomposition
2018-02-03 06:08:33,896 : INFO : keeping 2 factors (discarding 47.565% of energy spectrum)
2018-02-03 06:08:33,898 : INFO : processed documents up to #9
2018-02-03 06:08:

[(0, 0.7071067811865476), (1, 0.7071067811865476)]
[(0, 0.5773502691896257), (1, 0.5773502691896257), (2, 0.5773502691896257)]
[(0, 0.44424552527467476), (3, 0.44424552527467476), (4, 0.44424552527467476), (5, 0.3244870206138555), (6, 0.44424552527467476), (7, 0.3244870206138555)]
[(2, 0.5710059809418182), (5, 0.4170757362022777), (7, 0.4170757362022777), (8, 0.5710059809418182)]
[(1, 0.49182558987264147), (5, 0.7184811607083769), (8, 0.49182558987264147)]
[(3, 0.6282580468670046), (6, 0.6282580468670046), (7, 0.45889394536615247)]
[(9, 1.0)]
[(9, 0.7071067811865475), (10, 0.7071067811865475)]
[(9, 0.5080429008916749), (10, 0.5080429008916749), (11, 0.695546419520037)]
[(4, 0.6282580468670046), (10, 0.45889394536615247), (11, 0.6282580468670046)]


In [9]:
lsi.print_topics(2)
for doc in corpus_lsi: # both bow->tfidf and tfidf->lsi transformations are actually executed here, on the fly
    print(doc)
lsi.save('/tmp/model.lsi') # same for tfidf, lda, ...
lsi = models.LsiModel.load('/tmp/model.lsi')

2018-02-03 06:08:33,913 : INFO : topic #0(1.594): 0.703*"trees" + 0.538*"graph" + 0.402*"minors" + 0.187*"survey" + 0.061*"system" + 0.060*"response" + 0.060*"time" + 0.058*"user" + 0.049*"computer" + 0.035*"interface"
2018-02-03 06:08:33,915 : INFO : topic #1(1.476): -0.460*"system" + -0.373*"user" + -0.332*"eps" + -0.328*"interface" + -0.320*"response" + -0.320*"time" + -0.293*"computer" + -0.280*"human" + -0.171*"survey" + 0.161*"trees"
2018-02-03 06:08:33,921 : INFO : saving Projection object under /tmp/model.lsi.projection, separately None
2018-02-03 06:08:33,927 : INFO : saved /tmp/model.lsi.projection
2018-02-03 06:08:33,928 : INFO : saving LsiModel object under /tmp/model.lsi, separately None
2018-02-03 06:08:33,930 : INFO : not storing attribute projection
2018-02-03 06:08:33,931 : INFO : not storing attribute dispatcher
2018-02-03 06:08:33,933 : INFO : saved /tmp/model.lsi
2018-02-03 06:08:33,934 : INFO : loading LsiModel object from /tmp/model.lsi


[(0, 0.0660078339609029), (1, -0.5200703306361849)]
[(0, 0.1966759285914242), (1, -0.7609563167700055)]
[(0, 0.08992639972446287), (1, -0.7241860626752509)]
[(0, 0.07585847652178014), (1, -0.6320551586003428)]
[(0, 0.10150299184980074), (1, -0.5737308483002964)]
[(0, 0.7032108939378318), (1, 0.16115180214025732)]
[(0, 0.8774787673119837), (1, 0.1675890686465932)]
[(0, 0.9098624686818583), (1, 0.14086553628718912)]
[(0, 0.6165825350569281), (1, -0.05392907566389463)]


2018-02-03 06:08:33,937 : INFO : loading id2word recursively from /tmp/model.lsi.id2word.* with mmap=None
2018-02-03 06:08:33,939 : INFO : setting ignored attribute projection to None
2018-02-03 06:08:33,940 : INFO : setting ignored attribute dispatcher to None
2018-02-03 06:08:33,942 : INFO : loaded /tmp/model.lsi
2018-02-03 06:08:33,943 : INFO : loading LsiModel object from /tmp/model.lsi.projection
2018-02-03 06:08:33,945 : INFO : loaded /tmp/model.lsi.projection


In [10]:
# https://radimrehurek.com/gensim/tut3.html
from gensim import corpora, models, similarities
dictionary = corpora.Dictionary.load('/tmp/deerwester.dict')
corpus = corpora.MmCorpus('/tmp/deerwester.mm') # comes from the first tutorial, "From strings to vectors"
print(corpus)

2018-02-03 06:08:33,956 : INFO : loading Dictionary object from /tmp/deerwester.dict
2018-02-03 06:08:33,958 : INFO : loaded /tmp/deerwester.dict
2018-02-03 06:08:33,960 : INFO : loaded corpus index from /tmp/deerwester.mm.index
2018-02-03 06:08:33,961 : INFO : initializing corpus reader from /tmp/deerwester.mm
2018-02-03 06:08:33,963 : INFO : accepted corpus with 9 documents, 12 features, 28 non-zero entries


MmCorpus(9 documents, 12 features, 28 non-zero entries)


In [11]:
lsi = models.LsiModel(corpus, id2word=dictionary, num_topics=2)
doc = "Human computer interaction"
vec_bow = dictionary.doc2bow(doc.lower().split())
vec_lsi = lsi[vec_bow] # convert the query to LSI space
print(vec_lsi)

2018-02-03 06:08:33,977 : INFO : using serial LSI version on this node
2018-02-03 06:08:33,979 : INFO : updating model with new documents
2018-02-03 06:08:33,981 : INFO : preparing a new chunk of documents
2018-02-03 06:08:33,983 : INFO : using 100 extra samples and 2 power iterations
2018-02-03 06:08:33,984 : INFO : 1st phase: constructing (12, 102) action matrix
2018-02-03 06:08:33,986 : INFO : orthonormalizing (12, 102) action matrix
2018-02-03 06:08:33,990 : INFO : 2nd phase: running dense svd on (12, 9) matrix
2018-02-03 06:08:33,991 : INFO : computing the final decomposition
2018-02-03 06:08:33,993 : INFO : keeping 2 factors (discarding 43.156% of energy spectrum)
2018-02-03 06:08:33,994 : INFO : processed documents up to #9
2018-02-03 06:08:33,995 : INFO : topic #0(3.341): 0.644*"system" + 0.404*"user" + 0.301*"eps" + 0.265*"time" + 0.265*"response" + 0.240*"computer" + 0.221*"human" + 0.206*"survey" + 0.198*"interface" + 0.036*"graph"
2018-02-03 06:08:33,997 : INFO : topic #1(2

[(0, 0.4618210045327159), (1, 0.07002766527899992)]


In [12]:
index = similarities.MatrixSimilarity(lsi[corpus])
index.save('/tmp/deerwester.index')
index = similarities.MatrixSimilarity.load('/tmp/deerwester.index')
sims = index[vec_lsi] # perform a similarity query against the corpus
print(list(enumerate(sims)))
sims = sorted(enumerate(sims), key=lambda item: -item[1])
print(sims) # print sorted (document number, similarity score) 2-tuples

2018-02-03 06:08:34,016 : INFO : creating matrix with 9 documents and 2 features
2018-02-03 06:08:34,020 : INFO : saving MatrixSimilarity object under /tmp/deerwester.index, separately None
2018-02-03 06:08:34,022 : INFO : saved /tmp/deerwester.index
2018-02-03 06:08:34,023 : INFO : loading MatrixSimilarity object from /tmp/deerwester.index
2018-02-03 06:08:34,025 : INFO : loaded /tmp/deerwester.index


[(0, 0.998093), (1, 0.93748635), (2, 0.9984453), (3, 0.9865886), (4, 0.90755945), (5, -0.12416792), (6, -0.10639259), (7, -0.09879463), (8, 0.05004177)]
[(2, 0.9984453), (0, 0.998093), (3, 0.9865886), (1, 0.93748635), (4, 0.90755945), (8, 0.05004177), (7, -0.09879463), (6, -0.10639259), (5, -0.12416792)]


In [13]:
# https://rare-technologies.com/word2vec-tutorial/
import gensim, logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
 
other_sentences = [['first', 'sentence'], ['second', 'sentence']]
# train word2vec on the two other_sentences
model = gensim.models.Word2Vec(other_sentences, min_count=1)

2018-02-03 06:08:34,040 : INFO : collecting all words and their counts
2018-02-03 06:08:34,041 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2018-02-03 06:08:34,043 : INFO : collected 3 word types from a corpus of 4 raw words and 2 sentences
2018-02-03 06:08:34,043 : INFO : Loading a fresh vocabulary
2018-02-03 06:08:34,045 : INFO : min_count=1 retains 3 unique words (100% of original 3, drops 0)
2018-02-03 06:08:34,046 : INFO : min_count=1 leaves 4 word corpus (100% of original 4, drops 0)
2018-02-03 06:08:34,048 : INFO : deleting the raw counts dictionary of 3 items
2018-02-03 06:08:34,050 : INFO : sample=0.001 downsamples 3 most-common words
2018-02-03 06:08:34,051 : INFO : downsampling leaves estimated 0 word corpus (5.7% of prior 4)
2018-02-03 06:08:34,053 : INFO : estimated required memory for 3 words and 100 dimensions: 3900 bytes
2018-02-03 06:08:34,054 : INFO : resetting layer weights
2018-02-03 06:08:34,056 : INFO : training model with 3 workers o

In [14]:
class MySentences(object):
    def __init__(self, dirname):
        self.dirname = dirname
 
    def __iter__(self):
        for fname in os.listdir(self.dirname):
            for line in open(os.path.join(self.dirname, fname)):
                yield line.split()
 
some_sentences = list(MySentences('yezheng_test/')) # a memory-friendly iterator
model = gensim.models.Word2Vec(some_sentences)

2018-02-03 06:08:34,088 : INFO : collecting all words and their counts
2018-02-03 06:08:34,090 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2018-02-03 06:08:34,096 : INFO : collected 2694 word types from a corpus of 6891 raw words and 363 sentences
2018-02-03 06:08:34,097 : INFO : Loading a fresh vocabulary
2018-02-03 06:08:34,101 : INFO : min_count=5 retains 177 unique words (6% of original 2694, drops 2517)
2018-02-03 06:08:34,102 : INFO : min_count=5 leaves 2239 word corpus (32% of original 6891, drops 4652)
2018-02-03 06:08:34,104 : INFO : deleting the raw counts dictionary of 2694 items
2018-02-03 06:08:34,105 : INFO : sample=0.001 downsamples 118 most-common words
2018-02-03 06:08:34,106 : INFO : downsampling leaves estimated 1136 word corpus (50.8% of prior 2239)
2018-02-03 06:08:34,107 : INFO : estimated required memory for 177 words and 100 dimensions: 230100 bytes
2018-02-03 06:08:34,109 : INFO : resetting layer weights
2018-02-03 06:08:34,114 : 

In [15]:
model = gensim.models.Word2Vec(iter=1)  # an empty model, no training yet
model.build_vocab(some_sentences)  # can be a non-repeatable, 1-pass generator
model.train(other_sentences,total_examples=model.corpus_count,epochs=model.iter)   # can be a non-repeatable, 1-pass generator
# You must specify an explict epochs count. The usual value is epochs=model.iter.
# https://github.com/llSourcell/word_vectors_game_of_thrones-LIVE/issues/7

2018-02-03 06:08:34,144 : INFO : collecting all words and their counts
2018-02-03 06:08:34,146 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2018-02-03 06:08:34,149 : INFO : collected 2694 word types from a corpus of 6891 raw words and 363 sentences
2018-02-03 06:08:34,150 : INFO : Loading a fresh vocabulary
2018-02-03 06:08:34,153 : INFO : min_count=5 retains 177 unique words (6% of original 2694, drops 2517)
2018-02-03 06:08:34,154 : INFO : min_count=5 leaves 2239 word corpus (32% of original 6891, drops 4652)
2018-02-03 06:08:34,157 : INFO : deleting the raw counts dictionary of 2694 items
2018-02-03 06:08:34,158 : INFO : sample=0.001 downsamples 118 most-common words
2018-02-03 06:08:34,159 : INFO : downsampling leaves estimated 1136 word corpus (50.8% of prior 2239)
2018-02-03 06:08:34,160 : INFO : estimated required memory for 177 words and 100 dimensions: 230100 bytes
2018-02-03 06:08:34,161 : INFO : resetting layer weights
2018-02-03 06:08:34,165 : 

0

In [16]:
# import time
# T0 = time.time()
# import os, re
# class MySentences(object):
#     def __init__(self, dirname):
#         self.dirname = dirname
 
#     def __iter__(self):
#         for fname in os.listdir(self.dirname):
#             for line in open(os.path.join(self.dirname, fname)):
#                 yield line.split()
# sentences = list(MySentences('yezheng_test/'))
# vocab = set()
# for s in sentences:
#     Ltemp = [re.split('[0-9.]+',w)[0] for w in s if not '::' == w]
#     Ltemp = [w for w in Ltemp if len(w) >0]
#     for ele in Ltemp[1:]: vocab.add(ele)
# print(len(vocab))

# with open("Vocab.txt",'w') as f:
#     for w in vocab: f.write(w+'\n')
# print(time.time() - T0)
# print(len(vocab))

In [1]:
import time
T0 = time.time()
import os, re
class MySentences(object):
    def __init__(self, dirname):
        self.dirname = dirname
 
    def __iter__(self):
        for fname in os.listdir(self.dirname):
            for line in open(os.path.join(self.dirname, fname)):
                yield line.split()
sentences = list(MySentences('yezheng_test/'))
vocab = set()
with open("data/coocvec-500mostfreq-window-3.vec.filter") as fread:
    line = fread.readline()
    for line in fread: vocab.add(line.split()[0])
with open("data/GoogleNews-vectors-negative300.filter") as fread:
    line = fread.readline()
    for line in fread: vocab.add(line.split()[0])

with open("Vocab2.txt",'w') as f:
    for w in vocab: f.write(w+'\n')
print(len(vocab))
print(time.time() - T0)

0.23245024681091309


In [3]:
import numpy as np
from sklearn import datasets
X, y = datasets.make_blobs(n_samples=500, n_features=6, centers=5, cluster_std=[0.4, 0.3, 0.4, 0.3, 0.4], random_state=11)
from sklearn.cluster import SpectralClustering
y_pred = SpectralClustering().fit_predict(X)
from sklearn import metrics
print("Calinski-Harabasz Score", metrics.calinski_harabaz_score(X, y_pred))

Calinski-Harabasz Score 14907.099436228207


In [4]:
print(y_pred)

[3 6 6 0 1 1 2 0 1 1 5 3 1 1 1 0 1 2 2 2 4 4 5 2 4 7 7 7 6 3 3 2 1 6 2 3 3
 0 4 7 7 2 2 1 0 4 3 3 2 4 4 5 0 7 5 2 4 4 7 0 3 4 4 4 2 6 7 4 4 7 4 2 2 5
 6 2 0 2 7 3 1 7 4 4 7 6 1 6 1 3 0 1 1 4 7 2 2 1 3 6 7 3 0 5 2 7 5 5 6 2 1
 5 7 0 0 4 3 7 7 1 3 1 2 6 6 5 5 2 3 7 4 0 6 4 2 2 2 2 6 2 4 3 4 4 4 4 4 3
 0 2 4 3 2 5 0 4 2 2 2 4 1 2 3 7 4 3 0 0 2 4 4 1 2 7 7 2 7 4 4 0 7 1 7 1 2
 6 5 1 4 2 4 2 4 1 0 5 1 0 5 2 4 4 5 4 3 0 4 6 0 4 2 3 2 1 0 3 4 4 7 4 2 2
 3 2 5 5 1 3 6 2 1 2 5 4 2 7 1 2 6 1 4 1 2 2 1 7 2 5 0 0 3 1 0 4 1 2 4 2 2
 3 2 3 2 2 2 4 5 5 7 1 4 4 7 5 2 1 2 5 6 5 7 4 1 2 3 2 4 0 3 4 3 4 4 2 4 4
 4 0 2 5 3 2 3 6 3 2 2 3 6 7 1 4 1 7 2 4 2 2 2 1 3 0 2 1 2 5 5 2 7 4 1 0 2
 1 1 7 3 0 7 3 5 6 6 3 4 7 2 4 7 4 3 6 2 4 6 1 3 2 0 4 2 5 4 0 7 0 3 4 1 7
 4 3 6 3 4 0 4 5 5 5 3 5 1 1 4 2 4 7 5 4 1 2 6 0 4 1 6 2 3 2 2 4 4 1 5 4 5
 6 1 0 6 4 1 2 5 4 7 7 4 4 4 0 7 1 5 4 2 1 3 5 2 0 3 6 5 2 0 3 6 3 1 4 2 2
 6 4 5 4 4 5 7 1 6 4 1 0 7 6 5 2 7 4 2 2 1 4 7 7 7 7 6 0 6 0 4 4 7 0 5 2 3
 5 2 6 6 6 2 2 5 2 4 4 0 

In [5]:
print(len(vocab))

NameError: name 'Vocab' is not defined