In [0]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [0]:
from nltk.tokenize import word_tokenize
import re
import collections
import pickle
import numpy as np
from gensim.models.keyedvectors import KeyedVectors
from gensim.test.utils import get_tmpfile
from gensim.scripts.glove2word2vec import glove2word2vec


In [0]:
!pip install paramiko


Collecting paramiko
[?25l  Downloading https://files.pythonhosted.org/packages/cf/ae/94e70d49044ccc234bfdba20114fa947d7ba6eb68a2e452d89b920e62227/paramiko-2.4.2-py2.py3-none-any.whl (193kB)
[K    100% |████████████████████████████████| 194kB 7.9MB/s 
[?25hCollecting cryptography>=1.5 (from paramiko)
[?25l  Downloading https://files.pythonhosted.org/packages/5b/12/b0409a94dad366d98a8eee2a77678c7a73aafd8c0e4b835abea634ea3896/cryptography-2.6.1-cp34-abi3-manylinux1_x86_64.whl (2.3MB)
[K    100% |████████████████████████████████| 2.3MB 15.9MB/s 
[?25hCollecting bcrypt>=3.1.3 (from paramiko)
[?25l  Downloading https://files.pythonhosted.org/packages/d0/79/79a4d167a31cc206117d9b396926615fa9c1fdbd52017bcced80937ac501/bcrypt-3.1.6-cp34-abi3-manylinux1_x86_64.whl (55kB)
[K    100% |████████████████████████████████| 61kB 27.8MB/s 
[?25hCollecting pynacl>=1.0.1 (from paramiko)
[?25l  Downloading https://files.pythonhosted.org/packages/27/15/2cd0a203f318c2240b42cd9dd13c931ddd61067809fee3

In [0]:
import logging 
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [0]:
documents = ["Human machine interface for lab abc computer applications","A survey of user opinion of computer system response time","The EPS user interface management system","System and human system engineering testing of EPS","Relation of user perceived response time to error measurement","The generation of random binary unordered trees","The intersection graph of paths in trees","Graph minors IV Widths of trees and well quasi ordering","Graph minors A survey"]

In [0]:
from pprint import pprint
from collections import defaultdict

In [0]:
stoplist = set('for a of the and to in'.split())
texts = [
    [word for word in document.lower().split() if word not in stoplist]
    for document in documents
]

In [0]:
texts

[['human', 'machine', 'interface', 'lab', 'abc', 'computer', 'applications'],
 ['survey', 'user', 'opinion', 'computer', 'system', 'response', 'time'],
 ['eps', 'user', 'interface', 'management', 'system'],
 ['system', 'human', 'system', 'engineering', 'testing', 'eps'],
 ['relation', 'user', 'perceived', 'response', 'time', 'error', 'measurement'],
 ['generation', 'random', 'binary', 'unordered', 'trees'],
 ['intersection', 'graph', 'paths', 'trees'],
 ['graph', 'minors', 'iv', 'widths', 'trees', 'well', 'quasi', 'ordering'],
 ['graph', 'minors', 'survey']]

In [0]:
frequency = defaultdict(int)

In [0]:
for text in texts:
  for token in text:
    frequency[token] += 1

In [0]:
frequency

defaultdict(int,
            {'abc': 1,
             'applications': 1,
             'binary': 1,
             'computer': 2,
             'engineering': 1,
             'eps': 2,
             'error': 1,
             'generation': 1,
             'graph': 3,
             'human': 2,
             'interface': 2,
             'intersection': 1,
             'iv': 1,
             'lab': 1,
             'machine': 1,
             'management': 1,
             'measurement': 1,
             'minors': 2,
             'opinion': 1,
             'ordering': 1,
             'paths': 1,
             'perceived': 1,
             'quasi': 1,
             'random': 1,
             'relation': 1,
             'response': 2,
             'survey': 2,
             'system': 4,
             'testing': 1,
             'time': 2,
             'trees': 3,
             'unordered': 1,
             'user': 3,
             'well': 1,
             'widths': 1})

In [0]:
texts = [
    [token for token in text if frequency[token]>1]
    for text in texts
]

In [0]:
pprint(texts)

[['human', 'interface', 'computer'],
 ['survey', 'user', 'computer', 'system', 'response', 'time'],
 ['eps', 'user', 'interface', 'system'],
 ['system', 'human', 'system', 'eps'],
 ['user', 'response', 'time'],
 ['trees'],
 ['graph', 'trees'],
 ['graph', 'minors', 'trees'],
 ['graph', 'minors', 'survey']]


In [0]:
from gensim import corpora

In [0]:
dictionary = corpora.Dictionary(texts)

2019-04-26 07:20:51,339 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2019-04-26 07:20:51,341 : INFO : built Dictionary(12 unique tokens: ['computer', 'human', 'interface', 'response', 'survey']...) from 9 documents (total 29 corpus positions)


In [0]:
dictionary.save('deerwester.dict')
print(dictionary)

2019-04-26 07:21:37,659 : INFO : saving Dictionary object under deerwester.dict, separately None
2019-04-26 07:21:37,663 : INFO : saved deerwester.dict


Dictionary(12 unique tokens: ['computer', 'human', 'interface', 'response', 'survey']...)


In [0]:
print(dictionary.token2id)

{'computer': 0, 'human': 1, 'interface': 2, 'response': 3, 'survey': 4, 'system': 5, 'time': 6, 'user': 7, 'eps': 8, 'trees': 9, 'graph': 10, 'minors': 11}


In [0]:
new_doc = 'Human computer interaction'
new_vec = dictionary.doc2bow(new_doc.lower().split())
print(new_vec)

[(0, 1), (1, 1)]


In [0]:
corpus = [dictionary.doc2bow(text) for text in texts]
corpora.MmCorpus.serialize('deerwester.mm',corpus)
print(corpus)

2019-04-26 07:45:34,687 : INFO : storing corpus in Matrix Market format to deerwester.mm
2019-04-26 07:45:34,693 : INFO : saving sparse matrix to deerwester.mm
2019-04-26 07:45:34,695 : INFO : PROGRESS: saving document #0
2019-04-26 07:45:34,698 : INFO : saved 9x12 matrix, density=25.926% (28/108)
2019-04-26 07:45:34,699 : INFO : saving MmCorpus index to deerwester.mm.index


[[(0, 1), (1, 1), (2, 1)], [(0, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1)], [(2, 1), (5, 1), (7, 1), (8, 1)], [(1, 1), (5, 2), (8, 1)], [(3, 1), (6, 1), (7, 1)], [(9, 1)], [(9, 1), (10, 1)], [(9, 1), (10, 1), (11, 1)], [(4, 1), (10, 1), (11, 1)]]


In [0]:
!wget https://radimrehurek.com/gensim/mycorpus.txt

--2019-04-26 07:54:01--  https://radimrehurek.com/gensim/mycorpus.txt
Resolving radimrehurek.com (radimrehurek.com)... 104.28.21.65, 104.28.20.65, 2606:4700:30::681c:1541, ...
Connecting to radimrehurek.com (radimrehurek.com)|104.28.21.65|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 437 [text/plain]
Saving to: ‘mycorpus.txt’


2019-04-26 07:54:01 (67.1 MB/s) - ‘mycorpus.txt’ saved [437/437]



In [0]:
class MyCorpus(object):
  def __iter__(self):
    for line in open('mycorpus.txt'):
      yield dictionary.doc2bow(line.lower().split())

In [0]:
corpus_memory_friendly = MyCorpus()
print(corpus_memory_friendly)

<__main__.MyCorpus object at 0x7fe59c86a240>


In [0]:
for vector in corpus_memory_friendly:
  print(vector)

[(0, 1), (1, 1), (2, 1)]
[(0, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1)]
[(2, 1), (5, 1), (7, 1), (8, 1)]
[(1, 1), (5, 2), (8, 1)]
[(3, 1), (6, 1), (7, 1)]
[(9, 1)]
[(9, 1), (10, 1)]
[(9, 1), (10, 1), (11, 1)]
[(4, 1), (10, 1), (11, 1)]


In [0]:
from six import iteritems

In [0]:
dictionary = corpora.Dictionary(line.lower().split() for line in open('mycorpus.txt'))

2019-04-26 07:57:06,780 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2019-04-26 07:57:06,782 : INFO : built Dictionary(42 unique tokens: ['abc', 'applications', 'computer', 'for', 'human']...) from 9 documents (total 69 corpus positions)


In [0]:
stop_ids = [
    dictionary.token2id[stopword]
    for stopword in stoplist
    if stopword in dictionary.token2id
]

In [0]:
once_ids = [tokenid for tokenid, docfreq in iteritems(dictionary.dfs) if docfreq==1]
dictionary.filter_tokens(stop_ids + once_ids)
dictionary.compactify()
print(dictionary)

Dictionary(12 unique tokens: ['computer', 'human', 'interface', 'response', 'survey']...)
