## Imports

In [1]:
import logging
from collections import defaultdict
from pprint import pprint

from gensim import corpora

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [2]:
documents = ["Human machine interface for lab abc computer applications",
             "A survey of user opinion of computer system response time",
             "The EPS user interface management system",
             "System and human system engineering testing of EPS",
             "Relation of user perceived response time to error measurement",
             "The generation of random binary unordered trees",
             "The intersection graph of paths in trees",
             "Graph minors IV Widths of trees and well quasi ordering",
             "Graph minors A survey"]

### Basic Preprocessing
In this case, all we are doing is lowercasing, and excluding words in a small set of stopwrds, then excluding the words that have a frequency of 1

In [3]:
stoplist = set('for a of the and to in'.split())
texts = [[word for word in document.lower().split() 
          if word not in stoplist]
         for document in documents]

In [4]:
texts[0]

['human', 'machine', 'interface', 'lab', 'abc', 'computer', 'applications']

In [5]:
frequency = defaultdict(int)

for text in texts:
    for token in text:
        frequency[token] += 1

In [6]:
texts = [[token for token in text
         if frequency[token] > 1]
        for text in texts]

pprint(texts)

[['human', 'interface', 'computer'],
 ['survey', 'user', 'computer', 'system', 'response', 'time'],
 ['eps', 'user', 'interface', 'system'],
 ['system', 'human', 'system', 'eps'],
 ['user', 'response', 'time'],
 ['trees'],
 ['graph', 'trees'],
 ['graph', 'minors', 'trees'],
 ['graph', 'minors', 'survey']]


### Create Text Dictionary
Generally a good idea to keep track of your unique words. You can build this dictionary with any corpora.

In [7]:
text_dictionary = corpora.Dictionary(texts)
text_dictionary.save('../../data/deerwester.dict')
print(text_dictionary)

2018-06-23 11:41:37,681 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2018-06-23 11:41:37,683 : INFO : built Dictionary(12 unique tokens: ['computer', 'human', 'interface', 'response', 'survey']...) from 9 documents (total 29 corpus positions)
2018-06-23 11:41:37,685 : INFO : saving Dictionary object under ../data/deerwester.dict, separately None
2018-06-23 11:41:37,689 : INFO : saved ../data/deerwester.dict


Dictionary(12 unique tokens: ['computer', 'human', 'interface', 'response', 'survey']...)


In [8]:
print(text_dictionary.token2id)

{'computer': 0, 'human': 1, 'interface': 2, 'response': 3, 'survey': 4, 'system': 5, 'time': 6, 'user': 7, 'eps': 8, 'trees': 9, 'graph': 10, 'minors': 11}


### Convert New Doc To Vectors
This is based on the dictionary we have already created. If a word doesnt exist in the dictionary.. It gets ignored in the vectorized process.

Important Note:
`doc2bow` simply takes the id of the existing word, and includes the frequency within the doc.

In [9]:
new_doc = "Human computer interaction"
new_vec = text_dictionary.doc2bow(new_doc.lower().split())
print(new_vec)  # Note that "interaction" does not appear

[(0, 1), (1, 1)]


In [10]:
corpus = [text_dictionary.doc2bow(text) for text in texts]
corpora.MmCorpus.serialize('../../data/deerwester.mm', corpus=corpus)
pprint(corpus)

2018-06-23 11:41:37,728 : INFO : storing corpus in Matrix Market format to ../data/deerwester.mm
2018-06-23 11:41:37,734 : INFO : saving sparse matrix to ../data/deerwester.mm
2018-06-23 11:41:37,736 : INFO : PROGRESS: saving document #0
2018-06-23 11:41:37,740 : INFO : saved 9x12 matrix, density=25.926% (28/108)
2018-06-23 11:41:37,743 : INFO : saving MmCorpus index to ../data/deerwester.mm.index


[[(0, 1), (1, 1), (2, 1)],
 [(0, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1)],
 [(2, 1), (5, 1), (7, 1), (8, 1)],
 [(1, 1), (5, 2), (8, 1)],
 [(3, 1), (6, 1), (7, 1)],
 [(9, 1)],
 [(9, 1), (10, 1)],
 [(9, 1), (10, 1), (11, 1)],
 [(4, 1), (10, 1), (11, 1)]]


### Memory Efficiency
Given a dataset with millions of rows.. You want to create a model that loads it in efficiently. You can do this by creating a class with a basic iter method to read in the data one line at a time.

In [11]:
class MyCorpus:
    def __iter__(self):
        for line in open('../../data/test_data.txt'):
            yield text_dictionary.doc2bow(line.lower().split())

In [12]:
test_memory_corpus = MyCorpus()
for vector in test_memory_corpus:
    print(vector)

[(0, 1), (1, 1), (2, 1)]
[(0, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1)]
[(2, 1), (5, 1), (7, 1), (8, 1)]
[(1, 1), (5, 2), (8, 1)]
[(3, 1), (6, 1), (7, 1)]
[(9, 1)]
[(9, 1), (10, 1)]
[(9, 1), (10, 1), (11, 1)]
[(4, 1), (10, 1), (11, 1)]
