## Train an LDA model

- download data from ..
- use spacy to tokenize and leave only nouns
- train a gensim dictinoary
- train gensim LDA

In [1]:
import csv
csv.field_size_limit(100000000)

import glob
import os
import sys
from toolz.functoolz import compose

In [2]:
# turn on logging to see progress
os.environ['CRANIAL_LOGLEVEL'] = "INFO"

from cranial.re_iter import ReMap, ReChain, ReFilter, Progress, ReBatch, DiskCache, ReZip
from cranial.models.spacy_tokenizers import SpacyWrapper
from cranial.models.gensim_models import GensimDictionary, GensimTFIDF, GensimLDA

### Get a data files list 

In [3]:
files = glob.glob('data/*.csv')
files

['data/articles1.csv', 'data/articles2.csv', 'data/articles3.csv']

Lets check the header 

In [4]:
with open(files[0]) as f:
    print(f.readline().strip().split(','))

['', 'id', 'title', 'publication', 'author', 'date', 'year', 'month', 'url', 'content']


### Define helper functions

In [5]:
def read_csv(fname):
    """Read a csv file and output each row as a dictionary"""
    with open(fname) as f:
        reader = csv.reader(f)
        header = next(reader)
        for line in reader:
            yield dict(zip(header, line))
            
def to_tokens_list(doc):
    """Take only nouns, remove stop words, and lemmatize"""
    return [t.lemma_ for t in doc if t.pos_ == 'NOUN' and not t.is_stop]

### Instantiate spacy model

Start spacy model with in and out fields defined since each data point is a dictionary and we need to tokenize only text in "content" field.

Alternatively, if each data point was a text, then in and out fields could be left as None.
```python
spacy_tokenizer = SpacyWrapper(lang='en', batch_size=1000)
```

In [6]:
spacy_tokenizer = SpacyWrapper(lang='en', in_field='content', out_field='doc', batch_size=1000)

2018-07-05T17:31:52PDT - spacy_tokenizers.py - INFO - loading spacy...


#### Define transformations of iterators

In [7]:
# file names tranformed into iterators of rows in each file
out = ReMap(read_csv, files)

# all individual rows iterators are chained together
records = ReChain(out, name='chain rows from files')

# spacy creates a 'doc' key in each tranformed row wich containes spacy-parsed document
out = spacy_tokenizer.itransform(records)

# print out how many rows has been tranformed
out = Progress(out, max_period=5000, name='OUT')

In [8]:
# convert into a list of tokens
tokens = ReMap(lambda rec: to_tokens_list(rec['doc']), out)

# store each row to disk to avoid upstream re-runs (spacy is computationally expensive)
tokens = DiskCache(tokens)

### Instantiate gensim dictionary and train it

In [9]:
gensim_dict = GensimDictionary({
    'no_below_raw': 0,
    'no_above_raw': 1.,
    'max_n_raw': 100000,
    'no_below': 10,
    'no_above': 0.1,
    'max_n': 10000,
    'dict_filter_every': 50000,
})

gensim_dict = gensim_dict.train(tokens)

2018-07-05T17:31:53PDT - gensim_models.py - INFO - Init gensim dictionary with params:
{'no_below_raw': 0, 'no_above_raw': 1.0, 'max_n_raw': 100000, 'no_below': 10, 'no_above': 0.1, 'max_n': 10000, 'dict_filter_every': 50000}
2018-07-05T17:31:53PDT - gensim_models.py - INFO - Building gensim dictionary...
2018-07-05T17:31:53PDT - re_iter.py - INFO - Disk Cache:	Start iter number 1
2018-07-05T17:31:53PDT - re_iter.py - INFO - Disk Cache:	Saving iterable to 4476141b-8009-4d46-a7c7-d8707b251d1c
2018-07-05T17:31:53PDT - re_iter.py - INFO - reMap:	Start iter number 1
2018-07-05T17:31:53PDT - re_iter.py - INFO - OUT:	Start iter number 1
2018-07-05T17:31:53PDT - re_iter.py - INFO - reGenerate:	Start iter number 1
2018-07-05T17:31:53PDT - re_iter.py - INFO - chain rows from files:	Start iter number 1
2018-07-05T17:31:53PDT - re_iter.py - INFO - reMap:	Start iter number 1
2018-07-05T17:31:53PDT - re_iter.py - INFO - chain rows from files:	Start iter number 2
2018-07-05T17:31:53PDT - re_iter.py 

#### Convert tokens into Bag-of-Words representation

In [10]:
bow = gensim_dict.itransform(tokens)

### Instantiate and train gensim LDA model

In [11]:
g_lda = GensimLDA(lda_params={'num_topics': 100}, id2word=gensim_dict.state.model.id2token)
g_lda = g_lda.train(bow)

2018-07-05T23:39:39PDT - gensim_models.py - INFO - Init gensim LDA with params:
{'num_topics': 100}
2018-07-05T23:39:39PDT - re_iter.py - INFO - GensimDictionary:	Start iter number 1
2018-07-05T23:39:39PDT - re_iter.py - INFO - Disk Cache:	Start iter number 2
2018-07-05T23:39:39PDT - re_iter.py - INFO - Disk Cache:	Reading saved iterable from 4476141b-8009-4d46-a7c7-d8707b251d1c
2018-07-05T23:39:57PDT - re_iter.py - INFO - Disk Cache:	Finished iter number 2	total items: 142570	total time: 17.6 sec
2018-07-05T23:39:57PDT - re_iter.py - INFO - GensimDictionary:	Finished iter number 1	total items: 142570	total time: 17.6 sec
2018-07-05T23:39:57PDT - re_iter.py - INFO - GensimDictionary:	Start iter number 2
2018-07-05T23:39:57PDT - re_iter.py - INFO - Disk Cache:	Start iter number 3
2018-07-05T23:39:57PDT - re_iter.py - INFO - Disk Cache:	Reading saved iterable from 4476141b-8009-4d46-a7c7-d8707b251d1c
  diff = np.log(self.expElogbeta)
2018-07-05T23:41:04PDT - re_iter.py - INFO - Disk Cach

## Convert BOW representation to LDA sparse vectors and join with original data

In [12]:
vectors = g_lda.itransform(bow)

# zip together with original records
final = ReZip(records, vectors)

# and add vectors to records
final = ReMap(lambda x: {'lda': x[1], **x[0]}, final)

In [13]:
# trigger all these final calculations
final = [_ for _ in final]

2018-07-05T23:41:31PDT - re_iter.py - INFO - reMap:	Start iter number 1
2018-07-05T23:41:31PDT - re_iter.py - INFO - re-zip:	Start iter number 1
2018-07-05T23:41:31PDT - re_iter.py - INFO - chain rows from files:	Start iter number 3
2018-07-05T23:41:31PDT - re_iter.py - INFO - reMap:	Start iter number 3
2018-07-05T23:41:31PDT - re_iter.py - INFO - GensimLDA:	Start iter number 1
2018-07-05T23:41:31PDT - re_iter.py - INFO - GensimDictionary:	Start iter number 3
2018-07-05T23:41:31PDT - re_iter.py - INFO - Disk Cache:	Start iter number 4
2018-07-05T23:41:31PDT - re_iter.py - INFO - Disk Cache:	Reading saved iterable from 4476141b-8009-4d46-a7c7-d8707b251d1c
2018-07-05T23:45:07PDT - re_iter.py - INFO - reMap:	Finished iter number 3	total items: 3	total time: 215.9 sec
2018-07-05T23:45:07PDT - re_iter.py - INFO - chain rows from files:	Finished iter number 3	total items: 142570	total time: 215.9 sec
2018-07-05T23:45:07PDT - re_iter.py - INFO - re-zip:	Finished iter number 1	total items: 142

### Look at results

In [14]:
final[-10]

{'lda': [(14, 0.07707414031028748),
  (18, 0.48351725935935974),
  (21, 0.2209157645702362),
  (33, 0.15834416449069977),
  (55, 0.05055266246199608)],
 '': '146023',
 'id': '218073',
 'title': 'What U.S. Muslims fear from Trump',
 'publication': 'Washington Post',
 'author': 'Naureen Shah',
 'date': '2016-12-30',
 'year': '2016.0',
 'month': '12.0',
 'url': 'https://web.archive.org/web/20161231004909/https://www.washingtonpost.com/opinions/gen-kelly-has-talked-about-human-rights-will-trump-listen/2016/12/30/ebabbcea-c928-11e6-bf4b-2c064d32a4bf_story.html\n',
 'content': '   Naureen Shah is director of security and human rights at Amnesty International USA.    The Obama administration is dismantling a homeland security program created to track immigrants from   countries in an attempt to prevent   Donald Trump from fulfilling his campaign promise to create a Muslim registry. As an American Muslim and human rights advocate, I am hoping against hope that retired Gen. John F. Kelly, the h

In [15]:
print(g_lda.state.model.print_topic(18))
print(g_lda.state.model.print_topic(21))
print(g_lda.state.model.print_topic(33))

0.045*"immigration" + 0.036*"immigrant" + 0.019*"border" + 0.015*"migrant" + 0.012*"deportation" + 0.011*"crime" + 0.011*"enforcement" + 0.008*"asylum" + 0.008*"citizen" + 0.007*"refugee"
0.020*"rule" + 0.020*"bill" + 0.016*"judge" + 0.015*"ban" + 0.011*"ruling" + 0.010*"legislation" + 0.010*"governor" + 0.008*"lawmaker" + 0.008*"justice" + 0.007*"regulation"
0.017*"march" + 0.013*"protest" + 0.012*"town" + 0.012*"hall" + 0.012*"senator" + 0.012*"activist" + 0.010*"corruption" + 0.008*"protester" + 0.008*"crowd" + 0.007*"dinner"
