## Topic model inputs
- based off of https://github.com/RaRe-Technologies/gensim/blob/develop/docs/notebooks/atmodel_tutorial.ipynb

**Purpose**: create corpus

In [1]:
import os
import pandas as pd
import pickle
from collections import defaultdict

In [2]:
# matplotlib is logged even though disable_existing_loggers=yes in logging_config.yaml
# https://stackoverflow.com/a/51529172/7016397
# workaround is to manually set the level before creating my logger
import logging
logging.getLogger('matplotlib').setLevel(logging.WARNING)

from usrightmedia.shared.loggers import get_logger
LOGGER = get_logger(filename = '01-topic-model-inputs', logger_type='main')

In [3]:
from usrightmedia.shared.topics_utils import *

### 1. Load documents from Elasticsearch

In [4]:
from inca import Inca
myinca = Inca()

query = {
    "query": {
        "bool": {
            "filter": [
                {"term": {"should_include": True}}
            ]
        }
    }
}

docs = myinca.database.document_generator(query)



In [5]:
titles = [] # docs option 1
leads = [] # docs option 2
texts = [] # docs option 3
doctypes = []
ids = []

In [6]:
for n, doc in enumerate(docs):
    title = doc["_source"]["title"]
    text = doc["_source"]["article_maintext"] # as scraped by news-please
    
    lead = text.split()[0:100]
    lead = ' '.join(lead) 
    lead = title + lead
    
    titles.append(title)
    leads.append(lead)
    texts.append(text)
    
    doctypes.append(doc["_source"]['doctype'])
    ids.append(doc['_id'])

100%|██████████| 173504/173504 [01:19<00:00, 2191.11it/s]


In [7]:
df_data_inputs = pd.DataFrame({"doc_id": ids,
                               "doctype": doctypes,
                               "title": titles,
                               "lead": leads,
                               "article_maintext": texts})

In [8]:
df_data_inputs

Unnamed: 0,doc_id,doctype,title,lead,article_maintext
0,Breitbart_1554255474,breitbart,"Victoria's Secret Launches Transgender, Plus-S...","Victoria's Secret Launches Transgender, Plus-S...",Victoria’s Secret chose the middle of a worldw...
1,Breitbart_1553551951,breitbart,Coronavirus: California College Student Delive...,Coronavirus: California College Student Delive...,A college student is helping elderly people in...
2,Breitbart_1553156914,breitbart,Never Trump Leader Rick Wilson Mocks Melania T...,Never Trump Leader Rick Wilson Mocks Melania T...,First lady Melania Trump will appear in public...
3,Breitbart_1554255466,breitbart,Richard Burr Urges Ethics Probe as Criticism G...,Richard Burr Urges Ethics Probe as Criticism G...,Sen. Richard Burr (R-NC) is urging the Senate ...
4,Breitbart_1554086370,breitbart,'No Exceptions': Philippines Bans All Foreigne...,'No Exceptions': Philippines Bans All Foreigne...,The Philippines stopped issuing visas to forei...
...,...,...,...,...,...
173499,WashingtonExaminer_984965802,washingtonexaminer,FISA surveillance bill barely advances in clos...,FISA surveillance bill barely advances in clos...,The Senate just barely advanced legislation on...
173500,WashingtonExaminer_985026406,washingtonexaminer,Byron York: Comey told Congress FBI agents did...,Byron York: Comey told Congress FBI agents did...,"In March 2017, then-FBI Director James Comey b..."
173501,WashingtonExaminer_985090666,washingtonexaminer,DHS secretary on Trump's 'shithole' remark: 'I...,DHS secretary on Trump's 'shithole' remark: 'I...,Homeland Security Secretary Kirstjen Nielsen t...
173502,WashingtonExaminer_985177337,washingtonexaminer,Oil group chief boasts of America's 'energy ab...,Oil group chief boasts of America's 'energy ab...,The American Petroleum Institute boasted Tuesd...


In [9]:
%%time
with open(os.path.join(INPUTS_DIR, 'df_data_inputs.pkl'), 'wb') as file:
    pickle.dump(df_data_inputs, file)

CPU times: user 1.01 s, sys: 784 ms, total: 1.8 s
Wall time: 1.79 s


### 2. Pre-process documents

- Manually install `en_core_web_lg` into virtual environment (https://github.com/explosion/spaCy/issues/4297)

```
conda activate usrightmedia

cd ~/work/us-right-media-dev/usrightmedia/data

wget -c https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.0.0/en_core_web_lg-3.0.0.tar.gz

pip install ~/work/us-right-media-dev/usrightmedia/data/en_core_web_lg-3.0.0.tar.gz
```

- not using trf (transformer-based model) because it has token limit of 512 (https://github.com/explosion/spaCy/issues/6939)


In [10]:
%%time
docs_titles = preprocess_docs(titles, 'titles', INPUTS_DIR)

CPU times: user 4min 8s, sys: 111 ms, total: 4min 8s
Wall time: 4min 8s


In [11]:
%%time
docs_leads = preprocess_docs(leads, 'leads', INPUTS_DIR)

CPU times: user 32min 4s, sys: 17 s, total: 32min 21s
Wall time: 32min 22s


In [12]:
%%time
docs_texts = preprocess_docs(texts, 'texts', INPUTS_DIR)

CPU times: user 2h 28min 20s, sys: 21min 40s, total: 2h 50min
Wall time: 2h 50min 1s


### 3. Create dictionary

In [13]:
%%time
dict_titles = save_dictionary(docs_titles, 'titles', INPUTS_DIR)

CPU times: user 1.33 s, sys: 369 µs, total: 1.33 s
Wall time: 1.33 s


In [14]:
%%time
dict_leads = save_dictionary(docs_leads, 'leads', INPUTS_DIR)

CPU times: user 6.32 s, sys: 248 µs, total: 6.32 s
Wall time: 6.32 s


In [15]:
%%time
dict_texts = save_dictionary(docs_texts, 'texts', INPUTS_DIR)

CPU times: user 25.9 s, sys: 16.2 ms, total: 25.9 s
Wall time: 25.9 s


### 4. Create corpus

In [16]:
%%time
corp_titles, corp_tfidf_titles = save_corpus(dict_titles, docs_titles, 'titles', INPUTS_DIR)

CPU times: user 6.88 s, sys: 143 ms, total: 7.02 s
Wall time: 7.02 s


In [17]:
%%time
corp_leads, corp_tfidf_leads = save_corpus(dict_leads, docs_leads, 'leads', INPUTS_DIR)

CPU times: user 33.9 s, sys: 509 ms, total: 34.4 s
Wall time: 34.4 s


In [18]:
%%time
corp_texts, corp_tfidf_texts = save_corpus(dict_texts, docs_texts, 'texts', INPUTS_DIR)

CPU times: user 2min 7s, sys: 1.77 s, total: 2min 8s
Wall time: 2min 8s


### 5. Summary

In [19]:
def print_input_summary(label, dictionary, corpus):
    print("-"*120)
    print(f"DOCS VERSION: {label}")
    print(f'Number of unique tokens: {len(dictionary)}')
    print(f'Number of documents: {len(corpus)}')

In [20]:
print_input_summary('titles', dict_titles, corp_titles)
print_input_summary('leads', dict_leads, corp_leads)
print_input_summary('texts', dict_texts, corp_texts)

------------------------------------------------------------------------------------------------------------------------
DOCS VERSION: titles
Number of unique tokens: 4768
Number of documents: 173504
------------------------------------------------------------------------------------------------------------------------
DOCS VERSION: leads
Number of unique tokens: 16387
Number of documents: 173504
------------------------------------------------------------------------------------------------------------------------
DOCS VERSION: texts
Number of unique tokens: 33833
Number of documents: 173504
