## Topic model inputs
- based off of https://github.com/RaRe-Technologies/gensim/blob/develop/docs/notebooks/atmodel_tutorial.ipynb

**Purpose**: create corpus

In [1]:
import os
import pandas as pd
import pickle
from collections import defaultdict

In [2]:
# matplotlib is logged even though disable_existing_loggers=yes in logging_config.yaml
# https://stackoverflow.com/a/51529172/7016397
# workaround is to manually set the level before creating my logger
import logging
logging.getLogger('matplotlib').setLevel(logging.WARNING)

from usrightmedia.shared.loggers import get_logger
LOGGER = get_logger(filename = '01-topic-model-inputs', logger_type='main')

In [3]:
from usrightmedia.shared.topics_utils import *

### 1. Load documents from Elasticsearch

In [4]:
from inca import Inca
myinca = Inca()

query = {
    "query": {
        "bool": {
            "filter": [
                {"term": {"should_include": True}}
            ]
        }
    }
}

docs = myinca.database.document_generator(query)



In [5]:
titles = [] # docs option 1
leads = [] # docs option 2
texts = [] # docs option 3
doctypes = []
ids = []

In [6]:
for n, doc in enumerate(docs):
    title = doc["_source"]["title"]
    text = doc["_source"]["article_maintext"] # as scraped by news-please
    
    lead = text.split()[0:100]
    lead = ' '.join(lead) 
    lead = title + lead
    
    titles.append(title)
    leads.append(lead)
    texts.append(text)
    
    doctypes.append(doc["_source"]['doctype'])
    ids.append(doc['_id'])

100%|██████████| 727748/727748 [04:43<00:00, 2569.45it/s]


In [7]:
df_data_inputs = pd.DataFrame({"doc_id": ids,
                               "doctype": doctypes,
                               "title": titles,
                               "lead": leads,
                               "article_maintext": texts})

In [8]:
df_data_inputs

Unnamed: 0,doc_id,doctype,title,lead,article_maintext
0,AmericanRenaissance_1128638341,americanrenaissance,Congresswoman Hopes Reparations Bill is Path t...,Congresswoman Hopes Reparations Bill is Path t...,"Nicholas Ballasy, PJ Media, December 30, 2018\..."
1,Breitbart_621129461,breitbart,Portrait of Ronald Reagan Defaced During Break...,Portrait of Ronald Reagan Defaced During Break...,Someone vandalized a portrait of former Presid...
2,Breitbart_1483020896,breitbart,Study: Opioid Deaths Rise in Towns Where U.S. ...,Study: Opioid Deaths Rise in Towns Where U.S. ...,Opioid deaths sharply rise in American communi...
3,Breitbart_1483567174,breitbart,"Uber, Postmates Sue California to Stop Gig Wor...","Uber, Postmates Sue California to Stop Gig Wor...",Ride-sharing giant Uber and courier service Po...
4,AmericanRenaissance_1812166693,americanrenaissance,Dark Money Behemoth That Hosts BLM Foundation ...,Dark Money Behemoth That Hosts BLM Foundation ...,"Joe Schoffstall, Washington Free Beacon, Decem..."
...,...,...,...,...,...
727743,WashingtonExaminer_999923116,washingtonexaminer,"White House, DHS rip Joe Scarborough for compa...","White House, DHS rip Joe Scarborough for compa...",The White House and Department of Homeland Sec...
727744,WashingtonExaminer_999923435,washingtonexaminer,The 50 years since MLK's assassination,The 50 years since MLK's assassinationFifty ye...,"Fifty years ago this evening, the Rev. Dr. Mar..."
727745,WashingtonExaminer_999951831,washingtonexaminer,Mika Brzezinski says Trump is upset he can't w...,Mika Brzezinski says Trump is upset he can't w...,“Morning Joe” cohost Mika Brzezinski said some...
727746,WashingtonExaminer_999952161,washingtonexaminer,First person sentenced in Robert Mueller's Rus...,First person sentenced in Robert Mueller's Rus...,A federal judge on Tuesday sentenced the first...


In [9]:
%%time
with open(os.path.join(INPUTS_DIR, 'df_data_inputs.pkl'), 'wb') as file:
    pickle.dump(df_data_inputs, file)

CPU times: user 4.49 s, sys: 3.15 s, total: 7.64 s
Wall time: 7.64 s


### 2. Pre-process documents

- Manually install `en_core_web_lg` into virtual environment (https://github.com/explosion/spaCy/issues/4297)

```
conda activate usrightmedia

python -m spacy download en_core_web_lg
```

- not using trf (transformer-based model) because it has token limit of 512 (https://github.com/explosion/spaCy/issues/6939)


In [10]:
%%time
docs_titles = preprocess_docs(titles, 'titles', INPUTS_DIR)

CPU times: user 18min 24s, sys: 481 ms, total: 18min 24s
Wall time: 18min 24s


In [11]:
%%time
docs_leads = preprocess_docs(leads, 'leads', INPUTS_DIR)

CPU times: user 2h 23min 56s, sys: 17.1 s, total: 2h 24min 13s
Wall time: 2h 24min 13s


In [12]:
%%time
docs_texts = preprocess_docs(texts, 'texts', INPUTS_DIR)

CPU times: user 10h 50min 29s, sys: 1h 57min 23s, total: 12h 47min 52s
Wall time: 12h 47min 50s


### 3. Create dictionary

In [13]:
%%time
dict_titles = save_dictionary(docs_titles, 'titles', INPUTS_DIR)

CPU times: user 6.05 s, sys: 1.71 ms, total: 6.05 s
Wall time: 6.05 s


In [14]:
%%time
dict_leads = save_dictionary(docs_leads, 'leads', INPUTS_DIR)

CPU times: user 31.1 s, sys: 15 ms, total: 31.2 s
Wall time: 31.2 s


In [15]:
%%time
dict_texts = save_dictionary(docs_texts, 'texts', INPUTS_DIR)

CPU times: user 2min 4s, sys: 17.7 ms, total: 2min 4s
Wall time: 2min 4s


### 4. Create corpus

In [16]:
%%time
corp_titles, corp_tfidf_titles = save_corpus(dict_titles, docs_titles, 'titles', INPUTS_DIR)

CPU times: user 35.1 s, sys: 577 ms, total: 35.7 s
Wall time: 35.7 s


In [17]:
%%time
corp_leads, corp_tfidf_leads = save_corpus(dict_leads, docs_leads, 'leads', INPUTS_DIR)

CPU times: user 2min 31s, sys: 2.06 s, total: 2min 33s
Wall time: 2min 33s


In [18]:
%%time
corp_texts, corp_tfidf_texts = save_corpus(dict_texts, docs_texts, 'texts', INPUTS_DIR)

CPU times: user 9min 10s, sys: 7.43 s, total: 9min 17s
Wall time: 9min 17s


### 5. Summary

In [19]:
def print_input_summary(label, dictionary, corpus):
    print("-"*120)
    print(f"DOCS VERSION: {label}")
    print(f'Number of unique tokens: {len(dictionary)}')
    print(f'Number of documents: {len(corpus)}')

In [20]:
print_input_summary('titles', dict_titles, corp_titles)
print_input_summary('leads', dict_leads, corp_leads)
print_input_summary('texts', dict_texts, corp_texts)

------------------------------------------------------------------------------------------------------------------------
DOCS VERSION: titles
Number of unique tokens: 10074
Number of documents: 727748
------------------------------------------------------------------------------------------------------------------------
DOCS VERSION: leads
Number of unique tokens: 31988
Number of documents: 727748
------------------------------------------------------------------------------------------------------------------------
DOCS VERSION: texts
Number of unique tokens: 60511
Number of documents: 727748
