In [1]:
import os
import json

import pandas as pd
import gc
import multiprocessing as mp

In [2]:
%%capture

# LdaMallet
# Dictionary (gensim)
# build_docs
# transform_dt
# get_tw
# get_top_words
%run ./LDAModule.ipynb

# DocsManager
# build_docs
%run ../../DocsManager.ipynb
## Jupyter.notebook.save_checkpoint()

# get_corpus_path
# get_txt_clean_path
%run ../../path_manager.ipynb

# CorpusCleaner
%run ../../DataCleanerModule.ipynb

In [3]:
from gensim.models.ldamulticore import LdaMulticore
from gensim.models.ldamodel import LdaModel
from gensim.models.wrappers.ldamallet import malletmodel2ldamodel, LdaMallet

In [4]:
get_corpus_path('IMF')

'/home/wb536061/wb-nlp/CORPUS/IMF'

In [5]:
MODELS_PATH = get_models_path('GENSIMLDA').replace('MODELS', 'MODELS.gensim.staging')
MALLET_BINARY_PATH = "../Mallet/bin/mallet"

NUM_WORKERS = get_workers()
NUM_ITERS = 196
MIN_TOKEN_COUNT = 100  # Exclude documents with less than 100 tokens
NGRAM_FILE =  '../../whitelists/whitelist_ngrams_cleaned.csv'  # '../../whitelists/whitelist_ngrams_truncated_cleaned.csv'
DOC_PROCESSING_WORKERS = 2 * max(1, os.cpu_count() - 4)

In [6]:
MODELS_PATH

'/home/wb536061/wb-nlp/MODELS.gensim.staging/GENSIMLDA'

In [7]:
if not os.path.isdir(MODELS_PATH):
    os.makedirs(MODELS_PATH)

In [8]:
import logging
import gc

TRAINING_MODEL_ID = 'GENSIMLDA'

logging.basicConfig(filename=f'./{TRAINING_MODEL_ID.lower()}-iters_{NUM_ITERS}.log', format='%(asctime)s - %(levelname)s - %(message)s', level=logging.INFO)

logger = logging.getLogger(f'{TRAINING_MODEL_ID.lower()}-logger')


In [9]:
%%time
# for CORPUS_ID in ['IMF', 'WB']:
for CORPUS_ID in ['WB']:
    if CORPUS_ID == 'WB':
        region_partitions = []  # ['AFR', 'EAP', 'ECA', 'LAC', 'MENA', 'RoW', 'SAR', 'WLD']
        doctype_partitions = []  # ['BD', 'CF', 'ESW', 'PD', 'PR']
        doctype_partitions = doctype_partitions[::-1]
        corpus_partitions = ['ALL'] + doctype_partitions + region_partitions
    else:
        corpus_partitions = ['ALL']

    num_topics = [50, 100, 200]  # [25, 50, 100]

    for CORPUS_PART in corpus_partitions:
        docs = build_docs(
            metadata_filename=os.path.join(get_corpus_path(CORPUS_ID), f'{CORPUS_ID.lower()}_metadata_complete.csv'),
            cleaned_files_dir=get_txt_clean_path(CORPUS_ID),
            model_output_dir=MODELS_PATH  # Use flat directory as discussed...
        )

        logger.info(f'Creating partitioned docs and loading files for {CORPUS_ID}-{CORPUS_PART}...')

        # docs.set_ngram_mapper('../../whitelists/whitelist_ngrams_cleaned.csv', cleaner=None)
        docs.set_ngram_mapper(NGRAM_FILE, cleaner=None)
        
        docs.set_min_token_count(MIN_TOKEN_COUNT)
        docs_filtered, meta = docs.filter_doclist(CORPUS_PART, corpus_id=CORPUS_ID, save=True, return_meta=True, pool_workers=DOC_PROCESSING_WORKERS)

        print(docs_filtered.shape)
        logger.info(f'Building model for {docs_filtered.shape[0]} documents...')
        if docs_filtered.empty:
            continue

        logger.info('Building dictionary...')
        g_dict = Dictionary(docs_filtered.text.str.split())
        g_dict.filter_extremes(no_below=10, no_above=0.99, keep_n=200000, keep_tokens=None)  # Exclude words appearing in less than 10 docs.
        g_dict.id2token = {id: token for token, id in g_dict.token2id.items()}

        logger.info('Performing doc2bow...')
        # corpus = [g_dict.doc2bow(c.split()) for c in docs_filtered.text]

        with mp.Pool(NUM_WORKERS) as pool:
            # pool = mp.Pool(processes=10)
            logger.info('Performing parallel doc2bow...')
            corpus = pool.map(g_dict.doc2bow, docs_filtered.text.str.split())
            logger.info('Completed parallel doc2bow...')
            # pool.close()
            # pool.join()

        for NUM_TOPICS in num_topics:

            MODEL_ID = f"{CORPUS_PART}_{NUM_TOPICS}"
            logger.info(f'Starting process for {MODEL_ID}...')

            MODEL_FOLDER = os.path.join(MODELS_PATH, f'{CORPUS_ID}-{MODEL_ID}')

            MODEL_DATA_FOLDER = os.path.join(MODEL_FOLDER, 'data')
            MODEL_MALLET_FOLDER = os.path.join(MODEL_FOLDER, 'mallet')

            if not os.path.isdir(MODEL_DATA_FOLDER):
                os.makedirs(MODEL_DATA_FOLDER)

            if not os.path.isdir(MODEL_MALLET_FOLDER):
                os.makedirs(MODEL_MALLET_FOLDER)

            # Set logging
            lfh = logging.FileHandler(f'./{CORPUS_ID.lower()}-iters_{NUM_ITERS}-{MODEL_ID}.log')
            lfh.setLevel(logging.INFO)
            formatter = logging.Formatter('%(asctime)s %(levelname)-8s %(message)s')
            lfh.setFormatter(formatter)
            logger.addHandler(lfh)
            # End logging setup

            logger.info('Training mallet LDA model...')

            model = LdaModel(
                corpus=corpus, num_topics=NUM_TOPICS, id2word=g_dict.id2token,
                distributed=False,
                chunksize=10000, passes=1, update_every=1,
                alpha='symmetric', eta=None,
                decay=0.5, offset=1.0, eval_every=10, iterations=50, gamma_threshold=0.001,
                minimum_probability=0.01, random_state=1029, ns_conf=None,
                minimum_phi_value=0.01,
                per_word_topics=False, callbacks=None
            )

#             model = LdaMulticore(
#                 corpus=corpus, num_topics=NUM_TOPICS, id2word=g_dict.id2token,
#                 workers=int(NUM_WORKERS / 2),  # set workers directly to the number of your real cores (not hyperthreads) minus one, for optimal performance
#                 chunksize=2000, passes=1,
#                 batch=False, alpha='symmetric', eta=None,
#                 decay=0.5, offset=1.0, eval_every=10, iterations=NUM_ITERS,
#                 gamma_threshold=0.001, random_state=1029, minimum_probability=0.01,
#                 minimum_phi_value=0.01, per_word_topics=False
#             )
            
            logger.info('Completed training Multicore LDA model...')

#             dt = pd.read_csv(
#                 model.fdoctopics(), delimiter='\t', header=None,
#                 names=[i for i in range(model.num_topics)], index_col=None,
#                 usecols=[i + 2 for i in range(model.num_topics)],
#             )

#             dt.index = docs_filtered['id']
#             dt = dt.divide(dt.min(axis=1), axis=0).astype(int) - 1

#             logger.info('Generating dfr-browser data...')
#             ddt = transform_dt(dt.as_matrix().T)
#             ttw = get_tw(model)

#             with open(os.path.join(MODEL_DATA_FOLDER, 'tw.json'), 'w') as fl:
#                 json.dump(ttw, fl)

#             with open(os.path.join(MODEL_DATA_FOLDER, 'dt.json'), 'w') as fl:
#                 json.dump(ddt, fl)

#             info_json = {
#                 "title": f"Topics in <em>{CORPUS_ID} {MODEL_ID}<\/em>",
#                 "meta_info": "This site is the working demo for <a href=\"/\">dfr-browser</a>, a browsing interface for topic models of journal articles or other text.",
#                 "VIS": {
#                     "condition": {
#                         "type": "time",
#                         "spec": {
#                             "unit": "year",
#                             "n": 1
#                         }
#                     },
#                     "bib_sort": {
#                         "major": "year",
#                         "minor": "alpha"
#                     },
#                     "model_view": {
#                         "plot": {
#                             "words": 6,
#                             "size_range": [6, 14]
#                         } 
#                     }
#                 }
#             }

#             with open(os.path.join(MODEL_DATA_FOLDER, 'info.json'), 'w') as fl:
#                 json.dump(info_json, fl)

#             # Generation of key LDA files
#             # doc_topics
#             logger.info('Storing doc_topics...')
#             dt.to_csv(
#                 os.path.join(MODEL_DATA_FOLDER, f'doc_topics_{MODEL_ID}.csv'), 
#                 header=False,  # Change to True if topic id should be present as the header
#                 index=False  # Change to True if the uid should be present as the index
#             )
#             dt.to_csv(
#                 os.path.join(MODEL_DATA_FOLDER, f'doc_topics_{MODEL_ID}_with_details.csv'), 
#                 header=True,  # Change to True if topic id should be present as the header
#                 index=True  # Change to True if the uid should be present as the index
#             )

#             # topic_words
#             word_topics = pd.DataFrame(model.word_topics, columns=range(model.word_topics.shape[1]), index=range(1, model.word_topics.shape[0] + 1))
#             word_topics = word_topics.rename(columns=model.id2word)

#             logger.info('Storing word_topics...')
#             word_topics.astype(int).to_csv(
#                 os.path.join(MODEL_DATA_FOLDER, f'topic_words_{MODEL_ID}.csv'), 
#                 header=False,  # Change to True if actual word should be present as the header
#                 index=False  # Sorted order by topic id
#             )
#             word_topics.astype(int).to_csv(
#                 os.path.join(MODEL_DATA_FOLDER, f'topic_words_{MODEL_ID}_with_details.csv'), 
#                 header=True,  # Change to True if actual word should be present as the header
#                 index=False  # Sorted order by topic id
#             )

#             logger.info('Storing top_words...')
#             top_words = get_top_words(word_topics, topic=None, topn=NUM_TOPICS)
#             top_words.to_csv(
#                 os.path.join(MODEL_DATA_FOLDER, f'top_words_{MODEL_ID}.csv'), 
#                 index=False
#             )

            logger.info('Saving lda model...')
            model.save(os.path.join(MODEL_DATA_FOLDER, f'{CORPUS_ID}_lda_model_{MODEL_ID}.mallet.lda'))

            logger.info(f'lda model for {CORPUS_ID} completed: {MODEL_ID}...')
            logger.removeHandler(lfh)

            del(model)
            gc.collect()

        del(docs_filtered)
        del(docs.doclist)
        del(docs)
        del(meta)
        del(g_dict)
        del(corpus)
        gc.collect()

In [22]:
%%time

docs = build_docs(
    metadata_filename=os.path.join(get_corpus_path(CORPUS_ID), f'{CORPUS_ID.lower()}_metadata_complete.csv'),
    cleaned_files_dir=get_txt_clean_path(CORPUS_ID),
    model_output_dir=MODELS_PATH  # Use flat directory as discussed...
)

logger.info(f'Creating partitioned docs and loading files for {CORPUS_ID}-{CORPUS_PART}...')

# docs.set_ngram_mapper('../../whitelists/whitelist_ngrams_cleaned.csv', cleaner=None)
docs.set_ngram_mapper(NGRAM_FILE, cleaner=None)

docs.set_min_token_count(MIN_TOKEN_COUNT)

CPU times: user 4.26 s, sys: 0 ns, total: 4.26 s
Wall time: 4.36 s


In [36]:
# %%time
# docs_filtered, meta = docs.filter_doclist(CORPUS_PART, corpus_id=CORPUS_ID, save=True, return_meta=True, pool_workers=DOC_PROCESSING_WORKERS)
# 'world_bank' in docs_filtered[docs_filtered['id'] == 'wb_10000660'].text.values[0]

CPU times: user 8min 50s, sys: 5min 1s, total: 13min 51s
Wall time: 15min 28s


In [43]:
%%time
# g_dict = Dictionary(docs_filtered.text.str.split())
# g_dict.filter_extremes(no_below=10, no_above=0.99, keep_n=200000, keep_tokens=None)  # Exclude words appearing in less than 10 docs.
# g_dict.id2token = {id: token for token, id in g_dict.token2id.items()}
# g_dict.id2token[152], g_dict.token2id['purchasing_power_parity']

CPU times: user 9min 37s, sys: 1min 33s, total: 11min 11s
Wall time: 11min 11s


In [30]:
# with open('/home/wb536061/wb-nlp/CORPUS/WB/TXT_CLEAN/wb_10000660.txt') as fl:
#     d = fl.read()
    
# dd = docs.load_text('/home/wb536061/wb-nlp/CORPUS/WB/TXT_CLEAN/wb_10000660.txt')
# 'world_bank' in dd

In [124]:
model = LdaModel.load(os.path.join(MODEL_DATA_FOLDER, f'{CORPUS_ID}_lda_model_{MODEL_ID}.mallet.lda'))

In [160]:
# model_path = '/home/wb536061/wb-nlp/MODELS.gensim.staging/GENSIMLDA/WB-ALL_50/data/WB_lda_model_ALL_50.mallet.lda'
model_path = '/home/wb536061/wb-nlp/MODELS/LDA/WB-ALL_50/data/WB_lda_model_ALL_50.mallet.lda'

In [162]:
model = LdaMallet.load(model_path)

In [163]:
'stunting' in [model.id2word[i] for i in model.id2word]

True

In [202]:
%%time
# model[g_dict.doc2bow(docs_filtered.iloc[0].text.split())]
pd.DataFrame(sorted(model[g_dict.doc2bow(docs_filtered.iloc[23].text.split())], key=lambda x: x[1])[::-1]).sum()

CPU times: user 1.05 s, sys: 0 ns, total: 1.05 s
Wall time: 34.5 s


0    1225.0
1       1.0
dtype: float64

In [174]:
model.print_topic(32)

'0.056*"city" + 0.041*"urban" + 0.019*"housing" + 0.018*"area" + 0.018*"project" + 0.017*"county" + 0.016*"construction" + 0.014*"taking_land" + 0.014*"local" + 0.014*"municipal"'

In [179]:
%%time
glda = malletmodel2ldamodel(model, gamma_threshold=0.00001, iterations=500)
glda.minimum_probability = 0.001

CPU times: user 478 ms, sys: 0 ns, total: 478 ms
Wall time: 477 ms


In [194]:
glda.print_topic(40)

'0.029*"agreement" + 0.029*"project" + 0.023*"association" + 0.022*"borrower" + 0.022*"bank" + 0.021*"section" + 0.014*"date" + 0.011*"make" + 0.011*"nonperforming_loan" + 0.011*"recipient"'

In [205]:
%%time
pd.DataFrame(sorted(glda[g_dict.doc2bow(docs_filtered.iloc[23].text.split())], key=lambda x: x[1])[::-1]).sum()

CPU times: user 1.74 s, sys: 856 ms, total: 2.6 s
Wall time: 37.8 ms


0    1225.0
1       1.0
dtype: float64

In [192]:
docs_filtered.iloc[19].text

'weekly global economic brief number information org gem strong growth united state japan help offset impact weak growth euro area fiscal tightening financial turmoil weak confidence cloud growth prospect assume dramatic deterioration fiscal crisis forward look indicator suggest manufacture output remain weak month ahead output euro area likely contract external short_term debt developing_country increase trade finance relate increase china country high level short_term debt subject roll risk especially today febrile economic climate strong growth united state japan rebound growth japan compensate expect weakness euro area growth accelerate compensate euro area weakness united state household percentage point contribution growth business spend japan economy boom net trade estimate domestic demand net inventory export bounce relative strength economy reflect domestic final sale recovery earthquake contrast activity euro area remain weak inventory continue build amid anemic domestic dema

In [176]:
%%time
glda[g_dict.doc2bow(docs_filtered.iloc[0].text.split())]

CPU times: user 5.89 ms, sys: 0 ns, total: 5.89 ms
Wall time: 3.82 ms


[(2, 0.20384429903016774),
 (3, 0.1358098533911168),
 (4, 0.1122730150862996),
 (20, 0.028335651206527986),
 (26, 0.012637532333221502),
 (37, 0.012046880751905275),
 (40, 0.21122673420712978),
 (47, 0.04032272962006966),
 (49, 0.04689379618932216)]

In [164]:
model.print_topic(32)

'0.056*"city" + 0.041*"urban" + 0.019*"housing" + 0.018*"area" + 0.018*"project" + 0.017*"county" + 0.016*"construction" + 0.014*"taking_land" + 0.014*"local" + 0.014*"municipal"'

In [159]:
model.print_topic(32)

'0.029*"bank" + 0.025*"panel" + 0.025*"project" + 0.021*"request" + 0.020*"report" + 0.019*"management" + 0.013*"issue" + 0.010*"response" + 0.009*"review" + 0.009*"information"'

In [12]:
model.print_topics()

[(124,
  '0.056*"migration" + 0.056*"remittance" + 0.049*"migrant" + 0.027*"country" + 0.016*"worker" + 0.011*"percent" + 0.008*"international" + 0.007*"flow" + 0.007*"send" + 0.007*"immigrant"'),
 (130,
  '0.379*"phase" + 0.039*"salaam" + 0.013*"dart" + 0.009*"soc" + 0.007*"study" + 0.006*"project" + 0.006*"annex" + 0.006*"report" + 0.005*"cost" + 0.005*"got"'),
 (90,
  '0.034*"food" + 0.030*"agricultural" + 0.024*"agriculture" + 0.021*"price" + 0.014*"production" + 0.012*"rice" + 0.011*"increase" + 0.011*"market" + 0.009*"policy" + 0.009*"farmer"'),
 (14,
  '0.017*"project" + 0.008*"water" + 0.007*"area" + 0.007*"cost" + 0.006*"use" + 0.005*"land" + 0.005*"total" + 0.004*"impact" + 0.004*"development" + 0.004*"include"'),
 (66,
  '0.083*"town" + 0.028*"wadi" + 0.026*"project" + 0.014*"lot" + 0.013*"royal" + 0.011*"initial" + 0.010*"text" + 0.009*"area" + 0.009*"supply" + 0.009*"consultant"'),
 (83,
  '0.048*"actual" + 0.046*"date" + 0.039*"project" + 0.031*"value" + 0.029*"target" + 

In [14]:
max(model.id2word)

36222

In [None]:
docs_filtered.shape

In [11]:
# !../Mallet/bin/mallet import-file --preserve-case --keep-sequence --remove-stopwords --token-regex "\S+" --input /R/NLP/MODELS/LDA/IMF-ALL_20/mallet/IMF-ALL_20_corpus.txt --output /R/NLP/MODELS/LDA/IMF-ALL_20/mallet/IMF-ALL_20_corpus.mallet

In [235]:
# http://microdatahub.com/topicsmodeling/dfr/topic_browser/browser.php?model=data50_SAR&type=SAR&topic_count=50#/doc/9622
