In [1]:
import os
import json

import pandas as pd
import gc
import multiprocessing as mp

In [2]:
%%capture

# LdaMallet
# Dictionary (gensim)
# build_docs
# transform_dt
# get_tw
# get_top_words
%run ./LDAModule.ipynb

# DocsManager
# build_docs
%run ../../DocsManager.ipynb
## Jupyter.notebook.save_checkpoint()

# get_corpus_path
# get_txt_clean_path
%run ../../path_manager.ipynb

# CorpusCleaner
%run ../../DataCleanerModule.ipynb

In [3]:
from gensim.models.wrappers.ldamallet import malletmodel2ldamodel

In [4]:
get_corpus_path('IMF')

'/home/wb536061/wb-nlp/CORPUS/IMF'

In [5]:
MODELS_PATH = get_models_path('LDA').replace('MODELS', 'MODELS.pos')
MALLET_BINARY_PATH = "../Mallet/bin/mallet"

NUM_WORKERS = get_workers()
NUM_ITERS = 196
MIN_TOKEN_COUNT = 50
NGRAM_FILE =  '../../whitelists/whitelist_ngrams_cleaned.csv'  # '../../whitelists/whitelist_ngrams_truncated_cleaned.csv'
DOC_PROCESSING_WORKERS = 2 * max(1, os.cpu_count() - 4)

In [6]:
MODELS_PATH

'/home/wb536061/wb-nlp/MODELS.pos/LDA'

In [7]:
if not os.path.isdir(MODELS_PATH):
    os.makedirs(MODELS_PATH)

In [8]:
import logging
import gc

TRAINING_MODEL_ID = 'LDA'

logging.basicConfig(filename=f'./{TRAINING_MODEL_ID.lower()}-iters_{NUM_ITERS}.log', format='%(asctime)s - %(levelname)s - %(message)s', level=logging.INFO)

logger = logging.getLogger(f'{TRAINING_MODEL_ID.lower()}-logger')


In [None]:
for CORPUS_ID in ['WB']:
    if CORPUS_ID == 'WB':
        region_partitions = []  # ['AFR', 'EAP', 'ECA', 'LAC', 'MENA', 'RoW', 'SAR', 'WLD']
        doctype_partitions = ['PD', 'PR']  # ['BD', 'CF', 'ESW', 'PD', 'PR']
        doctype_partitions = doctype_partitions[::-1]
        corpus_partitions = ['ALL'] + doctype_partitions + region_partitions
    else:
        corpus_partitions = ['ALL']

    num_topics = [25, 50, 100, 200]

    for CORPUS_PART in corpus_partitions:
        docs = build_docs(
            metadata_filename=os.path.join(get_corpus_path(CORPUS_ID), f'{CORPUS_ID.lower()}_pos_metadata_complete.csv'),
            cleaned_files_dir=get_txt_clean_pos_path(CORPUS_ID),
            model_output_dir=MODELS_PATH  # Use flat directory as discussed...
        )

        logger.info(f'Creating partitioned docs and loading files for {CORPUS_ID}-{CORPUS_PART}...')

        # docs.set_ngram_mapper('../../whitelists/whitelist_ngrams_cleaned.csv', cleaner=None)
        docs.set_ngram_mapper(NGRAM_FILE, cleaner=None)
        
        docs.set_min_token_count(MIN_TOKEN_COUNT)
        docs_filtered, meta = docs.filter_doclist(CORPUS_PART, corpus_id=CORPUS_ID, save=True, return_meta=True, pool_workers=DOC_PROCESSING_WORKERS)

        print(docs_filtered.shape)
        logger.info(f'Building model for {docs_filtered.shape[0]} documents...')
        if docs_filtered.empty:
            continue

        logger.info('Building dictionary...')
        g_dict = Dictionary(docs_filtered.text.str.split())
        g_dict.filter_extremes(no_below=10, no_above=0.99, keep_n=200000, keep_tokens=None)  # Exclude words appearing in less than 10 docs.
        g_dict.id2token = {id: token for token, id in g_dict.token2id.items()}
        
        logger.info('Performing doc2bow...')
        # corpus = [g_dict.doc2bow(c.split()) for c in docs_filtered.text]

        with mp.Pool(NUM_WORKERS) as pool:
            # pool = mp.Pool(processes=10)
            logger.info('Performing parallel doc2bow...')
            corpus = pool.map(g_dict.doc2bow, docs_filtered.text.str.split())
            logger.info('Completed parallel doc2bow...')
            # pool.close()
            # pool.join()

        for NUM_TOPICS in num_topics:

            MODEL_ID = f"{CORPUS_PART}_{NUM_TOPICS}"
            logger.info(f'Starting process for {MODEL_ID}...')

            MODEL_FOLDER = os.path.join(MODELS_PATH, f'{CORPUS_ID}-{MODEL_ID}')

            MODEL_DATA_FOLDER = os.path.join(MODEL_FOLDER, 'data')
            MODEL_MALLET_FOLDER = os.path.join(MODEL_FOLDER, 'mallet')

            if not os.path.isdir(MODEL_DATA_FOLDER):
                os.makedirs(MODEL_DATA_FOLDER)

            if not os.path.isdir(MODEL_MALLET_FOLDER):
                os.makedirs(MODEL_MALLET_FOLDER)

            # Set logging
            lfh = logging.FileHandler(f'./{CORPUS_ID.lower()}-iters_{NUM_ITERS}-{MODEL_ID}.log')
            lfh.setLevel(logging.INFO)
            formatter = logging.Formatter('%(asctime)s %(levelname)-8s %(message)s')
            lfh.setFormatter(formatter)
            logger.addHandler(lfh)
            # End logging setup

            logger.info('Training mallet LDA model...')
            model = LdaMallet(
                MALLET_BINARY_PATH, corpus=corpus, num_topics=NUM_TOPICS, prefix=f'{MODEL_MALLET_FOLDER}/{CORPUS_ID}-{MODEL_ID}_',
                id2word=g_dict.id2token, workers=NUM_WORKERS, iterations=NUM_ITERS,
                random_seed=1029
            )
            logger.info('Completed training mallet LDA model...')

            dt = pd.read_csv(
                model.fdoctopics(), delimiter='\t', header=None,
                names=[i for i in range(model.num_topics)], index_col=None,
                usecols=[i + 2 for i in range(model.num_topics)],
            )

            dt.index = docs_filtered['id']
            dt = dt.divide(dt.min(axis=1), axis=0).astype(int) - 1

            logger.info('Generating dfr-browser data...')
            ddt = transform_dt(dt.as_matrix().T)
            ttw = get_tw(model)

            with open(os.path.join(MODEL_DATA_FOLDER, 'tw.json'), 'w') as fl:
                json.dump(ttw, fl)

            with open(os.path.join(MODEL_DATA_FOLDER, 'dt.json'), 'w') as fl:
                json.dump(ddt, fl)

            info_json = {
                "title": f"Topics in <em>{CORPUS_ID} {MODEL_ID}<\/em>",
                "meta_info": "This site is the working demo for <a href=\"/\">dfr-browser</a>, a browsing interface for topic models of journal articles or other text.",
                "VIS": {
                    "condition": {
                        "type": "time",
                        "spec": {
                            "unit": "year",
                            "n": 1
                        }
                    },
                    "bib_sort": {
                        "major": "year",
                        "minor": "alpha"
                    },
                    "model_view": {
                        "plot": {
                            "words": 6,
                            "size_range": [6, 14]
                        } 
                    }
                }
            }

            with open(os.path.join(MODEL_DATA_FOLDER, 'info.json'), 'w') as fl:
                json.dump(info_json, fl)

            # Generation of key LDA files
            # doc_topics
            logger.info('Storing doc_topics...')
            dt.to_csv(
                os.path.join(MODEL_DATA_FOLDER, f'doc_topics_{MODEL_ID}.csv'), 
                header=False,  # Change to True if topic id should be present as the header
                index=False  # Change to True if the uid should be present as the index
            )
            dt.to_csv(
                os.path.join(MODEL_DATA_FOLDER, f'doc_topics_{MODEL_ID}_with_details.csv'), 
                header=True,  # Change to True if topic id should be present as the header
                index=True  # Change to True if the uid should be present as the index
            )

            # topic_words
            word_topics = pd.DataFrame(model.word_topics, columns=range(model.word_topics.shape[1]), index=range(1, model.word_topics.shape[0] + 1))
            word_topics = word_topics.rename(columns=model.id2word)

            logger.info('Storing word_topics...')
            word_topics.astype(int).to_csv(
                os.path.join(MODEL_DATA_FOLDER, f'topic_words_{MODEL_ID}.csv'), 
                header=False,  # Change to True if actual word should be present as the header
                index=False  # Sorted order by topic id
            )
            word_topics.astype(int).to_csv(
                os.path.join(MODEL_DATA_FOLDER, f'topic_words_{MODEL_ID}_with_details.csv'), 
                header=True,  # Change to True if actual word should be present as the header
                index=False  # Sorted order by topic id
            )

            logger.info('Storing top_words...')
            top_words = get_top_words(word_topics, topic=None, topn=NUM_TOPICS)
            top_words.to_csv(
                os.path.join(MODEL_DATA_FOLDER, f'top_words_{MODEL_ID}.csv'), 
                index=False
            )

            logger.info('Saving mallet lda model...')
            model.save(os.path.join(MODEL_DATA_FOLDER, f'{CORPUS_ID}_lda_model_{MODEL_ID}.mallet.lda'))

            logger.info('Converting mallet lda to gensim lda model...')
            gensim_lda = malletmodel2ldamodel(model, gamma_threshold=0.000001, iterations=1000)
            gensim_lda.minimum_probability = 0.000001

            logger.info('Saving mallet.gensim lda model...')
            gensim_lda.save(os.path.join(MODEL_DATA_FOLDER, f'{CORPUS_ID}_lda_model_{MODEL_ID}.mallet.gensim.lda'))
            
            logger.info(f'lda model for {CORPUS_ID} completed: {MODEL_ID}...')
            logger.removeHandler(lfh)

            del(model)
            gc.collect()

        del(docs_filtered)
        del(docs.doclist)
        del(docs)
        del(meta)
        del(g_dict)
        del(corpus)
        gc.collect()

(203876, 3)




In [11]:
CORPUS_PART

'PD'

In [11]:
# !../Mallet/bin/mallet import-file --preserve-case --keep-sequence --remove-stopwords --token-regex "\S+" --input /R/NLP/MODELS/LDA/IMF-ALL_20/mallet/IMF-ALL_20_corpus.txt --output /R/NLP/MODELS/LDA/IMF-ALL_20/mallet/IMF-ALL_20_corpus.mallet

In [235]:
# http://microdatahub.com/topicsmodeling/dfr/topic_browser/browser.php?model=data50_SAR&type=SAR&topic_count=50#/doc/9622
