# Notebook for training word2vec models for IMF and WB corpora

In [1]:
%%capture

# Word2VecModel
%run ./Word2vecModule.ipynb

# DocsManager
# build_docs
%run ../../DocsManager.ipynb
## Jupyter.notebook.save_checkpoint()

# get_corpus_path
# get_txt_clean_path
%run ../../path_manager.ipynb

# CorpusCleaner
%run ../../DataCleanerModule.ipynb

## Import modules

In [2]:
import os
import json

import pandas as pd
import numpy as np

In [3]:
MODELS_PATH = get_models_path('WORD2VEC')
NUM_WORKERS = 23
NUM_ITERS = 10
MIN_TOKEN_COUNT = 50

In [4]:
import logging
import gc

TRAINING_MODEL_ID = 'WORD2VEC'

logging.basicConfig(filename=f'./{TRAINING_MODEL_ID.lower()}-LIDPL-iters_{NUM_ITERS}.log', format='%(asctime)s - %(levelname)s - %(message)s', level=logging.INFO)

logger = logging.getLogger(f'{TRAINING_MODEL_ID.lower()}-logger')
# logger.setLevel(logging.INFO)


In [5]:
CORPUS_ID = 'WB'
num_topics = [50, 100]
CORPUS_PART = 'LIDPL'

docs = build_docs(
    metadata_filename=os.path.join(get_corpus_path(CORPUS_ID), f'{CORPUS_ID.lower()}_metadata_complete.csv'),
    cleaned_files_dir=get_txt_clean_path(CORPUS_ID),
    model_output_dir=MODELS_PATH  # Use flat directory as discussed...
)

logger.info('Creating partitioned docs and loading files...')

docs.set_ngram_mapper('../../whitelists/whitelist_ngrams_cleaned.csv', cleaner=None)

docs.set_min_token_count(MIN_TOKEN_COUNT)

docs_filtered, meta = docs.filter_doclist(
    corpus_part=CORPUS_PART, corpus_id=CORPUS_ID,
    docs_filtered=docs.doclist[docs.doclist.wb_lending_instrument.str.contains('Development Policy Lending').fillna(False)].copy(),
    save=True, return_meta=True, pool_workers=22
)

print(docs_filtered.shape)

for NUM_TOPICS in num_topics:
#     if (CORPUS_ID == 'WB') and (NUM_TOPICS == 50) and (CORPUS_PART == 'ALL'):
#         continue

    MODEL_ID = f"{CORPUS_PART}_{NUM_TOPICS}"
    MODEL_FOLDER = os.path.join(MODELS_PATH, f'{CORPUS_ID}-{MODEL_ID}')

    MODEL_DATA_FOLDER = os.path.join(MODEL_FOLDER, 'data')

    if not os.path.isdir(MODEL_DATA_FOLDER):
        os.makedirs(MODEL_DATA_FOLDER)

    # Set logging
    lfh = logging.FileHandler(f'./{CORPUS_ID.lower()}-iters_{NUM_ITERS}-{MODEL_ID}.log')
    lfh.setLevel(logging.INFO)
    formatter = logging.Formatter('%(asctime)s %(levelname)-8s %(message)s')
    lfh.setFormatter(formatter)
    logger.addHandler(lfh)
    # End logging setup

    logger.info('Creating word2vec model...')

    w2vec_model = Word2VecModel(
        doc_df=docs_filtered,
        corpus_id=CORPUS_ID,
        model_id=MODEL_ID,
        dim=NUM_TOPICS,
        workers=NUM_WORKERS,
        model_path=MODELS_PATH,
        optimize_interval=0,
        iter=NUM_ITERS
    )

    logger.info('Starting model training...')
    w2vec_model.train_model()

    logger.info('Starting document vectors creation...')
    w2vec_model.build_doc_vecs(pool_workers=None)

    logger.info('Saving model...')
    w2vec_model.save_model()

    logger.info('Saving model and vectors...')
    w2vec_model.save()

    logger.info(f'word2vec model for {CORPUS_ID} completed: {MODEL_ID}...')
    logger.removeHandler(lfh)

    w2vec_model.clear()
    del(w2vec_model)
    gc.collect()

# del(docs_filtered)
# del(docs.doclist)
# del(docs)
# del(meta)
# gc.collect()

(4075, 3)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block0_values] [items->['id', 'wvecs']]

  return pytables.to_hdf(path_or_buf, key, self, **kwargs)
