In [1]:
%load_ext autotime

In [None]:
import json
import importlib
import numpy as np
import pandas as pd
from pathlib import Path
from elasticsearch.helpers import scan

from wb_nlp.interfaces import elasticsearch
from wb_nlp.interfaces import mongodb
from wb_nlp.types import metadata as meta_type
from wb_nlp.types import metadata_enums
from wb_nlp import dir_manager

importlib.reload(meta_type)
importlib.reload(metadata_enums)

# World Bank Metadata

Load the metadata from the API scraper notebook: `wb_nlp/notebooks/scrapers/wb_api_scraper_latest.ipynb`

In [None]:
# # fname = f"wb_metadata-{datetime.now()}.csv"
# fname = "wb_metadata-2021-03-27 19:17:44.764136.csv"
# wb_metadata = pd.read_csv(dir_manager.get_data_dir("corpus", "WB", fname))

# collection = mongodb.get_collection("test_nlp", "docs_metadata")
# errors_list = []
# no_url = []
# metadata_list = []
# for i, row in wb_metadata.iterrows():
#     if i % 10000 == 0:
#         print(i)

#     row["_id"] = row["id"]

#     if row["url_txt"] == "not to be displayed--":
#         print(row["id"])
#         no_url.append(row)
#         continue

#     try:
#         meta = json.loads(meta_type.make_metadata_model_from_nlp_schema(row.fillna("")).json())
#         meta["_id"] = meta["id"]
#         metadata_list.append(meta)
#     except:
#         errors_list.append(row)

# collection.insert_many(metadata_list)

In [None]:
# fname = "wb_metadata-2021-03-27 19:17:44.764136.csv"  # without abstract
fname = "wb_metadata-2021-04-06 16:49:38.570800.csv"  # with abstract
wb_metadata = pd.read_csv(dir_manager.get_data_dir("corpus", "WB", fname))
wb_metadata["year"] = wb_metadata["year"].fillna(np.nan).astype("Int64")
collection = mongodb.get_collection("test_nlp", "docs_metadata")
errors_list = []
no_url = []
metadata_list = []
ids = set()
dup_ids = set()

for i, row in wb_metadata.iterrows():
    if i % 10000 == 0:
        print(i, len(ids), len(errors_list))
    row["_id"] = row["id"]
    if row["url_txt"] == "not to be displayed--":
        print(row["id"])
        no_url.append(row)
        continue
    try:
        meta = json.loads(meta_type.make_metadata_model_from_nlp_schema(row.fillna("").astype(str)).json())
        if meta["path_original"] is None or "TXT_ORIG" in meta["path_original"]:
            meta["path_original"] = meta["path_original"]
        else:
            p = Path(meta["path_original"])
            meta["path_original"] = str(p.parent / "TXT_ORIG" / p.name)
        meta["_id"] = meta["id"]
        if meta["_id"] in ids:
            dup_ids.add(meta["_id"])
            continue
        metadata_list.append(meta)
        ids.add(meta["_id"])
    except:
        errors_list.append(row)

print(len(ids))
filter_obj = {"id": {"$in": list(ids)}}
del_res = collection.delete_many(filter_obj)
ins_res = collection.insert_many(metadata_list)

In [None]:
def make_nlp_docs_from_docs_metadata(docs_metadata, ignore_existing):
    es = elasticsearch.get_client()
    existing_ids = set()
    if ignore_existing: 
        for dobj in scan(es, query={"query": {"match_all": {}}, "fields": ["_id"]}, size=5000, index="nlp-documents"):
            ids.add(dobj["_id"])
    root_path = Path(dir_manager.get_path_from_root())
    for ix, data in enumerate(docs_metadata):
        if ix and ix % 10000 == 0:
            print(ix)
        if data["_id"] in existing_ids:
            continue
        doc_path = root_path / data["path_original"]
        if not doc_path.exists():
            continue
        # create and save and article
        nlp_doc = elasticsearch.NLPDoc(meta={'id': data["_id"]}, **data)
        with open(doc_path, 'rb') as open_file:
            doc = open_file.read().decode('utf-8', errors='ignore')
            nlp_doc.body = doc
        nlp_doc.save()

In [None]:
from wb_nlp.interfaces import elasticsearch

es = elasticsearch.get_client()
es.indices.refresh(index_name)
es.cat.count(index_name, params={"format": "json"})

# Notes and scratch

1. Copy raw data from sandbox server to gw1. Then copy to server7. Make sure the files are in the proper format <corpus_id>_<doc_id>.*
2. Extract covers of pdf.
3. Run filter by language script.
4. Set up config for the cleaning and the models. Then load the configs to db.
5. Define the scripts to run the models in model_runs/<model_name>/<model_name>-run-<run_number>.sh and also create the cleaning script (run_<model_name__cleaning.sh) for the intended model configuration.
6. Execute cleaning and traning: bash run_<model_name__cleaning.sh && bash model_runs/<model_name>/<model_name>-run-<run_number>.sh
7. Update permission of the data and models directories (inside the container).
8. Tar the needed files and models.
9. rsync to gw1, then rsync to sandbox/dev server.
10. copy files to local directories, i.e., data/ and models/.
11. Load the metadata to db.
12. Make a softlink from EN_TXT_ORIG to TXT_ORIG
13. Build the document vectors.
14. Dump the documents to elasticsearch.
15. Update the default model ids in app/app_kcp/src/config.js

##############################

https://stackoverflow.com/a/44683248

```FROM ubuntu:xenial-20170214
ARG UNAME=testuser
ARG UID=1000
ARG GID=1000
RUN groupadd -g $GID -o $UNAME
RUN useradd -m -u $UID -g $GID -o -s /bin/bash $UNAME
USER $UNAME
CMD /bin/bash


docker build --build-arg UID=$(id -u) --build-arg GID=$(id -g) \
  -f bb.dockerfile -t testimg .
```

```
##############################
sudo docker-compose -f docker-compose.yml -f docker-compose.w0lxsnlp01.yml up -d --no-deps --build tika
sudo docker-compose -f docker-compose.yml -f docker-compose.w0lxsnlp01.yml up -d --no-deps --build redis
sudo docker-compose -f docker-compose.yml -f docker-compose.w0lxsnlp01.yml up -d --no-deps --build mongodb
sudo docker-compose -f docker-compose.yml -f docker-compose.w0lxsnlp01.yml up -d --no-deps --build milvus

sudo docker-compose -f docker-compose.yml -f docker-compose.w0lxsnlp01.yml up -d --no-deps --build nlp_api
sudo docker-compose -f docker-compose.yml -f docker-compose.w0lxsnlp01.yml up -d --no-deps --build app_kcp

Step 1: Copy nlp-metadata-wbes2474-20201007.json to data/raw
Step 2: Place sample data in data/raw/sample_data/TXT_ORIG

sudo docker exec -it wb_nlp_nlp_api_1 /bin/bash

# Then run scripts run scripts
```

### To copy data from sandbox to gw1
```
###################################

python ~/kcp-scripts/copy_raw_data.py

# Manually rsync data for ADB
rsync -avP w0lxsnlp01:/decfile2/Modeling/NLP/ihsn_scrapers/scrapers/adb/adb_files/full /Documentum/Aivin/corpus/ADB/
mv /Documentum/Aivin/corpus/ADB/full /Documentum/Aivin/corpus/ADB/PDF_ORIG

```

### Generate wikipedia vocab
Download data from: https://dumps.wikimedia.org/enwiki/latest/
enwiki-latest-pages-articles.xml.bz2: https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2
```
###################################
cd /data/wb536061/wikipedia-word-frequency
./gather_wordfreq.py ../wb_nlp/data/external/wikipedia/enwiki-latest-pages-articles.xml.bz2 > wordfreq-enwiki-latest-pages-articles.xml.bz2.txt

sudo /usr/local/bin/docker-compose -f docker-compose.yml -f docker-compose.server7.yml up -d --no-deps --build tika
sudo /usr/local/bin/docker-compose -f docker-compose.yml -f docker-compose.server7.yml up -d --no-deps --build redis
sudo /usr/local/bin/docker-compose -f docker-compose.yml -f docker-compose.server7.yml up -d --no-deps --build mongodb
sudo /usr/local/bin/docker-compose -f docker-compose.yml -f docker-compose.server7.yml up -d --no-deps --build milvus

sudo /usr/local/bin/docker-compose -f docker-compose.yml -f docker-compose.server7.yml up -d --no-deps --build nlp_api
sudo /usr/local/bin/docker-compose -f docker-compose.yml -f docker-compose.server7.yml up -d --no-deps --build app_kcp

sudo docker exec -it wb_nlp_nlp_api_1 /bin/bash

python -u ./scripts/cleaning/convert_pdf2txt_corpus.py --input-dir ./data/corpus --recursive -vv |& tee ./logs/convert_pdf2txt_corpus.py.log

bash run_word2vec_cleaning.sh && bash model_runs/word2vec/word2vec-run-01.sh
```

### Build a tar of trained models and WB data
```
# tar -czf KCP_WB_DATA_MODELS.tar.gz data/corpus/WB/COVER data/corpus/WB/wb_metadata-2021-03-27\ 19\:17\:44.764136.csv data/corpus/WB/EN_TXT_ORIG data/corpus/cleaned/229abf370f281efa7c9f3c4ddc20159d/WB models/word2vec models/lda

tar -czvf KCP_WB_DATA_MODELS.tar.gz data/corpus/WB/COVER data/corpus/WB/wb_metadata-*.csv data/corpus/WB/EN_TXT_ORIG data/corpus/cleaned/*/WB models/word2vec models/lda models/mallet
```

### Build Word2vec vector index
```
###################### W2V
from wb_nlp.models import word2vec_base
import logging

wvec_model = word2vec_base.Word2VecModel(
    model_config_id="702984027cfedde344961b8b9461bfd3",
    cleaning_config_id="229abf370f281efa7c9f3c4ddc20159d",
    model_run_info_id="854ae5f9cdda093265212c435d1ddfd4",
    raise_empty_doc_status=False,
    log_level=logging.DEBUG)

wvec_model.build_doc_vecs(pool_workers=22)
```

### Build Gensim LDA vector index
```
###################### LDA
from wb_nlp.models import lda_base
import logging

lda_model = lda_base.LDAModel(
    model_config_id="43f9977dbee49d7d942f0f9988de4426",
    cleaning_config_id="e70ad4f61cf2053e4a15f570c5f82b67",
    model_run_info_id="749573cedb4b06aedcba2ec89bc46b46",
    raise_empty_doc_status=False,
    log_level=logging.DEBUG)

lda_model.build_doc_vecs(pool_workers=22)
```

### Build WDI vector index
```
###################### WDI
import pandas as pd
import logging
from wb_nlp import dir_manager
from wb_nlp.models import word2vec_base


wvec_model = word2vec_base.Word2VecModel(
    model_config_id="702984027cfedde344961b8b9461bfd3",
    cleaning_config_id="229abf370f281efa7c9f3c4ddc20159d",
    model_run_info_id="854ae5f9cdda093265212c435d1ddfd4",
    raise_empty_doc_status=False,
    log_level=logging.DEBUG)

wdi_df = pd.read_csv(dir_manager.get_data_dir(
    "preprocessed", "timeseries", "wdi_time_series_metadata.csv"))

wdi_df["text"] = wdi_df["txt_meta"].map(wvec_model.clean_text)

wdi_df["vector"] = wdi_df.apply(
    lambda x: wvec_model.process_doc(x)["doc_vec"], axis=1)

wdi_df.to_pickle(
    f"/workspace/models/wdi/wdi_time_series_metadata-{wvec_model.model_id}.pickle")```


### w0lxsnlp01: model_run_info backup

```
from wb_nlp.interfaces import mongodb
mric = mongodb.get_model_runs_info_collection()

In [8]: str(list(mric.find({}))).replace("'", '"')
[
  {
    "_id": "b93af59ca0150576ebd97d3086ff0324",
    "model_run_info_id": "b93af59ca0150576ebd97d3086ff0324",
    "model_name": "lda",
    "model_config_id": "ef0ab0459e9c28de8657f3c4f5b2cd86",
    "cleaning_config_id": "23f78350192d924e4a8f75278aca0e1c",
    "processed_corpus_id": "f8cc0e4e17e45f3d247383b3aa9813a1_5a80eb483f11c3b899d8cba7237215f9",
    "description": "",
    "model_file_name": "models/lda/b93af59ca0150576ebd97d3086ff0324/lda_b93af59ca0150576ebd97d3086ff0324.bz2"
  },
  {
    "_id": "41129a1b7ac4187779cc4847b1c6a43d",
    "model_run_info_id": "41129a1b7ac4187779cc4847b1c6a43d",
    "model_name": "word2vec",
    "model_config_id": "702984027cfedde344961b8b9461bfd3",
    "cleaning_config_id": "23f78350192d924e4a8f75278aca0e1c",
    "processed_corpus_id": "f8cc0e4e17e45f3d247383b3aa9813a1",
    "description": "",
    "model_file_name": "models/word2vec/41129a1b7ac4187779cc4847b1c6a43d/word2vec_41129a1b7ac4187779cc4847b1c6a43d.bz2"
  },
  {
    "_id": "6694f3a38bc16dee91be5ccf4a64b6d8",
    "model_run_info_id": "6694f3a38bc16dee91be5ccf4a64b6d8",
    "model_name": "lda",
    "model_config_id": "ef0ab0459e9c28de8657f3c4f5b2cd86",
    "cleaning_config_id": "23f78350192d924e4a8f75278aca0e1c",
    "processed_corpus_id": "531c1e4f358efbc07b97a58815558c53_5a80eb483f11c3b899d8cba7237215f9",
    "description": "",
    "model_file_name": "models/lda/6694f3a38bc16dee91be5ccf4a64b6d8/lda_6694f3a38bc16dee91be5ccf4a64b6d8.bz2"
  },
  {
    "_id": "777a9cf47411f6c4932e8941f177f90a",
    "model_run_info_id": "777a9cf47411f6c4932e8941f177f90a",
    "model_name": "word2vec",
    "model_config_id": "702984027cfedde344961b8b9461bfd3",
    "cleaning_config_id": "23f78350192d924e4a8f75278aca0e1c",
    "processed_corpus_id": "531c1e4f358efbc07b97a58815558c53",
    "description": "",
    "model_file_name": "models/word2vec/777a9cf47411f6c4932e8941f177f90a/word2vec_777a9cf47411f6c4932e8941f177f90a.bz2"
  },
  {
    "_id": "854ae5f9cdda093265212c435d1ddfd4",
    "model_run_info_id": "854ae5f9cdda093265212c435d1ddfd4",
    "model_name": "word2vec",
    "model_config_id": "702984027cfedde344961b8b9461bfd3",
    "cleaning_config_id": "229abf370f281efa7c9f3c4ddc20159d",
    "processed_corpus_id": "d41d8cd98f00b204e9800998ecf8427e",
    "description": "Word2vec D100 W5 N5 SG Full Corpus",
    "model_file_name": "models/word2vec/854ae5f9cdda093265212c435d1ddfd4/word2vec_854ae5f9cdda093265212c435d1ddfd4.bz2"
  },
  {
    "_id": "749573cedb4b06aedcba2ec89bc46b46",
    "model_run_info_id": "749573cedb4b06aedcba2ec89bc46b46",
    "model_name": "lda",
    "model_config_id": "43f9977dbee49d7d942f0f9988de4426",
    "cleaning_config_id": "e70ad4f61cf2053e4a15f570c5f82b67",
    "processed_corpus_id": "d41d8cd98f00b204e9800998ecf8427e_5a80eb483f11c3b899d8cba7237215f9",
    "description": "LDA topic model with 75 topics - full corpus",
    "model_file_name": "models/lda/749573cedb4b06aedcba2ec89bc46b46/lda_749573cedb4b06aedcba2ec89bc46b46.bz2"
  }
]
```


### w1lxbdatad07: model_run_info backup

```
[
  {
    "_id": "854ae5f9cdda093265212c435d1ddfd4",
    "model_run_info_id": "854ae5f9cdda093265212c435d1ddfd4",
    "model_name": "word2vec",
    "model_config_id": "702984027cfedde344961b8b9461bfd3",
    "cleaning_config_id": "229abf370f281efa7c9f3c4ddc20159d",
    "processed_corpus_id": "d41d8cd98f00b204e9800998ecf8427e",
    "description": "Word2vec D100 W5 N5 SG Full Corpus",
    "model_file_name": "models/word2vec/854ae5f9cdda093265212c435d1ddfd4/word2vec_854ae5f9cdda093265212c435d1ddfd4.bz2"
  },
  {
    "_id": "749573cedb4b06aedcba2ec89bc46b46",
    "model_run_info_id": "749573cedb4b06aedcba2ec89bc46b46",
    "model_name": "lda",
    "model_config_id": "43f9977dbee49d7d942f0f9988de4426",
    "cleaning_config_id": "e70ad4f61cf2053e4a15f570c5f82b67",
    "processed_corpus_id": "d41d8cd98f00b204e9800998ecf8427e_5a80eb483f11c3b899d8cba7237215f9",
    "description": "LDA topic model with 75 topics - full corpus",
    "model_file_name": "models/lda/749573cedb4b06aedcba2ec89bc46b46/lda_749573cedb4b06aedcba2ec89bc46b46.bz2"
  },
  {
    "_id": "2617e5cf327e60cc8955189110e7f21d",
    "model_run_info_id": "2617e5cf327e60cc8955189110e7f21d",
    "model_name": "word2vec",
    "model_config_id": "68e384251f875204e1263dafa25fade6",
    "cleaning_config_id": "229abf370f281efa7c9f3c4ddc20159d",
    "processed_corpus_id": "d41d8cd98f00b204e9800998ecf8427e",
    "description": "Word2vec D100 W5 N15 I10 SG Full Corpus",
    "model_file_name": "models/word2vec/2617e5cf327e60cc8955189110e7f21d/word2vec_2617e5cf327e60cc8955189110e7f21d.bz2"
  },
  {
    "_id": "6fd8b418cbe4af7a1b3d24debfafa1ee",
    "model_run_info_id": "6fd8b418cbe4af7a1b3d24debfafa1ee",
    "model_name": "mallet",
    "model_config_id": "5e26b5090bdd91d7aee4e5e89753a33b",
    "cleaning_config_id": "e70ad4f61cf2053e4a15f570c5f82b67",
    "processed_corpus_id": "d41d8cd98f00b204e9800998ecf8427e_5a80eb483f11c3b899d8cba7237215f9",
    "description": "Mallet LDA topic model with 75 topics - full corpus",
    "model_file_name": "models/mallet/6fd8b418cbe4af7a1b3d24debfafa1ee/mallet_6fd8b418cbe4af7a1b3d24debfafa1ee.bz2"
  }
]
```

In [None]:
eids = [obj["_id"] for obj in elasticsearch.scan(elasticsearch.get_client(), query=dict(query=dict(match_all={}), _source=False), size=5000, index=elasticsearch.DOC_INDEX)]