Process:

1. Download metadata
2. Download text documents
3. Perform cleaning
4. Apply LDA model

In [1]:
import os
import json

import pandas as pd

## Import the DocsManager notebook

Let's import the DocsManager helper class that manages the loading and filtering of documents from the API.

In [2]:
%%capture

# LdaMallet
# Dictionary (gensim)
# build_docs
# transform_dt
# get_tw
# get_top_words
%run ./LDAModule.ipynb

# DocsManager
# build_docs
%run ../../DocsManager.ipynb
## Jupyter.notebook.save_checkpoint()

# get_corpus_path
# get_txt_clean_path
%run ../../path_manager.ipynb

In [3]:
get_corpus_path('WB')

'/R/NLP/CORPUS/WB'

Let's create a DocsManager instance.

- `metadata_filename`: path to the metadata file generated after scraping the API
- `cleaned_files_dir`: path to the directory where the cleaned files are stored
- `model_output_dir`: path to where model related files will be saved


In [4]:
CORPUS_ID = 'WB'
CORPUS_PART = 'ALL'
NUM_TOPICS = 50
MALLET_BINARY_PATH = "../Mallet/bin/mallet"
MODELS_PATH = get_models_path('LDA')
NUM_WORKERS = 22

MODEL_ID = f"{CORPUS_PART}_{NUM_TOPICS}"
MODEL_FOLDER = os.path.join(MODELS_PATH, f'{CORPUS_ID}-{MODEL_ID}')

In [5]:
import logging

logging.basicConfig(filename=f'./{CORPUS_ID.lower()}-{MODEL_ID}.log', format='%(asctime)s - %(levelname)s - %(message)s', level=logging.INFO)


In [6]:
MODEL_DATA_FOLDER = os.path.join(MODEL_FOLDER, 'data')

if not os.path.isdir(MODEL_DATA_FOLDER):
    os.makedirs(MODEL_DATA_FOLDER)

In [7]:
%%time
docs = build_docs(
    metadata_filename=os.path.join(get_corpus_path(CORPUS_ID), f'{CORPUS_ID.lower()}_metadata_complete.csv'),
    cleaned_files_dir=get_txt_clean_path(CORPUS_ID),
    model_output_dir=MODEL_FOLDER  # Use flat directory as discussed...
)

CPU times: user 6.02 s, sys: 859 ms, total: 6.87 s
Wall time: 6.91 s


Given a `CORPUS_PART`, let's extract the filtered documents. Please check the `DocsManager.ipynb` notebook for additional filter options.

In [8]:
docs.set_min_token_count(100)

In [9]:
logging.info('Filtering docs for partition...')

In [10]:
%%time
docs_filtered, meta = docs.filter_doclist(CORPUS_PART, save=True, return_meta=True, pool_workers=22)

CPU times: user 2min 2s, sys: 38.1 s, total: 2min 40s
Wall time: 2min 54s


In [11]:
meta.head(2)

Unnamed: 0,id,title,author,digital_identifier,language_detected,year,pages
0,wb_10000577,"Crises, capital controls, and financial integr...","Van Horen, Neeltje",090224b0828ae539,en,2008.0,
1,wb_10000660,The typology of partial credit guarantee funds...,"Mendoza, Juan Carlos",090224b0828a486d,en,2008.0,


In [12]:
docs_filtered.head(2)

Unnamed: 0,id,filename,text
0,wb_10000577,/R/NLP/CORPUS/WB/TXT_CLEAN/wb_10000577.txt,policy research work paper crisis capital cont...
1,wb_10000660,/R/NLP/CORPUS/WB/TXT_CLEAN/wb_10000660.txt,policy research work paper typology partial cr...


# LDA model

### Generate gensim dictionary

In [13]:
%%time
logging.info('Generating dictionary...')
g_dict = Dictionary(docs_filtered.text.str.split())
g_dict.id2token = {id: token for token, id in g_dict.token2id.items()}

CPU times: user 16min 8s, sys: 52.4 s, total: 17min 1s
Wall time: 16min 56s


### Train LDA model using Gensim's Mallet wrapper

In [14]:
%%time
logging.info('Generating corpus...')
corpus = [g_dict.doc2bow(text.split()) for text in docs_filtered.text]

CPU times: user 13min 23s, sys: 8.29 s, total: 13min 31s
Wall time: 13min 28s


In [15]:
MODEL_DATA_FOLDER = os.path.join(MODELS_PATH, f'{CORPUS_ID}-{MODEL_ID}', 'data')
MODEL_MALLET_FOLDER = os.path.join(MODELS_PATH, f'{CORPUS_ID}-{MODEL_ID}', 'mallet')

if not os.path.isdir(MODEL_DATA_FOLDER):
    os.makedirs(MODEL_DATA_FOLDER)
    

if not os.path.isdir(MODEL_MALLET_FOLDER):
    os.makedirs(MODEL_MALLET_FOLDER)

In [None]:
MODEL_DATA_FOLDER

'/R/NLP/MODELS/LDA/WB-ALL_50/data'

# WARNING! Mallet files will be stored in the user home directory.

Ideally, this should be in the /tmp directory but the allocated space is not enough

In [None]:
%%time
logging.info('Start training LDA model...')
model = LdaMallet(
    MALLET_BINARY_PATH, corpus=corpus, num_topics=NUM_TOPICS, prefix=f'{MODEL_MALLET_FOLDER}/{CORPUS_ID}-{MODEL_ID}_', 
    id2word=g_dict.id2token, workers=NUM_WORKERS,
    iterations=200
)

/R/NLP/MODELS/LDA/WB-ALL_50/mallet/WB-ALL_50_corpus.txt


In [None]:
model.fdoctopics(), model.num_topics

### Load doc topics

In [None]:
logging.info('Building dt...')
dt = pd.read_csv(
    model.fdoctopics(), delimiter='\t', header=None,
    names=[i for i in range(model.num_topics)], index_col=None,
    usecols=[i + 2 for i in range(model.num_topics)],
)

dt.index = docs_filtered['id']
dt = dt.divide(dt.min(axis=1), axis=0).astype(int) - 1

In [None]:
dt.head(2)

In [None]:
dt.head(2)

### Generate dfr data

In [None]:
logging.info('Building ddt...')
ddt = transform_dt(dt.as_matrix().T)

In [None]:
logging.info('Building ttw...')
ttw = get_tw(model)

### Store data

In [None]:
logging.info('Storing ttw...')
with open(os.path.join(MODEL_DATA_FOLDER, 'tw.json'), 'w') as fl:
    json.dump(ttw, fl)

In [None]:
logging.info('Storing ddt...')
with open(os.path.join(MODEL_DATA_FOLDER, 'dt.json'), 'w') as fl:
    json.dump(ddt, fl)

In [None]:
logging.info('Storing info_json...')
info_json = {
    "title": "Topics in <em>WB Documents and Reports API<\/em>",
    "meta_info": "This site is the working demo for <a href=\"/\">dfr-browser</a>, a browsing interface for topic models of journal articles or other text.",
    "VIS": {
        "condition": {
            "type": "time",
            "spec": {
                "unit": "year",
                "n": 1
            }
        },
        "bib_sort": {
            "major": "year",
            "minor": "alpha"
        },
        "model_view": {
            "plot": {
                "words": 6,
                "size_range": [6, 14]
            } 
        }
    }
}

with open(os.path.join(MODEL_DATA_FOLDER, 'info.json'), 'w') as fl:
    json.dump(info_json, fl)

# Generation of key LDA files

### doc_topics

In [None]:
logging.info('Storing doc topics...')
dt.to_csv(
    os.path.join(MODEL_DATA_FOLDER, f'doc_topics_{MODEL_ID}.csv'), 
    header=False,  # Change to True if topic id should be present as the header
    index=False  # Change to True if the uid should be present as the index
)

### topic_words

In [None]:
word_topics = pd.DataFrame(model.word_topics, columns=range(model.word_topics.shape[1]), index=range(1, model.word_topics.shape[0] + 1))
word_topics = word_topics.rename(columns=model.id2word)

In [None]:
word_topics.head()

In [None]:
logging.info('Storing word topics...')
word_topics.astype(int).to_csv(
    os.path.join(MODEL_DATA_FOLDER, f'topic_words_{MODEL_ID}.csv'), 
    header=False,  # Change to True if actual word should be present as the header
    index=False  # Change to True if the topic id should be present as the index
)

### top_words

In [None]:
logging.info('Getting top words...')
top_words = get_top_words(word_topics, topic=None, topn=50)

In [None]:
top_words.head(2)

In [None]:
logging.info('Storing top words...')
top_words.to_csv(
    os.path.join(MODEL_DATA_FOLDER, f'top_words_{MODEL_ID}.csv'), 
    index=False  # Change to True if the topic id should be present as the index
)

In [None]:
%%time
logging.info('Saving model...')
model.save(os.path.join(MODEL_DATA_FOLDER, f'{CORPUS_ID}_lda_model_{MODEL_ID}.lda'))

In [34]:
# ls -lh saved_lda_model.lda

# Find closest document by Euclidean distance

Use functions defined in `LDAModule.ipynb`: `close_docs`

In [84]:
# # We generate a function that will find and list the N documents closest to a selected one
# close_docs <- function(docid, numclose) {
#   indx <- which(s$uid == docid)
#   mxcol = 24 + as.numeric(model)
#   x1 <- s[indx, 25:mxcol]
#   neighbors <- s[, 25:mxcol]
#   dist <- pdist(neighbors, x1)
#   similar <- cbind(s, dist@dist)
#   similar <- similar[order(dist@dist),]
#   head(similar[, c(1,5,6,8,9,11,15)], numclose) # The first in the list is the document itself
# }

# close_docs(10575832, 21)
# close_docs(27761347, 21)

In [87]:
doc_ids = close_docs(docs, doc_id=20140580, num_docs=10, report=True, dt=dt)

uid: 20140580 
title: SABER in Action: An Overview - -ln-             Strengthening Education Systems to Achieve Learning for All 
url: http://documents.worldbank.org/curated/en/866881468323335358/SABER-in-Action-An-Overview-Strengthening-Education-Systems-to-Achieve-Learning-for-All 
pdf_url: http://documents.worldbank.org/curated/en/866881468323335358/pdf/80059-REVISED-SABER-in-Action-An-Overview.pdf

uid: 29839318 
title: Statement by Mr. Johan Van -ln-             Overtveldt at the 97th meeting of the Development Committee -ln-             held on April 21, 2018 
url: http://documents.worldbank.org/curated/en/805751524690768770/Statement-by-Mr-Johan-Van-Overtveldt-at-the-97th-meeting-of-the-Development-Committee-held-on-April-21-2018 
pdf_url: http://documents.worldbank.org/curated/en/805751524690768770/pdf/DCS2018-0031-Belgium-04212018.pdf

uid: 29839299 
title: Statement by Rt. Hon. Penny -ln-             Mordaunt at the 97th meeting of the Development Committee -ln-             

# Scratch

In [178]:
ddt['p'][0:2]

[0, 3533]

In [174]:
ddt['i'][10000]

5959

In [175]:
ddt['x'][10000]

232

In [161]:
dt.as_matrix().T

array([[ 0,  0, 18, ...,  0,  0,  3],
       [ 0,  0, 10, ...,  0,  0,  2],
       [ 2,  0,  6, ...,  2,  4,  0],
       ...,
       [ 0,  0,  0, ...,  0,  0,  0],
       [ 0,  6, 30, ...,  0,  0,  0],
       [ 2,  6,  6, ...,  6,  4,  2]], dtype=int64)

In [179]:
import requests
dfr_dt = 'http://microdatahub.com/topicsmodeling/dfr/topic_browser/model.php?type=dt&model=data50_SAR'
dfr_dt = requests.get(dfr_dt)

In [234]:
dfr_dt = ddt  # dfr_dt.json()

In [235]:
# http://microdatahub.com/topicsmodeling/dfr/topic_browser/browser.php?model=data50_SAR&type=SAR&topic_count=50#/doc/9622


In [248]:
did = 8130
ps_9622 = [p for p in range(0, dfr_dt['p'][-1]) if dfr_dt['i'][p] == did]

for t in range(50):
    p0 = dfr_dt['p'][t]
    p1 = dfr_dt['p'][t + 1]

    pt_9622 = [p for p in range(p0, p1) if dfr_dt['i'][p] == did]
    try:
        raw = dfr_dt['x'][pt_9622[0]]
        w =  raw / sum(dfr_dt['x'][p] for p in ps_9622)
        print(t, raw, w)
    except:
        continue

0 3 0.0008880994671403197
1 2 0.0005920663114268798
3 2 0.0005920663114268798
6 5 0.0014801657785671995
10 2 0.0005920663114268798
11 5 0.0014801657785671995
12 900 0.2664298401420959
16 18 0.0053285968028419185
20 240 0.07104795737122557
21 5 0.0014801657785671995
23 2 0.0005920663114268798
27 10 0.002960331557134399
28 3 0.0008880994671403197
29 289 0.08555358200118414
34 2 0.0005920663114268798
37 8 0.002368265245707519
40 61 0.018058022498519833
41 2 0.0005920663114268798
42 16 0.004736530491415038
45 609 0.1802841918294849
46 2 0.0005920663114268798
47 3 0.0008880994671403197
48 50 0.014801657785671996
49 64 0.018946121965660152


In [249]:
dt.T.sum()

uid
20140580      1080
25715555     21191
25715556     16486
25715559     22112
25715564     22201
26063310     28187
26063311     86580
26527953     75725
26527971     76887
27100106    137258
27164047      4331
27279530     19752
27556198     18284
27556933    196106
27563729      4124
27666842     74554
27678201     30339
27873493      4271
27998406      8498
28022007     57133
28024406     21622
28078923     82664
28097873     80300
28097875     79557
28135063     53546
28138284     15789
28170915     33635
28397832     12005
28397833     34754
28645758    101015
             ...  
29934340     17483
29934571      1166
29934572     10604
29934576      3477
29934577      1058
29934810     23777
29934811      6585
29935000      1666
29935012     19639
29935017     10680
29935018     19224
29935028      3498
29935030     10039
29935031     29335
29935035     14365
29935179      4293
29935213      1246
29935218      4780
29935220      5114
29935221      1958
29935314       488
29935339

In [231]:
[p for p in range(0, dfr_dt['p'][-1]) if dfr_dt['i'][p] == 10490]

[32432, 58929, 92858, 134291]

In [227]:
wbdocs.doclist[wbdocs.doclist.uid == 27164047].tokens

10    3262.0
Name: tokens, dtype: float64

In [251]:
wbdocs.doclist[wbdocs.doclist.uid == 29935714].tokens

10499    1963.0
Name: tokens, dtype: float64

In [238]:
WBdocs_filtered.shape

(7896, 3)

In [239]:
dfr_dt['p'][-1]

389113

In [246]:
dt.shape

(8131, 100)

In [None]:
topc = model[corpus[0]]

/R/NLP/MODELS/LDA/WB-ALL_50/mallet/WB-ALL_50_corpus.txt


In [36]:
corpus[0]

[(0, 1),
 (1, 4),
 (2, 3),
 (3, 12),
 (4, 1),
 (5, 1),
 (6, 2),
 (7, 1),
 (8, 2),
 (9, 1),
 (10, 1),
 (11, 9),
 (12, 2),
 (13, 4),
 (14, 3),
 (15, 5),
 (16, 2),
 (17, 1),
 (18, 1),
 (19, 1),
 (20, 2),
 (21, 12),
 (22, 1),
 (23, 1),
 (24, 1),
 (25, 1),
 (26, 2),
 (27, 1),
 (28, 7),
 (29, 1),
 (30, 1),
 (31, 2),
 (32, 5),
 (33, 10),
 (34, 1),
 (35, 1),
 (36, 1),
 (37, 1),
 (38, 1),
 (39, 1),
 (40, 4),
 (41, 2),
 (42, 2),
 (43, 3),
 (44, 1),
 (45, 1),
 (46, 31),
 (47, 1),
 (48, 1),
 (49, 3),
 (50, 1),
 (51, 1),
 (52, 1),
 (53, 1),
 (54, 1),
 (55, 6),
 (56, 1),
 (57, 5),
 (58, 3),
 (59, 2),
 (60, 1),
 (61, 1),
 (62, 4),
 (63, 2),
 (64, 2),
 (65, 6),
 (66, 38),
 (67, 2),
 (68, 4),
 (69, 5),
 (70, 1),
 (71, 1),
 (72, 2),
 (73, 1),
 (74, 38),
 (75, 3),
 (76, 2),
 (77, 11),
 (78, 1),
 (79, 3),
 (80, 4),
 (81, 1),
 (82, 1),
 (83, 12),
 (84, 1),
 (85, 1),
 (86, 1),
 (87, 4),
 (88, 1),
 (89, 6),
 (90, 3),
 (91, 1),
 (92, 3),
 (93, 2),
 (94, 1),
 (95, 7),
 (96, 1),
 (97, 1),
 (98, 2),
 (99, 2),
 (

In [1]:
model

NameError: name 'model' is not defined