# World Bank Documents and Reports API Scraper

This notebook contains all the necessary scripts to connect and scrape document metadata from the World Bank Documents and Reports API.

In [1]:
%load_ext autotime

In [517]:
import requests
from datetime import datetime
import json
import os
import time
import glob
import pandas as pd
import numpy as np
import re
from joblib import Parallel, delayed

time: 43.5 ms


In [3]:
from wb_nlp import dir_manager

time: 169 ms


The following are the proper versions to be installed to prevent incompatibility issues.

In [4]:
for m in [requests, json, pd]:
    print(m.__name__, ':', m.__version__)

requests : 2.24.0
json : 2.0.9
pandas : 1.1.2
time: 553 µs


# Scraping methods for the World Bank Document and Reports API

The API allows customization of the kind of returned data. The `fl_param` can be adjusted to a specific subset of values if only certain fields are needed.

In [5]:
fl_params = [
    'guid', 'abstracts', 'admreg', 'alt_title', 'authr', 'available_in',
    'bdmdt', 'chronical_docm_id', 'closedt', 'colti', 'count', 'credit_no',
    'disclosure_date', 'disclosure_type', 'disclosure_type_date', 'disclstat',
    'display_title', 'docdt', 'docm_id', 'docna', 'docty', 'dois', 'entityid',
    'envcat', 'geo_reg', 'geo_reg_and_mdk', 'historic_topic', 'id',
    'isbn', 'issn', 'keywd', 'lang', 'listing_relative_url', 'lndinstr', 'loan_no',
    'majdocty', 'majtheme', 'ml_abstract', 'ml_display_title', 'new_url', 'owner',
    'pdfurl', 'prdln', 'projn', 'publishtoextweb_dt', 'repnb', 'repnme', 'seccl',
    'sectr', 'src_cit', 'subsc', 'subtopic', 'teratopic', 'theme', 'topic', 'topicv3',
    'totvolnb', 'trustfund', 'txturl', 'unregnbr', 'url_friendly_title', 'versiontyp',
    'versiontyp_key', 'virt_coll', 'vol_title', 'volnb', 'projectid',
]

time: 457 µs


In [6]:
for i in sorted(fl_params):
    print(i)

abstracts
admreg
alt_title
authr
available_in
bdmdt
chronical_docm_id
closedt
colti
count
credit_no
disclosure_date
disclosure_type
disclosure_type_date
disclstat
display_title
docdt
docm_id
docna
docty
dois
entityid
envcat
geo_reg
geo_reg_and_mdk
guid
historic_topic
id
isbn
issn
keywd
lang
listing_relative_url
lndinstr
loan_no
majdocty
majtheme
ml_abstract
ml_display_title
new_url
owner
pdfurl
prdln
projectid
projn
publishtoextweb_dt
repnb
repnme
seccl
sectr
src_cit
subsc
subtopic
teratopic
theme
topic
topicv3
totvolnb
trustfund
txturl
unregnbr
url_friendly_title
versiontyp
versiontyp_key
virt_coll
vol_title
volnb
time: 1.86 ms


In [7]:
dir_manager.get_data_dir('corpus', 'WB')

'/home/wb536061/wb_nlp/data/corpus/WB'

time: 5.86 ms


In [8]:
SCRAPER_DIR = dir_manager.get_data_dir('corpus', 'WB')
API_JSON_DIR = os.path.join(SCRAPER_DIR, 'tmp_api_json')
print(SCRAPER_DIR)
print(API_JSON_DIR)

/home/wb536061/wb_nlp/data/corpus/WB
/home/wb536061/wb_nlp/data/corpus/WB/tmp_api_json
time: 642 µs


In [9]:
def download_with_retry(url, params=None, max_retries=10):
    retry_count = 0

    while retry_count < max_retries:
        try:
            response = requests.get(url, params=params)
            if response.status_code != 200:
                retry_count += 1
                time.sleep(1)
                continue
            break
        except:
            retry_count += 1
            time.sleep(1)

    if retry_count >= max_retries:
        return

    return response

time: 557 µs


In [10]:
def request_worldbank_api(fl_params=None, offset=0, limit=1, max_retries=10):
    '''
    fl_params: list of values to return per row
    offset: parameter corresponding to the start page
    limit: maximum number of rows returned by the api call
    '''

    if fl_params is None:
        fl_params = ['guid']

    api_url = 'http://search.worldbank.org/api/v2/wds'
    api_params = dict(
        format='json',
        fl=','.join(fl_params),
        lang_exact='English',
        disclstat='Disclosed',
        srt='docdt',
        order='desc',  # Use asc such that pages already downloaded can still be used
        os=offset,
        rows=limit,
        # frmdisclosuredate='',  # '2018-09-12'
        # todisclosuredate='',  # '2018-09-13'
    )

    response = download_with_retry(url=api_url, params=api_params)

    if (response is None) or (response.status_code != 200):
        return {}

    json_content = response.json()

    return json_content


def get_total_documents():
    # This method solves the problem of determination of
    # the total pages in the database automatically.

    poll_request = request_worldbank_api()
    total_documents = poll_request['total']

    return int(total_documents)


def scrape_page(fl_params, page, limit=500, verbose=True):
    offset = page * limit
    page_content = request_worldbank_api(fl_params=fl_params, offset=offset, limit=limit)
    page_content = page_content['documents']
    func_params = {'page': page}

    # Remove extraneous key
    page_content.pop('facets')

    if not os.path.isdir(API_JSON_DIR):
        os.makedirs(API_JSON_DIR)

    page_file = os.path.join(API_JSON_DIR, 'data-{page}.json'.format(**func_params))

    with open(page_file, 'w') as fl:
        json.dump(page_content, fl)

    if verbose:
        print('Completed scraping of page {page}.'.format(**func_params), flush=True)

    time.sleep(1)


def scrape_worldbank_operational_docs_api(fl_params, limit=500, max_pages=5, n_jobs=1, verbose=True):
    '''
    Note:
        Parallelization of API access is discouraged for large limit size.
        It could result to throttling or failed return values.
    '''
    func_params = {}
    total_documents = get_total_documents()

    total_pages = (total_documents // limit) + 1
    func_params['total_pages'] = total_pages

    scrape_params = []

    for page in range(total_pages):
        func_params['page'] = page + 1

        if (max_pages is not None) and (page > max_pages):
            print('Terminating scraping for remaining pages...')
            break

        if not verbose:
            # Print this only if scrape_params verbosity is False...
            print('Scraping page {page} / {total_pages}'.format(**func_params))

        scrape_params.append(dict(fl_params=fl_params, page=page, limit=limit, verbose=verbose))

    Parallel(n_jobs=n_jobs)(delayed(scrape_page)(**sp) for sp in scrape_params)
    # scrape_page(fl_params, page, limit, verbose=False)

time: 1.53 ms


In [11]:
# # Check if ids are sorted by disclosure date

# sample_request_data = request_worldbank_api(fl_params, offset=100000, limit=4)
# sample_request_keys = sorted(sample_request_data['documents'].keys())
# sample_request_keys.pop(sample_request_keys.index('facets'))
# sample_request_disclosure_dates = sorted([sample_request_data['documents'][uid]['disclosure_date'] for uid in sample_request_keys])

# for ix, i in enumerate(sample_request_keys):
#     # Assuming that the document ids are not sequentially assigned by disclosure_date,
#     # then it is likely that if we sort the ids and the disclosure date and check the equality of the
#     # actual disclosure_date for the id vs the sorted disclosure_date that it will not match.
#     assert(sample_request_data['documents'][i]['disclosure_date'] == sample_request_disclosure_dates[ix])
    

time: 245 µs


In [12]:
get_total_documents() // 500

593

time: 55.2 ms


In [13]:
%%time
scrape_worldbank_operational_docs_api(fl_params=fl_params, limit=500, max_pages=None, n_jobs=5)

CPU times: user 878 ms, sys: 81.4 ms, total: 959 ms
Wall time: 14min 59s
time: 14min 59s


In [14]:
with open(os.path.join(API_JSON_DIR, "data-100.json")) as fl:
    nd = json.load(fl)
    nd = pd.DataFrame(nd).T
    nd.index.name = 'uid'

time: 53 ms


In [15]:
s = nd.dropna(subset=["authors"])[nd.dropna(subset=["authors"])["authors"].map(len) > 3].iloc[0].to_dict()
for k, v in s.items():
    if isinstance(v, dict):
        print(k, v)
        print()

authors {'0': {'author': 'Sjoberg,Fredrik Matias'}, '1': {'author': 'Mellon,Jonathan'}, '2': {'author': 'Peixoto,Tiago Carneiro'}, '3': {'author': 'Hemker,Johannes Zacharias'}, '4': {'author': 'Tsai,Lily Lee'}}

entityids {'entityid': '090224b086f47634_2_0'}

docna {'0': {'docna': 'Voice and Punishment : A Global Survey\n            Experiment on Tax Morale'}}

repnme {'repnme': 'Voice and Punishment : A Global Survey\n            Experiment on Tax Morale'}

keywd {'0': {'keywd': 'Tax Compliance; institute of development\n            studies; treatment effect; worst case scenario; public goods\n            provision; income tax evasion; list of countries; political\n            science association; relationship between citizens; support\n            for democracy; channels of communication; burden of\n            taxation; nominal tax rate; costs of taxation; women in\n            politics; public sector corruption; public sector worker;\n            public sector employment; national 

# Processing and normalization of scraped document metadata

In [520]:
# s = set(['Publications & Research', 'Publications'])
# s = set(["Country Focus", "Country Focus"])
# s = set("Publications,Publications & Research,Publications,Publications & Research".split(','))
import re

def standardize_authors_list(authors, delimiter=";"):
    # authors = "Runji,Justin;Jose Rizal;Bonifacio, Andres; Damaso, Maria Clara"
    if not isinstance(authors, str):
        return None
    return ",".join([re.sub(r"(.*),\s*(.*)", r"\2 \1", i.strip()) for i in authors.split(delimiter)])


def normalize_set(s):
    # s = set(['Publications & Research', 'Publications'])

    l = sorted(s)
    remove_index = set()
    
    for i in range(len(l) - 1):
        for j in range(1, len(l)):
            if l[i] in l[j]:
                remove_index.add(i)
    
    for k in sorted(remove_index, reverse=True):
        l.pop(k)

    return l


def make_unique_entry(series):
    # This will remove duplicate entries in fields: `majdocty` (`majdoctype` : normalized) and `admreg`
    series = series.fillna('')
    series = series.str.split(',').map(set).map(lambda vals: ', '.join(normalize_set(vals)))
    return series.replace('', None)


def collapse_array(data, connector=None):
    # Assume that array is of type list
    value = []
    
    if isinstance(data, list):        
        for d in data:
            if isinstance(d, dict):
                value.append(collapse_nested_dict(d, connector=connector))
            else:
                value.extend(collapse_array(d))
#     elif isinstance(data, dict):
#         data = collapse_nested_dict(data, connector=connector)
#         value.append(data)
    else:
        value.append(data)
    
    try:
        if connector:
            # `connector` is only used in the root function call so it is safe
            # to assume that in cases where the original value is not an array or nested array,
            # we can just retrieve and return the original value
            if len(value) > 1:
                # This means that the data is an array and possibly nested
                value = connector.join(value)
            else:
                value = value[0]
    except Exception as e:
        print(data)
        print(value)
        raise(e)
        
    return value


# line_break_pattern = re.compile('\r?\n|\r')
whitespace_pattern = re.compile('\s+')
hanging_dash_pattern = re.compile('\S+- ')


def extract_formatted_authors(authors, delimiter=";"):
    # {'0': {'author': 'Sjoberg,Fredrik Matias'},
    #  '1': {'author': 'Mellon,Jonathan'},
    #  '2': {'author': 'Peixoto,Tiago Carneiro'},
    #  '3': {'author': 'Hemker,Johannes Zacharias'},
    #  '4': {'author': 'Tsai,Lily Lee'}}

    authors_value = None
    if pd.notna(authors):
        authors_value = delimiter.join([author["author"] for author in authors.values()])

    return standardize_authors_list(authors_value, delimiter=delimiter)

def normalize_hanging_dash(t):
    for p in hanging_dash_pattern.findall(t):
        t = t.replace(p, p.replace('- ', ' - '))

    return t


def normalize_str_col(ser):
    return ser.map(lambda x: normalize_hanging_dash(whitespace_pattern.sub(' ', x)) if isinstance(x, str) else x)


def normalize_geo_regions(x, connector='|'):
    # geo_regions has this assumed format: {'0': {'geo_region': 'Europe'}, '1': {'geo_region': 'Europe'}}
    if isinstance(x, dict):
        x = connector.join(set(i['geo_region'] for i in x.values()))

    return x


def collapse_nested_dict(x, connector=None):
    value = []
    
    if isinstance(x, dict):
        for val in x.values():
            value.extend(collapse_nested_dict(val))
    elif isinstance(x, list):
        x = collapse_array(x, connector=connector)
        value.append(x)
    else:
        value.append(x)
    
    if connector:
        if len(value) > 1:
            value = connector.join(value)
        else:
            value = value[0]

    return value

def process_uid(uid):
    # Implement this function to easily process the creation or update how the ids are constructed.
    # Take note that this is important because the original API changed how the ids are rendered from <id> to D<id> format.

    ## No need to do this since all uids are already standardized
    # if uid.startswith('D'):
    #     uid = uid[1:]

    # if not uid.isdigit():
    #     raise ValueError(f'Unexpected document id format: {uid}...')

    return uid

def normalize_document_data(use_short_columns=True, fname=None, data_dir=None, save_data=True):
#     if use_short_columns:
#         columns = ['guid', 'docyear', 'majdoctype', 'doctype', 'authors', 'colti', 'display_title', 'docdt', 'docm_id', 'historic_topic', 'pdfurl', 'seccl', 'txturl', 'language', 'admreg', 'country', 'txtfilename']
#     else:
#         columns = ['authors', 'abstracts', 'admreg', 'alt_title', 'available_in', 'bdmdt', 'chronical_docm_id', 'closedt', 'colti', 'count', 'credit_no', 'disclosure_date', 'disclosure_type', 'disclosure_type_date', 'disclstat', 'display_title', 'docdt', 'doc_year', 'docm_id', 'docna', 'docty', 'dois', 'entityids', 'envcat', 'geo_regions', 'geo_region_mdks', 'historic_topic', 'id', 'isbn', 'issn', 'keywd', 'lang', 'listing_relative_url', 'lndinstr', 'loan_no', 'majdocty', 'majtheme', 'ml_abstract', 'ml_display_title', 'new_url', 'owner', 'pdfurl', 'prdln', 'projn', 'publishtoextweb_dt', 'repnb', 'repnme', 'seccl', 'sectr', 'src_cit', 'subsc', 'subtopic', 'teratopic', 'theme', 'topic', 'topicv3', 'totvolnb', 'trustfund', 'txturl', 'unregnbr', 'url_friendly_title', 'versiontyp', 'versiontyp_key', 'virt_coll', 'vol_title', 'volnb']


    normalized_df = pd.DataFrame()

    for json_file in glob.iglob(os.path.join(API_JSON_DIR, '*.json')):
        print(json_file)

        with open(json_file) as fl:
            normalized_data = json.load(fl)
            normalized_data = pd.DataFrame(normalized_data).T
            normalized_data.index.name = 'uid'
            
            normalized_data.index = normalized_data.index.map(process_uid)
            # normalized_data.index = normalized_data.index.astype(int)

        rename_cols = {
            'docty': 'doc_type',
            'lang': 'language',
            'majdocty': 'majdoctype',
            'count': 'country',
        }

        normalized_data = normalized_data.rename(columns=rename_cols)
        try:
            normalized_data['authors'] = normalized_data['authors'].map(extract_formatted_authors)
        except KeyError:
            # This means that the metadata doesn't have an author field
            normalized_data['authors'] = None

        # Assume that the `display_title` field follows a standard format: list -> dict
        # [{'display_title': 'Voice and Punishment : A Global\n            Survey Experiment on Tax Morale'}]
        normalized_data['display_title'] = normalized_data['display_title'].map(lambda dt: dt[0].get('display_title') if len(dt) else None)

        try:
            normalized_data["abstract"] = normalized_data["abstracts"].map(lambda x: x.get("cdata!") if isinstance(x, dict) else None)
        except KeyError:
            normalized_data["abstract"] = None

        for col in normalized_data.columns:
            try:
                # Normalize line breaks for string data
                normalized_data[col] = normalize_str_col(normalized_data[col])
                normalized_data[col] = normalized_data[col].map(lambda x: collapse_array(x, '|'))
                normalized_data[col] = normalized_data[col].map(lambda x: collapse_nested_dict(x, '|'))

            except AttributeError:
                # column is not a string type
                continue

        normalized_data['majdoctype'] = make_unique_entry(normalized_data['majdoctype'])
        normalized_data['admreg'] = make_unique_entry(normalized_data['admreg'])
        normalized_data['geo_regions'] = normalized_data['geo_regions'].map(normalize_geo_regions)

        normalized_data['docyear'] = pd.to_datetime(normalized_data['docdt']).dt.year

#         existing_cols = normalized_data.columns.intersection(columns)
#         new_cols = pd.Index(set(columns).difference(normalized_data.columns))

#         normalized_data = normalized_data[existing_cols]

#         for col in new_cols:
#             normalized_data[col] = None

        if normalized_df.empty:
            normalized_df = normalized_data
        else:
            normalized_df = pd.concat([normalized_df, normalized_data], axis=0)

#     if save_data:
#         if fname is None:
#             fname = f"WBCorpus_metadata_{pd.datetime.now().strftime('%m-%d-%Y')}.csv"
#         if data_dir:
#             fname = os.path.join(data_dir, fname)
            
#         normalized_df.to_csv(fname)

    return normalized_df

time: 2.74 ms


In [521]:
%%time
normalized_df = normalize_document_data(data_dir=SCRAPER_DIR)

print('\nMissing fields:')
for c in 'uid,guid,docyear,majdoctype,doctype,authors,abstract,colti,display_title,docdt,docm_id,historic_topic,pdfurl,seccl,txturl,language,admreg,country,txtfilename,txtfileid,txturl2,doctypeid,lang_detected,lang_score,tokens'.split(','):
    if c == 'uid':
        continue
    try:
        normalized_df[c]
    except:
        print(f'\t{c}')

/home/wb536061/wb_nlp/data/corpus/WB/tmp_api_json/data-3.json
/home/wb536061/wb_nlp/data/corpus/WB/tmp_api_json/data-1.json
/home/wb536061/wb_nlp/data/corpus/WB/tmp_api_json/data-0.json
/home/wb536061/wb_nlp/data/corpus/WB/tmp_api_json/data-4.json
/home/wb536061/wb_nlp/data/corpus/WB/tmp_api_json/data-5.json
/home/wb536061/wb_nlp/data/corpus/WB/tmp_api_json/data-2.json
/home/wb536061/wb_nlp/data/corpus/WB/tmp_api_json/data-6.json
/home/wb536061/wb_nlp/data/corpus/WB/tmp_api_json/data-7.json
/home/wb536061/wb_nlp/data/corpus/WB/tmp_api_json/data-9.json
/home/wb536061/wb_nlp/data/corpus/WB/tmp_api_json/data-8.json
/home/wb536061/wb_nlp/data/corpus/WB/tmp_api_json/data-11.json
/home/wb536061/wb_nlp/data/corpus/WB/tmp_api_json/data-12.json
/home/wb536061/wb_nlp/data/corpus/WB/tmp_api_json/data-10.json
/home/wb536061/wb_nlp/data/corpus/WB/tmp_api_json/data-13.json
/home/wb536061/wb_nlp/data/corpus/WB/tmp_api_json/data-14.json
/home/wb536061/wb_nlp/data/corpus/WB/tmp_api_json/data-16.json
/h

In [522]:
normalized_df.shape

(296978, 68)

time: 5.39 ms


In [22]:
normalized_df.shape

(296978, 67)

time: 1.07 ms


In [16]:
normalized_df.shape

(297347, 67)

In [17]:
normalized_df.shape

(297347, 67)

# Improvements

By using dataframes, we can easily exploit its slice and filter methods to get certain partitions of the dataset given an arbitrary filter set.

### Filtering by majdoctype

We list all the available **major document types**.

In [523]:
for majdoctype in normalized_df.majdoctype.unique():
    print(majdoctype)

Project Documents
Publications & Research
Board Documents
Economic & Sector Work
Country Focus

Publications &amp; Research
Economic &amp; Sector Work
time: 64.8 ms


In [524]:
for majdoctype in normalized_df.majdoctype.unique():
    print(majdoctype)

Project Documents
Publications & Research
Board Documents
Economic & Sector Work
Country Focus

Publications &amp; Research
Economic &amp; Sector Work
time: 64.2 ms


We can sample from all the dataset all the documents corresponding to the `Project Documents` document type as shown below.

In [525]:
project_documents = normalized_df[normalized_df.majdoctype == 'Project Documents']
project_documents.head(2)

Unnamed: 0_level_0,url,available_in,url_friendly_title,new_url,guid,disclosure_date,disclosure_type,disclosure_type_date,publishtoextweb_dt,disclstat,...,loan_no,dois,isbn,src_cit,virt_coll,issn,Environmental Category,credit_no,unregnbr,vol_title
uid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
D32659506,http://documents.worldbank.org/curated/en/2322...,English,http://documents.worldbank.org/curated/en/2322...,2020/12/32659506/Congo-Democratic-Republic-of-...,232211607643292798,2020-12-10T00:00:00Z,,2020-12-10T00:00:00Z,2020-12-10T00:00:00Z,Disclosed,...,,,,,,,,,,
D32659154,http://documents.worldbank.org/curated/en/2391...,English,http://documents.worldbank.org/curated/en/2391...,2020/12/32659154/Disclosable-Version-of-the-IS...,239181607635364212,2020-12-10T00:00:00Z,,2020-12-10T00:00:00Z,2020-12-10T00:00:00Z,Disclosed,...,,,,,,,,,,


time: 1.66 s


### Filtering by year

Again, we can perform a filtered view of the metadata based on the document year.

In [526]:
normalized_df[normalized_df.docyear == 2018].head(2)

Unnamed: 0_level_0,url,available_in,url_friendly_title,new_url,guid,disclosure_date,disclosure_type,disclosure_type_date,publishtoextweb_dt,disclstat,...,loan_no,dois,isbn,src_cit,virt_coll,issn,Environmental Category,credit_no,unregnbr,vol_title
uid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
D30732714,http://documents.worldbank.org/curated/en/2693...,English,http://documents.worldbank.org/curated/en/2693...,2018/12/30732714/Bolivia-LATIN-AMERICA-AND-CAR...,269351546221067558,2018-12-31T00:00:00Z,,2018-12-31T00:00:00Z,2018-12-31T00:00:00Z,Disclosed,...,,,,,,,,,,
D30732671,http://documents.worldbank.org/curated/en/3406...,English,http://documents.worldbank.org/curated/en/3406...,2018/12/30732671/Disclosable-Version-of-the-IS...,340671546216904449,2018-12-31T00:00:00Z,,2018-12-31T00:00:00Z,2018-12-31T00:00:00Z,Disclosed,...,,,,,,,,,,


time: 62.5 ms


### Filtering the dataset by combination of conditions

Shown below is the method of extracting snapshots of the entire metadata based on specific filters.

In [527]:
filters = (
    (normalized_df.docyear == 2018) &
    (normalized_df.majdoctype == 'Project Documents') &
    (normalized_df.country == 'Philippines')
)

normalized_df[filters].head(2)

Unnamed: 0_level_0,url,available_in,url_friendly_title,new_url,guid,disclosure_date,disclosure_type,disclosure_type_date,publishtoextweb_dt,disclstat,...,loan_no,dois,isbn,src_cit,virt_coll,issn,Environmental Category,credit_no,unregnbr,vol_title
uid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
D30749732,http://documents.worldbank.org/curated/en/4068...,English,http://documents.worldbank.org/curated/en/4068...,2018/12/30749732/LGUGC-Entity-AFS-CY2017-fd,406801547089645115,2019-01-11T00:00:00Z,,2019-01-11T00:00:00Z,2019-01-11T00:00:00Z,Disclosed,...,,,,,,,,,,
D30732578,http://documents.worldbank.org/curated/en/1839...,English,http://documents.worldbank.org/curated/en/1839...,2018/12/30732578/Disclosable-Version-of-the-IS...,183961546204484443,2018-12-30T00:00:00Z,,2018-12-30T00:00:00Z,2018-12-30T00:00:00Z,Disclosed,...,,,,,,,,,,


time: 104 ms


In [528]:
METADATA_COLS = [
    'corpus', 'id', 'path_original', 'path_clean', 'filename_original', 'year',
    'major_doc_type', 'doc_type', 'author', 'abstract', 'collection', 'title', 'journal', 'volume',
    'date_published', 'digital_identifier', 'topics_src', 'url_pdf', 'url_txt', 'language_src',
    'adm_region', 'geo_region', 'country',

    # Not yet available at this stage...,
    # 'language_detected', 'language_score', 'tokens'  

    # WB specific fields
    'wb_lending_instrument', 'wb_product_line', 'wb_major_theme', 'wb_theme', 'wb_sector',
    'wb_subtopic_src', 'wb_project_id',
    # 'wb_environmental_category', 
]

time: 408 µs


In [529]:
def build_wb_id(uid, max_len=9):
    # return f'wb_{"0"*(max_len - len(str(uid)))}{uid}'
    return f'wb_{uid}'


def standardize_metadata_fields(metadata_df):
    '''
    This method must be applied to the original metadata processed dataframe.
    This will assign the final field names.
    '''
    metadata_df = metadata_df.reset_index()
    metadata_df['uid'] = metadata_df.uid.map(build_wb_id)

    wb_core_field_map = {
        'uid': 'id',
        'docyear': 'year',
        'majdoctype': 'major_doc_type',
        'doctype': 'doc_type',
        'authors': 'author',
        'abstract': 'abstract',
        'colti': 'collection',
        'display_title': 'title',
        'docdt': 'date_published',
        'docm_id': 'digital_identifier',
        'historic_topic': 'topics_src',
        'pdfurl': 'url_pdf',
        'txturl': 'url_txt',
        'language': 'language_src',
        'admreg': 'adm_region',
        'country': 'country',
        'geo_regions': 'geo_region',
    }

    wb_specific_field_map = {
        'lndinstr': 'wb_lending_instrument',
        'prdln': 'wb_product_line',
        'majtheme': 'wb_major_theme',
        'theme': 'wb_theme',
        'sectr': 'wb_sector',
        # 'envcat': 'wb_environmental_category',
        'projectid': 'wb_project_id',
        'subtopic': 'wb_subtopic_src',
    }

    wb_new_fields = ['corpus', 'path_original', 'path_clean', 'filename_original', 'journal', 'volume']

    # path_original_dir = '/NLP/CORPUS/WB/TXT_ORIG'
    # path_clean_dir = '/NLP/CORPUS/WB/TXT_CLEAN'

    path_original_dir = 'data/corpus/WB/TXT_ORIG'
    path_clean_dir = ''

    # Perform post normalization preprocessing
    metadata_df['docdt'] = pd.to_datetime(metadata_df['docdt']).dt.date.map(str)

    # Apply final field names
    metadata_df = metadata_df.rename(columns=wb_core_field_map)
    metadata_df = metadata_df.rename(columns=wb_specific_field_map)

    for nf in wb_new_fields:
        if nf == 'corpus':
            metadata_df[nf] = 'WB'
        elif nf == 'filename_original':
            metadata_df[nf] = metadata_df.url_txt.map(lambda x: os.path.basename(x) if isinstance(x, str) else x)
        elif nf == 'path_original':
            metadata_df[nf] = metadata_df['id'].map(lambda x: f"{path_original_dir}/{x}.txt")
        elif nf == 'path_clean':
            metadata_df[nf] = metadata_df['id'].map(lambda x: f"{path_clean_dir}/{x}.txt" if path_clean_dir else None)
        elif nf in ['journal', 'volume']:
            metadata_df[nf] = None

    metadata_df = metadata_df[METADATA_COLS]
    return metadata_df.set_index('id')


time: 1.43 ms


In [530]:
%%time
normalized_final_df = standardize_metadata_fields(normalized_df)

CPU times: user 4.04 s, sys: 606 ms, total: 4.65 s
Wall time: 4.65 s
time: 4.65 s


In [536]:
normalized_final_df[~normalized_final_df["abstract"].isnull()].head(2)

Unnamed: 0_level_0,corpus,path_original,path_clean,filename_original,year,major_doc_type,doc_type,author,abstract,collection,...,adm_region,geo_region,country,wb_lending_instrument,wb_product_line,wb_major_theme,wb_theme,wb_sector,wb_subtopic_src,wb_project_id
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
wb_D32662137,WB,data/corpus/WB/TXT_ORIG/wb_D32662137.txt,,Two-For-One-How-Leveraging-Small-Claims-Proced...,2020.0,Publications & Research,Report,"Svetozara Petkova,Runyararo Gladys Senderayi",People often argue about little things in both...,,...,The World Region,,World,,,,,,"Judicial System Reform,Law and Justice Institu...",
wb_D32653799,WB,data/corpus/WB/TXT_ORIG/wb_D32653799.txt,,Re-thinking-the-Approach-to-Informal-Businesse...,2020.0,Publications & Research,Report,"William Iver Nielsen,Andreja Marusic,Tania Gho...",Interventions over the past decades to encoura...,,...,The World Region,,World,,,,,,"Labor Markets,Financial Sector Policy,Investme...",


time: 138 ms


In [537]:
from datetime import datetime

time: 319 µs


In [538]:
fname = f"wb_metadata-{datetime.now()}.csv"

normalized_final_df.reset_index()[METADATA_COLS].to_csv(
    os.path.join(SCRAPER_DIR, fname),
    index=False
)

time: 9.68 s


In [33]:
normalized_final_df.head(2)

Unnamed: 0_level_0,corpus,path_original,path_clean,filename_original,year,major_doc_type,doc_type,author,collection,title,...,adm_region,geo_region,country,wb_lending_instrument,wb_product_line,wb_major_theme,wb_theme,wb_sector,wb_subtopic_src,wb_project_id
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
wb_D32659506,WB,data/corpus/WB/wb_D32659506.txt,,Congo-Democratic-Republic-of-AFRICA-EAST-P1738...,2020.0,Project Documents,Procurement Plan,Dominique Baado,,"Congo, Democratic Republic of - AFRICA EAST - ...",...,Africa,,"Congo, Democratic Republic of",,,,,,"Regulatory Regimes,Legal Reform,Social Policy,...",P173825
wb_D32659154,WB,data/corpus/WB/wb_D32659154.txt,,Disclosable-Version-of-the-ISR-Kenya-Secondary...,2020.0,Project Documents,Implementation Status and Results Report,"Waheed,Huma Ali",,Disclosable Version of the ISR - Kenya Seconda...,...,Africa,,Kenya,,,,,,"Educational Sciences,Secondary Education,Educa...",P160083


time: 18.5 ms


In [34]:
normalized_final_df.shape

(296978, 28)

time: 1.41 ms


In [31]:
normalized_final_df.shape

(296994, 28)

In [29]:
normalized_final_df.shape

(297347, 28)

In [33]:
normalized_final_df.shape

(297490, 28)

# Downloading actual files

So far, we already have a collection of metadata for the documents in the database. The following scripts will enable us to download the actual text documents associate with each entry in the database.

In [35]:
def download_document_and_reports_file(data=None):
    download_links = {}

    if data is None:
        for json_file in glob.iglob(os.path.join(API_JSON_DIR, '*.json')):
            print(json_file)

            with open(json_file) as fl:
                data = json.load(fl)
                data = pd.DataFrame(data).T
                data.index = data.index.map(build_wb_id)
                data.index.name = 'id'
                download_data = data['txturl']  # txturl since this directly uses data from api

                download_links.update(download_data)
    else:
        download_links = data['url_txt'].to_dict()

    return download_links

time: 491 µs


In [36]:
WB_TXT_DIR = dir_manager.get_data_dir("corpus", "WB", "TXT_ORIG")

if not os.path.isdir(WB_TXT_DIR):
    os.makedirs(WB_TXT_DIR)

time: 7.65 ms


In [37]:
%%time
# links = download_document_and_reports_file(data=normalized_final_df)
links = download_document_and_reports_file(data=None)
error_downloads = []

/home/wb536061/wb_nlp/data/corpus/WB/tmp_api_json/data-3.json
/home/wb536061/wb_nlp/data/corpus/WB/tmp_api_json/data-1.json
/home/wb536061/wb_nlp/data/corpus/WB/tmp_api_json/data-0.json
/home/wb536061/wb_nlp/data/corpus/WB/tmp_api_json/data-4.json
/home/wb536061/wb_nlp/data/corpus/WB/tmp_api_json/data-5.json
/home/wb536061/wb_nlp/data/corpus/WB/tmp_api_json/data-2.json
/home/wb536061/wb_nlp/data/corpus/WB/tmp_api_json/data-6.json
/home/wb536061/wb_nlp/data/corpus/WB/tmp_api_json/data-7.json
/home/wb536061/wb_nlp/data/corpus/WB/tmp_api_json/data-9.json
/home/wb536061/wb_nlp/data/corpus/WB/tmp_api_json/data-8.json
/home/wb536061/wb_nlp/data/corpus/WB/tmp_api_json/data-11.json
/home/wb536061/wb_nlp/data/corpus/WB/tmp_api_json/data-12.json
/home/wb536061/wb_nlp/data/corpus/WB/tmp_api_json/data-10.json
/home/wb536061/wb_nlp/data/corpus/WB/tmp_api_json/data-13.json
/home/wb536061/wb_nlp/data/corpus/WB/tmp_api_json/data-14.json
/home/wb536061/wb_nlp/data/corpus/WB/tmp_api_json/data-16.json
/h

In [38]:
import numpy as np

time: 224 µs


In [41]:
def download_and_store_file(ix, fname, link):
    id = os.path.splitext(os.path.basename(fname))[0] # os.path.basename(fname).replace('.txt', '')
    if ix and ix%10000 == 0:
        print(ix, id, link)

    ret_val = 0

    if not isinstance(link, str):
        ret_val = (id, link, 'NaN link')
    else:    
        try:
            if not os.path.isfile(fname):
                # Download only files that are not yet available locally.
                response = download_with_retry(link)

                if response:
                    with open(fname, 'wb') as fl:
                        fl.write(response.content)
                else:
                    ret_val = (id, link, 'Empty response.')
        except Exception as e:
            ret_val = (id, link, e)

    return ret_val

def transform_link_to_pdf(link):
    if not isinstance(link, str):
        return None

    link = link.replace("/text/", "/pdf/")
    link = os.path.splitext(link)[0] + ".pdf"

    return link


time: 729 µs


# Download TXT

In [42]:
print(len(links))
WB_TXT_DIR = dir_manager.get_data_dir("corpus", "WB", "TXT_ORIG")

if not os.path.isdir(WB_TXT_DIR):
    os.makedirs(WB_TXT_DIR)

ret_codes = Parallel(n_jobs=20)(
    delayed(download_and_store_file)(ix, os.path.join(WB_TXT_DIR, f'{id}.txt'), link) for ix, (id, link) in enumerate(links.items())
)

296727
time: 1min 16s


# Download PDF

In [43]:
pdf_links = {id: transform_link_to_pdf(link) for id, link in links.items()}
print(len(pdf_links))

WB_PDF_DIR = dir_manager.get_data_dir("corpus", "WB", "PDF_ORIG")

if not os.path.isdir(WB_PDF_DIR):
    os.makedirs(WB_PDF_DIR)

pdf_ret_codes = Parallel(n_jobs=5)(
    delayed(download_and_store_file)(ix, os.path.join(WB_PDF_DIR, f'{id}.pdf'), link) for ix, (id, link) in enumerate(pdf_links.items()) if link)

296727
time: 3min 38s


In [73]:
normalized_final_df["author"]

id
wb_D32658475   NaN
wb_D32659506   NaN
wb_D32659154   NaN
wb_D32659153   NaN
wb_D32659160   NaN
                ..
wb_D29851457   NaN
wb_D31825733   NaN
wb_D16919195   NaN
wb_D32092090   NaN
wb_D14551809   NaN
Name: author, Length: 296994, dtype: float64

In [27]:
# # Cleanup temporary json files from API
# import shutil

# shutil.rmtree(API_JSON_DIR)

In [None]:
# import pandas as pd
# meta = pd.read_csv('/home/wb536061/wbes2474/NLP/CORPUS/WB/wb_metadata.csv')
# meta.shape

In [3]:
# ssh -A wb536061@w1lxbdatad07 rsync -vuarP "/home/wb536061/wbes2474/NLP/CORPUS/WB/wb_metadata-2021-02-17\ 18:06:18.367302.csv" wb536061@w0lxsnlp01:/decfile2/Modeling/NLP/CORPUS/WB/



(249805, 29)

# Get words in corpus

In [295]:
from pathlib import Path
from collections import Counter
import re
import pickle

corpus_dir = Path(dir_manager.get_data_dir("corpus"))

time: 486 µs


In [296]:
word_use = Counter()
word_doc = Counter()

newline_dash = re.compile(r"(\S+)-\s+(\S+)")
for ix, fpath in enumerate(corpus_dir.glob("*/EN_TXT_ORIG/*.txt")):

    with open(fpath, "rb") as open_file:
        text = open_file.read().decode("utf-8", errors="ignore")
        text = newline_dash.sub(r"\1\2", text).lower()
        doc_use = Counter(re.findall(r"[a-z]+", text))
        for word in doc_use:
            word_doc[word] += 1

    word_use.update(doc_use)

with open(corpus_dir / "en_corpus_word_use.dict.pickle", "wb") as open_file:
    pickle.dump(word_use, open_file)

with open(corpus_dir / "en_corpus_word_doc.dict.pickle", "wb") as open_file:
    pickle.dump(word_doc, open_file)

time: 1h 4min 50s


In [297]:
import enchant
import re
VALID_TOKEN_PAT = re.compile("^[a-z]+$")
en_dict = enchant.Dict("en_US")

def check(word):
    is_en = en_dict.check(word)
    if not is_en:
        s = en_dict.suggest(word)
        is_en = word.lower() in {i.lower() for i in s}
    return is_en

time: 1.26 ms


In [298]:
word_doc_df = pd.Series(word_doc).sort_values(ascending=False)
word_doc_df.name = "freq"

word_use_df = pd.Series(word_use).sort_values(ascending=False)
word_use_df.name = "freq"

word_usage_rate = word_use_df / word_doc_df

time: 1min 8s


In [299]:
wiki_count = pd.read_csv("../../data/whitelists/whitelists/wordfreq-enwiki-latest-pages-articles.xml.bz2.txt", sep=" ", names=["word", "freq"])

time: 1.01 s


In [300]:
word_doc_frame = word_doc_df.to_frame().head(100000)
word_doc_frame["en"] = word_doc_frame.index.map(check)
word_doc_frame.head()

Unnamed: 0,freq,en
the,334398,True
to,332722,True
of,332443,True
and,332289,True
for,329768,True


time: 13min 37s


In [301]:
w = word_doc_frame[word_doc_frame["en"] == False].index.intersection(wiki_count.head(500000)["word"])

time: 595 ms


In [302]:
with open(dir_manager.get_data_dir("whitelists", "whitelists", "doc-freq-wiki-wordlist.txt"), "w") as open_file:
    for i in w:
        if VALID_TOKEN_PAT.match(i):
            open_file.write(i.strip() + "\n")

time: 21.6 ms


In [304]:
w

Index(['pre', 'ibrd', 'multi', 'capita', 'usd', 'sectoral', 'socio', 'adb',
       'ngos', 'tf',
       ...
       'manabu', 'raghabpur', 'xiaoxi', 'pcrm', 'hamra', 'krishan', 'advanta',
       'barone', 'skikda', 'mtw'],
      dtype='object', length=29876)

time: 6.9 ms


In [291]:
word_doc_df.head(70000).tail(10)

maintainedby    159
copesul         159
benefitsof      159
valua           159
fallacies       159
graficos        159
hated           159
maintai         159
tanahu          159
lettering       159
Name: freq, dtype: int64

time: 2.2 ms


In [280]:
word_doc_frame[word_doc_frame["en"] == False].shape

(62035, 2)

time: 12.6 ms


In [208]:
wiki_count.head()

Unnamed: 0,word,freq
0,the,168874258
1,of,79603285
2,in,69200815
3,and,69025766
4,to,48269837


time: 4.1 ms


In [242]:
w = word_use_df.head(25000).index.intersection(wiki_count.head(500000)["word"])

time: 303 ms


In [243]:
w

Index(['the', 'of', 'and', 'to', 'in', 'for', 'is', 'be', 'on', 'project',
       ...
       'uti', 'fram', 'stifling', 'lyceum', 'domes', 'agartala', 'misiones',
       'ello', 'gta', 'generales'],
      dtype='object', length=22769)

time: 2.31 ms


In [209]:
oov_words = word_doc_df.index.difference(wiki_count.word)

time: 24 s


In [210]:
len(oov_words)

7268438

time: 1.79 ms


In [227]:
word_use_df.loc[oov_words].sort_values(ascending=False).head(100).tail(50)

dpmu                32436
alisation           31490
minist              31304
gorfq               30297
laboration          30037
subloans            29348
curit               29176
achev               29037
ohsr                28965
esmps               27656
csindv              26771
mediumand           25484
nagement            25377
lowand              25199
adquisici           24785
qcvn                23983
selecci             23852
btnmt               22861
poblaci             22781
ciency              22364
doingbusiness       22326
remdp               21685
phrd                21484
buildco             21350
unwithdrawn         21292
paration            21167
capacit             21014
praps               20927
informaci           20588
evaluaci            20481
costunder           20470
clientconnection    20312
subloan             20245
qualityand          20231
quipements          20012
financi             19825
fournitures         19755
ppiaf               19642
finalizado  

time: 5.09 s


In [226]:
en_dict.suggest("cancelado")

['cancel ado', 'cancel-ado', 'cancellation']

time: 32.7 ms


In [185]:
word_use_df["python"], word_doc_df["python"]

(1345, 663)

time: 1.77 ms


In [162]:
word_usage_rate[word_usage_rate > 500].sort_values().head(50)

hmbb                          506.000000
daclatasvir                   528.578947
consultoraambientaljambato    559.000000
ucgpne                        582.000000
the                           619.747449
mbhg                          701.000000
mustroll                      729.833333
jicapr                        754.000000
chakondi                      779.000000
ibeem                         875.000000
olutionsu                     921.000000
Name: freq, dtype: float64

time: 9.21 ms


In [167]:
l = np.log(word_usage_rate)

time: 182 ms


In [179]:
l["pandas"]

0.9829821063728273

time: 1.51 ms


In [193]:
l

a                                 4.940984
aa                                0.923036
aaa                               1.654194
aaaa                              1.106852
aaaaa                             0.490206
                                    ...   
zzzzzzzzzzzzzzzz                  0.000000
zzzzzzzzzzzzzzzzzzzzzz            0.000000
zzzzzzzzzzzzzzzzzzzzzzzz          0.000000
zzzzzzzzzzzzzzzzzzzzzzzzzz        0.000000
zzzzzzzzzzzzzzzzzzzzzzzzzzzzzz    0.000000
Name: freq, Length: 7895772, dtype: float64

time: 2.96 ms


In [120]:
top_100k = word_doc_df[word_doc_df >= 100].to_frame()
top_100k["en"] = top_100k.index.map(en_dict.check)
top_100k.head()

Unnamed: 0,freq,en
the,340842,True
of,339530,True
and,339113,True
to,338220,True
for,336543,True


time: 766 ms


In [138]:
top_100k[top_100k["en"] != True].head(1000).tail(50)

Unnamed: 0,freq,en
bs,5200,False
cient,5195,False
dt,5189,False
stateowned,5188,False
ay,5183,False
ger,5181,False
aaa,5169,False
gbv,5167,False
eq,5162,False
nes,5161,False


time: 17.5 ms


In [136]:
top_100k[top_100k["freq"] >= 30].shape #.loc["covid"]

(95684, 2)

time: 18.7 ms


In [141]:
word_doc_df[word_doc_df >= 30].shape

(226443,)

time: 18.9 ms


In [144]:
word_doc_df[word_doc_df > 2].shape

(1516004,)

time: 78.3 ms


In [148]:
en_dict.suggest("lsms")

['lams', 'isms', 'ls ms', 'ls-ms', 'smalls']

time: 18.7 ms


In [150]:
word_use["lsms"]

14205

time: 1.29 ms


In [79]:
len(word_use)

13827671

time: 1.2 ms


In [78]:
word_use.most_common(100)

[('the', 214318097),
 ('of', 141354242),
 ('and', 134904503),
 ('to', 83512337),
 ('in', 77278965),
 ('a', 54967841),
 ('for', 47338471),
 ('is', 28564126),
 ('be', 25006460),
 ('on', 23802749),
 ('project', 22230687),
 ('by', 21528059),
 ('as', 21064770),
 ('with', 20864970),
 ('s', 20611112),
 ('are', 18874613),
 ('that', 18400201),
 ('will', 14462952),
 ('or', 14321585),
 ('from', 14145334),
 ('i', 13081068),
 ('this', 12773708),
 ('n', 12518222),
 ('at', 12347332),
 ('e', 11851721),
 ('development', 10865081),
 ('bank', 10281509),
 ('o', 9972697),
 ('not', 9735684),
 ('an', 9325677),
 ('have', 9053618),
 ('has', 8687585),
 ('de', 8582783),
 ('no', 8558967),
 ('it', 8553165),
 ('d', 8463582),
 ('c', 8454718),
 ('which', 8157941),
 ('was', 8117387),
 ('management', 8068529),
 ('t', 7856929),
 ('m', 7810134),
 ('water', 7579247),
 ('other', 7314613),
 ('government', 6764486),
 ('b', 6726532),
 ('all', 6710519),
 ('land', 6612628),
 ('implementation', 6605461),
 ('r', 6586767),
 ('l', 

time: 2.11 s


In [73]:
word_doc.most_common(100)

[('the', 340851),
 ('of', 339535),
 ('and', 339124),
 ('to', 338407),
 ('for', 336933),
 ('in', 333275),
 ('a', 333229),
 ('s', 332604),
 ('as', 325425),
 ('by', 323373),
 ('bank', 321721),
 ('on', 320768),
 ('with', 316205),
 ('development', 307286),
 ('no', 306676),
 ('be', 306656),
 ('is', 306336),
 ('this', 305819),
 ('are', 300533),
 ('project', 298650),
 ('other', 297444),
 ('from', 293265),
 ('not', 293183),
 ('at', 289016),
 ('or', 288932),
 ('that', 288811),
 ('report', 284905),
 ('all', 284368),
 ('under', 281997),
 ('i', 281148),
 ('an', 280641),
 ('may', 277375),
 ('will', 274865),
 ('date', 274260),
 ('information', 273005),
 ('management', 271729),
 ('international', 268449),
 ('which', 267013),
 ('services', 266930),
 ('c', 266592),
 ('has', 266069),
 ('it', 265923),
 ('country', 264572),
 ('one', 264229),
 ('have', 261668),
 ('e', 261268),
 ('b', 258924),
 ('its', 257530),
 ('including', 256922),
 ('implementation', 256285),
 ('d', 254964),
 ('national', 254690),
 ('wor

time: 1.99 s


In [320]:
!pwd

/home/wb536061/wb_nlp/notebooks/scrapers
time: 1.23 s


# Process WB metadata

In [337]:
import re

def standardize_authors_list(authors):
    # authors = "Runji,Justin;Jose Rizal;Bonifacio, Andres; Damaso, Maria Clara"
    if not isinstance(authors, str):
        return None
    return ",".join([re.sub(r"(.*),\s*(.*)", r"\2 \1", i.strip()) for i in authors.split(";")])

time: 759 µs


In [342]:
copy_normalized_final_df = normalized_final_df.copy()
copy_normalized_final_df["author"] = normalized_final_df["author"].map(standardize_authors_list)

time: 2.05 s


In [344]:
copy_normalized_final_df["_id"] = copy_normalized_final_df.index
copy_normalized_final_df.head()

Unnamed: 0_level_0,corpus,path_original,path_clean,filename_original,year,major_doc_type,doc_type,author,collection,title,...,geo_region,country,wb_lending_instrument,wb_product_line,wb_major_theme,wb_theme,wb_sector,wb_subtopic_src,wb_project_id,_id
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
wb_D32659506,WB,data/corpus/WB/wb_D32659506.txt,,Congo-Democratic-Republic-of-AFRICA-EAST-P1738...,2020.0,Project Documents,Procurement Plan,Dominique Baado,,"Congo, Democratic Republic of - AFRICA EAST - ...",...,,"Congo, Democratic Republic of",,,,,,"Regulatory Regimes,Legal Reform,Social Policy,...",P173825,wb_D32659506
wb_D32659154,WB,data/corpus/WB/wb_D32659154.txt,,Disclosable-Version-of-the-ISR-Kenya-Secondary...,2020.0,Project Documents,Implementation Status and Results Report,Huma Ali Waheed,,Disclosable Version of the ISR - Kenya Seconda...,...,,Kenya,,,,,,"Educational Sciences,Secondary Education,Educa...",P160083,wb_D32659154
wb_D32659153,WB,data/corpus/WB/wb_D32659153.txt,,China-EAST-ASIA-AND-PACIFIC-P126832-CH-GEF-Mun...,2020.0,Project Documents,Procurement Plan,Jiangping Li,,China - EAST ASIA AND PACIFIC - P126832 - CH G...,...,,China,,,,,,"Pollution Management & Control,Urban Solid Was...",P126832,wb_D32659153
wb_D32659160,WB,data/corpus/WB/wb_D32659160.txt,,Disclosable-Version-of-the-ISR-Roads-and-Emplo...,2020.0,Project Documents,Implementation Status and Results Report,Mira Morad,,Disclosable Version of the ISR - Roads and Emp...,...,,Lebanon,,,,,,"Transport Services,Labor Markets,Financial Sec...",P160223,wb_D32659160
wb_D32658894,WB,data/corpus/WB/wb_D32658894.txt,,Kenya-AFRICA-EAST-P152394-Transforming-Health-...,2020.0,Project Documents,Procurement Plan,Zachariah Kimwetich,,Kenya - AFRICA EAST - P152394 - Transforming H...,...,,Kenya,,,,,,"Health Systems Development & Reform,Health Car...",P152394,wb_D32658894


time: 37.5 ms


In [515]:
import json
from wb_nlp.interfaces import mongodb
from wb_nlp.types import metadata as meta_type
from wb_nlp.types import metadata_enums
import importlib

importlib.reload(meta_type)
importlib.reload(metadata_enums)

<module 'wb_nlp.types.metadata_enums' from '/home/wb536061/wb_nlp/src/wb_nlp/types/metadata_enums.py'>

time: 20.8 ms


In [365]:
# fname = f"wb_metadata-{datetime.now()}.csv"
fname = "wb_metadata-2021-03-27 19:17:44.764136.csv"
wb_metadata = pd.read_csv(dir_manager.get_data_dir("corpus", "WB", fname))

collection = mongodb.get_collection("test_nlp", "docs_metadata")
errors_list = []
no_url = []
metadata_list = []
for i, row in wb_metadata.iterrows():
    if i % 10000 == 0:
        print(i)

    row["_id"] = row["id"]

    if row["url_txt"] == "not to be displayed--":
        print(row["id"])
        no_url.append(row)
        continue

    try:
        meta = json.loads(meta_type.make_metadata_model_from_nlp_schema(row.fillna("")).json())
        meta["_id"] = meta["id"]
        metadata_list.append(meta)
    except:
        errors_list.append(row)
    
collection.insert_many(metadata_list)

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


time: 4.5 s


In [469]:
# errors_list = []
# no_url = []
# metadata_list = []
# for i, row in wb_metadata.iterrows():
#     if i % 10000 == 0:
#         print(i)

#     row["_id"] = row["id"]

#     if row["url_txt"] == "not to be displayed--":
#         print(row["id"])
#         no_url.append(row)
#         continue

#     try:
#         meta = json.loads(meta_type.make_metadata_model_from_nlp_schema(row.fillna("")).json())
#         meta["_id"] = meta["id"]
#         metadata_list.append(meta)
#     except:
#         errors_list.append(row)

0
10000
20000
30000
40000
50000
60000
70000
80000
90000
100000
110000
120000
130000
140000
150000
160000
170000
180000
190000
200000
wb_D12739952
wb_D12619066
210000
220000
230000
wb_D19660178
wb_D19660149
wb_D5086350
wb_D5087542
wb_D5085295
wb_D5087543
wb_D19585040
240000
wb_D5083514
wb_D5078290
wb_D5081641
wb_D19551988
wb_D19544006
wb_D5081541
wb_D5083310
wb_D19533870
wb_D5081300
wb_D5081296
wb_D5080101
wb_D5081237
wb_D5081123
wb_D5080972
250000
wb_D5084124
wb_D5082488
wb_D5080888
wb_D5080900
wb_D5080819
wb_D5079752
wb_D5082533
wb_D5079622
wb_D5079389
wb_D5082313
wb_D5082076
wb_D5079216
wb_D5080601
wb_D5077916
wb_D5079132
260000
wb_D5076462
wb_D5076508
wb_D5076280
wb_D5080398
wb_D5076193
wb_D5079207
wb_D5075954
wb_D5075858
270000
280000
290000
time: 11min 9s


In [477]:
len(no_url), len(errors_list), len(metadata_list)

(46, 351, 296581)

time: 2.71 ms


In [516]:
# Review failing items
# # eee = []
# # for ix, row in enumerate(errors_list):
# #     if row["doc_type"] in ["Dataset", "Policy", "Presentation", "Proceedings", "Supervision Report", "PAS Research Paper"]:
# #         continue
# #     try:
# #         eee.append(meta_type.make_metadata_model_from_nlp_schema(row.fillna("")))
# #     except Exception as e:
# #         print(row["doc_type"])
# #         raise(e)

time: 382 ms


In [482]:
# Review failing items
# # dts = [
# #     "Environmental and Social Management Framework",
# #     "IEG Evaluation",
# #      "News Story",
# #     "Technical Assessment",
# #     "Procurement Assessment",
# #     "Reference Material",
# #     "Flash Report",
# #     "Sector Report",
# # ]

# # eee = [i for i in errors_list if not any([v in i["doc_type"] for v in dts])]
# # eee = [i for i in eee if (isinstance(i["geo_region"], str) and "africa west" not in i["geo_region"].lower())]
# # print(len(eee))
# # row = eee[1]
# # print(row["doc_type"], row["geo_region"], row["url_txt"])
# # # row["geo_region"] = row["geo_region"].replace("Africa West", "West Africa") if isinstance(row["geo_region"], str) else None
# # meta_type.make_metadata_model_from_nlp_schema(row.fillna(""))

time: 383 µs


In [470]:
# Review failing items
# # dts = [
# #     "Environmental and Social Management Framework",
# #     "IEG Evaluation",
# #      "News Story",
# #     "Technical Assessment",
# #     "Procurement Assessment",
# #     "Reference Material",
# #     "Flash Report",
# #     "Sector Report",
# # ]

# # eee = [i for i in errors_list if not any([v in i["doc_type"] for v in dts])]
# # eee = [i for i in eee if (isinstance(i["geo_region"], str) and "africa west" not in i["geo_region"].lower())]
# # print(len(eee))
# # row = eee[6]
# # print(row["doc_type"], row["geo_region"], row["url_txt"])
# # # row["geo_region"] = row["geo_region"].replace("Africa West", "West Africa") if isinstance(row["geo_region"], str) else None
# # meta_type.make_metadata_model_from_nlp_schema(row.fillna(""))

time: 333 µs


In [None]:
# doc_type
## Environmental and Social Management Framework

In [353]:
# fname = f"wb_metadata-{datetime.now()}.csv"
fname = "wb_metadata-2021-03-27 19:17:44.764136.csv"
wb_metadata = dir_manager.get_data_dir("corpus", "WB", fname)

meta_type.make_metadata_model_from_nlp_schema(copy_normalized_final_df.iloc[0].fillna(""))

MetadataModel(id='wb_D32659506', hex_id='eceab51b6d7caa2', int_id=1066978529181682338, adm_region=[<WBAdminRegions.africa: 'Africa'>], author=['Dominique Baado'], cleaning_config_id=None, collection=None, corpus=<Corpus.WB: 'WB'>, country=['Congo', 'Democratic Republic of'], date_published=datetime.date(2020, 12, 10), der_acronyms=None, der_countries=None, der_language_detected=None, der_language_score=None, der_raw_token_count=None, der_clean_token_count=None, digital_identifier='090224b0880b3ebf', doc_type=[<WBDocTypes.procurement_plan: 'Procurement Plan'>], filename_original='Congo-Democratic-Republic-of-AFRICA-EAST-P173825-DRC-COVID-19-Strategic-Preparedness-and-Response-Project-SPRP-Procurement-Plan.txt', geo_region=None, journal=None, language_src='English', last_update_date=datetime.datetime(2021, 3, 31, 23, 57, 17, 779753), major_doc_type=[<WBMajorDocTypes.project_documents: 'Project Documents'>], path_clean=None, path_original='data/corpus/WB/wb_D32659506.txt', title='Congo, D

time: 9.36 ms
