# World Bank Documents and Reports API Scraper

This notebook contains all the necessary scripts to connect and scrape document metadata from the World Bank Documents and Reports API.

In [1]:
%load_ext autotime

In [2]:
import requests
from datetime import datetime
import json
import os
import time
import glob
import pandas as pd
import numpy as np
import re
from joblib import Parallel, delayed

time: 258 ms


In [3]:
from wb_nlp import dir_manager

time: 169 ms


The following are the proper versions to be installed to prevent incompatibility issues.

In [4]:
for m in [requests, json, pd]:
    print(m.__name__, ':', m.__version__)

requests : 2.24.0
json : 2.0.9
pandas : 1.1.2
time: 553 µs


# Scraping methods for the World Bank Document and Reports API

The API allows customization of the kind of returned data. The `fl_param` can be adjusted to a specific subset of values if only certain fields are needed.

In [5]:
fl_params = [
    'guid', 'abstracts', 'admreg', 'alt_title', 'authr', 'available_in',
    'bdmdt', 'chronical_docm_id', 'closedt', 'colti', 'count', 'credit_no',
    'disclosure_date', 'disclosure_type', 'disclosure_type_date', 'disclstat',
    'display_title', 'docdt', 'docm_id', 'docna', 'docty', 'dois', 'entityid',
    'envcat', 'geo_reg', 'geo_reg_and_mdk', 'historic_topic', 'id',
    'isbn', 'issn', 'keywd', 'lang', 'listing_relative_url', 'lndinstr', 'loan_no',
    'majdocty', 'majtheme', 'ml_abstract', 'ml_display_title', 'new_url', 'owner',
    'pdfurl', 'prdln', 'projn', 'publishtoextweb_dt', 'repnb', 'repnme', 'seccl',
    'sectr', 'src_cit', 'subsc', 'subtopic', 'teratopic', 'theme', 'topic', 'topicv3',
    'totvolnb', 'trustfund', 'txturl', 'unregnbr', 'url_friendly_title', 'versiontyp',
    'versiontyp_key', 'virt_coll', 'vol_title', 'volnb', 'projectid',
]

time: 457 µs


In [6]:
for i in sorted(fl_params):
    print(i)

abstracts
admreg
alt_title
authr
available_in
bdmdt
chronical_docm_id
closedt
colti
count
credit_no
disclosure_date
disclosure_type
disclosure_type_date
disclstat
display_title
docdt
docm_id
docna
docty
dois
entityid
envcat
geo_reg
geo_reg_and_mdk
guid
historic_topic
id
isbn
issn
keywd
lang
listing_relative_url
lndinstr
loan_no
majdocty
majtheme
ml_abstract
ml_display_title
new_url
owner
pdfurl
prdln
projectid
projn
publishtoextweb_dt
repnb
repnme
seccl
sectr
src_cit
subsc
subtopic
teratopic
theme
topic
topicv3
totvolnb
trustfund
txturl
unregnbr
url_friendly_title
versiontyp
versiontyp_key
virt_coll
vol_title
volnb
time: 1.86 ms


In [7]:
dir_manager.get_data_dir('corpus', 'WB')

'/home/wb536061/wb_nlp/data/corpus/WB'

time: 5.86 ms


In [8]:
SCRAPER_DIR = dir_manager.get_data_dir('corpus', 'WB')
API_JSON_DIR = os.path.join(SCRAPER_DIR, 'tmp_api_json')
print(SCRAPER_DIR)
print(API_JSON_DIR)

/home/wb536061/wb_nlp/data/corpus/WB
/home/wb536061/wb_nlp/data/corpus/WB/tmp_api_json
time: 642 µs


In [9]:
def download_with_retry(url, params=None, max_retries=10):
    retry_count = 0

    while retry_count < max_retries:
        try:
            response = requests.get(url, params=params)
            if response.status_code != 200:
                retry_count += 1
                time.sleep(1)
                continue
            break
        except:
            retry_count += 1
            time.sleep(1)

    if retry_count >= max_retries:
        return

    return response

time: 557 µs


In [10]:
def request_worldbank_api(fl_params=None, offset=0, limit=1, max_retries=10):
    '''
    fl_params: list of values to return per row
    offset: parameter corresponding to the start page
    limit: maximum number of rows returned by the api call
    '''

    if fl_params is None:
        fl_params = ['guid']

    api_url = 'http://search.worldbank.org/api/v2/wds'
    api_params = dict(
        format='json',
        fl=','.join(fl_params),
        lang_exact='English',
        disclstat='Disclosed',
        srt='docdt',
        order='desc',  # Use asc such that pages already downloaded can still be used
        os=offset,
        rows=limit,
        # frmdisclosuredate='',  # '2018-09-12'
        # todisclosuredate='',  # '2018-09-13'
    )

    response = download_with_retry(url=api_url, params=api_params)

    if (response is None) or (response.status_code != 200):
        return {}

    json_content = response.json()

    return json_content


def get_total_documents():
    # This method solves the problem of determination of
    # the total pages in the database automatically.

    poll_request = request_worldbank_api()
    total_documents = poll_request['total']

    return int(total_documents)


def scrape_page(fl_params, page, limit=500, verbose=True):
    offset = page * limit
    page_content = request_worldbank_api(fl_params=fl_params, offset=offset, limit=limit)
    page_content = page_content['documents']
    func_params = {'page': page}

    # Remove extraneous key
    page_content.pop('facets')

    if not os.path.isdir(API_JSON_DIR):
        os.makedirs(API_JSON_DIR)

    page_file = os.path.join(API_JSON_DIR, 'data-{page}.json'.format(**func_params))

    with open(page_file, 'w') as fl:
        json.dump(page_content, fl)

    if verbose:
        print('Completed scraping of page {page}.'.format(**func_params), flush=True)

    time.sleep(1)


def scrape_worldbank_operational_docs_api(fl_params, limit=500, max_pages=5, n_jobs=1, verbose=True):
    '''
    Note:
        Parallelization of API access is discouraged for large limit size.
        It could result to throttling or failed return values.
    '''
    func_params = {}
    total_documents = get_total_documents()

    total_pages = (total_documents // limit) + 1
    func_params['total_pages'] = total_pages

    scrape_params = []

    for page in range(total_pages):
        func_params['page'] = page + 1

        if (max_pages is not None) and (page > max_pages):
            print('Terminating scraping for remaining pages...')
            break

        if not verbose:
            # Print this only if scrape_params verbosity is False...
            print('Scraping page {page} / {total_pages}'.format(**func_params))

        scrape_params.append(dict(fl_params=fl_params, page=page, limit=limit, verbose=verbose))

    Parallel(n_jobs=n_jobs)(delayed(scrape_page)(**sp) for sp in scrape_params)
    # scrape_page(fl_params, page, limit, verbose=False)

time: 1.53 ms


In [11]:
# # Check if ids are sorted by disclosure date

# sample_request_data = request_worldbank_api(fl_params, offset=100000, limit=4)
# sample_request_keys = sorted(sample_request_data['documents'].keys())
# sample_request_keys.pop(sample_request_keys.index('facets'))
# sample_request_disclosure_dates = sorted([sample_request_data['documents'][uid]['disclosure_date'] for uid in sample_request_keys])

# for ix, i in enumerate(sample_request_keys):
#     # Assuming that the document ids are not sequentially assigned by disclosure_date,
#     # then it is likely that if we sort the ids and the disclosure date and check the equality of the
#     # actual disclosure_date for the id vs the sorted disclosure_date that it will not match.
#     assert(sample_request_data['documents'][i]['disclosure_date'] == sample_request_disclosure_dates[ix])
    

time: 245 µs


In [12]:
get_total_documents() // 500

593

time: 55.2 ms


In [None]:
%%time
scrape_worldbank_operational_docs_api(fl_params=fl_params, limit=500, max_pages=None, n_jobs=5)

In [None]:
with open(os.path.join(API_JSON_DIR, "data-100.json")) as fl:
    nd = json.load(fl)
    nd = pd.DataFrame(nd).T
    nd.index.name = 'uid'

In [None]:
s = nd.dropna(subset=["authors"])[nd.dropna(subset=["authors"])["authors"].map(len) > 3].iloc[0].to_dict()
for k, v in s.items():
    if isinstance(v, dict):
        print(k, v)
        print()

# Processing and normalization of scraped document metadata

In [None]:
# s = set(['Publications & Research', 'Publications'])
# s = set(["Country Focus", "Country Focus"])
# s = set("Publications,Publications & Research,Publications,Publications & Research".split(','))

def normalize_set(s):
    # s = set(['Publications & Research', 'Publications'])

    l = sorted(s)
    remove_index = set()
    
    for i in range(len(l) - 1):
        for j in range(1, len(l)):
            if l[i] in l[j]:
                remove_index.add(i)
    
    for k in sorted(remove_index, reverse=True):
        l.pop(k)

    return l


def make_unique_entry(series):
    # This will remove duplicate entries in fields: `majdocty` (`majdoctype` : normalized) and `admreg`
    series = series.fillna('')
    series = series.str.split(',').map(set).map(lambda vals: ', '.join(normalize_set(vals)))
    return series.replace('', None)


def collapse_array(data, connector=None):
    # Assume that array is of type list
    value = []
    
    if isinstance(data, list):        
        for d in data:
            if isinstance(d, dict):
                value.append(collapse_nested_dict(d, connector=connector))
            else:
                value.extend(collapse_array(d))
#     elif isinstance(data, dict):
#         data = collapse_nested_dict(data, connector=connector)
#         value.append(data)
    else:
        value.append(data)
    
    try:
        if connector:
            # `connector` is only used in the root function call so it is safe
            # to assume that in cases where the original value is not an array or nested array,
            # we can just retrieve and return the original value
            if len(value) > 1:
                # This means that the data is an array and possibly nested
                value = connector.join(value)
            else:
                value = value[0]
    except Exception as e:
        print(data)
        print(value)
        raise(e)
        
    return value


# line_break_pattern = re.compile('\r?\n|\r')
whitespace_pattern = re.compile('\s+')
hanging_dash_pattern = re.compile('\S+- ')


def extract_formatted_authors(authors, delimiter=";"):
    # {'0': {'author': 'Sjoberg,Fredrik Matias'},
    #  '1': {'author': 'Mellon,Jonathan'},
    #  '2': {'author': 'Peixoto,Tiago Carneiro'},
    #  '3': {'author': 'Hemker,Johannes Zacharias'},
    #  '4': {'author': 'Tsai,Lily Lee'}}

    authors_value = None
    if pd.notna(authors):
        authors_value = delimiter.join([author["author"] for author in authors.values()])

    return authors_value

def normalize_hanging_dash(t):
    for p in hanging_dash_pattern.findall(t):
        t = t.replace(p, p.replace('- ', ' - '))

    return t


def normalize_str_col(ser):
    return ser.map(lambda x: normalize_hanging_dash(whitespace_pattern.sub(' ', x)) if isinstance(x, str) else x)


def normalize_geo_regions(x, connector='|'):
    # geo_regions has this assumed format: {'0': {'geo_region': 'Europe'}, '1': {'geo_region': 'Europe'}}
    if isinstance(x, dict):
        x = connector.join(set(i['geo_region'] for i in x.values()))

    return x


def collapse_nested_dict(x, connector=None):
    value = []
    
    if isinstance(x, dict):
        for val in x.values():
            value.extend(collapse_nested_dict(val))
    elif isinstance(x, list):
        x = collapse_array(x, connector=connector)
        value.append(x)
    else:
        value.append(x)
    
    if connector:
        if len(value) > 1:
            value = connector.join(value)
        else:
            value = value[0]

    return value

def process_uid(uid):
    # Implement this function to easily process the creation or update how the ids are constructed.
    # Take note that this is important because the original API changed how the ids are rendered from <id> to D<id> format.

    ## No need to do this since all uids are already standardized
    # if uid.startswith('D'):
    #     uid = uid[1:]

    # if not uid.isdigit():
    #     raise ValueError(f'Unexpected document id format: {uid}...')

    return uid

def normalize_document_data(use_short_columns=True, fname=None, data_dir=None, save_data=True):
#     if use_short_columns:
#         columns = ['guid', 'docyear', 'majdoctype', 'doctype', 'authors', 'colti', 'display_title', 'docdt', 'docm_id', 'historic_topic', 'pdfurl', 'seccl', 'txturl', 'language', 'admreg', 'country', 'txtfilename']
#     else:
#         columns = ['authors', 'abstracts', 'admreg', 'alt_title', 'available_in', 'bdmdt', 'chronical_docm_id', 'closedt', 'colti', 'count', 'credit_no', 'disclosure_date', 'disclosure_type', 'disclosure_type_date', 'disclstat', 'display_title', 'docdt', 'doc_year', 'docm_id', 'docna', 'docty', 'dois', 'entityids', 'envcat', 'geo_regions', 'geo_region_mdks', 'historic_topic', 'id', 'isbn', 'issn', 'keywd', 'lang', 'listing_relative_url', 'lndinstr', 'loan_no', 'majdocty', 'majtheme', 'ml_abstract', 'ml_display_title', 'new_url', 'owner', 'pdfurl', 'prdln', 'projn', 'publishtoextweb_dt', 'repnb', 'repnme', 'seccl', 'sectr', 'src_cit', 'subsc', 'subtopic', 'teratopic', 'theme', 'topic', 'topicv3', 'totvolnb', 'trustfund', 'txturl', 'unregnbr', 'url_friendly_title', 'versiontyp', 'versiontyp_key', 'virt_coll', 'vol_title', 'volnb']


    normalized_df = pd.DataFrame()

    for json_file in glob.iglob(os.path.join(API_JSON_DIR, '*.json')):
        print(json_file)

        with open(json_file) as fl:
            normalized_data = json.load(fl)
            normalized_data = pd.DataFrame(normalized_data).T
            normalized_data.index.name = 'uid'
            
            normalized_data.index = normalized_data.index.map(process_uid)
            # normalized_data.index = normalized_data.index.astype(int)

        rename_cols = {
            'docty': 'doc_type',
            'lang': 'language',
            'majdocty': 'majdoctype',
            'count': 'country'
        }

        normalized_data = normalized_data.rename(columns=rename_cols)
        try:
            normalized_data['authors'] = normalized_data['authors'].map(authors_value)
        except KeyError:
            # This means that the metadata doesn't have an author field
            normalized_data['authors'] = None

        # Assume that the `display_title` field follows a standard format: list -> dict
        # [{'display_title': 'Voice and Punishment : A Global\n            Survey Experiment on Tax Morale'}]
        normalized_data['display_title'] = normalized_data['display_title'].map(lambda dt: dt[0].get('display_title') if len(dt) else None)

        for col in normalized_data.columns:
            try:
                # Normalize line breaks for string data
                normalized_data[col] = normalize_str_col(normalized_data[col])
                normalized_data[col] = normalized_data[col].map(lambda x: collapse_array(x, '|'))
                normalized_data[col] = normalized_data[col].map(lambda x: collapse_nested_dict(x, '|'))

            except AttributeError:
                # column is not a string type
                continue

        normalized_data['majdoctype'] = make_unique_entry(normalized_data['majdoctype'])
        normalized_data['admreg'] = make_unique_entry(normalized_data['admreg'])
        normalized_data['geo_regions'] = normalized_data['geo_regions'].map(normalize_geo_regions)

        normalized_data['docyear'] = pd.to_datetime(normalized_data['docdt']).dt.year

#         existing_cols = normalized_data.columns.intersection(columns)
#         new_cols = pd.Index(set(columns).difference(normalized_data.columns))

#         normalized_data = normalized_data[existing_cols]

#         for col in new_cols:
#             normalized_data[col] = None

        if normalized_df.empty:
            normalized_df = normalized_data
        else:
            normalized_df = pd.concat([normalized_df, normalized_data], axis=0)

#     if save_data:
#         if fname is None:
#             fname = f"WBCorpus_metadata_{pd.datetime.now().strftime('%m-%d-%Y')}.csv"
#         if data_dir:
#             fname = os.path.join(data_dir, fname)
            
#         normalized_df.to_csv(fname)

    return normalized_df

In [None]:
%%time
normalized_df = normalize_document_data(data_dir=SCRAPER_DIR)

print('\nMissing fields:')
for c in 'uid,guid,docyear,majdoctype,doctype,authors,colti,display_title,docdt,docm_id,historic_topic,pdfurl,seccl,txturl,language,admreg,country,txtfilename,txtfileid,txturl2,doctypeid,lang_detected,lang_score,tokens'.split(','):
    if c == 'uid':
        continue
    try:
        normalized_df[c]
    except:
        print(f'\t{c}')

In [None]:
normalized_df.shape

In [19]:
normalized_df.shape

(296994, 67)

In [16]:
normalized_df.shape

(297347, 67)

In [17]:
normalized_df.shape

(297347, 67)

# Improvements

By using dataframes, we can easily exploit its slice and filter methods to get certain partitions of the dataset given an arbitrary filter set.

### Filtering by majdoctype

We list all the available **major document types**.

In [None]:
for majdoctype in normalized_df.majdoctype.unique():
    print(majdoctype)

In [None]:
for majdoctype in normalized_df.majdoctype.unique():
    print(majdoctype)

We can sample from all the dataset all the documents corresponding to the `Project Documents` document type as shown below.

In [None]:
project_documents = normalized_df[normalized_df.majdoctype == 'Project Documents']
project_documents.head(2)

### Filtering by year

Again, we can perform a filtered view of the metadata based on the document year.

In [None]:
normalized_df[normalized_df.docyear == 2018].head(2)

### Filtering the dataset by combination of conditions

Shown below is the method of extracting snapshots of the entire metadata based on specific filters.

In [None]:
filters = (
    (normalized_df.docyear == 2018) &
    (normalized_df.majdoctype == 'Project Documents') &
    (normalized_df.country == 'Philippines')
)

normalized_df[filters].head(2)

In [None]:
METADATA_COLS = [
    'corpus', 'id', 'path_original', 'path_clean', 'filename_original', 'year',
    'major_doc_type', 'doc_type', 'author', 'collection', 'title', 'journal', 'volume',
    'date_published', 'digital_identifier', 'topics_src', 'url_pdf', 'url_txt', 'language_src',
    'adm_region', 'geo_region', 'country',

    # Not yet available at this stage...,
    # 'language_detected', 'language_score', 'tokens'  

    # WB specific fields
    'wb_lending_instrument', 'wb_product_line', 'wb_major_theme', 'wb_theme', 'wb_sector',
    'wb_subtopic_src', 'wb_project_id',
    # 'wb_environmental_category', 
]

In [None]:
def build_wb_id(uid, max_len=9):
    # return f'wb_{"0"*(max_len - len(str(uid)))}{uid}'
    return f'wb_{uid}'


def standardize_metadata_fields(metadata_df):
    '''
    This method must be applied to the original metadata processed dataframe.
    This will assign the final field names.
    '''
    metadata_df = metadata_df.reset_index()
    metadata_df['uid'] = metadata_df.uid.map(build_wb_id)

    wb_core_field_map = {
        'uid': 'id',
        'docyear': 'year',
        'majdoctype': 'major_doc_type',
        'doctype': 'doc_type',
        'authors': 'author',
        'colti': 'collection',
        'display_title': 'title',
        'docdt': 'date_published',
        'docm_id': 'digital_identifier',
        'historic_topic': 'topics_src',
        'pdfurl': 'url_pdf',
        'txturl': 'url_txt',
        'language': 'language_src',
        'admreg': 'adm_region',
        'country': 'country',
        'geo_regions': 'geo_region',
    }

    wb_specific_field_map = {
        'lndinstr': 'wb_lending_instrument',
        'prdln': 'wb_product_line',
        'majtheme': 'wb_major_theme',
        'theme': 'wb_theme',
        'sectr': 'wb_sector',
        # 'envcat': 'wb_environmental_category',
        'projectid': 'wb_project_id',
        'subtopic': 'wb_subtopic_src',
    }

    wb_new_fields = ['corpus', 'path_original', 'path_clean', 'filename_original', 'journal', 'volume']

    # path_original_dir = '/NLP/CORPUS/WB/TXT_ORIG'
    # path_clean_dir = '/NLP/CORPUS/WB/TXT_CLEAN'

    path_original_dir = 'data/corpus/WB'
    path_clean_dir = ''

    # Perform post normalization preprocessing
    metadata_df['docdt'] = pd.to_datetime(metadata_df['docdt']).dt.date.map(str)

    # Apply final field names
    metadata_df = metadata_df.rename(columns=wb_core_field_map)
    metadata_df = metadata_df.rename(columns=wb_specific_field_map)

    for nf in wb_new_fields:
        if nf == 'corpus':
            metadata_df[nf] = 'WB'
        elif nf == 'filename_original':
            metadata_df[nf] = metadata_df.url_txt.map(lambda x: os.path.basename(x) if isinstance(x, str) else x)
        elif nf == 'path_original':
            metadata_df[nf] = metadata_df['id'].map(lambda x: f"{path_original_dir}/{x}.txt")
        elif nf == 'path_clean':
            metadata_df[nf] = metadata_df['id'].map(lambda x: f"{path_clean_dir}/{x}.txt" if path_clean_dir else None)
        elif nf in ['journal', 'volume']:
            metadata_df[nf] = None

    metadata_df = metadata_df[METADATA_COLS]
    return metadata_df.set_index('id')


In [None]:
%%time
normalized_final_df = standardize_metadata_fields(normalized_df)

In [None]:
from datetime import datetime

In [None]:
fname = f"wb_metadata-{datetime.now()}.csv"

normalized_final_df.reset_index()[METADATA_COLS].to_csv(
    os.path.join(SCRAPER_DIR, fname),
    index=False
)

In [None]:
normalized_final_df.head(2)

In [None]:
normalized_final_df.shape

In [31]:
normalized_final_df.shape

(296994, 28)

In [29]:
normalized_final_df.shape

(297347, 28)

In [33]:
normalized_final_df.shape

(297490, 28)

# Downloading actual files

So far, we already have a collection of metadata for the documents in the database. The following scripts will enable us to download the actual text documents associate with each entry in the database.

In [None]:
def download_document_and_reports_file(data=None):
    download_links = {}

    if data is None:
        for json_file in glob.iglob(os.path.join(API_JSON_DIR, '*.json')):
            print(json_file)

            with open(json_file) as fl:
                data = json.load(fl)
                data = pd.DataFrame(data).T
                data.index = data.index.map(build_wb_id)
                data.index.name = 'id'
                download_data = data['txturl']  # txturl since this directly uses data from api

                download_links.update(download_data)
    else:
        download_links = data['url_txt'].to_dict()

    return download_links

In [None]:
WB_TXT_DIR = dir_manager.get_data_dir("corpus", "WB", "TXT_ORIG")

if not os.path.isdir(WB_TXT_DIR):
    os.makedirs(WB_TXT_DIR)

In [None]:
%%time
# links = download_document_and_reports_file(data=normalized_final_df)
links = download_document_and_reports_file(data=None)
error_downloads = []

In [None]:
import numpy as np

In [None]:
def download_and_store_file(ix, fname, link):
    if ix and ix%10000 == 0:
        print(ix, id, link)

    id = os.path.splitext(os.path.basename(fname))[0] # os.path.basename(fname).replace('.txt', '')
    ret_val = 0

    if not isinstance(link, str):
        ret_val = (id, link, 'NaN link')
    else:    
        try:
            if not os.path.isfile(fname):
                # Download only files that are not yet available locally.
                response = download_with_retry(link)

                if response:
                    with open(fname, 'wb') as fl:
                        fl.write(response.content)
                else:
                    ret_val = (id, link, 'Empty response.')
        except Exception as e:
            ret_val = (id, link, e)

    return ret_val

def transform_link_to_pdf(link):
    if not isinstance(link, str):
        return None

    link = link.replace("/text/", "/pdf/")
    link = os.path.splitext(link)[0] + ".pdf"

    return link


# Download TXT

In [None]:
print(len(links))
WB_TXT_DIR = dir_manager.get_data_dir("corpus", "WB", "TXT_ORIG")

if not os.path.isdir(WB_TXT_DIR):
    os.makedirs(WB_TXT_DIR)

ret_codes = Parallel(n_jobs=20)(
    delayed(download_and_store_file)(ix, os.path.join(WB_TXT_DIR, f'{id}.txt'), link) for ix, (id, link) in enumerate(links.items())
)

# Download PDF

In [None]:
pdf_links = {id: transform_link_to_pdf(link) for id, link in links.items()}
print(len(pdf_links))

WB_PDF_DIR = dir_manager.get_data_dir("corpus", "WB", "PDF_ORIG")

if not os.path.isdir(WB_PDF_DIR):
    os.makedirs(WB_PDF_DIR)

pdf_ret_codes = Parallel(n_jobs=5)(
    delayed(download_and_store_file)(ix, os.path.join(WB_PDF_DIR, f'{id}.pdf'), link) for ix, (id, link) in enumerate(pdf_links.items()) if link)

In [82]:
nan_ids = []
for ix, (id, link) in enumerate(links.items()):
    if not isinstance(link, str):
        nan_ids.append(id)
        continue

    fname = os.path.join(WB_TXT_DIR, f'{id}.txt')
    download_and_store_file(fname, link)

    if ix and ix % 10000 == 0:
        print(ix, id, link)

10000 wb_D32412661 http://documents.worldbank.org/curated/en/296571600333530125/text/Audited-FS-2018-USD-account-pdf.txt
20000 wb_D32044719 http://documents.worldbank.org/curated/en/333461589809644880/text/West-Bank-and-Gaza-MIDDLE-EAST-AND-NORTH-AFRICA-P150481-Health-System-Resiliency-Strengthening-Procurement-Plan.txt
30000 wb_D31754788 http://documents.worldbank.org/curated/en/589771580823230896/text/Romania-EUROPE-AND-CENTRAL-ASIA-P148585-Romania-Secondary-Education-Project-Procurement-Plan.txt
40000 wb_D31405141 http://documents.worldbank.org/curated/en/891021568371703236/text/Nigeria-AFRICA-P124905-Nigeria-Erosion-and-Watershed-Management-Project-Procurement-Plan.txt
50000 wb_D31189642 http://documents.worldbank.org/curated/en/649271563588060501/text/Angola-Growth-and-Inclusion-Development-Policy-Financing-Project.txt
60000 wb_D30758447 http://documents.worldbank.org/curated/en/570071547484250986/text/Rwanda-AFRICA-P131464-Landscape-Approach-to-Forest-Restoration-and-Conservation

In [67]:
def download_and_store_pdf(fname, link):
    id = os.path.splitext(os.path.basename(fname))[0]  #.replace('.pdf', '')
    ret_val = 0

    try:
        if not os.path.isfile(fname):
            # Download only files that are not yet available locally.
            response = download_with_retry(link)

            if response:
                with open(fname, 'wb') as fl:
                    fl.write(response.content)
            else:
                ret_val = (id, link, 'Empty response.')
    except Exception as e:
        ret_val = (id, link, e)

    return ret_val


In [90]:
len(nan_ids) + 295436

296724

CPU times: user 584 ms, sys: 16.8 ms, total: 600 ms
Wall time: 599 ms


296724

In [84]:
WB_PDF_DIR = dir_manager.get_data_dir("corpus", "WB", "PDF_ORIG")

pdf_ret_codes = Parallel(n_jobs=5)(
    delayed(download_and_store_file)(os.path.join(WB_PDF_DIR, f'{id}.pdf'), link) for id, link in pdf_links.items() if link)

In [88]:
len(list(filter(lambda x: x, pdf_links.values())))

295436

In [86]:
len(pdf_ret_codes)

295436

In [74]:
pdf_nan_ids = []
for ix, (id, link) in enumerate(links.items()):
    if not isinstance(link, str):
        pdf_nan_ids.append(id)
        continue

    link = link.replace("/text/", "/pdf/")
    link = os.path.splitext(link)[0] + ".pdf"

    fname = os.path.join(WB_PDF_DIR, f'{id}.pdf')
    download_and_store_pdf(fname, link)

    if ix and ix % 10000 == 0:
        print(ix, id, link)

KeyboardInterrupt: 

In [69]:
nan_ids[:10]

['wb_D32374016',
 'wb_D32214123',
 'wb_D31352427',
 'wb_D31303351',
 'wb_D31298833',
 'wb_D31298222',
 'wb_D31293382',
 'wb_D31270193',
 'wb_D31254585',
 'wb_D31231960']

In [63]:
os.path.basename(link).replace('.txt', '.pdf')

'Madagascar-AFRICA-EAST-P149323-Social-Safety-Net-Project-Procurement-Plan.pdf'

In [72]:
normalized_final_df.loc["wb_D32374016"]

corpus                                                                  WB
path_original                              data/corpus/WB/wb_D32374016.txt
path_clean                                               /wb_D32374016.txt
filename_original                                                      NaN
year                                                                  2020
major_doc_type                                           Project Documents
doc_type                                                  Procurement Plan
author                                                                 NaN
collection                                                             NaN
title                    Kenya - AFRICA EAST - P153349 - National Agric...
journal                                                               None
volume                                                                None
date_published                                                  2020-09-02
digital_identifier       

In [73]:
normalized_final_df["author"]

id
wb_D32658475   NaN
wb_D32659506   NaN
wb_D32659154   NaN
wb_D32659153   NaN
wb_D32659160   NaN
                ..
wb_D29851457   NaN
wb_D31825733   NaN
wb_D16919195   NaN
wb_D32092090   NaN
wb_D14551809   NaN
Name: author, Length: 296994, dtype: float64

In [52]:
np.isnan(link)

True

In [48]:
r

<Response [200]>

In [39]:
list(links)[:10]

['wb_D32658475',
 'wb_D32659506',
 'wb_D32659154',
 'wb_D32659153',
 'wb_D32659160',
 'wb_D32658894',
 'wb_D32658893',
 'wb_D32659159',
 'wb_D32654909',
 'wb_D32654907']

In [40]:
links['wb_D32659160']

'http://documents.worldbank.org/curated/en/326541607635914974/text/Disclosable-Version-of-the-ISR-Roads-and-Employment-Project-P160223-Sequence-No-08.txt'

In [22]:
%%time
ret_codes = Parallel(n_jobs=20)(
    delayed(download_and_store_file)(os.path.join(WB_TXT_DIR, f'{id}.txt'), link) for id, link in links.items()
)

CPU times: user 9min 42s, sys: 27 s, total: 10min 9s
Wall time: 1h 10min 38s


In [29]:
normalized_final_df.head(2)

Unnamed: 0_level_0,corpus,path_original,path_clean,filename_original,year,major_doc_type,doc_type,author,collection,title,...,adm_region,geo_region,country,wb_lending_instrument,wb_product_line,wb_major_theme,wb_theme,wb_sector,wb_subtopic_src,wb_project_id
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
wb_19431275,wb,/NLP/CORPUS/WB/TXT_ORIG/wb_19431275.txt,/NLP/CORPUS/WB/TXT_CLEAN/wb_19431275.txt,RAD2066245936.txt,2014.0,Project Documents,Loan Agreement,,,Official Documents - Loan Agreement for Loan 8...,...,Europe and Central Asia,,Moldova,Development Policy Lending,IBRD/IDA,,"Public sector governance,Rural development,Fin...",,"Climate Change and Agriculture,Consumption,Fis...",P143283
wb_19431317,wb,/NLP/CORPUS/WB/TXT_ORIG/wb_19431317.txt,/NLP/CORPUS/WB/TXT_CLEAN/wb_19431317.txt,RAD1222573634.txt,2014.0,Project Documents,Financing Agreement,,,Official Documents - Financing Agreement for C...,...,Europe and Central Asia,,Moldova,Development Policy Lending,IBRD/IDA,,"Public sector governance,Rural development,Fin...",,"Climate Change and Agriculture,Consumption,Fis...",P143283


In [30]:
SCRAPER_DIR

'/home/wb536061/wbes2474/NLP/CORPUS/WB'

In [31]:
fname = f"wb_metadata.csv"

normalized_final_df.reset_index()[METADATA_COLS].to_csv(
    os.path.join(SCRAPER_DIR, fname),
    index=False
)

In [32]:
normalized_final_df.shape

(249805, 28)

In [78]:
normalized_final_df.shape

(249539, 28)

In [27]:
# # Cleanup temporary json files from API
# import shutil

# shutil.rmtree(API_JSON_DIR)

In [None]:
# import pandas as pd
# meta = pd.read_csv('/home/wb536061/wbes2474/NLP/CORPUS/WB/wb_metadata.csv')
# meta.shape

In [3]:
# ssh -A wb536061@w1lxbdatad07 rsync -vuarP "/home/wb536061/wbes2474/NLP/CORPUS/WB/wb_metadata-2021-02-17\ 18:06:18.367302.csv" wb536061@w0lxsnlp01:/decfile2/Modeling/NLP/CORPUS/WB/



(249805, 29)