# World Bank Publication and Research Cover Page Snapshot

In [1]:
%load_ext autotime

In [2]:
%%capture

# get_corpus_path
# get_txt_orig_path

%run ../path_manager.ipynb

time: 224 ms


In [3]:
import requests
import json
import os
import time
import glob
import pandas as pd
import re
from joblib import Parallel, delayed
import pdf2image
import PIL

time: 552 ms


In [4]:
class DocumentCover:
    def __init__(self, doc_id, pdf_url, cover_dir, fixed_width=200):
        self.doc_id = doc_id
        self.pdf_url = pdf_url
        self.cover_dir = cover_dir
        self.fixed_width = fixed_width
        
        self.fname = os.path.join(self.cover_dir, f'{self.doc_id}.png')

        self.orig_cover = None
        self.cover = None
        
    def get_content(self):
        res = requests.get(self.pdf_url)
        pages = pdf2image.convert_from_bytes(res.content)
        self.orig_cover = pages[0]

    def standardize_size(self):
        c = self.orig_cover
        
        if c is None:
            self.get_content()
            c = self.orig_cover
            assert(c)
        
        width = self.fixed_width
        w0, h0 = c.size
        c = c.resize((width, int(h0 * width / w0)), resample=PIL.Image.BICUBIC)
        self.cover = c
        
    def save(self):
        if not os.path.isfile(self.fname):        
            cover = self.cover

            if cover is None:
                self.standardize_size()
                cover = self.cover

            cover.save(self.fname)
        
        return self.doc_id
    
    def cleanup(self):
        del(self.orig_cover)
        del(self.cover)
        
        self.orig_cover = None
        self.cover = None

time: 7.24 ms


In [5]:
COVER_DIR = os.path.join(get_corpus_path('WB'), 'cover')
if not os.path.isdir(COVER_DIR):
    os.makedirs(COVER_DIR)

time: 6.74 ms


In [6]:
COVER_DIR

'/R/NLP/CORPUS/WB/cover'

time: 13.4 ms


In [7]:
!ls -l /R/NLP/CORPUS/WB/cover | wc -l

49959
time: 3.24 s


In [8]:
# df = pd.read_csv('../../Subtopic Prediction/data/wb_metadata.csv')
df = pd.read_csv(os.path.join(get_corpus_path('WB'), 'wb_metadata.csv'))
pr = df[df.major_doc_type == 'Publications & Research'][['id', 'major_doc_type', 'url_pdf']]

pr.dropna(subset=['url_pdf'], inplace=True)

time: 7.85 s


In [9]:
pr.head()

Unnamed: 0,id,major_doc_type,url_pdf
0,wb_30572113,Publications & Research,http://documents.worldbank.org/curated/en/2803...
65,wb_30556234,Publications & Research,http://documents.worldbank.org/curated/en/1498...
105,wb_30573851,Publications & Research,http://documents.worldbank.org/curated/en/4910...
110,wb_30574049,Publications & Research,http://documents.worldbank.org/curated/en/3374...
111,wb_30574056,Publications & Research,http://documents.worldbank.org/curated/en/3643...


time: 16.3 ms


In [10]:
pr.shape

(50046, 3)

time: 3.19 ms


In [11]:
def parallel_process(row):
    ret = dict(status='ok', doc_id=row['id'])
    try:
        dc = DocumentCover(row['id'], row['url_pdf'], COVER_DIR)
        r = dc.save()
        dc.cleanup()
    except Exception as e:
        ret['status'] = e.__str__()
        
    return ret

time: 4.06 ms


In [None]:
NUM_JOBS = 24

pres = Parallel(n_jobs=NUM_JOBS)(delayed(parallel_process)(row) for _, row in pr.iterrows())