# <div style="text-align:center">World Bank Documents and Reports Manager</div>

This notebook implements the manager class for the data from the **Documents and Reports API**. This document manager provides filtering options for the documents in the index.

In [1]:
from ngrams.ngrams import NGramMapper

In [4]:
import os

import pandas as pd
import numpy as np
import multiprocessing as mp

In [None]:
class DocsManager:
    # Corpus by region
    adm_region_region_map = {
        'Africa': 'AFR',
        'East Asia and Pacific':'EAP',
        'Europe and Central Asia': 'ECA',
        'Latin America & Caribbean': 'LAC',
        'Middle East and North Africa': 'MENA',
        'Rest Of The World': 'RoW',
        'South Asia': 'SAR',
        'The world Region': 'WLD',
        np.nan: 'M_U'
    }

    # Corpus by document type
    major_doc_type_mdtype_map = {
        '': 'N.D',
        'Board Documents': 'BD',
        'Country Focus': 'CF',
        'Economic & Sector Work': 'ESW',
        'Project Documents': 'PD',
        'Publications & Research': 'PR',
        np.nan: 'N.D'
    }
    
    def __init__(
        self,
        metadata_filename, 
        cleaned_files_dir,
        model_output_dir,
        from_year=1950,
        to_year=pd.datetime.now().year,  # Get the current year
        min_token_count = 100,
        ngram_whitelist_file=None
    ):
        '''
        metadata_filename: full path to the metadata file generated from scraping the docs api
        cleaned_files_output_dir: full path to the output directory of the cleaning pipeline
        model_output_dir: full path to the output directory where models will be stored
        from_year: default minimum year filter on the publication date
        to_year: default maximum year filter on the publication date
        min_token_count: threshold for the minimum number of useful words in the document
        '''
        
        self.metadata_filename = metadata_filename
        self.cleaned_files_dir = cleaned_files_dir
        self.model_output_dir = model_output_dir
        self.set_ngram_mapper(ngram_whitelist_file)
                
        self.doclist = pd.read_csv(self.metadata_filename, low_memory=False, index_col=0)
        self.doclist.index.name = 'id'
        self.doclist = self.doclist.reset_index()
        
        self.doclist['region'] = self.doclist.adm_region.map(self.adm_region_region_map).fillna('M_U')
        self.doclist['mdtype'] = self.doclist.major_doc_type.map(self.major_doc_type_mdtype_map).fillna('N.D')
        
        self.corpus_parts = (
            ['ALL'] + 
            [i for i in self.adm_region_region_map.values()] + 
            [i for i in self.major_doc_type_mdtype_map.values()]
        )
        
        # Selected parameters to apply to all models
        self.set_from_year(from_year)
        self.set_to_year(to_year)
        self.set_min_token_count(min_token_count)
        
    def set_from_year(self, from_year):
        self.from_year = from_year
    
    def set_to_year(self, to_year):
        self.to_year = to_year
    
    def set_ngram_mapper(self, ngram_whitelist_file, cleaner=None):
        self.ngram_mapper = NGramMapper(whitelist_file=ngram_whitelist_file, cleaner=cleaner) if ngram_whitelist_file is not None else None

    def set_min_token_count(self, min_token_count):
        self.min_token_count = min_token_count
        
    def load_text(self, path):
        if os.path.isfile(path):
            try:
                with open(path) as fl:
                    data = fl.read()
            except Exception as e1:
                try:
                    with open(path, 'rb') as fl:
                        data = fl.read().decode('utf-8', errors='ignore')
                except:
                    raise(e1)
        else:
            data = None
        
        if self.ngram_mapper is not None:
            data = self.ngram_mapper.replace_ngrams(data)

        return data
    
    def filter_doclist(self, corpus_part, corpus_id, docs_filtered=None, verbose=False, save=False, return_meta=False, pool_workers=None):
        if docs_filtered is None:
            docs_filtered = self.doclist.copy()
        else:
            docs_filtered = docs_filtered.copy()
        docs_filtered = docs_filtered.reset_index()

        if corpus_part in self.major_doc_type_mdtype_map.values():
            docs_filtered = docs_filtered[docs_filtered.mdtype == corpus_part]
        if corpus_part in self.adm_region_region_map.values():
            docs_filtered = docs_filtered[docs_filtered.region == corpus_part]

        dfr_params = ['id', 'title', 'author', 'digital_identifier', 'language_detected', 'year']  # Note: `guid` removed

        docs_filtered = docs_filtered[
            (docs_filtered.year.between(
                self.from_year, self.to_year
            )) &
            (docs_filtered.language_src == 'English') &
            (docs_filtered.language_detected == 'en') &
            (docs_filtered.language_score >= 0.98) &
            (docs_filtered.tokens >= self.min_token_count)
        ][dfr_params]  # Add pages here

        docs_filtered['author'] = docs_filtered['author'].fillna('[Anon.]')
        docs_filtered['pages'] = ""

        if verbose:
            print(f'Number of documents in selected corpus: {docs_filtered.shape[0]}')

        file_name = os.path.join(self.model_output_dir, f'{corpus_id.lower()}-meta_{corpus_part}.csv')
        if save:
            docs_filtered.to_csv(
                file_name,
                index=False, header=False
            )

        if return_meta:
            meta = docs_filtered[docs_filtered.columns]

        docs_filtered = docs_filtered[['id']]
        docs_filtered['filename'] = docs_filtered['id'].map(
            lambda x: os.path.join(self.cleaned_files_dir, f'{x}.txt')
        )

        # Create a new column for text data and add content of text files in it 
        docs_filtered["text"] = 'not-set'

        if pool_workers is None:
            docs_filtered['text'] = docs_filtered.filename.map(self.load_text)
        else:
            pool = mp.Pool(processes=pool_workers)
            docs_filtered['text'] = pool.map(self.load_text, docs_filtered.filename)
            pool.close()
            pool.join()
            
        docs_filtered = docs_filtered.dropna(subset=['text'])
        
        if return_meta:
            return docs_filtered, meta
        else:
            return docs_filtered

In [None]:
def build_docs(metadata_filename, cleaned_files_dir, model_output_dir):
    docs = DocsManager(
        metadata_filename=metadata_filename,
        cleaned_files_dir=cleaned_files_dir,
        model_output_dir=model_output_dir
    )
    
    return docs