In [None]:
# IMPORTANT: The parameters below are set only for running this notebook independently. 
# When executing the full Ploomber pipeline, these values will be overridden by the settings in `pipeline.yaml`. 
# Any modifications made here will not persist when running the pipeline.

upstream = None
COUNTRY =  'ARM' # Code of the Country
product = {'data': f'../data/interim/{COUNTRY}/unicef-ecaro-cpe-corpus.jsonl'}  # Path to save the final data product (stored under the 'data' key)
data_source = 'data/corpora'  # Path to the source data directory

This notebook extract text from COARs, CPDs, RD Letters and SitAns and writes it into a jsonl file.

In [None]:
import re
from pathlib import Path
import importlib
import pandas as pd
from tqdm import tqdm
import unicef_cpe as cpe
from pathlib import Path
# other settings
from unicef_cpe.config import PROJ_ROOT

In [None]:
source_path = PROJ_ROOT / Path(data_source)

# extract text from documents
records = []

pdfs = sorted(source_path.glob(f'**/{COUNTRY.lower()}*pdf'))
docx = sorted(source_path.glob(f'**/{COUNTRY.lower()}*docx'))
for file_path in tqdm(pdfs + docx):
    record = {
        'country': COUNTRY,
        'file_name': file_path.name,
        'file_type': file_path.parent.name,
        'year': int(re.search(r'\d+', file_path.name).group()),
        'text': cpe.extraction.extract_text_from_file(file_path),
    }
    if not record['text'].strip():
        print(f'{file_path} is empty.')
        continue
    records.append(record)

df_corpus = pd.DataFrame(records)
print('Shape:', df_corpus.shape)
print(df_corpus.head())

In [None]:
df_corpus['text'].str.len().describe().round(1)

In [None]:
output_path = Path(product['data'])
output_path.parent.mkdir(parents=True, exist_ok=True)  # Create missing directories

df_corpus.to_json(product['data'], orient='records', lines=True)

In [None]:
######################################################################################################################################################################################################