In [1]:
import pandas as pd
from ast import literal_eval
import os
from collections import Counter
import numpy as np
from tqdm import tqdm

In [2]:
def flatten(t):
    return [item for sublist in t for item in sublist]

DATA_PATH = os.path.join(
    '..', '..', '..', '..', "data", "frameworks_data", 'data_v0.7.1'
)

full_df = pd.read_csv(os.path.join(DATA_PATH, 'full_dataset_with_translations.csv'))

for col in ['subpillars_2d', 'subpillars_1d']:
    full_df[col] = full_df[col].apply(literal_eval)

full_df['subpillars'] = full_df.apply(
    lambda x: x.subpillars_2d + x.subpillars_1d, axis=1
)

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [3]:
numbers_df = full_df[full_df.subpillars.apply(
    lambda x: any(['number' in item.lower() for item in x])
)][['entry_id', 'excerpt', 'subpillars', 'project_id']]

PROJECTS_PATH = os.path.join(
    '..', '..', '..', '..', "data", "frameworks_data", 'data_v0.7', 'projects.csv'
)

projects = pd.read_csv(PROJECTS_PATH)[['id', 'title']].rename(
    columns={'id': 'project_id', 'title': 'project_title'}
)

numbers_df = numbers_df.merge(projects, on='project_id', how='left')
numbers_df

Unnamed: 0,entry_id,excerpt,subpillars,project_id,project_title
0,186155,Impacto en las personas a. Inseguridad aliment...,"[Impact->Number Of People Affected, At Risk->R...",1898.0,UNHCR El Salvador
1,187037,"• According to the World Bank, most Venezuelan...","[Displacement->Type/Numbers/Movements, Humanit...",1184.0,UNHCR Ecuador
2,346234,"De otro lado, resaltamos que las niñas y adole...",[Humanitarian Conditions->Physical And Mental ...,2311.0,IMMAP/DFS Colombia
3,241161,"En cuanto a las comunidades rurales, periurban...",[Humanitarian Conditions->Number Of People In ...,2311.0,IMMAP/DFS Colombia
4,186874,La dieta promedio nacional cumple con los requ...,"[At Risk->Risk And Vulnerabilities, At Risk->N...",1184.0,UNHCR Ecuador
...,...,...,...,...,...
15965,218958,Although food distribution amounts per househo...,[Capacities & Response->International Response...,2335.0,GIMAC South Sudan
15966,218960,"In July, WFP reached .95 million in the first ...",[Capacities & Response->International Response...,2335.0,GIMAC South Sudan
15967,318213,An upsurge in violence severely impacted human...,[Humanitarian Access->Number Of People Facing ...,2335.0,GIMAC South Sudan
15968,307062,Conflict between Somaliland and Puntland over ...,"[NOT_MAPPED, Capacities & Response->Number Of ...",2331.0,GIMAC Somalia


In [4]:
numbers_df.shape 

(15970, 5)

In [4]:
numbers_df.to_csv('number_related_tags.csv')

In [5]:
tags = flatten(numbers_df['subpillars'])
all_numbers_tags = [item for item in tags if 'number' in item.lower()]
Counter(all_numbers_tags).most_common(10)

[('Displacement->Type/Numbers/Movements', 9051),
 ('Capacities & Response->Number Of People Reached/Response Gaps', 3419),
 ('Impact->Number Of People Affected', 2364),
 ('Humanitarian Conditions->Number Of People In Need', 1339),
 ('Humanitarian Access->Number Of People Facing Humanitarian Access Constraints/Humanitarian Access Gaps',
  726),
 ('At Risk->Number Of People At Risk', 277)]

In [6]:
questions = {
   'Displacement->Type/Numbers/Movements': 'How many people have been displaced?', 

   'Capacities & Response->Number Of People Reached/Response Gaps': 'How many people have been reached?',

   'Impact->Number Of People Affected': 'How many people are affected?',

   'Humanitarian Conditions->Number Of People In Need': 'How many people are in need?',

   'Humanitarian Access->Number Of People Facing Humanitarian Access Constraints/Humanitarian Access Gaps': 'How many people are facing humanitarian acceess gaps or constrainsts?',

   'At Risk->Number Of People At Risk': 'How many people are at risk?'
}

In [7]:
from transformers import pipeline

qa_model = pipeline("question-answering")


In [8]:
number_related_tags = list(set(all_numbers_tags))
number_related_tags

['Impact->Number Of People Affected',
 'Displacement->Type/Numbers/Movements',
 'Humanitarian Access->Number Of People Facing Humanitarian Access Constraints/Humanitarian Access Gaps',
 'Humanitarian Conditions->Number Of People In Need',
 'Capacities & Response->Number Of People Reached/Response Gaps',
 'At Risk->Number Of People At Risk']

In [13]:
project_names = numbers_df.project_title.unique()
project_names

array(['UNHCR El Salvador', 'UNHCR Ecuador', 'IMMAP/DFS Colombia',
       'IMMAP/DFS Nigeria', 'UNHCR Honduras', 'UNHCR Trinidad and Tobago',
       'IMMAP/DFS RDC', 'UNHCR Uruguay', 'UNHCR Guatemala',
       'UNHCR Colombia', 'IMMAP/DFS Burkina Faso', 'IMMAP/DFS Syria',
       'UNHCR Costa Rica', 'UNHCR Dominican Republic', 'UNHCR Argentina',
       'Lebanon Situation Analysis', 'UNHCR Chile',
       'IMMAP/DFS Bangladesh', 'UNHCR Panama', 'UNHCR Peru',
       'UNHCR Guyana',
       'Bosnia and Herzegovina_Population Movement Report',
       'Central America: Hurricanes Eta and Iota', 'UNHCR Bolivia',
       'UNHCR Venezuela', 'Sudan Floods - September 2020',
       'UNHCR Paraguay', 'UNHCR Aruba', 'UNHCR Curacao',
       'IFRC Philippines', '2020 DFS Libya', 'IFRC Nigeria',
       'Nigeria Situation Analysis (OA)', 'IFRC Kenya', 'IFRC Uganda',
       'Libya Situation Analysis (OA)', 'IFRC Niger', '2020 DFS Nigeria',
       'Situation Analysis Generic Yemen',
       'Situation Analysi

In [14]:
extracted_numbers_dict = {
    project_name: {tag_name: 'None'} for project_name in project_names for tag_name in number_related_tags
}

for id in tqdm(numbers_df.project_id.unique()[:5]):
    df_one_project = numbers_df[
        numbers_df.project_id==id
    ]
    project_name = df_one_project.iloc[0].project_title
    for tag in number_related_tags:
        tag_specific_excerpts = ' '.join(df_one_project[
            df_one_project.subpillars.apply(
                    lambda x: tag in x
                )
        ].excerpt)

        if (tag_specific_excerpts==''):
            continue
        else:
            question_one_project = questions[tag]
            response = qa_model(question = question_one_project, context = tag_specific_excerpts)['answer']
            extracted_numbers_dict[project_name][tag] = response


 20%|██        | 1/5 [00:07<00:29,  7.25s/it]

In [56]:
number_hum_gaps_df.iloc[1].excerpt

'Se ha observado una disminución de personas con vocación de retorno, pero incrementándose la población venezolana ingresando al país por pasos informales, lo que aumenta el riesgo de esta población a multas migratorias, de deportación y otros temas de protección. Según monitoreos de frontera, diariamente ingresan al Ecuador entre 80 a 120 personas en su mayoría en tránsito a terceros países.'