In [1]:
import os
import re
import json
import random
from pprint import pprint
from ast import literal_eval
from collections import Counter, defaultdict

import numpy as np
import pandas as pd
from tqdm.auto import tqdm

import seaborn as sns
import matplotlib.pyplot as plt

tqdm.pandas()

In [2]:
random.seed(2021)
np.random.seed(2021)

In [3]:
tqdm.pandas()

In [4]:
flatten = lambda l: sum(map(flatten, l), []) if isinstance(l, list) else [l]

In [5]:
exportdata = pd.read_csv("exportdata.csv")
entries = pd.read_csv("entries.csv")
af_widgets = pd.read_csv("af_widgets.csv")
projects = pd.read_csv("projects.csv")
exportables = pd.read_csv("af_exportables.csv")
##
afs = pd.read_csv("analysis_frameworks.csv")
users = pd.read_csv("user_names.csv")

In [6]:
dirName = 'generated_dataset'
if not os.path.exists(dirName):
    os.mkdir(dirName)

In [7]:
af_widgets.head()

Unnamed: 0,id,widget_id,title,properties,analysis_framework_id,key
0,483,multiselectWidget,SPECIFIC NEEDS GROUPS,"{""data"": {""options"": [{""key"": ""option-1"", ""lab...",137,element7
1,484,dateWidget,DATE OF INFORMATION,"{""added_from"": ""list"", ""list_grid_layout"": {""t...",137,element8
2,475,excerptWidget,EXCERPT,"{""added_from"": ""overview"", ""list_grid_layout"":...",137,page-two-excerpt
3,482,multiselectWidget,DEMOGRAPHIC GROUPS,"{""data"": {""options"": [{""key"": ""option-1"", ""lab...",137,element6
4,479,scaleWidget,RELIABILITY,"{""data"": {""scale_units"": [{""key"": ""scale-1"", ""...",137,element3


In [8]:
af_widgets["properties"] = af_widgets["properties"].apply(json.loads)

In [9]:
af_widgets["title"].str.upper().unique()

array(['SPECIFIC NEEDS GROUPS', 'DATE OF INFORMATION', 'EXCERPT',
       'DEMOGRAPHIC GROUPS', 'RELIABILITY', 'SEVERITY', 'AFFECTED GROUPS',
       'MATRIX 1D', 'SECTORAL INFORMATION', 'LOCATION',
       'OPERATIONAL ENVIRONMENT', 'INFORMATION DATE', 'SECTORS', 'FLAG',
       'CROSS SECTOR', 'GEO LOCATION', 'CROSS SECTORS', 'MATRIX 2D',
       'PRE-CRISIS', 'IN-CRISIS', 'CRISIS TYPE', 'SHOCK/EVENT',
       'ADDITIONAL CONTEXT', 'DISPLACED POP TYPE', 'CLEANING TAGS',
       'POPULATION GROUPS', 'CLEANING COMMENTS', 'HIGH LEVEL TAGS',
       'CONTEXT ADDITIONAL TAGS', 'COMMENT'], dtype=object)

In [10]:
mat2d_titles = [
    s.upper() for s in [
        "Pre-Crisis",
        "Shock/Event",
        "In-Crisis",
        "Sectors",
        "Sectoral Information",
        "Matrix 2D",
    ]
]
mat2d_properties_ids = af_widgets[(
    af_widgets["title"].str.upper()).isin(mat2d_titles)][[
        "properties", "analysis_framework_id"
    ]]
mat2d_properties = mat2d_properties_ids["properties"].tolist()
mat2d_ids = mat2d_properties_ids["analysis_framework_id"].tolist()

In [11]:
afids_pillars_subpillars = dict()
errors = []
for mat, af_id in zip(mat2d_properties, mat2d_ids):
    try:
        dims = mat["data"]['dimensions']
        afids_pillars_subpillars[af_id] = {}
        for dim in dims:
            pillar = dim["title"]
            sub_pillars = []
            for sub_pillar_dict in dim["subdimensions"]:
                sub_pillars.append(sub_pillar_dict["title"])
            afids_pillars_subpillars[af_id][pillar] = sub_pillars
    except KeyError:
        errors.append(mat)

In [12]:
exportables["data"] = exportables["data"].apply(json.loads)

In [13]:
def extract_title(x):
    if x["excel"].get("title"):
        return x["excel"]["title"]
    elif x["excel"].get("type") == "multiple":
        return x["excel"]["titles"]
    
def title_case(tag):
    if isinstance(tag, (list, tuple)):
        return tuple([x.title() for x in tag])
    return tag.title()

af_titles = exportables["data"].apply(extract_title)

In [14]:
entries.shape, exportdata.shape

((203464, 21), (1156529, 4))

In [15]:
entries[~entries["excerpt"].isna()]["excerpt"].unique().shape

(155472,)

In [16]:
exid_to_exdata = dict()
for ex_id, ex_data in zip(exportables["id"], exportables["data"]):
    exid_to_exdata[ex_id] = ex_data

In [17]:
#exportdata[exportdata["tag_title"].eq('Crisis Type')]["tag_value"].tolist()
# exportdata[exportdata["tag_title"].eq('Crisis Type')]["tag_value"].shape

In [18]:
af_widgets["title"].unique()

array(['SPECIFIC NEEDS GROUPS', 'DATE OF INFORMATION', 'EXCERPT',
       'DEMOGRAPHIC GROUPS', 'RELIABILITY', 'SEVERITY', 'AFFECTED GROUPS',
       'Matrix 1D', 'Sectoral Information', 'LOCATION',
       'Operational Environment', 'Information Date', 'Sectors', 'Flag',
       'Severity', 'Cross sector', 'Affected groups',
       'Specific Needs Groups', 'Demographic Groups', 'Reliability',
       'Geo Location', 'Excerpt', 'Information date', 'Cross Sectors',
       'Matrix 2D', 'PRE-CRISIS', 'IN-CRISIS', 'Crisis type',
       'Crisis Type', 'SHOCK/EVENT', 'Additional Context',
       'DISPLACED POP TYPE', 'CLEANING tags', 'POPULATION GROUPS',
       'Cleaning comments', 'HIGH LEVEL TAGS', 'Context additional tags',
       'Comment'], dtype=object)

In [19]:
widget_key_id_to_title = dict()
for w_key, w_id, title in zip(af_widgets["key"], af_widgets["widget_id"],
                              af_widgets["title"]):
    widget_key_id_to_title[(w_key, w_id)] = title

In [20]:
def exportdata_to_tag_title(row):
    data = json.loads(row[1])
    wkey, wid = None, None
    if data.get("common"):
        wkey = data.get("common").get("widget_key")
        wid = data["common"].get("widget_id")
    if wkey and wid:
        if widget_key_id_to_title.get((wkey, wid)):
            return widget_key_id_to_title.get((wkey, wid))
    if data.get("report") and data["report"].get("other"):
        if len(data["report"]
               ["other"]) == 1 and data["report"]["other"][0].get("title"):
            if data["report"]["other"][0]["title"]:
                return data["report"]["other"][0]["title"]
    if isinstance(data.get("excel"), list) and len(data["excel"])==1 and \
     data["excel"][0].get("widget_key") and data["excel"][0].get("widget_id"):
        wkey = data["excel"][0]["widget_key"]
        wid = data["excel"][0]["widget_id"]
        if widget_key_id_to_title.get((wkey, wid)):
            return widget_key_id_to_title.get((wkey, wid))
    exportable_id = row[3]
    if exid_to_exdata[exportable_id]["excel"].get("title"):
        return exid_to_exdata[exportable_id]["excel"]["title"]
    elif exid_to_exdata[exportable_id]["excel"].get("type") == "multiple":
        return exid_to_exdata[exportable_id]["excel"]["titles"]
    raise

In [21]:
exportdata["tag_title"] = exportdata.apply(exportdata_to_tag_title, axis=1)

In [22]:
def exportdata_to_tag_value(row):
    data = json.loads(row[1])
    if isinstance(data["excel"], list) and len(data["excel"]) == 1:
        return data["excel"][0]["value"]
    if data["excel"].get("type") == "lists":
        return data["excel"]["values"]
    if isinstance(data["excel"], dict) and "value" in data["excel"]:
        return data["excel"].get("value")
    if isinstance(data["excel"], dict) and "values" in data["excel"]:
        return data["excel"].get("values")
    elif "values" in data["common"]:
        return data["common"]["values"]
    elif "value" in data["common"]:
        return data["common"]["value"]
    raise

In [23]:
exportdata["tag_value"] = exportdata.apply(exportdata_to_tag_value, axis=1)

In [24]:
def title_case(tag):
    if isinstance(tag, (list, tuple)):
        return tuple([x.title() for x in tag])
    return tag.title()

In [25]:
exportdata["tag_title"] = exportdata["tag_title"].apply(title_case)

In [26]:
#exportdata["tag_title"].unique()
# 1. GIMAC --> Pre-Crisis, Shock/Event, In-Crisis
# 2. 2020 Okular --> Sectoral Information
# 3. Okular Analytics Generic --> Sectoral Information
# 4. Rohingya Framework --> Sectors
# 5. IFRC 2018 --> Sectors
# 6. Colombia AF --> Sectoral Information
# 7. Nigeria Situation Analysis (OA) --> Sectoral Information
# 8. Situation Analysis Generic Yemen --> Matrix 2D
# 9. Situation Analysis Generic Libya --> Sectors

In [27]:
exportdata['tag_title'].unique()

array(['Severity', 'Reliability', 'Affected Groups', 'Information Date',
       'Location', 'Sectoral Information', 'Demographic Groups', 'Flag',
       'Operational Environment', 'Shock/Event',
       'Context Additional Tags', 'In-Crisis', 'Specific Needs Groups',
       'Crisis Type', 'Comment', 'Additional Context', 'Sectors',
       'Cross Sector', 'Date Of Information', 'Geo Location',
       'Displaced Pop Type', 'Cleaning Tags',
       'Displacement Profile Details',
       ('Affected Groups - Level 0', 'Affected Groups - Level 1', 'Affected Groups - Level 2', 'Affected Groups - Level 3'),
       'Cleaning Comments',
       ('Sectoral Information - Dimension', 'Sectoral Information - Subdimension', 'Sectoral Information - Sector', 'Sectoral Information - Subsectors'),
       ('Information Date (From)', 'Information Date (To)'),
       ('Operational Environment - Dimension', 'Operational Environment - Subdimension'),
       ('Matrix 1D - Dimension', 'Matrix 1D - Subdimension'),


In [28]:
date = [
    'Information Date', ('Information Date (From)', 'Information Date (To)'),
    'Date Of Information'
]
affected_groups = [
    'Population Groups', 'Affected Groups',
    ('Affected Groups - Level 0', 'Affected Groups - Level 1',
     'Affected Groups - Level 2', 'Affected Groups - Level 3'),
    ('Affected Groups - Level 0', 'Affected Groups - Level 1',
     'Affected Groups - Level 2', 'Affected Groups - Level 3',
     'Affected Groups - Level 4', 'Affected Groups - Level 5'),
    'Displacement Profile Details', 'Displaced Pop Type'
]
location = ['Location', 'Geo Location']

secondary_tags_titles = [
    "Severity",
    "Reliability",
    "Demographic Groups",
    "Specific Needs Groups",
] + date + location + affected_groups

In [29]:
exportdata_secondary_tags = exportdata[exportdata["tag_title"].isin(
    secondary_tags_titles)]

In [30]:
def agg_group(group):
    tag_titles = group["tag_title"]
    tag_values = group["tag_value"]
    #
    new_row = {
        "entry_id": group['entry_id'].iloc[0],
        "Severity": None,
        "Reliability": [],
        "Demographic Groups": [],
        "Specific Needs Groups": [],
        "Location": [],
        "Information Date": None,
        "Affected Groups": [],
    }
    for title, value in zip(tag_titles, tag_values):
        if title is None:
            continue
        if title in date:
            new_row["Information Date"] = value
        elif title in location:
            new_row['Location'].extend(value)
        elif title in affected_groups:
            new_row['Affected Groups'].extend(flatten(value))
        elif title in ["Demographic Groups", "Specific Needs Groups"]:
            new_row[title].extend(value)
        elif title == "Severity":
            new_row[title] = value
        elif title == "Reliability":
            new_row[title].append(value)
        else:
            raise
    new_row['Affected Groups'] = list(
        set([x for x in new_row['Affected Groups'] if x]))
    return new_row

In [31]:
exportdata_secondary_tags = pd.DataFrame.from_records(
    exportdata_secondary_tags.groupby("entry_id").apply(agg_group).values)

In [32]:
exportdata_secondary_tags

Unnamed: 0,entry_id,Severity,Reliability,Demographic Groups,Specific Needs Groups,Location,Information Date,Affected Groups
0,556,No problem,[Usually],[],[],[28391],,"[Displaced, Affected]"
1,558,Severe conditions,[Usually],[],[],[28391],,"[Displaced, Affected, Returnees]"
2,559,Severe conditions,[Usually],[],[],[28389],,[]
3,562,Severe conditions,[Usually],[],[],[28389],,[]
4,563,No problem,[Usually],[],[],[28389],,[]
...,...,...,...,...,...,...,...,...
202837,501310,,[Usually Reliable],[],[],[82862],"[07-05-2020, None]","[Out of Venezuela, Migrants, In transit, Affec..."
202838,501311,,[Usually Reliable],[],[],[82862],"[07-05-2020, None]","[Out of Venezuela, Migrants, In transit, Affec..."
202839,501312,,[Usually Reliable],[],[],[82862],"[07-05-2020, None]","[Out of Venezuela, Migrants, In transit, Affec..."
202840,501313,,[Usually Reliable],[],[],[82862],"[07-05-2020, None]","[Out of Venezuela, Migrants, In transit, Affec..."


In [33]:
entries.columns

Index(['id', 'created_at', 'modified_at', 'excerpt', 'image_raw',
       'analysis_framework_id', 'created_by_id', 'lead_id', 'modified_by_id',
       'entry_type', 'information_date', 'order', 'client_id', 'project_id',
       'tabular_field_id', 'dropped_excerpt', 'highlight_hidden', 'verified',
       'verification_last_changed_by_id', 'image_id', 'title'],
      dtype='object')

In [34]:
entries_labeled_secondary_tags = pd.merge(entries,
                                          exportdata_secondary_tags,
                                          how="inner",
                                          left_on="id",
                                          right_on="entry_id",
                                          suffixes=('_entry', '_exportdata'))

In [35]:
entries.shape, entries[~entries["excerpt"].isna()]["excerpt"].unique(
).shape, exportdata_secondary_tags.shape

((203464, 21), (155472,), (202842, 8))

In [36]:
entries.head()

Unnamed: 0,id,created_at,modified_at,excerpt,image_raw,analysis_framework_id,created_by_id,lead_id,modified_by_id,entry_type,...,order,client_id,project_id,tabular_field_id,dropped_excerpt,highlight_hidden,verified,verification_last_changed_by_id,image_id,title
0,16851,2018-12-22 13:19:52.922273+00,2018-12-24 05:24:33.116063+00,"During the reporting week, IOM provided medica...",,137,354,6334,354,excerpt,...,2,tasiymx9,322,,,False,False,,,Situation Analysis Generic Yemen
1,25639,2019-03-24 11:25:28.055219+00,2019-03-25 04:27:17.715687+00,"On March 19, search-andrescue personnel airlif...",,273,26,10607,26,excerpt,...,13,odi4z54i,878,,,False,False,,,"IFRC - Cyclone Idai, March 2019"
2,54325,2019-08-13 08:51:01.120935+00,2021-05-06 07:14:14.773704+00,,https://api.thedeep.io/file/19293/,829,1395,16973,1395,image,...,17,ptmusm3b,1186,,,False,False,,19293.0,UNHCR Chile
3,65692,2019-10-09 11:40:11.416419+00,2021-05-06 07:14:31.449556+00,,https://api.thedeep.io/file/23416/,829,1396,18714,1396,image,...,21,a7dfeews,1183,,,False,False,,23416.0,UNHCR Colombia
4,21660,2019-02-08 11:27:52.756824+00,2019-02-08 11:27:53.091176+00,"According to The Guardian, more than 9,000 ref...",,552,657,8644,657,excerpt,...,2,x8t1upes,788,,"According to The Guardian, more than 9,000 ref...",False,False,,,Nigeria Situation Analysis (OA)


In [37]:
exportdata_secondary_tags.head()

Unnamed: 0,entry_id,Severity,Reliability,Demographic Groups,Specific Needs Groups,Location,Information Date,Affected Groups
0,556,No problem,[Usually],[],[],[28391],,"[Displaced, Affected]"
1,558,Severe conditions,[Usually],[],[],[28391],,"[Displaced, Affected, Returnees]"
2,559,Severe conditions,[Usually],[],[],[28389],,[]
3,562,Severe conditions,[Usually],[],[],[28389],,[]
4,563,No problem,[Usually],[],[],[28389],,[]


In [38]:
entries_labeled_secondary_tags_cleaned = entries_labeled_secondary_tags[
    ~entries_labeled_secondary_tags["excerpt"].isna()]

In [39]:
entries_labeled_secondary_tags_cleaned = entries_labeled_secondary_tags_cleaned.drop(
    columns=[
        'tabular_field_id', 'image_id', 'image_raw', 'created_at',
        'modified_at', 'created_by_id', 'modified_by_id', 'entry_type',
        'dropped_excerpt', 'highlight_hidden', 'verified',
        'verification_last_changed_by_id', 'client_id', 'information_date',
        'order'
    ])

In [40]:
entries_labeled_secondary_tags_cleaned.shape, entries_labeled_secondary_tags_cleaned.columns

((157469, 14),
 Index(['id', 'excerpt', 'analysis_framework_id', 'lead_id', 'project_id',
        'title', 'entry_id', 'Severity', 'Reliability', 'Demographic Groups',
        'Specific Needs Groups', 'Location', 'Information Date',
        'Affected Groups'],
       dtype='object'))

### Reliability

In [41]:
reliability_raw_to_processed_dict = {
    'Completely': 'Completely Reliable',
    'Completely Reliable': 'Completely Reliable',
    'Usually Reliable': 'Usually reliable',
    'Usually': 'Usually reliable',
    'Fairly Reliable': 'Fairly Reliable',
    'Fairly': 'Fairly Reliable',
    'Not Usually': 'Not Usually Reliable',
    'Not Usually Reliable': 'Not Usually Reliable',
    'Unreliable': 'Unreliable',
    'Cannot be Used': 'Unreliable',
    'Unreliable ': 'Unreliable',
}

In [42]:
entries_labeled_secondary_tags_cleaned[
    'Reliability'] = entries_labeled_secondary_tags_cleaned[
        'Reliability'].apply(
            lambda l: [reliability_raw_to_processed_dict[x] for x in l if x])

## Dates

In [43]:
entries_labeled_secondary_tags_cleaned["Information Date"]

0                 None
1           22-03-2019
4         [None, None]
5                 None
7                 None
              ...     
202837    [None, None]
202838    [None, None]
202839    [None, None]
202840    [None, None]
202841    [None, None]
Name: Information Date, Length: 157469, dtype: object

In [44]:
entries_labeled_secondary_tags_cleaned[
    "Information Date"] = entries_labeled_secondary_tags_cleaned[
        "Information Date"].apply(lambda x: x if x != [None, None] else None)

In [45]:
entries_labeled_secondary_tags_cleaned.head()

Unnamed: 0,id,excerpt,analysis_framework_id,lead_id,project_id,title,entry_id,Severity,Reliability,Demographic Groups,Specific Needs Groups,Location,Information Date,Affected Groups
0,16851,"During the reporting week, IOM provided medica...",137,6334,322,Situation Analysis Generic Yemen,16851,No problem,[Usually reliable],[],[Pregnant or Lactating Women],[27956],,[]
1,25639,"On March 19, search-andrescue personnel airlif...",273,10607,878,"IFRC - Cyclone Idai, March 2019",25639,No Problem,[Usually reliable],[],[],"[135799, 135800]",22-03-2019,[]
4,21660,"According to The Guardian, more than 9,000 ref...",552,8644,788,Nigeria Situation Analysis (OA),21660,,[Usually reliable],[],[],[29687],,"[Displaced, Refugees, Affected]"
5,16852,Distributed Rapid Response Mechanism (RMM) kit...,137,6334,322,Situation Analysis Generic Yemen,16852,No problem,[Usually reliable],[],[],[27956],,"[Displaced, Affected]"
7,17917,Water represents the main nerve-racking proble...,137,6357,322,Situation Analysis Generic Yemen,17917,No problem,[Usually reliable],[],[],[27956],,[]


## Location

In [46]:
geoarea_df = pd.read_csv('geo_geoarea.csv')

In [47]:
geoarea_cleaned = geoarea_df[['id', 'title']].dropna()
geoarea_cleaned['title'] = geoarea_cleaned['title'].apply(
    lambda x: x.replace('(the)', '').replace('(la)', '').lstrip().rstrip())

list_dicts = geoarea_cleaned.to_dict(orient='split')['data']
id_to_location = {}
for row in list_dicts:
    id_to_location[row[0]] = row[1]

In [48]:
list(id_to_location.items())[:10]

[(41917, 'Ethiopia'),
 (41918, 'Addis Ababa'),
 (41919, 'Tigray'),
 (41920, 'Somali'),
 (41921, 'Hareri'),
 (41922, 'Gambela'),
 (41923, 'Beneshangul Gumu'),
 (41924, 'Amhara'),
 (41925, 'Afar'),
 (41926, 'Oromia')]

In [49]:
entries_labeled_secondary_tags_cleaned[
    "Location"] = entries_labeled_secondary_tags_cleaned["Location"].apply(
        lambda x: [
            id_to_location[int(loc_id)] for loc_id in x
            if int(loc_id) in id_to_location
        ])

In [50]:
entries_labeled_secondary_tags_cleaned.head(3)

Unnamed: 0,id,excerpt,analysis_framework_id,lead_id,project_id,title,entry_id,Severity,Reliability,Demographic Groups,Specific Needs Groups,Location,Information Date,Affected Groups
0,16851,"During the reporting week, IOM provided medica...",137,6334,322,Situation Analysis Generic Yemen,16851,No problem,[Usually reliable],[],[Pregnant or Lactating Women],[Yemen],,[]
1,25639,"On March 19, search-andrescue personnel airlif...",273,10607,878,"IFRC - Cyclone Idai, March 2019",25639,No Problem,[Usually reliable],[],[],"[Cidade Da Beira, Buzi]",22-03-2019,[]
4,21660,"According to The Guardian, more than 9,000 ref...",552,8644,788,Nigeria Situation Analysis (OA),21660,,[Usually reliable],[],[],[Nigeria],,"[Displaced, Refugees, Affected]"


## Severity

In [51]:
severity_raw_to_processed_dict = {
    'Critical': 'Critical',
    'Severe': 'Critical',
    'Critical Situation': 'Critical',
    'Critical situation': 'Critical',
    'Critical problem. Urgent intervention required': 'Critical',
    'Severe Conditions': 'Critical',
    'Severe conditions': 'Critical',
    'Major': 'Major',
    'Severe Problem. Intervention required': 'Major',
    'Important problem. Middle to short term intervention required': 'Major',
    'Situation of Major Concern': 'Major',
    'Situation of major concern': 'Major',
    'Of Concern. Monitoring required': 'Of Concern',
    'Of Concern': 'Of Concern',
    'Situation of Concern': 'Of Concern',
    'No Problem': 'No problem',
    'No problem': 'No problem',
    'Minor Problem': 'Minor Problem',
    'No problem/minor problem. No intervention required': 'INVALID CONVERSION',
    'No problem/Minor Problem': 'INVALID CONVERSION',
    '': None,
    None: None
}

In [52]:
entries_labeled_secondary_tags_cleaned[
    "Severity"] = entries_labeled_secondary_tags_cleaned["Severity"].apply(
        lambda x: severity_raw_to_processed_dict[x])

In [53]:
entries_labeled_secondary_tags_cleaned.head(3)

Unnamed: 0,id,excerpt,analysis_framework_id,lead_id,project_id,title,entry_id,Severity,Reliability,Demographic Groups,Specific Needs Groups,Location,Information Date,Affected Groups
0,16851,"During the reporting week, IOM provided medica...",137,6334,322,Situation Analysis Generic Yemen,16851,No problem,[Usually reliable],[],[Pregnant or Lactating Women],[Yemen],,[]
1,25639,"On March 19, search-andrescue personnel airlif...",273,10607,878,"IFRC - Cyclone Idai, March 2019",25639,No problem,[Usually reliable],[],[],"[Cidade Da Beira, Buzi]",22-03-2019,[]
4,21660,"According to The Guardian, more than 9,000 ref...",552,8644,788,Nigeria Situation Analysis (OA),21660,,[Usually reliable],[],[],[Nigeria],,"[Displaced, Refugees, Affected]"


### Move some tags from Specific Needs Groups to Affected Groups

In [54]:
to_be_moved = {
    'Unregistered Refugee', 'No legal documentation',
    'People with irregular status', 'Stateless'
}

In [55]:
entries_labeled_secondary_tags_cleaned.columns

Index(['id', 'excerpt', 'analysis_framework_id', 'lead_id', 'project_id',
       'title', 'entry_id', 'Severity', 'Reliability', 'Demographic Groups',
       'Specific Needs Groups', 'Location', 'Information Date',
       'Affected Groups'],
      dtype='object')

In [56]:
def unique_values(df, col):
    uv = set()
    for x in df[col]:
        uv.update(x)
    return uv

In [57]:
unique_values(entries_labeled_secondary_tags_cleaned, "Specific Needs Groups")

{' Separated Children',
 'Child Head of Household',
 'Chronically Ill',
 'Chronically ill',
 'Elderly Head of Household',
 'Ethnic minority',
 'Female Head of Household',
 'GBV survivors',
 'Indigenous people',
 'LGBTQI',
 'LGBTQI community member',
 'LGBTQI+',
 'Minorities',
 'No legal documentation',
 'People with irregular status',
 'Person with Disabilities',
 'Persons with Disability',
 'Pregnant or Lactating Women',
 'Religious minority',
 'Separated Children',
 'Single Women (including Widows)',
 'Stateless',
 'Unaccompanied Children (without Caregiver)',
 'Unaccompanied Children (without caregiver)',
 'Unaccompanied or Separated Children (UASC)',
 'Unregistered Refugee'}

In [58]:
def move(row):
    ag = set(row[13])
    ag.update([x for x in row[10] if x in to_be_moved])
    return list(ag)


entries_labeled_secondary_tags_cleaned[
    'Affected Groups'] = entries_labeled_secondary_tags_cleaned.apply(move,
                                                                      axis=1)

In [59]:
entries_labeled_secondary_tags_cleaned['Affected Groups']

0                                                        []
1                                                        []
4                           [Displaced, Refugees, Affected]
5                                     [Displaced, Affected]
7                                                        []
                                ...                        
202837    [Migrants, Refugees, Affected, Displaced, Perm...
202838    [Migrants, Refugees, Affected, Displaced, Perm...
202839    [Migrants, Refugees, Affected, Displaced, Perm...
202840    [Migrants, Refugees, Affected, Displaced, Perm...
202841    [Migrants, Refugees, Affected, Displaced, Perm...
Name: Affected Groups, Length: 157469, dtype: object

### Specific Needs Groups

In [60]:
specific_needs_groups_raw_to_processed_dict = {
    ' Separated Children': 'Unaccompanied or Separated Children',
    'Separated Children': 'Unaccompanied or Separated Children',
    'Unaccompanied Children (without caregiver)':
    'Unaccompanied or Separated Children',
    'Unaccompanied Children (without Caregiver)':
    'Unaccompanied or Separated Children',
    'Unaccompanied or Separated Children (UASC)':
    'Unaccompanied or Separated Children',
    'Child Head of Household': 'Child Head of Household',
    'Elderly Head of Household': 'Elderly Head of Household',
    'Female Head of Household': 'Female Head of Household',
    'GBV survivors': 'GBV survivors',
    'Pregnant or Lactating Women': 'Pregnant or Lactating Women',
    'Single Women (including Widows)': 'Single Women (including Widows)',
    'Ethnic minority': 'Minorities',
    'Indigenous people': 'Indigenous people',
    'LGBTQI': 'LGBTQI+',
    'LGBTQI community member': 'LGBTQI+',
    'LGBTQI+': 'LGBTQI+',
    'Minorities': 'Minorities',
    'Religious minority': 'Minorities',
    'Unregistered Refugee':None,
    'No legal documentation':None,
    'People with irregular status':None,
    'Stateless':None,
    'Person with Disabilities': 'Persons with Disability',
    'Persons with Disability': 'Persons with Disability',
    'Chronically Ill': 'Chronically Ill',
    'Chronically ill': 'Chronically Ill'
}

In [61]:
entries_labeled_secondary_tags_cleaned[
    "Specific Needs Groups"] = entries_labeled_secondary_tags_cleaned[
        "Specific Needs Groups"].apply(lambda x: list(
            set([
                specific_needs_groups_raw_to_processed_dict[e] for e in x
                if not e in to_be_moved
            ])))

### Demographic Groups

In [62]:
demographic_groups_raw_to_processed_dict = {
    'Adult (18 to 59 years old)':
    ['Adult Female (18 to 59 years old)', 'Adult Male (18 to 59 years old)'],
    'Adult (18-59 years old)':
    ['Adult Female (18 to 59 years old)', 'Adult Male (18 to 59 years old)'],
    'Adult Female (18 to 59 years old)': ['Adult Female (18 to 59 years old)'],
    'Adult female (18 to 59 years old)': ['Adult Female (18 to 59 years old)'],
    'Adult female (18-59 years old)': ['Adult Female (18 to 59 years old)'],
    'Adult Male (18 to 59 years old)': ['Adult Male (18 to 59 years old)'],
    'Adult (25 to 59 years old)':
    ['Adult Female (18 to 59 years old)', 'Adult Male (18 to 59 years old)'],
    'Youth (18 to 24 years old)':
    ['Adult Female (18 to 59 years old)', 'Adult Male (18 to 59 years old)'],
    'Adult Female (25 to 59 years old)': ['Adult Female (18 to 59 years old)'],
    'Adult Male (25 to 59 years old)': ['Adult Male (18 to 59 years old)'],
    'Youth Female (18 to 24 years old)': ['Adult Female (18 to 59 years old)'],
    'Youth Male (18 to 24 years old)': ['Adult Male (18 to 59 years old)'],
    'Adult male (18-59 years old)': ['Adult Male (18 to 59 years old)'],
    'Female Older Persons (60+ years old)':
    ['Older Persons Female (60+ years old)'],
    'Male Older Persons (60+ years old)':
    ['Older Persons Male (60+ years old)'],
    'Older Persons (60+ years old)': [
        'Older Persons Female (60+ years old)',
        'Older Persons Male (60+ years old)'
    ],
    'Older Persons Female (60+ years old)':
    ['Older Persons Female (60+ years old)'],
    'Older Persons Male (60+ years old)':
    ['Older Persons Male (60+ years old)'],
    'Older persons (60+ years old)': [
        'Older Persons Female (60+ years old)',
        'Older Persons Male (60+ years old)'
    ],
    'Older persons female (60+ years old)':
    ['Older Persons Female (60+ years old)'],
    'Older persons male (60+ years old)':
    ['Older Persons Male (60+ years old)'],
    'Infants/Toddlers (<5 years old)': ['Infants/Toddlers (<5 years old)'],
    'Infants/toddlers (<5)': ['Infants/Toddlers (<5 years old)'],
    'Infants (0-2 years old)': ['Infants/Toddlers (<5 years old)'],
    'Children (0-4 years old)': ['Infants/Toddlers (<5 years old)'],

    #'Children Male (5 to 1 years old)
    'Children (5 to 11 years old)': [
        'Children/Youth Male (5 to 17 years old)',
        'Children/Youth Female (5 to 17 years old)'
    ],
    'Children (5 to 17 years old)': [
        'Children/Youth Male (5 to 17 years old)',
        'Children/Youth Female (5 to 17 years old)'
    ],
    'Children (5-11 years old)': [
        'Children/Youth Male (5 to 17 years old)',
        'Children/Youth Female (5 to 17 years old)'
    ],
    'Children (< 18 years)': [
        'Children/Youth Male (5 to 17 years old)',
        'Children/Youth Female (5 to 17 years old)'
    ],
    'Children Female  <18 years old':
    ['Children/Youth Female (5 to 17 years old)'],
    'Children Female (5 to 11 years old)':
    ['Children/Youth Female (5 to 17 years old)'],
    'Children Female (5 to 17 years old)': [
        'Children/Youth Female (5 to 17 years old)'
    ],
    'Children Female (< 18 years)': [
        'Children/Youth Female (5 to 17 years old)'
    ],
    'Children Male (5 to 11 years old)': [
        'Children/Youth Male (5 to 17 years old)'
    ],
    'Children Male (< 18 years)': ['Children/Youth Male (5 to 17 years old)'],
    ###
    'Children <18 years old': [
        'Children/Youth Male (5 to 17 years old)',
        'Children/Youth Female (5 to 17 years old)'
    ],
    ###
    'Children Male < 18 years old': [
        'Children/Youth Male (5 to 17 years old)'
    ],
    'Children female (5-11 years old)': [
        'Children/Youth Female (5 to 17 years old)'
    ],
    'Children male (5-11 years old)': [
        'Children/Youth Male (5 to 17 years old)'
    ],
    'Youth (12 to 17 years old)': [
        'Children/Youth Male (5 to 17 years old)',
        'Children/Youth Female (5 to 17 years old)'
    ],
    'Youth (12-17 years old)': [
        'Children/Youth Male (5 to 17 years old)',
        'Children/Youth Female (5 to 17 years old)'
    ],
    'Youth Female (12 to 17 years old)': [
        'Children/Youth Female (5 to 17 years old)'
    ],
    'Youth Male (12 to 17 years old)': [
        'Children/Youth Male (5 to 17 years old)'
    ],
    'Youth female (12-17 years old)': [
        'Children/Youth Female (5 to 17 years old)'
    ],
    'Youth male (12-17 years old)': [
        'Children/Youth Male (5 to 17 years old)'
    ],
    'Children Male (5 to 1 years old)': [
        'Children/Youth Male (5 to 17 years old)'
    ],
    'Female':
    'Female',
    'Male':
    'Male',
}

In [63]:
entries_labeled_secondary_tags_cleaned[
    "Demographic Groups"] = entries_labeled_secondary_tags_cleaned[
        "Demographic Groups"].apply(
            lambda x: [demographic_groups_raw_to_processed_dict[e] for e in x])

In [64]:
entries_labeled_secondary_tags_cleaned[
    "Demographic Groups"] = entries_labeled_secondary_tags_cleaned[
        "Demographic Groups"].apply(flatten)

### Affected Groups

In [65]:
# def clean_names(x):
#     return x.replace('Others of Concern',
#                      'Others').replace('affected', 'Affected')


# def preprocess_lists_affected_groups(ag):
#     return list(set([clean_names(x) for x in ag]))


# def preprocess_lists_affected_groups(t):
#     return list(
#         np.unique([
#             clean_names(item) for sublist in t for item in sublist
#             if item not in [
#                 '',
#                 'Out of Venezuela',
#                 'Back to Venezuela',
#                 'All',
#                 'Stateless',
#                 'Non Host',
#                 'Not Affected',
#                 'Not affected',
#                 'Pendular',
#             ]
#         ]))

In [66]:
# entries_labeled_secondary_tags_cleaned[
#     'Affected Groups'] = entries_labeled_secondary_tags_cleaned[
#         'Affected Groups'].apply(preprocess_lists_affected_groups)

In [67]:
# entries_labeled_secondary_tags_cleaned.head(5)

### Final Processing of the Dataframe

In [68]:
entries_cleaned_final = entries.drop(columns=[
    'tabular_field_id', 'image_id', 'image_raw', 'created_at', 'modified_at',
    'created_by_id', 'modified_by_id', 'dropped_excerpt', 'highlight_hidden',
    'verified', 'verification_last_changed_by_id', 'client_id', 'title',
    'entry_type', 'information_date', 'order'
]).rename(columns={'id': 'entry_id'})

In [69]:
entries_labeled_secondary_tags_cleaned = entries_labeled_secondary_tags_cleaned.drop(
    columns=['title', 'entry_id']).rename(
        columns={
            'id': 'entry_id',
            "Location": 'geo_location',
            "Specific Needs Groups": 'specific_needs_groups',
            "Severity": 'severity',
            "Demographic Groups": 'demographic_groups',
            'Reliability': 'reliability',
            "Affected Groups": 'affected_groups',
            "Information Date": "info_date"
        })

In [70]:
entries_labeled_secondary_tags_cleaned = entries_labeled_secondary_tags_cleaned[~(
    entries_labeled_secondary_tags_cleaned.analysis_framework_id.isna()
    | entries_labeled_secondary_tags_cleaned.entry_id.isna()
    | entries_labeled_secondary_tags_cleaned.lead_id.isna()
    | entries_labeled_secondary_tags_cleaned.excerpt.isna())]

In [71]:
entries_labeled_secondary_tags_cleaned[
    'info_date'] = entries_labeled_secondary_tags_cleaned['info_date'].fillna(
        "UNKNOWN")

In [72]:
entries_labeled_secondary_tags_cleaned.head()

Unnamed: 0,entry_id,excerpt,analysis_framework_id,lead_id,project_id,severity,reliability,demographic_groups,specific_needs_groups,geo_location,info_date,affected_groups
0,16851,"During the reporting week, IOM provided medica...",137,6334,322,No problem,[Usually reliable],[],[Pregnant or Lactating Women],[Yemen],UNKNOWN,[]
1,25639,"On March 19, search-andrescue personnel airlif...",273,10607,878,No problem,[Usually reliable],[],[],"[Cidade Da Beira, Buzi]",22-03-2019,[]
4,21660,"According to The Guardian, more than 9,000 ref...",552,8644,788,,[Usually reliable],[],[],[Nigeria],UNKNOWN,"[Displaced, Refugees, Affected]"
5,16852,Distributed Rapid Response Mechanism (RMM) kit...,137,6334,322,No problem,[Usually reliable],[],[],[Yemen],UNKNOWN,"[Displaced, Affected]"
7,17917,Water represents the main nerve-racking proble...,137,6357,322,No problem,[Usually reliable],[],[],[Yemen],UNKNOWN,[]


In [73]:
entries_labeled_secondary_tags_cleaned.shape

(157469, 12)

## Merging with primary tags

In [74]:
primary_tags = pd.read_csv('generated_dataset/primary_tags.csv')
primary_tags_gimmac = pd.read_csv(
    'generated_dataset/primary_tags_gimmac_data.csv')

In [75]:
all_dataset = pd.concat([primary_tags, primary_tags_gimmac])
all_dataset = all_dataset.drop_duplicates()

In [76]:
all_dataset = all_dataset[~(
    all_dataset.analysis_framework_id.isna() | all_dataset.entry_id.isna()
    | all_dataset.lead_id.isna() | all_dataset.excerpt.isna())]
all_dataset.shape

(154455, 13)

In [77]:
final_merged = pd.merge(left=all_dataset,
                        right=entries_labeled_secondary_tags_cleaned,
                        how='outer',
                        on='entry_id')
final_merged.columns

Index(['entry_id', 'excerpt_x', 'entry_type', 'analysis_framework_id_x',
       'lead_id_x', 'title', 'project_id_x', 'verified', 'sectors',
       'pillars_2d', 'pillars_1d', 'subpillars_2d', 'subpillars_1d',
       'excerpt_y', 'analysis_framework_id_y', 'lead_id_y', 'project_id_y',
       'severity', 'reliability', 'demographic_groups',
       'specific_needs_groups', 'geo_location', 'info_date',
       'affected_groups'],
      dtype='object')

In [78]:
final_merged['excerpt'] = final_merged.apply(
    lambda x: x.excerpt_y if pd.isna(x.excerpt_x) else x.excerpt_x, axis=1)
final_merged = final_merged[~final_merged.excerpt.isna()].drop(
    columns=['excerpt_x', 'excerpt_y'])

final_merged['analysis_framework_id'] = final_merged.apply(
    lambda x: x.analysis_framework_id_y
    if pd.isna(x.analysis_framework_id_x) else x.analysis_framework_id_x,
    axis=1)
final_merged = final_merged[~final_merged.analysis_framework_id.isna()].drop(
    columns=['analysis_framework_id_x', 'analysis_framework_id_y'])

final_merged['lead_id'] = final_merged.apply(
    lambda x: x.lead_id_y if pd.isna(x.lead_id_x) else x.lead_id_x, axis=1)
final_merged = final_merged[~final_merged.lead_id.isna()].drop(
    columns=['lead_id_x', 'lead_id_y'])

final_merged['project_id'] = final_merged.apply(
    lambda x: x.project_id_y if pd.isna(x.project_id_x) else x.project_id_x,
    axis=1)
final_merged = final_merged[~final_merged.project_id.isna()].drop(
    columns=['project_id_x', 'project_id_y'])


In [79]:
tags = ['sectors', 'subpillars_2d', 'subpillars_1d']

for tag in tags:
    final_merged[tag] = final_merged[tag].fillna("").apply(lambda x: []
                                                           if x == "" else x)

In [80]:
final_merged.columns

Index(['entry_id', 'entry_type', 'title', 'verified', 'sectors', 'pillars_2d',
       'pillars_1d', 'subpillars_2d', 'subpillars_1d', 'severity',
       'reliability', 'demographic_groups', 'specific_needs_groups',
       'geo_location', 'info_date', 'affected_groups', 'excerpt',
       'analysis_framework_id', 'lead_id', 'project_id'],
      dtype='object')

In [81]:
final_merged = final_merged[[
    'entry_id', 'excerpt', 'analysis_framework_id', 'lead_id', 'project_id',
    'verified', 'sectors', 'subpillars_2d', 'subpillars_1d', 'geo_location',
    'specific_needs_groups', 'severity', 'info_date', 'demographic_groups',
    'reliability', 'affected_groups'
]]

## Adding Lead Details

In [82]:
leads = pd.read_csv('leads.csv').rename(columns={'id': 'lead_id'})[[
    'lead_id', 'source_type', 'url', 'website'
]]
leads.columns

Index(['lead_id', 'source_type', 'url', 'website'], dtype='object')

In [83]:
final_merged = pd.merge(left=final_merged,
                        right=leads,
                        how='left',
                        on='lead_id')

In [84]:
final_merged.to_csv('generated_dataset/full_dataset.csv', index=None)

### Quick Stats

In [85]:
df = pd.read_csv('generated_dataset/full_dataset.csv')
##
for col in [
        'sectors', 'subpillars_2d', 'subpillars_1d', 'geo_location',
        'specific_needs_groups', 'demographic_groups',
        'reliability', 'affected_groups'
]:
    df[col] = df[col].apply(lambda x: [] if x != x else literal_eval(x))
##
def count_vals(df, col):
    vals = Counter()
    for e in df[col]:
        vals.update(e)
    return vals.most_common()

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [86]:
count_vals(df, 'affected_groups')

[('Affected', 79438),
 ('Displaced', 49874),
 ('Migrants', 16516),
 ('Refugees', 15348),
 ('IDP', 14617),
 ('All', 7905),
 ('Non Displaced', 7154),
 ('Host', 6202),
 ('Others of Concern', 5299),
 ('Returnees', 4575),
 ('Permanent', 3346),
 ('Camp', 2332),
 ('Asylum Seekers', 2080),
 ('Others', 1587),
 ('In transit', 1287),
 ('People with irregular status', 501),
 ('Not affected', 421),
 ('Non-camp', 396),
 ('Pendular', 275),
 ('No legal documentation', 224),
 ('Unregistered Refugee', 221),
 ('Out of Venezuela', 130),
 ('Non Host', 126),
 ('Back to Venezuela', 108),
 ('Stateless', 88),
 ('Venezuelans with intention to remain in Colombia', 83),
 ('Venezuelans in transit (caminantes)', 25),
 ('Venezuelans engaged in pendular/circular movement ', 16),
 ('Other', 11),
 ('Not Affected', 10)]

In [87]:
count_vals(df, 'reliability')

[('Usually reliable', 152865),
 ('Fairly Reliable', 2596),
 ('Completely Reliable', 1205),
 ('Not Usually Reliable', 121),
 ('Unreliable', 46)]

In [88]:
count_vals(df, 'demographic_groups')

[('Children/Youth Female (5 to 17 years old)', 18909),
 ('Children/Youth Male (5 to 17 years old)', 15477),
 ('Adult Female (18 to 59 years old)', 12182),
 ('Adult Male (18 to 59 years old)', 5523),
 ('Older Persons Female (60+ years old)', 4978),
 ('Infants/Toddlers (<5 years old)', 3723),
 ('Older Persons Male (60+ years old)', 3023),
 ('Female', 1486),
 ('Male', 572)]

In [89]:
count_vals(df, 'specific_needs_groups')

[('Pregnant or Lactating Women', 2071),
 ('Indigenous people', 1789),
 ('Persons with Disability', 1659),
 ('Minorities', 1001),
 ('GBV survivors', 855),
 ('Unaccompanied or Separated Children', 814),
 ('Chronically Ill', 770),
 ('Female Head of Household', 722),
 ('LGBTQI+', 517),
 ('Single Women (including Widows)', 202),
 ('Child Head of Household', 192),
 ('Elderly Head of Household', 116)]

In [90]:
count_vals(df, 'geo_location')

[('Colombia', 13459),
 ('Bangladesh', 8926),
 ('Nigeria', 7132),
 ('Afghanistan', 6647),
 ('Syrian Arab Republic', 5771),
 ("Cox's Bazar", 5084),
 ('Borno', 4824),
 ('South Sudan', 4704),
 ('Libya', 3760),
 ('Somalia', 3380),
 ('Tchad', 2962),
 ('Adamawa', 2922),
 ('Burkina Faso', 2810),
 ('Yobe', 2570),
 ('Venezuela', 2365),
 ('Sudan', 2207),
 ('Sud-Ouest', 2189),
 ('Peru', 2105),
 ('Chile', 2063),
 ('Nord-Ouest', 1970),
 ('Norte de Santander', 1858),
 ('Lac', 1823),
 ('Est', 1769),
 ('République démocratique du Congo', 1731),
 ('Sahel', 1729),
 ('Cameroon', 1675),
 ('Northeast', 1668),
 ('La Guajira', 1609),
 ('Arauca', 1607),
 ('Tripoli', 1546),
 ('Nord-Kivu', 1512),
 ('Nord', 1495),
 ('Nariño', 1476),
 ('Northwest', 1378),
 ('Antioquia', 1313),
 ('Bogotá, D.C.', 1310),
 ('Centre-Nord', 1286),
 ('Argentina', 1265),
 ('Extrême-Nord', 1218),
 ('Ecuador', 1197),
 ('NIGER', 1154),
 ('Yemen', 1115),
 ('Government-Controlled Areas', 1081),
 ('Sud-Kivu', 1047),
 ('Kinshasa', 964),
 ('Boucl

In [91]:
count_vals(df, 'subpillars_1d')

[('NOT_MAPPED', 39526),
 ('Displacement->Type/Numbers/Movements', 9051),
 ('Context->Security & Stability', 7926),
 ('Covid-19->Cases', 5507),
 ('Covid-19->Restriction Measures', 4879),
 ('Context->Economy', 4756),
 ('Shock/Event->Hazard & Threats', 4587),
 ('Context->Demography', 3378),
 ('Casualties->Dead', 3194),
 ('Covid-19->Deaths', 3098),
 ('Context->Legal & Policy', 2804),
 ('Covid-19->Vaccination', 2776),
 ('Displacement->Local Integration', 2617),
 ('Context->Politics', 2095),
 ('Displacement->Push Factors', 2051),
 ('Shock/Event->Underlying/Aggravating Factors', 1863),
 ('Context->Socio Cultural', 1754),
 ('Humanitarian Access->Physical Constraints', 1743),
 ('Covid-19->Testing', 1622),
 ('Shock/Event->Type And Characteristics', 1336),
 ('Context->Environment', 1207),
 ('Humanitarian Access->Relief To Population', 948),
 ('Capacities & Response->Number Of People Reached/Response Gaps', 746),
 ('Humanitarian Access->Number Of People Facing Humanitarian Access Constraints/Human

In [92]:
count_vals(df, 'subpillars_2d')

[('NOT_MAPPED', 35577),
 ('Humanitarian Conditions->Living Standards', 27948),
 ('Humanitarian Conditions->Physical And Mental Well Being', 17246),
 ('Capacities & Response->International Response', 16305),
 ('Impact->Impact On Systems, Services And Networks', 15235),
 ('Impact->Impact On People', 12875),
 ('Impact->Driver/Aggravating Factors', 12737),
 ('At Risk->Risk And Vulnerabilities', 12460),
 ('Capacities & Response->National Response', 7277),
 ('Priority Interventions->Expressed By Humanitarian Staff', 5673),
 ('Humanitarian Conditions->Coping Mechanisms', 5577),
 ('Capacities & Response->Number Of People Reached/Response Gaps', 3419),
 ('Impact->Number Of People Affected', 2364),
 ('Priority Needs->Expressed By Population', 2222),
 ('Priority Needs->Expressed By Humanitarian Staff', 1686),
 ('Humanitarian Conditions->Number Of People In Need', 1339),
 ('Covid-19->Restriction Measures', 1050),
 ('Context->Economy', 427),
 ('Priority Interventions->Expressed By Population', 290)

In [93]:
count_vals(df, 'sectors')

[('Health', 34290),
 ('Protection', 28020),
 ('Cross', 26670),
 ('Livelihoods', 16421),
 ('Food Security', 15651),
 ('WASH', 11468),
 ('Education', 10141),
 ('Shelter', 10004),
 ('Nutrition', 5222),
 ('Agriculture', 3114),
 ('Logistics', 2798),
 ('NOT_MAPPED', 1262)]