### Matching GIMAC Generic

In [1]:
import re
import json
import random
from pprint import pprint
from ast import literal_eval
from collections import Counter, defaultdict
import os

import numpy as np
import pandas as pd
from tqdm.auto import tqdm

In [2]:
random.seed(2021)
np.random.seed(2021)

In [3]:
# each line in this table correspond to single tagging of an entry.
# an entry may have more than one tagging (e.g. a tag in 2D matrix, another in 1D matrix,
# secondary tags, etc. )
exportdata = pd.read_csv("exportdata.csv")
# I pulled entries related to projects chosen by Patrice
entries = pd.read_csv("entries.csv")
# The widgets of AF's, i.e. 2D matrices, 1D matrices, secondary tagging widgets, etc.
af_widgets = pd.read_csv("af_widgets.csv")
# Projects chosen by Patrice
projects = pd.read_csv("projects.csv")
#
exportables = pd.read_csv("af_exportables.csv")
# AF details
afs = pd.read_csv("analysis_frameworks.csv")
# user ids, first names and last names
users = pd.read_csv("user_names.csv")
# The matchin of pillar/sub-pillar names from different AFs into unified pillar/sub-pillar names
matching_gimac = pd.read_csv("gimac_to_vaf.csv")

In [4]:
dirName = 'generated_dataset'
if not os.path.exists(dirName):
    os.mkdir(dirName)

In [5]:
af_id_to_title = afs[["id", "title"]].set_index("id")["title"].to_dict()
af_title_to_ids = afs[["id", "title"]].groupby("title")["id"].apply(list).to_dict()
af_title_to_ids["GIMAC Generic"]

[1465]

In [6]:
exportdata.columns

Index(['id', 'data', 'entry_id', 'exportable_id'], dtype='object')

In [7]:
# keep "GIMAC Generic"'s data
af_widgets = af_widgets[af_widgets["analysis_framework_id"].eq(1465)]
exportables = exportables[exportables["analysis_framework_id"].eq(1465)]
exportdata = exportdata[exportdata["exportable_id"].isin(exportables.id)]

In [8]:
exportdata[exportdata["exportable_id"].isin(exportables.id)]["entry_id"].unique().shape

(42220,)

In [9]:
raw_name_to_tag_dict = {
    "Severity":"Severity",
    "Severity (Needs assessment entries only)":"Severity",
    "SEVERITY":"Severity",
    
    "Reliability":"Reliability",
    "RELIABILITY":"Reliability",
    
    "Demographic Groups":"Demographic Groups",
    "DEMOGRAPHIC GROUPS":"Demographic Groups",
    
    "Information date":"Information Date",
    "Information Date":"Information Date",
    "Date":"Information Date",
    "DATE OF INFORMATION":"Information Date",
    "Date range":"Information Date",
    
    "Geo Location":"Geo Location",
    "LOCATION":"Geo Location",
    "Geo location":"Geo Location",
    "Country":"Geo Location",
    "GEOLOCATIONS":"Geo Location",
    
    "Affected groups":"Affected Groups",
    "AFFECTED GROUPS":"Affected Groups",
    
    "Specific Needs Groups":"Specific Needs Groups",
    "SPECIFIC NEEDS GROUPS":"Specific Needs Groups",
}


In [10]:
def clean_titles(x):
    if x in list(raw_name_to_tag_dict.keys()):
        return raw_name_to_tag_dict[x]
    return x.title()


af_widgets['title'] = af_widgets['title'].apply(clean_titles)

In [11]:
af_widgets['title']

42     Specific Needs Groups
95                Pre-Crisis
96                 In-Crisis
100             Geo Location
123                  Excerpt
124              Shock/Event
125              Reliability
126       Additional Context
127       Displaced Pop Type
129            Cleaning Tags
130        Population Groups
131        Cleaning Comments
132       Demographic Groups
133          High Level Tags
Name: title, dtype: object

In [12]:
# widget names of 2D matrices in the chosen AFs's
mat2d_titles = [
    "Pre-Crisis",
    "Shock/Event",
    "In-Crisis",
]
# Pandas will read json objects as strings,
# here I am converting them into Python dict objects
af_widgets["properties"] = af_widgets["properties"].apply(json.loads)
##
mat2d_properties_ids = af_widgets[(
    af_widgets["title"].str.title()).isin(mat2d_titles)][[
        "properties", "analysis_framework_id"
    ]]
mat2d_properties = mat2d_properties_ids["properties"].tolist()
mat2d_ids = mat2d_properties_ids["analysis_framework_id"].tolist()
## 1D Matrices
# widget names of 1D matrices in the chosen AFs's
mat1d_titles = ["High Level Tags"]

mat1d_properties_ids = af_widgets[(
    af_widgets["title"].str.upper()).isin(mat1d_titles)][[
        "properties", "analysis_framework_id"
    ]]
mat1d_properties = mat1d_properties_ids["properties"].tolist()
mat1d_ids = mat1d_properties_ids["analysis_framework_id"].tolist()

In [13]:
afids_pillars_subpillars = dict()
errors = []
for mat, af_id in zip(mat2d_properties, mat2d_ids):
    try:
        dims = mat["data"]['dimensions']
        afids_pillars_subpillars[af_id] = {}
        for dim in dims:
            pillar = dim["title"]
            sub_pillars = []
            for sub_pillar_dict in dim["subdimensions"]:
                sub_pillars.append(sub_pillar_dict["title"])
            afids_pillars_subpillars[af_id][pillar] = sub_pillars
    except KeyError:
        errors.append([mat, af_id])
##
afids_rows_cells = dict()
for mat, af_id in zip(mat1d_properties, mat1d_ids):
    try:
        rows = mat["data"]['rows']
        afids_rows_cells[af_id] = {}
        for row in rows:
            pillar = row["title"]
            sub_pillars = []
            for sub_pillar_dict in row["cells"]:
                sub_pillars.append(sub_pillar_dict["value"])
            afids_rows_cells[af_id][pillar] = sub_pillars
    except KeyError:
        errors.append([mat, af_id])

In [14]:
def extract_title(x):
    if x["excel"].get("title"):
        return x["excel"]["title"]
    elif x["excel"].get("type") == "multiple":
        return x["excel"]["titles"]
##
exportables["data"] = exportables["data"].apply(literal_eval)
exportable_titles = exportables["data"].apply(extract_title).tolist()

In [15]:
print(entries.shape, exportdata.shape)
entries = entries[entries["entry_type"].eq("excerpt") & (~entries["excerpt"].isna())]
print(entries.shape, exportdata.shape)

(203464, 21) (186929, 4)
(157903, 21) (186929, 4)


In [16]:
exid_to_exdata = dict()
for ex_id, ex_data in zip(exportables["id"], exportables["data"]):
    exid_to_exdata[ex_id] = ex_data

In [17]:
widget_key_id_to_title = dict()
for w_key, w_id, title in zip(af_widgets["key"], af_widgets["widget_id"], af_widgets["title"]):
    widget_key_id_to_title[(w_key, w_id)] = title

In [18]:
def exportdata_to_tag_title(row):
    data = json.loads(row[1])
    wkey, wid = None, None
    if data.get("common"):
        wkey = data.get("common").get("widget_key")
        wid = data["common"].get("widget_id")
    if wkey and wid:
        if widget_key_id_to_title.get((wkey, wid)):
            return widget_key_id_to_title.get((wkey, wid))
    if data.get("report") and data["report"].get("other"):
            if len(data["report"]["other"]) == 1 and data["report"]["other"][0].get("title"):
                if data["report"]["other"][0]["title"]:
                    return data["report"]["other"][0]["title"]
    if isinstance(data.get("excel"), list) and len(data["excel"])==1 and \
     data["excel"][0].get("widget_key") and data["excel"][0].get("widget_id"):
        wkey = data["excel"][0]["widget_key"]
        wid = data["excel"][0]["widget_id"]
        if widget_key_id_to_title.get((wkey, wid)):
            return widget_key_id_to_title.get((wkey, wid))
    exportable_id = row[3]
    if exid_to_exdata[exportable_id]["excel"].get("title"):
        return exid_to_exdata[exportable_id]["excel"]["title"]
    elif exid_to_exdata[exportable_id]["excel"].get("type") == "multiple":
        return exid_to_exdata[exportable_id]["excel"]["titles"]
    raise

In [19]:
exportdata["tag_title"] = exportdata.apply(exportdata_to_tag_title, axis=1)

In [20]:
def exportdata_to_tag_value(row):
    data = json.loads(row[1])
    if isinstance(data["excel"], list) and len(data["excel"])==1:
        return data["excel"][0]["value"]
    if data["excel"].get("type") == "lists":
        return data["excel"]["values"]
    if isinstance(data["excel"], dict) and "value" in data["excel"]:
        return data["excel"].get("value")
    if isinstance(data["excel"], dict) and "values" in data["excel"]:
        return data["excel"].get("values")
    elif "values" in data["common"]:
        return data["common"]["values"]
    elif "value" in data["common"]:
        return data["common"]["value"]
    raise

In [21]:
exportdata["tag_value"] = exportdata.apply(exportdata_to_tag_value, axis=1)

In [22]:
def title_case(tag):
    if isinstance(tag, (list, tuple)):
        return tuple([x.title() for x in tag])
    return tag.title()

In [23]:
exportdata["tag_title"] = exportdata["tag_title"].apply(title_case)

In [24]:
mat_1d_and_flag_titles = ["High Level Tags"]#, ('High Level Tags - Dimension', 'High Level Tags - Subdimension')]

In [25]:
exportdata["tag_title"].unique()

array(['Pre-Crisis', 'Population Groups', 'Geo Location', 'Reliability',
       'Demographic Groups', 'Shock/Event', 'High Level Tags',
       'In-Crisis', 'Specific Needs Groups', 'Additional Context',
       'Displaced Pop Type', 'Cleaning Tags', 'Cleaning Comments',
       ('In-Crisis - Dimension', 'In-Crisis - Subdimension', 'In-Crisis - Sector', 'In-Crisis - Subsectors'),
       ('Pre-Crisis - Dimension', 'Pre-Crisis - Subdimension', 'Pre-Crisis - Sector', 'Pre-Crisis - Subsectors'),
       ('High Level Tags - Dimension', 'High Level Tags - Subdimension')],
      dtype=object)

In [26]:
af_widgets["title"].unique()

array(['Specific Needs Groups', 'Pre-Crisis', 'In-Crisis', 'Geo Location',
       'Excerpt', 'Shock/Event', 'Reliability', 'Additional Context',
       'Displaced Pop Type', 'Cleaning Tags', 'Population Groups',
       'Cleaning Comments', 'Demographic Groups', 'High Level Tags'],
      dtype=object)

In [27]:
#exportdata[exportdata["tag_title"].isin([('High Level Tags - Dimension', 'High Level Tags - Subdimension')])]["tag_value"].tolist()
# exportdata[exportdata["tag_title"].isin([('In-Crisis - Dimension', 'In-Crisis - Subdimension', 'In-Crisis - Sector', 'In-Crisis - Subsectors'),
#        ('Pre-Crisis - Dimension', 'Pre-Crisis - Subdimension', 'Pre-Crisis - Sector', 'Pre-Crisis - Subsectors')])]["tag_value"].tolist()
#exportdata[exportdata["tag_title"].eq('High Level Tags')]["tag_value"].tolist()
exportdata[exportdata["tag_title"].eq('High Level Tags')]["tag_value"].shape, \
exportdata[exportdata["tag_title"].eq('Pre-Crisis')]["tag_value"].shape, \
exportdata[exportdata["tag_title"].eq('In-Crisis')]["tag_value"].shape, \
exportdata[exportdata["tag_title"].eq('Shock/Event')]["tag_value"].shape

((8024,), (8644,), (34198,), (4647,))

In [28]:
exportdata_of_interest = exportdata[exportdata["tag_title"].isin(
    mat2d_titles + mat_1d_and_flag_titles)]

In [29]:
exportdata_of_interest.shape, entries.shape

((55513, 6), (157903, 21))

In [30]:
entries_labeled = pd.merge(entries,
                           exportdata_of_interest,
                           how="inner",
                           left_on="id",
                           right_on="entry_id",
                           suffixes=('_entry', '_exportdata'))

In [31]:
entries_labeled["id_entry"].shape, entries_labeled["id_entry"].unique().shape

((49789,), (38148,))

In [32]:
def agg_group(group):
    tag_titles = group["tag_title"]
    tag_values = group["tag_value"]
    #
    grouped = defaultdict(list)
    for title, value in zip(tag_titles, tag_values):
        if title in mat_1d_and_flag_titles:
            grouped["Mat1D"].extend(value)
        elif title in mat2d_titles:
            grouped["Mat2D"].extend(value)
    return grouped

In [33]:
entries_labeled.columns

Index(['id_entry', 'created_at', 'modified_at', 'excerpt', 'image_raw',
       'analysis_framework_id', 'created_by_id', 'lead_id', 'modified_by_id',
       'entry_type', 'information_date', 'order', 'client_id', 'project_id',
       'tabular_field_id', 'dropped_excerpt', 'highlight_hidden', 'verified',
       'verification_last_changed_by_id', 'image_id', 'title', 'id_exportdata',
       'data', 'entry_id', 'exportable_id', 'tag_title', 'tag_value'],
      dtype='object')

In [34]:
tags = entries_labeled.groupby("id_entry").apply(agg_group)

In [35]:
entryids_tags = pd.DataFrame({"entry_id": tags.index, "tags": tags.values})

In [36]:
entryids_tags.shape

(38148, 2)

In [37]:
entries_ = entries.rename({"id": "entry_id"}, axis=1, inplace=False)
#
print(entries.shape)
#
entries_ = entries_[[
    'entry_id',
    'created_at',
    'modified_at',
    'excerpt',
    'entry_type',
    'analysis_framework_id',
    'created_by_id',
    'lead_id',
    'modified_by_id',
    'information_date',
    'order',
    'title',
    'project_id',
    'verified',
]].drop_duplicates()
#
print(entries_.shape)

(157903, 21)
(157903, 14)


In [38]:
# print(entries_labeled.shape)
# entries_labeled = entries_labeled[[
#     'entry_id',
#     'created_at',
#     'modified_at',
#     'excerpt',
#     'entry_type',
#     'analysis_framework_id',
#     'created_by_id',
#     'lead_id',
#     'modified_by_id',
#     'information_date',
#     'order',
#     'project_id',
#     #'dropped_excerpt',
#     'verified',
# ]].drop_duplicates()
# print(entries_labeled.shape)

In [39]:
entries_labeled = pd.merge(entries_, entryids_tags, on="entry_id", how="inner")
entries_labeled.shape, entries_labeled.columns

((38148, 15),
 Index(['entry_id', 'created_at', 'modified_at', 'excerpt', 'entry_type',
        'analysis_framework_id', 'created_by_id', 'lead_id', 'modified_by_id',
        'information_date', 'order', 'title', 'project_id', 'verified', 'tags'],
       dtype='object'))

In [40]:
entries_labeled["tags_str"] = entries_labeled["tags"].apply(str)
entries_labeled.duplicated(subset=["entry_id", "tags_str"]).sum()

0

In [41]:
entries_labeled["entry_id"].duplicated().sum()

0

In [42]:
entries_labeled.shape

(38148, 16)

In [43]:
entries_labeled["tags"].isna().sum()

0

In [44]:
entries.shape, entries[~entries["excerpt"].isna()]["excerpt"].unique().shape

((157903, 21), (155459,))

In [45]:
entries_labeled.shape, entries_labeled.columns

((38148, 16),
 Index(['entry_id', 'created_at', 'modified_at', 'excerpt', 'entry_type',
        'analysis_framework_id', 'created_by_id', 'lead_id', 'modified_by_id',
        'information_date', 'order', 'title', 'project_id', 'verified', 'tags',
        'tags_str'],
       dtype='object'))

In [46]:
entries_labeled

Unnamed: 0,entry_id,created_at,modified_at,excerpt,entry_type,analysis_framework_id,created_by_id,lead_id,modified_by_id,information_date,order,title,project_id,verified,tags,tags_str
0,233460,2021-01-05 08:16:23.956916+00,2021-04-02 10:43:12.065945+00,"Between January and March 2020, more than 300,...",excerpt,1465,2744,44708,2742,,47,GIMAC South Sudan,2335,True,"{'Mat2D': [['HUMANITARIAN SITUATION', 'Humanit...","defaultdict(<class 'list'>, {'Mat2D': [['HUMAN..."
1,233466,2021-01-05 08:27:12.191481+00,2021-04-02 10:43:35.079851+00,The first quarter saw humanitarian reach vary ...,excerpt,1465,2744,44708,2742,,48,GIMAC South Sudan,2335,True,"{'Mat2D': [['HUMANITARIAN SITUATION', 'Humanit...","defaultdict(<class 'list'>, {'Mat2D': [['HUMAN..."
2,217146,2020-12-16 06:21:17.1031+00,2021-01-15 05:33:36.204388+00,"As a result of these efforts, 19 isolation cen...",excerpt,1465,2741,44961,2746,,14,GIMAC Somalia,2331,True,"{'Mat2D': [['OPERATIONAL ENVIRONMENT', 'Nation...","defaultdict(<class 'list'>, {'Mat2D': [['OPERA..."
3,282536,2021-03-18 05:55:43.974559+00,2021-03-26 08:38:49.014899+00,"In late August 2020, IOM CCCM coordinated a do...",excerpt,1465,2744,51243,2744,,10,GIMAC South Sudan,2335,True,"{'Mat2D': [['OPERATIONAL ENVIRONMENT', 'Intern...","defaultdict(<class 'list'>, {'Mat2D': [['OPERA..."
4,291038,2021-03-26 06:14:22.172504+00,2021-03-29 06:02:28.855473+00,The ministry of Education has provided online ...,excerpt,1465,2742,51255,2742,,10,GIMAC South Sudan,2335,True,"{'Mat2D': [['OPERATIONAL ENVIRONMENT', 'Nation...","defaultdict(<class 'list'>, {'Mat2D': [['OPERA..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38143,192392,2020-11-19 07:26:14.198449+00,2021-05-07 11:49:02.789558+00,Over 40 per cent of the population in Nangarha...,excerpt,1465,2745,43233,3009,,373,GIMAC Afghanistan,2336,True,"{'Mat2D': [['SHOCK INFORMATON', 'Aggravating f...","defaultdict(<class 'list'>, {'Mat2D': [['SHOCK..."
38144,192389,2020-11-19 07:24:58.082405+00,2021-05-07 11:47:31.63679+00,The need for shelter was reported to be the hi...,excerpt,1465,2745,43233,3009,,372,GIMAC Afghanistan,2336,True,"{'Mat2D': [['HUMANITARIAN SITUATION', 'Humanit...","defaultdict(<class 'list'>, {'Mat2D': [['HUMAN..."
38145,192386,2020-11-19 07:24:22.137399+00,2021-05-07 11:46:41.665937+00,"According to the 2019 WOA Assessment, needs of...",excerpt,1465,2745,43233,3009,,371,GIMAC Afghanistan,2336,True,"{'Mat2D': [['HUMANITARIAN SITUATION', 'Humanit...","defaultdict(<class 'list'>, {'Mat2D': [['HUMAN..."
38146,322876,2021-04-28 08:56:50.107051+00,2021-07-29 09:01:57.637742+00,Education: 32% of assessed settlements in whic...,excerpt,1465,2746,54751,3009,,5,GIMAC Somalia,2331,True,"{'Mat2D': [['EFFECTS SYSTEMS AND NETWORKS', 'B...","defaultdict(<class 'list'>, {'Mat2D': [['EFFEC..."


In [47]:
def tags_to_pillars(x):
    tags = x[0]
    af_id = x[1]
    pillars = []
    for tag_key, tag_value in tags.items():
        if len(tag_value):
            for t in tag_value:
                if t[0]:
                    if tag_key[-2:] == "1D":
                        tag_type = "High Level Tags"
                    else:
                        tag_type = "2D"
                    pillars.append(
                      tag_type + "->" + t[0].title().strip() + "->" + t[1].title().strip())
    return list(set(pillars))

def tags_to_sectors(x):
    tags = x[0]
    af_id = x[1]
    sectors = []
    for tag_key, tag_value in tags.items():
        if tag_key != "Mat2D":
            continue
        if len(tag_value):
            for t in tag_value:
                if len(t) > 2 and t[2]:
                    sectors.append(t[2].title())
    return list(set(sectors))

In [48]:
entries_labeled.columns

Index(['entry_id', 'created_at', 'modified_at', 'excerpt', 'entry_type',
       'analysis_framework_id', 'created_by_id', 'lead_id', 'modified_by_id',
       'information_date', 'order', 'title', 'project_id', 'verified', 'tags',
       'tags_str'],
      dtype='object')

In [49]:
entries_labeled["pillars"] = entries_labeled[["tags", 'analysis_framework_id'
                                              ]].apply(tags_to_pillars, axis=1)
entries_labeled["sectors"] = entries_labeled[["tags", 'analysis_framework_id'
                                              ]].apply(tags_to_sectors, axis=1)

In [50]:
entries_labeled[~entries_labeled["excerpt"].isna()]["excerpt"].unique().shape

(37729,)

In [51]:
pills_unique_1d = set()
pills_occurances_1d = list()
for pills_1d in entries_labeled["pillars"]:
    pills_unique_1d.update(pills_1d)
    pills_occurances_1d.extend(pills_1d)
len(pills_unique_1d), Counter(pills_occurances_1d).most_common()

(49,
 [('2D->Operational Environment->International Humanitarian Response And Capacities',
   7480),
  ('2D->Humanitarian Conditions->Humanitarian Condition', 7199),
  ('2D->Humanitarian Conditions->Risk', 3499),
  ('2D->Stated Priorities And Recommendations->Recommendations From Aid/Research Actors',
   3442),
  ('2D->Operational Environment->National/Sub-National Response And Capacities',
   3290),
  ('2D->Humanitarian Situation->Humanitarian Profile', 2914),
  ('High Level Tags->General->Risk', 2339),
  ('2D->Effects On Population->Post-Shock Displacement', 1996),
  ('2D->Effects Systems And Networks->Basic Infrastructure And Social Services',
   1843),
  ('2D->Effects On Population->Disruption Of Social Behaviors And Networks',
   1749),
  ('2D->Effects On Population->Damage To Physical Integrity', 1724),
  ('2D->Effects On Population->Disruption Of Resources And Assets', 1303),
  ('2D->Operational Environment->Humanitarian Access', 1252),
  ('High Level Tags->General->Response Gap

In [52]:
secs_unique = set()
secs_occurances = list()
for secs in entries_labeled["sectors"]:
    secs_unique.update(secs)
    secs_occurances.extend(secs)
#secs_unique
Counter(secs_occurances).most_common()

[('Cross-Sector', 12411),
 ('Health', 8676),
 ('Protection', 7239),
 ('Food Security', 4568),
 ('Livelihoods', 3974),
 ('Wash', 3345),
 ('Education', 2923),
 ('Shelter And Nfis', 2002),
 ('Nutrition', 2000),
 ('Cccm', 678),
 ('Shelter', 311)]

In [53]:
entries_labeled.columns

Index(['entry_id', 'created_at', 'modified_at', 'excerpt', 'entry_type',
       'analysis_framework_id', 'created_by_id', 'lead_id', 'modified_by_id',
       'information_date', 'order', 'title', 'project_id', 'verified', 'tags',
       'tags_str', 'pillars', 'sectors'],
      dtype='object')

In [54]:
final_df = entries_labeled[entries_labeled["entry_type"].eq("excerpt")
                           & (~entries_labeled["excerpt"].isna())]
final_df.shape

(38148, 18)

In [55]:
def remove_newlines(excerpt):
    if not isinstance(excerpt, str):
        return excerpt
    return re.sub("\s+", " ", excerpt)

final_df.loc[:, 'excerpt'] = final_df['excerpt'].apply(remove_newlines)
#final_df.loc[:, 'dropped_excerpt'] = final_df['dropped_excerpt'].apply(remove_newlines)
##
final_df = final_df.sort_values("verified").reset_index(drop=True).drop_duplicates(subset='excerpt', keep="last")
final_df.shape, final_df.duplicated(subset="excerpt").sum()

((37711, 18), 0)

In [56]:
sector_name_mapper = {
    "Agriculture": "Agriculture",
    "Cccm": "NOT_MAPPED",
    "Cross": "Cross",
    "Cross Cutting": "Cross",
    "Cross-Sector": "Cross",
    "Education": "Education",
    "Food": "Food Security",
    "Food Security": "Food Security",
    "Nutrition": "Nutrition",
    "Health": "Health",
    "Livelihood": "Livelihoods",
    "Livelihoods": "Livelihoods",
    "Logistic": "Logistics",
    "Logistics": "Logistics",
    "Protection": "Protection",
    "Shelter": "Shelter",
    "Shelter And Nfis": "Shelter",
    "Nfi": "NOT_MAPPED",
    "Wash": "WASH",
}

In [57]:
af_title_to_id = dict()
for afid, title in zip(afs["id"], afs["title"]):
    af_title_to_id["title"] = afid

In [58]:
for col in matching_gimac.columns[1:7]:
    matching_gimac[col] = matching_gimac[col].astype(str).str.title()

In [59]:
## fix typo problems
corrected = 'Shock Information' # should not have been corrected!
orig = 'Shock Informaton'
matching_gimac.loc[matching_gimac['Pillar'].eq(corrected), 'Pillar'] = orig
##
corrected = 'Effects On Systems And Networks' # should not have been corrected!
orig = 'Effects Systems And Networks'
matching_gimac.loc[matching_gimac['Pillar'].eq(corrected), 'Pillar'] = orig
##
corrected = 'Humanitarian Access Gaps' # should not have been corrected!
orig = 'Humanitarian Access'
matching_gimac.loc[matching_gimac['Sub-pillar'].eq(corrected), 'Sub-pillar'] = orig
##
corrected = 'Supply Chain' # should not have been corrected!
orig = 'Supply Chains'
matching_gimac.loc[matching_gimac['Sub-pillar'].eq(corrected), 'Sub-pillar'] = orig
## missing from GIMAC Framework not in GIMAC Generic
existing = 'Disruption of resources and assets'.title()
missing = 'Damage To Resources And Assets'.title()
row = matching_gimac.loc[matching_gimac['Sub-pillar'].eq(existing)].iloc[0]
row['Sub-pillar'] = missing
matching_gimac.loc[matching_gimac.shape[0]] = row
##
existing = 'Primary Shock (COVID Event Characteristics)'.title()
missing = 'Primary Shock (Covid)'.title()
row = matching_gimac.loc[matching_gimac['Sub-pillar'].eq(existing)].iloc[0]
row['Sub-pillar'] = missing
matching_gimac.loc[matching_gimac.shape[0]] = row

In [60]:
original_to_virtual = dict()
for otype, opillar, osubpillar, ftype, fpillar, fsubpillar in zip(
        matching_gimac["Type"],
        matching_gimac["Pillar"],
        matching_gimac["Sub-pillar"],
        matching_gimac["Virtual Type"],
        matching_gimac["Virtual Pillar"],
        matching_gimac["Virtual Sub-pillar"],
):
    original = f"{otype.title().strip()}->{opillar.title().strip()}->{osubpillar.title().strip()}"
    virtual = f"{ftype.title().strip()}->{fpillar.title().strip()}->{fsubpillar.title().strip()}"
    original_to_virtual[original] = virtual

In [61]:
original_to_virtual

{'2D->Humanitarian Situation->Humanitarian Access': '1D->Humanitarian Access->Number Of People Facing Humanitarian Access Constraints/Humanitarian Access Gaps',
 '2D->Humanitarian Situation->Humanitarian Profile': 'Nan->Nan->Nan',
 '2D->Humanitarian Situation->Pre-Covid Humanitarian Operations': 'Nan->Nan->Nan',
 '2D->Humanitarian Situation->Pre-Covid National Response Mechanisms': 'Nan->Nan->Nan',
 '2D->Context->Socio-Economic': '1D->Context->Economy',
 '2D->Context->Socio-Cultural': '1D->Context->Socio Cultural',
 '2D->Context->Security': '1D->Context->Security & Stability',
 '2D->Context->Political': '1D->Context->Politics',
 '2D->Context->Policy/Normative Frameworks': '1D->Context->Legal & Policy',
 '2D->Context->Demographic': '1D->Context->Demography',
 '2D->Context->Environmental': '1D->Context->Environment',
 '2D->Context->Basic Infrastructure And Social Services': 'Nan->Nan->Nan',
 '2D->Context->Information And Communication': 'Nan->Nan->Nan',
 '2D->Shock Informaton->Primary Sh

In [62]:
def sector_mapper(sec):
    if sec == sec:
        return [sector_name_mapper[s] for s in sec]
    return []


##
def pillar_mapper_2d(dim):
    if dim == dim:
        dim = [original_to_virtual[d] for d in dim]

        dim_processed = []
        for d in dim:
            if d.startswith('2D'):
                dim_processed.append(d.split("->")[1])
            elif d == 'Nan->Nan->Nan':
                dim_processed.append('NOT_MAPPED')

        return dim_processed
    return []


##
def subpillar_mapper_2d(subdim):
    if subdim == subdim:
        subdim = [original_to_virtual[d] for d in subdim]

        subdim_processed = []
        for d in subdim:
            if d.startswith('2D'):
                subdim_processed.append(d[4:])
            elif d == 'Nan->Nan->Nan':
                subdim_processed.append('NOT_MAPPED')

        return subdim_processed
    return []


##
def pillar_mapper_1d(dim):
    if dim == dim:
        dim = [original_to_virtual[d] for d in dim]

        dim_processed = []
        for d in dim:
            if d.startswith('1D'):
                dim_processed.append(d.split("->")[1])
            elif d == 'Nan->Nan->Nan':
                dim_processed.append('NOT_MAPPED')

        return dim_processed
    return []


##
def subpillar_mapper_1d(subdim):
    if subdim == subdim:
        subdim = [original_to_virtual[d] for d in subdim]

        subdim_processed = []
        for d in subdim:
            if d.startswith('1D'):
                subdim_processed.append(d[4:])
            elif d == 'Nan->Nan->Nan':
                subdim_processed.append('NOT_MAPPED')

        return subdim_processed
    return []

In [63]:
final_df["sectors"] = final_df["sectors"].apply(sector_mapper)
final_df["subpillars_2d"] = final_df["pillars"].apply(subpillar_mapper_2d)
final_df["subpillars_1d"] = final_df["pillars"].apply(subpillar_mapper_1d)
final_df["pillars_1d"] = final_df["pillars"].apply(pillar_mapper_1d)
final_df["pillars_2d"] = final_df["pillars"].apply(pillar_mapper_2d)
final_df.drop(columns="pillars", inplace=True)

In [64]:
final_df.shape

(37711, 21)

In [65]:
to_be_deleted = 'NOT_MAPPED'

valid_covn_df = final_df[~(
    final_df["pillars_2d"].apply(lambda x: to_be_deleted in x)
    | final_df["pillars_1d"].apply(lambda x: to_be_deleted in x)
    | final_df["subpillars_2d"].apply(lambda x: to_be_deleted in x)
    | final_df["subpillars_1d"].apply(lambda x: to_be_deleted in x))]

In [66]:
final_df.shape, valid_covn_df.shape

((37711, 21), (17667, 21))

In [67]:
secs_unique = set()
secs_occurances = list()
for secs in final_df["sectors"]:
    secs_unique.update(secs)
    secs_occurances.extend(secs)
#secs_unique
Counter(secs_occurances).most_common()

[('Cross', 12281),
 ('Health', 8557),
 ('Protection', 7193),
 ('Food Security', 4513),
 ('Livelihoods', 3925),
 ('WASH', 3318),
 ('Education', 2887),
 ('Shelter', 2293),
 ('Nutrition', 1979),
 ('NOT_MAPPED', 675)]

In [68]:
pills_unique = set()
pills_occurances = list()
for pills in final_df["pillars_2d"]:
    pills_unique.update(pills)
    pills_occurances.extend(pills)
#pills_unique
Counter(pills_occurances).most_common()

[('NOT_MAPPED', 24094),
 ('Capacities & Response', 11734),
 ('Impact', 5675),
 ('Priority Interventions', 3384),
 ('Humanitarian Conditions', 2919),
 ('Covid-19', 1050),
 ('Priority Needs', 562),
 ('Context', 427)]

In [69]:
subpills_unique = set()
subpills_occurances = list()
for subpills in final_df["subpillars_2d"]:
    subpills_unique.update(subpills)
    subpills_occurances.extend(subpills)
Counter(subpills_occurances).most_common()

[('NOT_MAPPED', 24094),
 ('Capacities & Response->International Response', 7385),
 ('Priority Interventions->Expressed By Humanitarian Staff', 3384),
 ('Capacities & Response->National Response', 3250),
 ('Impact->Impact On People', 3191),
 ('Impact->Impact On Systems, Services And Networks', 2484),
 ('Humanitarian Conditions->Physical And Mental Well Being', 1711),
 ('Humanitarian Conditions->Coping Mechanisms', 1208),
 ('Capacities & Response->Number Of People Reached/Response Gaps', 1099),
 ('Covid-19->Restriction Measures', 1050),
 ('Priority Needs->Expressed By Population', 562),
 ('Context->Economy', 427)]

In [70]:
pills_unique_1d = set()
pills_occurances_1d = list()
for pills_1d in final_df["pillars_1d"]:
    pills_unique_1d.update(pills_1d)
    pills_occurances_1d.extend(pills_1d)
#pills_unique_1d
Counter(pills_occurances_1d).most_common()

[('NOT_MAPPED', 24094),
 ('Context', 3407),
 ('Shock/Event', 2767),
 ('Humanitarian Access', 268),
 ('Information And Communication', 156)]

In [71]:
subpills_unique_1d = set()
subpills_occurances_1d = list()
for subpills_1d in final_df["subpillars_1d"]:
    subpills_unique_1d.update(subpills_1d)
    subpills_occurances_1d.extend(subpills_1d)
#subpills_unique_1d
Counter(subpills_occurances_1d).most_common()

[('NOT_MAPPED', 24094),
 ('Shock/Event->Hazard & Threats', 2316),
 ('Context->Security & Stability', 1024),
 ('Context->Economy', 857),
 ('Context->Socio Cultural', 576),
 ('Shock/Event->Underlying/Aggravating Factors', 451),
 ('Context->Environment', 326),
 ('Humanitarian Access->Number Of People Facing Humanitarian Access Constraints/Humanitarian Access Gaps',
  268),
 ('Context->Legal & Policy', 261),
 ('Context->Demography', 193),
 ('Context->Politics', 170),
 ('Information And Communication->Knowledge And Info Gaps (Hum)', 156)]

In [72]:
final_df["subpillars_1d"].apply(lambda x:x!=[]).sum(),\
final_df["subpillars_2d"].apply(lambda x:x!=[]).sum(),\
final_df["sectors"].apply(lambda x:x!=[]).sum()

(22759, 35194, 37389)

In [73]:
final_df = final_df[['entry_id', 'excerpt', 'entry_type',
        'analysis_framework_id', 'lead_id','title',
        'project_id', 'verified',
        'sectors', 'subpillars_2d', 'subpillars_1d', 'pillars_1d',
        'pillars_2d']]

valid_covn_df = valid_covn_df[['entry_id', 'excerpt', 'entry_type',
        'analysis_framework_id', 'lead_id','title',
        'project_id', 'verified',
        'sectors', 'subpillars_2d', 'subpillars_1d', 'pillars_1d',
        'pillars_2d']]

In [74]:
final_df = final_df[~(
    final_df.analysis_framework_id.isna() | 
    final_df.entry_id.isna() |
    final_df.lead_id.isna() |
    final_df.excerpt.isna())
]

valid_covn_df = valid_covn_df[~(
    valid_covn_df.analysis_framework_id.isna() | 
    valid_covn_df.entry_id.isna() |
    valid_covn_df.lead_id.isna() |
    valid_covn_df.excerpt.isna())
]
final_df.shape, valid_covn_df.shape

((37711, 13), (17667, 13))

In [75]:
final_df.to_csv('generated_dataset/primary_tags_gimmac_data.csv', index=None)
valid_covn_df.to_csv('generated_dataset/primary_tags_gimmac_data_vc.csv', index=None)