In [1]:
import re
import os
import json
import random
from pprint import pprint
from ast import literal_eval
from collections import Counter, defaultdict

import numpy as np
import pandas as pd
from tqdm.auto import tqdm

In [2]:
random.seed(2021)
np.random.seed(2021)

In [3]:
# each line in this table correspond to single tagging of an entry.
# an entry may have more than one tagging (e.g. a tag in 2D matrix, another in 1D matrix,
# secondary tags, etc. )
exportdata = pd.read_csv("exportdata.csv")
# I pulled entries related to projects chosen by Patrice
entries = pd.read_csv("entries.csv")
# The widgets of AF's, i.e. 2D matrices, 1D matrices, secondary tagging widgets, etc.
af_widgets = pd.read_csv("af_widgets.csv")
# Projects chosen by Patrice
projects = pd.read_csv("projects.csv")
#
exportables = pd.read_csv("af_exportables.csv")
# AF details
afs = pd.read_csv("analysis_frameworks.csv")
# user ids, first names and last names
users = pd.read_csv("user_names.csv")
# The matchin of pillar/sub-pillar names from different AFs into unified pillar/sub-pillar names
# matching_2d_mat = pd.read_csv("mat_2d_matching.csv")
# matching_1d_mat = pd.read_csv("mat_1d_matching.csv")
mapping = pd.read_csv("mapping.csv")

In [4]:
# remove raws related to GIMAC framework
mapping = mapping[mapping['Framework Title'].ne('GIMAC Generic')]
# split into two mapping dfs: 1D and 2D mappings
matching_2d_mat = mapping[mapping["Type"].eq("2D")].copy()
matching_1d_mat = mapping[mapping["Type"].isin(["1D", "Flag"])].copy()

In [5]:
gen_data_dirname = 'generated_dataset'
if not os.path.exists(gen_data_dirname):
    os.mkdir(gen_data_dirname)

In [6]:
# remove "GIMAC Generic"'s data
entries = entries[entries["analysis_framework_id"].ne(1465)]

In [7]:
raw_name_to_tag_dict = {
    "Severity":"Severity",
    "Severity (Needs assessment entries only)":"Severity",
    "SEVERITY":"Severity",
    
    "Reliability":"Reliability",
    "RELIABILITY":"Reliability",
    
    "Demographic Groups":"Demographic Groups",
    "DEMOGRAPHIC GROUPS":"Demographic Groups",
    
    "Information date":"Information Date",
    "Information Date":"Information Date",
    "Date":"Information Date",
    "DATE OF INFORMATION":"Information Date",
    "Date range":"Information Date",
    
    "Geo Location":"Geo Location",
    "LOCATION":"Geo Location",
    "Geo location":"Geo Location",
    "Country":"Geo Location",
    "GEOLOCATIONS":"Geo Location",
    
    "Affected groups":"Affected Groups",
    "AFFECTED GROUPS":"Affected Groups",
    
    "Specific Needs Groups":"Specific Needs Groups",
    "SPECIFIC NEEDS GROUPS":"Specific Needs Groups",
}


In [8]:
entries.shape

(161244, 21)

In [9]:
def clean_titles(x):
    return raw_name_to_tag_dict.get(x, x)


af_widgets['title'] = af_widgets['title'].apply(clean_titles)

In [10]:
# widget names of 2D matrices in the chosen AFs's
mat2d_titles = [
    s.upper() for s in [
        "Sectors",
        "Sectoral Information",
        "Matrix 2D",
    ]
]
# Pandas will read json objects as strings,
# here I am converting them into Python dict objects
af_widgets["properties"] = af_widgets["properties"].apply(json.loads)
##
mat2d_properties_ids = af_widgets[(
    af_widgets["title"].str.upper()).isin(mat2d_titles)][[
        "properties", "analysis_framework_id"
    ]]
mat2d_properties = mat2d_properties_ids["properties"].tolist()
mat2d_ids = mat2d_properties_ids["analysis_framework_id"].tolist()
## 1D Matrices
# widget names of 1D matrices in the chosen AFs's
mat1d_titles = [
    s.upper() for s in [
        "Operational Environment",  # iMMAP, 2020 Okular, Okular Generic, Colombia-AF, Nigeria Situation Analysis (OA), IFRC Master Framework 2019
        "Cross Sector",  #Rohingya Framework, IFRC Analytical Framework 2018, 
        "Matrix 1D",  # Situation Analysis Generic Yemen, 
        "Cross Sectors",  # Situation Analysis Generic Libya
    ]
]
mat1d_titles = mat1d_titles + [
    sub_s.title() for sub_s in ('Operational Environment - Dimension',
                                'Operational Environment - Subdimension')
]
mat1d_properties_ids = af_widgets[(
    af_widgets["title"].str.upper()).isin(mat1d_titles)][[
        "properties", "analysis_framework_id"
    ]]
mat1d_properties = mat1d_properties_ids["properties"].tolist()
mat1d_ids = mat1d_properties_ids["analysis_framework_id"].tolist()

In [11]:
afids_pillars_subpillars = dict()
errors = []
for mat, af_id in zip(mat2d_properties, mat2d_ids):
    try:
        dims = mat["data"]['dimensions']
        afids_pillars_subpillars[af_id] = {}
        for dim in dims:
            pillar = dim["title"]
            sub_pillars = []
            for sub_pillar_dict in dim["subdimensions"]:
                sub_pillars.append(sub_pillar_dict["title"])
            afids_pillars_subpillars[af_id][pillar] = sub_pillars
    except KeyError:
        errors.append([mat, af_id])
##
afids_rows_cells = dict()
for mat, af_id in zip(mat1d_properties, mat1d_ids):
    try:
        rows = mat["data"]['rows']
        afids_rows_cells[af_id] = {}
        for row in rows:
            pillar = row["title"]
            sub_pillars = []
            for sub_pillar_dict in row["cells"]:
                sub_pillars.append(sub_pillar_dict["value"])
            afids_rows_cells[af_id][pillar] = sub_pillars
    except KeyError:
        errors.append([mat, af_id])

In [12]:
def extract_title(x):
    if x["excel"].get("title"):
        return x["excel"]["title"]
    elif x["excel"].get("type") == "multiple":
        return x["excel"]["titles"]


##
exportables["data"] = exportables["data"].apply(literal_eval)
af_titles = exportables["data"].apply(extract_title).tolist()

In [13]:
print(entries.shape, exportdata.shape)
entries = entries[entries["entry_type"].eq("excerpt")
                  & (~entries["excerpt"].isna())]
print(entries.shape, exportdata.shape)

(161244, 21) (1156529, 4)
(119755, 21) (1156529, 4)


In [14]:
exid_to_exdata = dict()
for ex_id, ex_data in zip(exportables["id"], exportables["data"]):
    exid_to_exdata[ex_id] = ex_data

In [15]:
widget_key_id_to_title = dict()
for w_key, w_id, title in zip(af_widgets["key"], af_widgets["widget_id"],
                              af_widgets["title"]):
    widget_key_id_to_title[(w_key, w_id)] = title

In [16]:
def exportdata_to_tag_title(row):
    data = json.loads(row[1])
    wkey, wid = None, None
    if data.get("common"):
        wkey = data.get("common").get("widget_key")
        wid = data["common"].get("widget_id")
    if wkey and wid:
        if widget_key_id_to_title.get((wkey, wid)):
            return widget_key_id_to_title.get((wkey, wid))
    if data.get("report") and data["report"].get("other"):
        if len(data["report"]
               ["other"]) == 1 and data["report"]["other"][0].get("title"):
            if data["report"]["other"][0]["title"]:
                return data["report"]["other"][0]["title"]
    if isinstance(data.get("excel"), list) and len(data["excel"])==1 and \
     data["excel"][0].get("widget_key") and data["excel"][0].get("widget_id"):
        wkey = data["excel"][0]["widget_key"]
        wid = data["excel"][0]["widget_id"]
        if widget_key_id_to_title.get((wkey, wid)):
            return widget_key_id_to_title.get((wkey, wid))
    exportable_id = row[3]
    if exid_to_exdata[exportable_id]["excel"].get("title"):
        return exid_to_exdata[exportable_id]["excel"]["title"]
    elif exid_to_exdata[exportable_id]["excel"].get("type") == "multiple":
        return exid_to_exdata[exportable_id]["excel"]["titles"]
    raise

In [17]:
exportdata["tag_title"] = exportdata.apply(exportdata_to_tag_title, axis=1)

In [18]:
def exportdata_to_tag_value(row):
    data = json.loads(row[1])
    if isinstance(data["excel"], list) and len(data["excel"]) == 1:
        return data["excel"][0]["value"]
    if data["excel"].get("type") == "lists":
        return data["excel"]["values"]
    if isinstance(data["excel"], dict) and "value" in data["excel"]:
        return data["excel"].get("value")
    if isinstance(data["excel"], dict) and "values" in data["excel"]:
        return data["excel"].get("values")
    elif "values" in data["common"]:
        return data["common"]["values"]
    elif "value" in data["common"]:
        return data["common"]["value"]
    raise

In [19]:
exportdata["tag_value"] = exportdata.apply(exportdata_to_tag_value, axis=1)

In [20]:
def title_case(tag):
    if isinstance(tag, (list, tuple)):
        return tuple([x.title() for x in tag])
    return tag.title()

In [21]:
exportdata["tag_title"] = exportdata["tag_title"].apply(title_case)

In [22]:
mat_1d_and_flag_titles = [
    'Flag', 'Operational Environment', 'Cross Sector', 'Temática',
    ('Operational Environment - Dimension',
     'Operational Environment - Subdimension'),
    ('Flag - Dimension', 'Flag - Subdimension'),
    ('High Level Tags - Dimension', 'High Level Tags - Subdimension')
]

In [23]:
#exportdata.tag_title.unique()

In [24]:
#exportdata[exportdata["tag_title"].isin([('Operational Environment - Dimension', 'Operational Environment - Subdimension')])]["tag_value"].tolist()
#exportdata[exportdata["tag_title"].eq('High Level Tags')]["tag_value"].tolist()

In [25]:
mat2d_titles = [s.title() for s in mat2d_titles]
mat1d_titles = [
    s.title() if isinstance(s, str) else (sub_s.title() for sub_s in s)
    for s in mat1d_titles
]
mat_1d_and_flag_titles = [
    s.title() if isinstance(s, str) else (sub_s.title() for sub_s in s)
    for s in mat_1d_and_flag_titles
]

In [26]:
exportdata_of_interest = exportdata[exportdata["tag_title"].isin(
    mat2d_titles + mat_1d_and_flag_titles)]

In [27]:
exportdata_of_interest.shape, entries.shape

((211986, 6), (119755, 21))

In [28]:
entries_labeled = pd.merge(entries,
                           exportdata_of_interest,
                           how="inner",
                           left_on="id",
                           right_on="entry_id",
                           suffixes=('_entry', '_exportdata'))

In [29]:
entries_labeled["id_entry"].shape, entries_labeled["id_entry"].unique().shape

((152251,), (118816,))

In [30]:
def agg_group(group):
    tag_titles = group["tag_title"]
    tag_values = group["tag_value"]
    #
    grouped = defaultdict(list)
    for title, value in zip(tag_titles, tag_values):
        if title in mat_1d_and_flag_titles:
            grouped["Mat1D"].extend(value)
        elif title in mat2d_titles:
            grouped["Mat2D"].extend(value)
    return grouped

In [31]:
entries_labeled.columns

Index(['id_entry', 'created_at', 'modified_at', 'excerpt', 'image_raw',
       'analysis_framework_id', 'created_by_id', 'lead_id', 'modified_by_id',
       'entry_type', 'information_date', 'order', 'client_id', 'project_id',
       'tabular_field_id', 'dropped_excerpt', 'highlight_hidden', 'verified',
       'verification_last_changed_by_id', 'image_id', 'title', 'id_exportdata',
       'data', 'entry_id', 'exportable_id', 'tag_title', 'tag_value'],
      dtype='object')

In [32]:
tags = entries_labeled.groupby("id_entry").apply(agg_group)

In [33]:
entryids_tags = pd.DataFrame({"entry_id": tags.index, "tags": tags.values})

In [34]:
entryids_tags.shape

(118816, 2)

In [35]:
entries_ = entries.rename({"id": "entry_id"}, axis=1, inplace=False)
#
print(entries.shape)
#
entries_ = entries_[[
    'entry_id',
    'created_at',
    'modified_at',
    'excerpt',
    'entry_type',
    'analysis_framework_id',
    'created_by_id',
    'lead_id',
    'modified_by_id',
    'information_date',
    'order',
    'project_id',
    'title',
    'verified',
]].drop_duplicates()
#
print(entries_.shape)

(119755, 21)
(119755, 14)


In [36]:
entries_labeled = pd.merge(entries_, entryids_tags, on="entry_id", how="inner")
entries_labeled.shape, entries_labeled.columns

((118816, 15),
 Index(['entry_id', 'created_at', 'modified_at', 'excerpt', 'entry_type',
        'analysis_framework_id', 'created_by_id', 'lead_id', 'modified_by_id',
        'information_date', 'order', 'project_id', 'title', 'verified', 'tags'],
       dtype='object'))

In [37]:
entries_labeled["tags_str"] = entries_labeled["tags"].apply(str)
entries_labeled.duplicated(subset=["entry_id", "tags_str"]).sum()

0

In [38]:
entries_labeled["entry_id"].duplicated().sum()

0

In [39]:
entries_labeled.shape

(118816, 16)

In [40]:
entries_labeled["tags"].isna().sum()

0

In [41]:
entries.shape, entries[~entries["excerpt"].isna()]["excerpt"].unique().shape

((119755, 21), (117743,))

In [42]:
entries_labeled.shape, entries_labeled.columns

((118816, 16),
 Index(['entry_id', 'created_at', 'modified_at', 'excerpt', 'entry_type',
        'analysis_framework_id', 'created_by_id', 'lead_id', 'modified_by_id',
        'information_date', 'order', 'project_id', 'title', 'verified', 'tags',
        'tags_str'],
       dtype='object'))

In [43]:
def tags_to_pillars(x):
    tags = x[0]
    af_id = x[1]
    pillars = []
    for tag_key, tag_value in tags.items():
        if tag_key != "Mat2D":
            continue
        if len(tag_value):
            for t in tag_value:
                if t[0] and t[1]:
                    pillars.append(t[0].title().strip() + "->" +
                                   t[1].title().strip())
    return list(set(pillars))


def tags_to_sectors(x):
    tags = x[0]
    af_id = x[1]
    sectors = []
    for tag_key, tag_value in tags.items():
        if tag_key != "Mat2D":
            continue
        if len(tag_value):
            for t in tag_value:
                if len(t) > 2 and t[2]:
                    sectors.append(t[2].title().strip())
    return list(set(sectors))


def tags_to_subsectors(x):
    tags = x[0]
    af_id = x[1]
    subsectors = []
    for tag_key, tag_value in tags.items():
        if tag_key != "Mat2D":
            continue
        if len(tag_value):
            subsectors = []
            for t in tag_value:
                if len(t) > 3 and t[3]:
                    if isinstance(t[3], list):
                        subsectors.extend([x.title().strip() for x in t[3]])
                    else:
                        subsectors.append(t[3].title().strip())
    return list(set(subsectors))


##
def tags_to_label2d(x):
    tags = x[0]
    af_id = x[1]
    label2d = []
    for tag_key, tag_value in tags.items():
        if tag_key != "Mat2D":
            continue
        if len(tag_value):
            for t in tag_value:
                if t[0] and t[1]:
                    label2d.append("AF-ID:" + str(af_id) + "->Sector:" +
                                   t[2].title().strip() + "->Pillar:" +
                                   t[0].title().strip() + "->Sub-pillar:" +
                                   t[1].title().strip())
    return list(set(label2d))


##
def tags_to_pillars_1d(x):
    tags = x[0]
    af_id = x[1]
    pillars_1d = []
    for tag_key, tag_value in tags.items():
        if tag_key != "Mat1D":
            continue
        if len(tag_value):
            for t in tag_value:
                if t[0] and t[1]:
                    pillars_1d.append(t[0].title().strip() + "->" +
                                      t[1].title().strip())
    return list(set(pillars_1d))

In [44]:
entries_labeled["pillars"] = entries_labeled[["tags", 'analysis_framework_id'
                                              ]].apply(tags_to_pillars, axis=1)
entries_labeled["sectors"] = entries_labeled[["tags", 'analysis_framework_id'
                                              ]].apply(tags_to_sectors, axis=1)
entries_labeled["subsectors"] = entries_labeled[[
    "tags", 'analysis_framework_id'
]].apply(tags_to_subsectors, axis=1)
##
entries_labeled["label_2d"] = entries_labeled[[
    "tags", 'analysis_framework_id'
]].apply(tags_to_label2d, axis=1)
##
entries_labeled["pillars_1d"] = entries_labeled[["tags", 'analysis_framework_id'
                                              ]].apply(tags_to_pillars_1d, axis=1)

In [45]:
entries_labeled[~entries_labeled["excerpt"].isna()]["excerpt"].unique().shape

(116857,)

In [46]:
pills_unique_1d = set()
pills_occurances_1d = list()
for pills_1d in entries_labeled["pillars_1d"]:
    pills_unique_1d.update(pills_1d)
    pills_occurances_1d.extend(pills_1d)
len(pills_unique_1d), Counter(pills_occurances_1d).most_common()

(111,
 [('Covid-19 Overview->Cases', 5549),
  ('Humanitarian Profile->Affected Groups', 4194),
  ('Displacement->Type/Numbers/Movements', 3960),
  ('Context->Security & Stability', 3846),
  ('Context->Economy', 3340),
  ('Casualties->Dead', 3216),
  ('Context->Security', 3169),
  ('Covid-19 Overview->Deaths', 3122),
  ('Covid-19 Overview->Vaccination', 2791),
  ('Displacement->Displacement', 2778),
  ('Covid-19 Containment Measures->Public Health Measures', 2256),
  ('Humanitarian Profile->Casualties', 2020),
  ('Context->Demography', 1877),
  ('Displacement->Local Integration', 1863),
  ('Context->Legal Or Normative Framework', 1828),
  ('Flag->Hazard & Threats', 1824),
  ('Humanitarian Access->Physical Constraints', 1797),
  ('Displacement->Push Factors', 1736),
  ('Covid-19 Overview->Testing', 1629),
  ('Context->Politics', 1466),
  ('Covid-19 Containment Measures->Movement Restrictions', 1256),
  ('Humanitarian Profile->Population Movement', 1192),
  ('Humanitarian Profile->Push/Pu

In [47]:
pills_unique = set()
pills_occurances = list()
for pills in entries_labeled["pillars"]:
    pills_unique.update(pills)
    pills_occurances.extend(pills)
Counter(pills_occurances).most_common()

[('Humanitarian Conditions->Living Standards', 26611),
 ('Humanitarian Conditions->Physical & Mental Wellbeing', 12287),
 ('Impact->Impact On People', 9752),
 ('Impact->Drivers/Aggravating Factors', 9583),
 ('Impact->Impact On System & Services', 9084),
 ('At Risk->People At Risk / Vulnerable', 5183),
 ('Capacities & Response->International', 4629),
 ('Humanitarian Conditions->Coping Mechanisms', 4295),
 ('Response And Capacities->National And Local', 3646),
 ('Humanitarian Conditions->Risks & Vulnerabilities', 2723),
 ('Humanitarian Conditions->Physical And Mental Well-Being', 2521),
 ('Impact->Number Of People Affected', 2277),
 ('Priorities->Priority Interventions (Staff)', 2269),
 ('Scope & Scale->Drivers/Aggravating Factors', 2239),
 ('Capacities And Response->International Response', 2171),
 ('Scope & Scale->System Disruption', 2126),
 ('Capacities & Response->Number Of People Reached', 2101),
 ('Response And Capacities->International', 2066),
 ('Risks->People At Risk Or Vulnerab

In [48]:
secs_unique = set()
secs_occurances = list()
for secs in entries_labeled["sectors"]:
    secs_unique.update(secs)
    secs_occurances.extend(secs)
#secs_unique
Counter(secs_occurances).most_common()

[('Health', 26283),
 ('Protection', 21298),
 ('Livelihoods', 12582),
 ('Cross', 11531),
 ('Food Security', 8593),
 ('Wash', 8328),
 ('Shelter', 7902),
 ('Education', 7364),
 ('Nutrition', 3325),
 ('Agriculture', 3159),
 ('Food', 2757),
 ('Cross-Sector', 2629),
 ('Logistics', 2614),
 ('Cross Cutting', 678),
 ('Nfi', 611),
 ('Logistic', 248),
 ('Livelihood', 132)]

In [49]:
entries_labeled.columns

Index(['entry_id', 'created_at', 'modified_at', 'excerpt', 'entry_type',
       'analysis_framework_id', 'created_by_id', 'lead_id', 'modified_by_id',
       'information_date', 'order', 'project_id', 'title', 'verified', 'tags',
       'tags_str', 'pillars', 'sectors', 'subsectors', 'label_2d',
       'pillars_1d'],
      dtype='object')

In [50]:
final_df = entries_labeled[entries_labeled["entry_type"].eq("excerpt")
                           & (~entries_labeled["excerpt"].isna())]
final_df.shape

(118816, 21)

In [51]:
def remove_newlines(excerpt):
    if not isinstance(excerpt, str):
        return excerpt
    return re.sub("\s+", " ", excerpt)

final_df.loc[:, 'excerpt'] = final_df['excerpt'].apply(remove_newlines)
#final_df.loc[:, 'dropped_excerpt'] = final_df['dropped_excerpt'].apply(remove_newlines)
##
final_df = final_df.sort_values("verified").reset_index(drop=True).drop_duplicates(subset='excerpt', keep="last")
final_df.shape, final_df.duplicated(subset="excerpt").sum()

((116744, 21), 0)

In [52]:
sector_name_mapper = {
    "Agriculture": "Agriculture",
    "Cccm": "Nan",
    "Cross": "Cross",
    "Cross Cutting": "Cross",
    "Cross-Sector": "Cross",
    "Education": "Education",
    "Food": "Food Security",
    "Food Security": "Food Security",
    "Nutrition": "Nutrition",
    "Health": "Health",
    "Livelihood": "Livelihoods",
    "Livelihoods": "Livelihoods",
    "Logistic": "Logistics",
    "Logistics": "Logistics",
    "Protection": "Protection",
    "Shelter": "Shelter",
    "Shelter And Nfis": "Shelter",
    "Nfi": "Nan",
    "Wash": "WASH",
}

In [53]:
af_title_to_id = dict()
for afid, title in zip(afs["id"], afs["title"]):
    af_title_to_id["title"] = afid

In [54]:
af_id_title = afs[["id", "title"]]
af_id_title["title"] = af_id_title["title"].str.title()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  af_id_title["title"] = af_id_title["title"].str.title()


In [55]:
matching_2d_mat["Framework Title"] = matching_2d_mat["Framework Title"].str.title()

In [56]:
af_id_title.columns = ["analysis_framework_id", "Framework Name"]

In [57]:
matching_2d_mat = pd.merge(matching_2d_mat,
                    af_id_title,
                    how="left",
                    left_on="Framework Title",
                    right_on="Framework Name")

In [58]:
matching_2d_mat.columns

Index(['Framework Title', 'Type', 'Pillar', 'Sub-pillar', 'Virtual Type',
       'Virtual Pillar', 'Virtual Sub-pillar', 'First-pass', 'Verified',
       'Reversible', 'Cover', 'analysis_framework_id', 'Framework Name'],
      dtype='object')

In [59]:
matching_2d_mat.drop(["Framework Name"], axis=1, inplace=True)
for col in matching_2d_mat.columns[1:7]:
    matching_2d_mat[col] = matching_2d_mat[col].astype(str).str.title()

In [60]:
matching_2d_mat["Pillar"] = matching_2d_mat["Pillar"].apply(
    lambda x: x.strip().title())
matching_2d_mat["Sub-pillar"] = matching_2d_mat["Sub-pillar"].apply(
    lambda x: x.strip().title())
matching_2d_mat["Final Pillar Name"] = matching_2d_mat["Virtual Pillar"].apply(
    lambda x: x.strip().title())
matching_2d_mat["Final Sub-pillar Name"] = matching_2d_mat[
    "Virtual Sub-pillar"].apply(lambda x: x.strip().title())

In [61]:
matching_2d_mat["original_pillar"] = matching_2d_mat[
    "Pillar"] + "->" + matching_2d_mat["Sub-pillar"]
matching_2d_mat["original_subpillar"] = matching_2d_mat[
    "Pillar"] + "->" + matching_2d_mat["Sub-pillar"]
##
matching_2d_mat["target_pillar"] = matching_2d_mat["Final Pillar Name"]
matching_2d_mat["target_subpillar"] = matching_2d_mat[
    "Final Pillar Name"] + "->" + matching_2d_mat["Final Sub-pillar Name"]

In [62]:
pillar_name_mapper = dict()
subpillar_name_mapper = dict()
for pillar, fpillar, subpillar, fsubpillar, af_id in zip(
        matching_2d_mat["original_pillar"], matching_2d_mat["target_pillar"],
        matching_2d_mat["original_subpillar"],
        matching_2d_mat["target_subpillar"],
        matching_2d_mat["analysis_framework_id"]):
    pillar_name_mapper[pillar] = fpillar
    subpillar_name_mapper[subpillar] = fsubpillar

In [63]:
matching_1d_mat["Framework Title"] = matching_1d_mat[
    "Framework Title"].str.title()
##
matching_1d_mat = pd.merge(matching_1d_mat,
                           af_id_title,
                           how="left",
                           left_on="Framework Title",
                           right_on="Framework Name")

In [64]:
matching_1d_mat.drop(["Framework Name"], axis=1, inplace=True)
for col in matching_1d_mat.columns[1:7]:
    matching_1d_mat[col] = matching_1d_mat[col].astype(str).str.title()

In [65]:
#matching_1d_mat = matching_1d_mat[~matching_1d_mat["analysis_framework_id"].isna()]
matching_1d_mat["analysis_framework_id"] = matching_1d_mat[
    "analysis_framework_id"].apply(int)

In [66]:
matching_1d_mat.columns

Index(['Framework Title', 'Type', 'Pillar', 'Sub-pillar', 'Virtual Type',
       'Virtual Pillar', 'Virtual Sub-pillar', 'First-pass', 'Verified',
       'Reversible', 'Cover', 'analysis_framework_id'],
      dtype='object')

In [67]:
pillar_name_mapper_1d = dict()
subpillar_name_mapper_1d = dict()
for pillar, fpillar, subpillar, fsubpillar, af_id in zip(
        matching_1d_mat["Pillar"], matching_1d_mat["Virtual Pillar"],
        matching_1d_mat["Sub-pillar"], matching_1d_mat["Virtual Sub-pillar"],
        matching_1d_mat["analysis_framework_id"]):
    pillar = pillar.strip() + "->" + subpillar.strip()
    pillar_name_mapper_1d[pillar] = fpillar.strip()
    subpillar_name_mapper_1d[pillar] = fpillar.strip() + "->" + fsubpillar.strip()

In [68]:
def sector_mapper(sec):
    if sec == sec:
        sectors = [sector_name_mapper[s] for s in sec]
        return sectors  #[sec for sec in sectors if sec]
    return []


##
def pillar_mapper(dim):
    if dim == dim:
        return [pillar_name_mapper[d.strip()] for d in dim]
    return []


##
def subpillar_mapper(subdim):
    if subdim == subdim:
        return [subpillar_name_mapper[d.strip()] for d in subdim]
    return []


##
def pillar_mapper_1d(pill):
    if pill == pill:
        return [pillar_name_mapper_1d[p] for p in pill]
    return []


##
def subpillar_mapper_1d(subpill):
    if subpill == subpill:
        return [subpillar_name_mapper_1d[p] for p in subpill]
    return []

In [69]:
subpillar_name_mapper['Impact->'
                      'Status Of Essential Infrastructure'
                      ', Systems, Markets And Networks'] = 'Nan'

subpillar_name_mapper['Impact->Drivers'] = 'Nan'
##
corrected = 'Humanitarian Conditions->Risks & new or aggravated vulnerabilities (DO NOT TAG)'.title(
)
orig = 'Humanitarian Conditions->Risks &  New Or Aggravated Vulnerabilities (Do Not Tag)'.title(
)
subpillar_name_mapper[orig] = subpillar_name_mapper[corrected]
##
missing = 'Humanitarian Conditions->Risks &  New Or Aggravated Vulnerabilities'.title(
)
existing = 'Humanitarian Conditions->Risks &  New Or Aggravated Vulnerabilities (Do Not Tag)'.title(
)
subpillar_name_mapper[missing] = subpillar_name_mapper[existing]
##
subpillar_name_mapper[
    'Humanitarian Conditions->Persons/Groups With Specific Needs'] = 'Nan'
subpillar_name_mapper['Impact->Damages & Losses'] = 'Nan'

In [70]:
for k, v in subpillar_name_mapper.items():
    if v == "Nan->Nan":
        subpillar_name_mapper[k] = 'Nan'

In [71]:
for k, v in subpillar_name_mapper.items():
    pillar_name_mapper[k] = v.split("->")[0]

In [72]:
corrected = 'Covid-19 Communication & Information->Prevention Campaings'.title(
)
orig = 'Covid-19 Communication & Information->Prevention Campaigns'.title()
subpillar_name_mapper_1d[orig] = subpillar_name_mapper_1d[corrected]
##
missing = 'Event/Shock->Pre-Existing Vulnerabilities'.title()
existing = 'Event/Shock->Underlying Factors/Pre-existing vulnerabilities'.title(
)
subpillar_name_mapper_1d[missing] = subpillar_name_mapper_1d[existing]
##
corrected = 'Shock /Event/Ongoing Conditions->Mitigating factors'.title()
orig = 'Shock /Event/Ongoing Conditions->Mtitigating Factors'.title()
subpillar_name_mapper_1d[orig] = subpillar_name_mapper_1d[corrected]
##
corrected = 'Information And Communication->Knowledge and info needs'.title()
orig = 'Information And Communication->Knoweldge And Info Needs'.title()
subpillar_name_mapper_1d[orig] = subpillar_name_mapper_1d[corrected]
##
missing = 'Context->Hazard Development'.title()
existing = 'Context->Hazard Developments'.title()
subpillar_name_mapper_1d[missing] = subpillar_name_mapper_1d[existing]

In [73]:
for k, v in subpillar_name_mapper_1d.items():
    if v == "Nan->Nan":
        subpillar_name_mapper_1d[k] = 'Nan'

In [74]:
for k, v in subpillar_name_mapper_1d.items():
    pillar_name_mapper_1d[k] = v.split("->")[0]

In [75]:
final_df["sectors"] = final_df["sectors"].apply(sector_mapper)
##
final_df["subpillars_2d"] = final_df["pillars"].apply(subpillar_mapper)
final_df["pillars_2d"] = final_df["pillars"].apply(pillar_mapper)
##
final_df["subpillars_1d"] = final_df["pillars_1d"].apply(subpillar_mapper_1d)
final_df["pillars_1d"] = final_df["pillars_1d"].apply(pillar_mapper_1d)

In [76]:
final_df.columns

Index(['entry_id', 'created_at', 'modified_at', 'excerpt', 'entry_type',
       'analysis_framework_id', 'created_by_id', 'lead_id', 'modified_by_id',
       'information_date', 'order', 'project_id', 'title', 'verified', 'tags',
       'tags_str', 'pillars', 'sectors', 'subsectors', 'label_2d',
       'pillars_1d', 'subpillars_2d', 'pillars_2d', 'subpillars_1d'],
      dtype='object')

In [77]:
final_df = final_df[~final_df.excerpt.isna()]


In [78]:
secs_unique = set()
secs_occurances = list()
for secs in final_df["sectors"]:
    secs_unique.update(secs)
    secs_occurances.extend(secs)
#secs_unique
Counter(secs_occurances).most_common()

[('Health', 25733),
 ('Protection', 20827),
 ('Cross', 14389),
 ('Livelihoods', 12496),
 ('Food Security', 11138),
 ('WASH', 8150),
 ('Shelter', 7711),
 ('Education', 7254),
 ('Nutrition', 3243),
 ('Agriculture', 3114),
 ('Logistics', 2798),
 ('Nan', 587)]

In [79]:
pills_unique = set()
pills_occurances = list()
for pills in final_df["pillars_2d"]:
    pills_unique.update(pills)
    pills_occurances.extend(pills)
#pills_unique
Counter(pills_occurances).most_common()

[('Humanitarian Conditions', 49191),
 ('Impact', 37536),
 ('Capacities & Response', 15526),
 ('At Risk', 12737),
 ('Nan', 11483),
 ('Priority Needs', 3346),
 ('Priority Interventions', 2579)]

In [80]:
subpills_unique = set()
subpills_occurances = list()
for subpills in final_df["subpillars_2d"]:
    subpills_unique.update(subpills)
    subpills_occurances.extend(subpills)
Counter(subpills_occurances).most_common()

[('Humanitarian Conditions->Living Standards', 27948),
 ('Humanitarian Conditions->Physical And Mental Well Being', 15535),
 ('Impact->Impact On Systems, Services And Networks', 12751),
 ('Impact->Driver/Aggravating Factors', 12737),
 ('At Risk->Risk And Vulnerabilities', 12460),
 ('Nan', 11483),
 ('Impact->Impact On People', 9684),
 ('Capacities & Response->International Response', 8920),
 ('Humanitarian Conditions->Coping Mechanisms', 4369),
 ('Capacities & Response->National Response', 4027),
 ('Impact->Number Of People Affected', 2364),
 ('Capacities & Response->Number Of People Reached/Response Gaps', 2320),
 ('Priority Interventions->Expressed By Humanitarian Staff', 2289),
 ('Priority Needs->Expressed By Humanitarian Staff', 1686),
 ('Priority Needs->Expressed By Population', 1660),
 ('Humanitarian Conditions->Number Of People In Need', 1339),
 ('Priority Interventions->Expressed By Population', 290),
 ('At Risk->Number Of People At Risk', 277),
 ('Capacities & Response->Local R

In [81]:
pills_unique_1d = set()
pills_occurances_1d = list()
for pills_1d in final_df["pillars_1d"]:
    pills_unique_1d.update(pills_1d)
    pills_occurances_1d.extend(pills_1d)
#pills_unique_1d
Counter(pills_occurances_1d).most_common()

[('Context', 20513),
 ('Covid-19', 19021),
 ('Nan', 15432),
 ('Displacement', 14572),
 ('Shock/Event', 5019),
 ('Casualties', 4129),
 ('Humanitarian Access', 3377),
 ('Information And Communication', 1196),
 ('Capacities & Response', 746)]

In [82]:
subpills_unique_1d = set()
subpills_occurances_1d = list()
for subpills_1d in final_df["subpillars_1d"]:
    subpills_unique_1d.update(subpills_1d)
    subpills_occurances_1d.extend(subpills_1d)
#subpills_unique_1d
Counter(subpills_occurances_1d).most_common()

[('Nan', 15432),
 ('Displacement->Type/Numbers/Movements', 9051),
 ('Context->Security & Stability', 6902),
 ('Covid-19->Cases', 5507),
 ('Covid-19->Restriction Measures', 4879),
 ('Context->Economy', 3899),
 ('Casualties->Dead', 3194),
 ('Context->Demography', 3185),
 ('Covid-19->Deaths', 3098),
 ('Covid-19->Vaccination', 2776),
 ('Displacement->Local Integration', 2617),
 ('Context->Legal & Policy', 2543),
 ('Shock/Event->Hazard & Threats', 2271),
 ('Displacement->Push Factors', 2051),
 ('Context->Politics', 1925),
 ('Humanitarian Access->Physical Constraints', 1743),
 ('Covid-19->Testing', 1622),
 ('Shock/Event->Underlying/Aggravating Factors', 1412),
 ('Shock/Event->Type And Characteristics', 1336),
 ('Context->Socio Cultural', 1178),
 ('Humanitarian Access->Relief To Population', 948),
 ('Context->Environment', 881),
 ('Capacities & Response->Number Of People Reached/Response Gaps', 746),
 ('Casualties->Injured', 663),
 ('Covid-19->Contact Tracing', 627),
 ('Information And Commun

In [83]:
final_df["subpillars_1d"].apply(lambda x:x!=[]).sum(),\
final_df["subpillars_2d"].apply(lambda x:x!=[]).sum(),\
final_df["sectors"].apply(lambda x:x!=[]).sum()

(57933, 85174, 85174)

In [84]:
final_df = final_df[[
    'entry_id', 'excerpt', 'entry_type', 'analysis_framework_id', 'lead_id',
    'title', 'project_id', 'verified', 'sectors', 'pillars_2d', 'pillars_1d',
    'subpillars_2d', 'subpillars_1d'
]]

In [85]:
for col in [
        'sectors', 'pillars_2d', 'pillars_1d', 'subpillars_2d', 'subpillars_1d'
]:
    final_df[col] = final_df[col].apply(
        lambda lbls: [x if x != 'Nan' else "NOT_MAPPED" for x in lbls])

In [86]:
all_valid_df = final_df[
    final_df['sectors'].apply(lambda lbls: "NOT_MAPPED" not in lbls)
    & final_df['pillars_2d'].apply(
        lambda lbls: "NOT_MAPPED" not in lbls) &
    final_df['pillars_1d'].apply(lambda lbls: "NOT_MAPPED" not in lbls)
    & final_df['subpillars_2d'].apply(
        lambda lbls: "NOT_MAPPED" not in lbls)
    & final_df['subpillars_1d'].apply(
        lambda lbls: "NOT_MAPPED" not in lbls)]
print(all_valid_df.shape, final_df.shape)

(95004, 13) (116744, 13)


In [87]:
final_df.to_csv('generated_dataset/primary_tags.csv', index=None)
all_valid_df.to_csv('generated_dataset/primary_tags_vc.csv', index=None)