In [1]:
import re
import json
import random
from pprint import pprint
from ast import literal_eval
from collections import Counter

import numpy as np
import pandas as pd
from tqdm.auto import tqdm

Random seeding for reproducibility

In [2]:
random.seed(2021)
np.random.seed(2021)

In [3]:
# each line in this table correspond to single tagging of an entry.
# an entry may have more than one tagging (e.g. a tag in 2D matrix, another in 1D matrix, 
# secondary tags, etc. )
exportdata = pd.read_csv("exportdata.csv")
# I pulled entries related to projects chosen by Patrice
entries = pd.read_csv("entries_of_projects_chosen_by_patrice.csv")
# The widgets of AF's, i.e. 2D matrices, 1D matrices, secondary tagging widgets, etc.
af_widgets = pd.read_csv("af_widgets_of_interest.csv")
# Projects chosen by Patrice
projects = pd.read_csv("projects_chosen_by_patrice.csv")
# 
exportables = pd.read_csv("af_exportables.csv")
# AF details 
afs = pd.read_csv("analysis_frameworks.csv")
# user ids, first names and last names
users = pd.read_csv("user_names.csv")
# The matchin of pillar/sub-pillar names from different AFs into unified pillar/sub-pillar names
matching_2d_mat = pd.read_csv("pillars_subpillars_matching.csv")
matching_1d_mat = pd.read_csv("mat_1d_matching.csv")

In [4]:
# Pandas will read json objects as strings,
# here I am converting them into Python dict objects
af_widgets["properties"] = af_widgets["properties"].apply(json.loads)

In [5]:
# widget names of 2D matrices in the chosen AFs's
mat2d_titles = [
    s.upper() for s in [
        "Pre-Crisis",
        "Shock/Event",
        "In-Crisis",
        "Sectors",
        "Sectoral Information",
        "Matrix 2D",
    ]
]

mat2d_properties_ids = af_widgets[(
    af_widgets["title"].str.upper()).isin(mat2d_titles)][[
        "properties", "analysis_framework_id"
    ]]
mat2d_properties = mat2d_properties_ids["properties"].tolist()
mat2d_ids = mat2d_properties_ids["analysis_framework_id"].tolist()
## 1D Matrices
# widget names of 1D matrices in the chosen AFs's
mat1d_titles = [
    s.upper() for s in [
        "Operational Environment",  # iMMAP, 2020 Okular, Okular Generic, Colombia-AF, Nigeria Situation Analysis (OA), IFRC Master Framework 2019
        "Cross Sector",  #Rohingya Framework, IFRC Analytical Framework 2018, 
        "Matrix 1D",  # Situation Analysis Generic Yemen, 
        "Cross Sectors",  # Situation Analysis Generic Libya
    ]
]
mat1d_titles = mat1d_titles + [
    sub_s.title() for sub_s in ('Operational Environment - Dimension',
                                'Operational Environment - Subdimension')
]
mat1d_properties_ids = af_widgets[(
    af_widgets["title"].str.upper()).isin(mat1d_titles)][[
        "properties", "analysis_framework_id"
    ]]
mat1d_properties = mat1d_properties_ids["properties"].tolist()
mat1d_ids = mat1d_properties_ids["analysis_framework_id"].tolist()

In [6]:
afids_pillars_subpillars = dict()
for mat, af_id in zip(mat2d_properties, mat2d_ids):
    dims = mat["data"]['dimensions']
    afids_pillars_subpillars[af_id] = {}
    for dim in dims:
        pillar = dim["title"]
        sub_pillars = []
        for sub_pillar_dict in dim["subdimensions"]:
            sub_pillars.append(sub_pillar_dict["title"])
        afids_pillars_subpillars[af_id][pillar] = sub_pillars
##
afids_rows_cells = dict()
for mat, af_id in zip(mat1d_properties, mat1d_ids):
    rows = mat["data"]['rows']
    afids_rows_cells[af_id] = {}
    for row in rows:
        pillar = row["title"]
        sub_pillars = []
        for sub_pillar_dict in row["cells"]:
            sub_pillars.append(sub_pillar_dict["value"])
        afids_rows_cells[af_id][pillar] = sub_pillars


In [7]:
def extract_title(x):
    if x["excel"].get("title"):
        return x["excel"]["title"]
    elif x["excel"].get("type") == "multiple":
        return x["excel"]["titles"]
##
exportables["data"] = exportables["data"].apply(literal_eval)
af_titles = exportables["data"].apply(extract_title).tolist()

In [8]:
entries.shape, exportdata.shape

((181142, 21), (1018086, 4))

In [9]:
entries[~entries["excerpt"].isna()]["excerpt"].unique().shape

(138924,)

In [10]:
exid_to_exdata = dict()
for ex_id, ex_data in zip(exportables["id"], exportables["data"]):
    exid_to_exdata[ex_id] = ex_data

In [11]:
af_widgets["title"].unique()

array(['Sectoral Information', 'LOCATION', 'EXCERPT',
       'Operational Environment', 'Information Date', 'RELIABILITY',
       'Flag', 'DEMOGRAPHIC GROUPS', 'SPECIFIC NEEDS GROUPS',
       'AFFECTED GROUPS', 'Severity', 'Sectors', 'Cross sector',
       'Affected groups', 'Specific Needs Groups', 'Demographic Groups',
       'Reliability', 'Geo Location', 'Excerpt', 'Information date',
       'PRE-CRISIS', 'IN-CRISIS', 'Crisis type',
       'Context additional tags', 'Crisis Type', 'SHOCK/EVENT',
       'Additional Context', 'DISPLACED POP TYPE', 'CLEANING tags',
       'POPULATION GROUPS', 'Cleaning comments', 'HIGH LEVEL TAGS',
       'Comment'], dtype=object)

In [12]:
widget_key_id_to_title = dict()
for w_key, w_id, title in zip(af_widgets["key"], af_widgets["widget_id"], af_widgets["title"]):
    widget_key_id_to_title[(w_key, w_id)] = title

In [13]:
def exportdata_to_tag_title(row):
    data = json.loads(row[1])
    wkey, wid = None, None
    if data.get("common"):
        wkey = data.get("common").get("widget_key")
        wid = data["common"].get("widget_id")
    if wkey and wid:
        if widget_key_id_to_title.get((wkey, wid)):
            return widget_key_id_to_title.get((wkey, wid))
    if data.get("report") and data["report"].get("other"):
            if len(data["report"]["other"]) == 1 and data["report"]["other"][0].get("title"):
                if data["report"]["other"][0]["title"]:
                    return data["report"]["other"][0]["title"]
    if isinstance(data.get("excel"), list) and len(data["excel"])==1 and \
     data["excel"][0].get("widget_key") and data["excel"][0].get("widget_id"):
        wkey = data["excel"][0]["widget_key"]
        wid = data["excel"][0]["widget_id"]
        if widget_key_id_to_title.get((wkey, wid)):
            return widget_key_id_to_title.get((wkey, wid))
    exportable_id = row[3]
    if exid_to_exdata[exportable_id]["excel"].get("title"):
        return exid_to_exdata[exportable_id]["excel"]["title"]
    elif exid_to_exdata[exportable_id]["excel"].get("type") == "multiple":
        return exid_to_exdata[exportable_id]["excel"]["titles"]
    raise

In [14]:
exportdata["tag_title"] = exportdata.apply(exportdata_to_tag_title, axis=1)

In [15]:
def exportdata_to_tag_value(row):
    data = json.loads(row[1])
    if isinstance(data["excel"], list) and len(data["excel"])==1:
        return data["excel"][0]["value"]
    if data["excel"].get("type") == "lists":
        return data["excel"]["values"]
    if isinstance(data["excel"], dict) and "value" in data["excel"]:
        return data["excel"].get("value")
    if isinstance(data["excel"], dict) and "values" in data["excel"]:
        return data["excel"].get("values")
    elif "values" in data["common"]:
        return data["common"]["values"]
    elif "value" in data["common"]:
        return data["common"]["value"]
    raise

In [16]:
exportdata["tag_value"] = exportdata.apply(exportdata_to_tag_value, axis=1)

In [17]:
def title_case(tag):
    if isinstance(tag, (list, tuple)):
        return tuple([x.title() for x in tag])
    return tag.title()

In [18]:
exportdata["tag_title"] = exportdata["tag_title"].apply(title_case)

In [19]:
mat_1d_and_flag_titles = [
    'Flag', 'Operational Environment', 'Cross Sector',
    ('Operational Environment - Dimension',
     'Operational Environment - Subdimension'),
    ('Flag - Dimension', 'Flag - Subdimension')
]

In [20]:
#exportdata[exportdata["tag_title"].isin([('Operational Environment - Dimension', 'Operational Environment - Subdimension')])]["tag_value"].tolist()
#exportdata[exportdata["tag_title"].eq('High Level Tags')]["tag_value"].tolist()

In [21]:
mat2d_titles = [s.title() for s in mat2d_titles]
mat1d_titles = [
    s.title() if isinstance(s, str) else (sub_s.title() for sub_s in s)
    for s in mat1d_titles
]
mat_1d_and_flag_titles = [
    s.title() if isinstance(s, str) else (sub_s.title() for sub_s in s)
    for s in mat_1d_and_flag_titles
]

In [22]:
exportdata_of_interest = exportdata[exportdata["tag_title"].isin(
    mat2d_titles + mat_1d_and_flag_titles)]

In [23]:
exportdata_of_interest.shape, entries.shape

((225045, 6), (181142, 21))

In [24]:
entries_labeled = pd.merge(entries,
                           exportdata_of_interest,
                           how="inner",
                           left_on="id",
                           right_on="entry_id",
                           suffixes=('_entry', '_exportdata'))

In [25]:
entries_labeled["id_entry"].shape, entries_labeled["id_entry"].unique().shape

((225044,), (179473,))

In [26]:
def agg_group(group):
    tag_titles = group["tag_title"]
    tag_titles = ["Mat1D" if title in mat_1d_and_flag_titles else "Mat2D" for title in tag_titles]
    tag_values = group["tag_value"]
    return {f"{title}_{i}":value for i, (title, value) in enumerate(zip(tag_titles, tag_values))}

In [27]:
tags = entries_labeled.groupby("id_entry").apply(agg_group)

In [28]:
entryids_tags = pd.DataFrame({"entry_id": tags.index, "tags": tags.values})

In [29]:
entryids_tags.shape

(179473, 2)

In [30]:
entries_labeled.columns

Index(['id_entry', 'created_at', 'modified_at', 'excerpt', 'image_raw',
       'analysis_framework_id', 'created_by_id', 'lead_id', 'modified_by_id',
       'entry_type', 'information_date', 'order', 'client_id', 'project_id',
       'tabular_field_id', 'dropped_excerpt', 'highlight_hidden', 'verified',
       'verification_last_changed_by_id', 'image_id', 'title', 'id_exportdata',
       'data', 'entry_id', 'exportable_id', 'tag_title', 'tag_value'],
      dtype='object')

In [31]:
print(entries_labeled.shape)
entries_labeled = entries_labeled[[
    'entry_id',
    'created_at',
    'modified_at',
    'excerpt',
    'entry_type',
    'analysis_framework_id',
    'created_by_id',
    'lead_id',
    'modified_by_id',
    'information_date',
    'order',
    'project_id',
    'dropped_excerpt',
    'verified',
]].drop_duplicates()
print(entries_labeled.shape)

(225044, 27)
(179473, 14)


In [32]:
entries_labeled = pd.merge(entries_labeled, entryids_tags, on="entry_id")

In [33]:
entries_labeled.shape, entries_labeled.columns

((179473, 15),
 Index(['entry_id', 'created_at', 'modified_at', 'excerpt', 'entry_type',
        'analysis_framework_id', 'created_by_id', 'lead_id', 'modified_by_id',
        'information_date', 'order', 'project_id', 'dropped_excerpt',
        'verified', 'tags'],
       dtype='object'))

In [34]:
entries_labeled["tags_str"] = entries_labeled["tags"].apply(str)
entries_labeled.duplicated(subset=["entry_id", "tags_str"]).sum()

0

In [35]:
entries_labeled["entry_id"].duplicated().sum()

0

In [36]:
entries_labeled.shape

(179473, 16)

In [37]:
entries_labeled["tags"].isna().sum()

0

In [38]:
entries.shape, entries[~entries["excerpt"].isna()]["excerpt"].unique().shape

((181142, 21), (138924,))

In [39]:
entries_labeled.shape, entries_labeled.columns

((179473, 16),
 Index(['entry_id', 'created_at', 'modified_at', 'excerpt', 'entry_type',
        'analysis_framework_id', 'created_by_id', 'lead_id', 'modified_by_id',
        'information_date', 'order', 'project_id', 'dropped_excerpt',
        'verified', 'tags', 'tags_str'],
       dtype='object'))

In [40]:
dimensions = []
for k, v in entries_labeled["tags"].tolist()[3].items():
    if k.startswith("Mat2D"):
        tag_value = v
        if len(tag_value):
            for t in tag_value:
                if t[0]:
                    dimensions.append(str(af_id)+"->"+t[0].title() +"->"+t[1].title())

list(set(dimensions))

['1306->Humanitarian Conditions->Living Standards',
 '1306->Impact->Status Of Essential Infrastructure, Systems, Markets And Networks']

In [41]:
def tags_to_pillars(x):
    tags = x[0]
    af_id = x[1]
    pillars = []
    for tag_key, tag_value in tags.items():
        if not tag_key.startswith("Mat2D"):
            continue
        if len(tag_value):
            for t in tag_value:
                if t[0]:
                    pillars.append(str(af_id)+"->"+t[0].title() +"->"+t[1].title())
    return list(set(pillars))

def tags_to_sectors(x):
    tags = x[0]
    af_id = x[1]
    sectors = []
    for tag_key, tag_value in tags.items():
        if not tag_key.startswith("Mat2D"):
            continue
        if len(tag_value):
            for t in tag_value:
                if len(t) > 2 and t[2]:
                    sectors.append(t[2].title())
    return list(set(sectors))

def tags_to_subsectors(x):
    tags = x[0]
    af_id = x[1]
    subsectors = []
    for tag_key, tag_value in tags.items():
        if not tag_key.startswith("Mat2D"):
            continue
        if len(tag_value):
            subsectors = []
            for t in tag_value:
                if len(t) >3 and t[3]:
                    if isinstance(t[3], list):
                        subsectors.extend([x.title() for x in t[3]])
                    else:
                        subsectors.append(t[3].title())
    return list(set(subsectors))
##
def tags_to_pillars_1d(x):
    tags = x[0]
    af_id = x[1]
    pillars = []
    for tag_key, tag_value in tags.items():
        if not tag_key.startswith("Mat1D"):
            continue
        if len(tag_value):
            for t in tag_value:
                if t[0]:
                    pillars.append(str(af_id)+"->"+t[0].title() +"->"+t[1].title())
    return list(set(pillars))

In [42]:
entries_labeled["pillars"] = entries_labeled[["tags", 'analysis_framework_id'
                                              ]].apply(tags_to_pillars, axis=1)
entries_labeled["sectors"] = entries_labeled[["tags", 'analysis_framework_id'
                                              ]].apply(tags_to_sectors, axis=1)
entries_labeled["subsectors"] = entries_labeled[[
    "tags", 'analysis_framework_id'
]].apply(tags_to_subsectors, axis=1)
##
entries_labeled["pillars_1d"] = entries_labeled[["tags", 'analysis_framework_id'
                                              ]].apply(tags_to_pillars_1d, axis=1)

In [43]:
entries_labeled[~entries_labeled["excerpt"].isna()]["excerpt"].unique().shape

(137921,)

In [44]:
pills_unique_1d = set()
pills_occurances_1d = list()
for pills_1d in entries_labeled["pillars_1d"]:
    pills_unique_1d.update(pills_1d)
    pills_occurances_1d.extend(pills_1d)
len(pills_unique_1d), Counter(pills_occurances_1d).most_common()

(233,
 [('1306->Covid-19 Overview->Cases', 5870),
  ('1306->Displacement->Type/Numbers/Movements', 4407),
  ('699->Humanitarian Profile->Affected Groups', 4188),
  ('829->Displacement ->Displacement', 3801),
  ('1306->Context->Security & Stability', 3462),
  ('1306->Covid-19 Overview->Deaths', 3160),
  ('699->Context->Demography', 2542),
  ('1306->Covid-19 Containment Measures->Public Health Measures', 2524),
  ('1306->Context->Economy', 2480),
  ('1306->Covid-19 Overview->Vaccination', 2148),
  ('1306->Casualties->Dead', 2091),
  ('829->Context->Demographics', 1852),
  ('829->Context->Legal Or Normative Framework', 1817),
  ('537->Humanitarian Profile->Affected Groups', 1768),
  ('1306->Covid-19 Overview->Testing', 1617),
  ('552->Humanitarian Profile->Affected Groups', 1607),
  ('1004->Displacement Profile->Type/Numbers', 1575),
  ('1306->Covid-19 Containment Measures->Movement Restrictions', 1528),
  ('1306->Displacement->Push Factors', 1522),
  ('699->Humanitarian Profile->Populati

In [45]:
pills_unique = set()
pills_occurances = list()
for pills in entries_labeled["pillars"]:
    pills_unique.update(pills)
    pills_occurances.extend(pills)
Counter(pills_occurances).most_common()

[('1306->Humanitarian Conditions->Living Standards', 15401),
 ('1306->Humanitarian Conditions->Physical & Mental Wellbeing', 9070),
 ('829->Humanitarian Conditions->Living Standards', 8956),
 ('1465->Humanitarian Conditions->Humanitarian Condition', 8690),
 ('1306->Impact->Impact On System & Services', 8482),
 ('1306->Impact->Drivers/Aggravating Factors', 8463),
 ('1306->Impact->Impact On People', 8248),
 ('1465->Operational Environment->International Humanitarian Response And Capacities',
  7816),
 ('1306->At Risk->People At Risk / Vulnerable', 4897),
 ('829->Response And Capacities->National And Local', 4358),
 ('1306->Capacities & Response->International', 4311),
 ('699->Capacities & Response->International Actors', 4190),
 ('1465->Humanitarian Conditions->Risk', 3646),
 ('1465->Humanitarian Situation->Humanitarian Profile', 3642),
 ('1465->Stated Priorities And Recommendations->Recommendations From Aid/Research Actors',
  3531),
 ('1465->Operational Environment->National/Sub-Nation

In [46]:
secs_unique = set()
secs_occurances = list()
for secs in entries_labeled["sectors"]:
    secs_unique.update(secs)
    secs_occurances.extend(secs)
#secs_unique
Counter(secs_occurances).most_common()

[('Health', 41717),
 ('Protection', 30626),
 ('Livelihoods', 19135),
 ('Cross-Sector', 17853),
 ('Food Security', 14649),
 ('Cross', 14014),
 ('Wash', 13423),
 ('Education', 12336),
 ('Shelter', 9263),
 ('Nutrition', 6175),
 ('Food', 3854),
 ('Agriculture', 3183),
 ('Logistics', 2763),
 ('Shelter And Nfis', 2130),
 ('Cross Cutting', 770),
 ('Cccm', 750),
 ('Logistic', 277),
 ('Nfi', 137),
 ('Livelihood', 134)]

In [47]:
entries_labeled.columns

Index(['entry_id', 'created_at', 'modified_at', 'excerpt', 'entry_type',
       'analysis_framework_id', 'created_by_id', 'lead_id', 'modified_by_id',
       'information_date', 'order', 'project_id', 'dropped_excerpt',
       'verified', 'tags', 'tags_str', 'pillars', 'sectors', 'subsectors',
       'pillars_1d'],
      dtype='object')

In [48]:
final_df = entries_labeled[entries_labeled["entry_type"].eq("excerpt")
                           & (~entries_labeled["excerpt"].isna())]

In [49]:
def remove_newlines(excerpt):
    if not isinstance(excerpt, str):
        return excerpt
    return re.sub("\s+", " ", excerpt)

final_df.loc[:, 'excerpt'] = final_df['excerpt'].apply(remove_newlines)
final_df.loc[:, 'dropped_excerpt'] = final_df['dropped_excerpt'].apply(remove_newlines)
##
final_df = final_df.sort_values("verified").reset_index(drop=True).drop_duplicates(subset='excerpt', keep="last")
final_df.shape, final_df.duplicated(subset="excerpt").sum()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(ilocs[0], value)


((137801, 20), 0)

In [50]:
sector_name_mapper = {
    "Agriculture": "Agriculture",
    "Cccm": "",
    "Cross": "Cross",
    "Cross Cutting": "Cross",
    "Cross-Sector": "Cross",
    "Education": "Education",
    "Food": "Food Security",
    "Food Security": "Food Security",
    "Nutrition": "Nutrition",
    "Health": "Health",
    "Livelihood": "Livelihoods",
    "Livelihoods": "Livelihoods",
    "Logistic": "Logistics",
    "Logistics": "Logistics",
    "Protection": "Protection",
    "Shelter": "Shelter",
    "Shelter And Nfis": "Shelter",
    "Nfi": "",
    "Wash": "WASH",
}

In [51]:
af_title_to_id = dict()
for afid, title in zip(afs["id"], afs["title"]):
    af_title_to_id["title"] = afid

In [52]:
af_id_title = afs[["id", "title"]]
af_id_title.columns = ["analysis_framework_id", "Framework Name"]
matching_2d_mat = pd.merge(matching_2d_mat,
                    af_id_title,
                    how="left",
                    left_on="Framework Name",
                    right_on="Framework Name")
matching_2d_mat.columns

Index(['Framework Name', 'Pillar', 'Sub-pillar', 'Final Pillar Name',
       'Final Sub-pillar Name', 'analysis_framework_id'],
      dtype='object')

In [53]:
matching_2d_mat["Pillar"] = matching_2d_mat["Pillar"].apply(
    lambda x: x.strip().title())
matching_2d_mat["Sub-pillar"] = matching_2d_mat["Sub-pillar"].apply(
    lambda x: x.strip().title())
matching_2d_mat["Final Pillar Name"] = matching_2d_mat[
    "Final Pillar Name"].apply(lambda x: x.strip().title())
matching_2d_mat["Final Sub-pillar Name"] = matching_2d_mat[
    "Final Sub-pillar Name"].apply(lambda x: x.strip().title())

In [54]:
matching_2d_mat["Final Sub-pillar Name"].unique()

array(['Living Standards', 'Physical And Mental Well Being',
       'Coping Mechanisms', 'Impact On People Or Impact On Services',
       'Driver/Aggravating Factors', 'Impact On People',
       'Impact On Systems And Services', 'International Response',
       'National Response', 'Number Of People Affected',
       'Number Of People At Risk', 'Number Of People In Need',
       'Number Of People Reached', 'Expressed By Population',
       'Expressed By Humanitarian Staff', 'Response Gaps',
       'Risk And Vulnerabilities', 'Impact On Services'], dtype=object)

In [55]:
matching_2d_mat["Final Pillar Name"].unique()

array(['Humanitatian Conditions', 'Impact', 'Capacities & Response',
       'People At Risk', 'Humanitarian Conditions', 'Priority Needs',
       'Priority Interventions'], dtype=object)

In [56]:
matching_2d_mat.loc[
    matching_2d_mat["Final Pillar Name"].eq('Humanitatian Conditions'),
    "Final Pillar Name"] = 'Humanitarian Conditions'

In [57]:
matching_2d_mat = matching_2d_mat.astype({
    'Framework Name': str,
    'Pillar': str,
    'Sub-pillar': str,
    'Final Pillar Name': str,
    'Final Sub-pillar Name': str,
    'analysis_framework_id': str
})

In [58]:
matching_2d_mat["original_pillar"] = matching_2d_mat[
    "analysis_framework_id"] + "->" + matching_2d_mat[
        "Pillar"] + "->" + matching_2d_mat["Sub-pillar"]
matching_2d_mat["original_subpillar"] = matching_2d_mat[
    "analysis_framework_id"] + "->" + matching_2d_mat[
        "Pillar"] + "->" + matching_2d_mat["Sub-pillar"]
##
matching_2d_mat["target_pillar"] = matching_2d_mat["Final Pillar Name"]
matching_2d_mat["target_subpillar"] = matching_2d_mat[
    "Final Pillar Name"] + "->" + matching_2d_mat["Final Sub-pillar Name"]

In [59]:
pillar_name_mapper = dict()
subpillar_name_mapper = dict()
for pillar, fpillar, subpillar, fsubpillar, af_id in zip(
        matching_2d_mat["original_pillar"], matching_2d_mat["target_pillar"],
        matching_2d_mat["original_subpillar"],
        matching_2d_mat["target_subpillar"],
        matching_2d_mat["analysis_framework_id"]):
    pillar_name_mapper[pillar] = fpillar
    subpillar_name_mapper[subpillar] = fsubpillar

In [60]:
matching_1d_mat = pd.merge(matching_1d_mat,
                           afs[["id", "title"]],
                           how="left",
                           left_on="Framework Title",
                           right_on="title")
matching_1d_mat["id"] = matching_1d_mat["id"].astype(np.int)

In [61]:
pillar_name_mapper_1d = dict()
subpillar_name_mapper_1d = dict()
for pillar, fpillar, subpillar, fsubpillar, af_id in zip(
        matching_1d_mat["1D Pillar"], matching_1d_mat["Final 1D Pillar"],
        matching_1d_mat["1D Sub-pillar"],
        matching_1d_mat["Final 1D Sub-pillar"], matching_1d_mat["id"]):
    pillar = str(af_id) + "->" + pillar + "->" + subpillar
    pillar_name_mapper_1d[pillar] = fpillar
    subpillar_name_mapper_1d[pillar] = fpillar + "->" + fsubpillar

In [62]:
def sector_mapper(sec):
    if sec == sec:
        sectors =  [sector_name_mapper.get(s, "") for s in sec]
        return [sec for sec in sectors if sec]
    return []
##
def pillar_mapper(dim):
    if dim == dim:
        dim = [pillar_name_mapper.get(d, "") for d in dim]
        return [d for d in dim if d]
    return []
##
def subpillar_mapper(subdim):
    if subdim == subdim:
        subdim = [pillar_name_mapper.get(d, "") for d in subdim]
        return [d for d in subdim if d]
    return []
##
def pillar_mapper_1d(pill):
    if pill == pill:
        pill = [pillar_name_mapper_1d.get(p, "") for p in pill]
        return [p for p in pill if p]
    return []
##
def subpillar_mapper_1d(subpill):
    if subpill == subpill:
        subpill = [subpillar_name_mapper_1d.get(p, "") for p in subpill]
        return [p for p in subpill if p]
    return []

In [63]:
final_df["sectors"] = final_df["sectors"].apply(sector_mapper)
final_df["subpillars"] = final_df["pillars"].apply(subpillar_mapper)
final_df["pillars"] = final_df["pillars"].apply(pillar_mapper)
##
final_df["subpillars_1d"] = final_df["pillars_1d"].apply(subpillar_mapper_1d)
final_df["pillars_1d"] = final_df["pillars_1d"].apply(pillar_mapper_1d)

In [64]:
final_df.columns

Index(['entry_id', 'created_at', 'modified_at', 'excerpt', 'entry_type',
       'analysis_framework_id', 'created_by_id', 'lead_id', 'modified_by_id',
       'information_date', 'order', 'project_id', 'dropped_excerpt',
       'verified', 'tags', 'tags_str', 'pillars', 'sectors', 'subsectors',
       'pillars_1d', 'subpillars', 'subpillars_1d'],
      dtype='object')

In [65]:
secs_unique = set()
secs_occurances = list()
for secs in final_df["sectors"]:
    secs_unique.update(secs)
    secs_occurances.extend(secs)
#secs_unique
Counter(secs_occurances).most_common()

[('Health', 31900),
 ('Cross', 26227),
 ('Protection', 24543),
 ('Livelihoods', 14773),
 ('Food Security', 13371),
 ('WASH', 10019),
 ('Education', 9417),
 ('Shelter', 8621),
 ('Nutrition', 4631),
 ('Agriculture', 2595),
 ('Logistics', 2256)]

In [66]:
pills_unique = set()
pills_occurances = list()
for pills in final_df["pillars"]:
    pills_unique.update(pills)
    pills_occurances.extend(pills)
#pills_unique
Counter(pills_occurances).most_common()

[('Humanitarian Conditions', 37784),
 ('Impact', 30056),
 ('Capacities & Response', 13287),
 ('People At Risk', 8382),
 ('Priority Needs', 2323),
 ('Priority Interventions', 1880)]

In [67]:
subpills_unique = set()
subpills_occurances = list()
for subpills in final_df["subpillars"]:
    subpills_unique.update(subpills)
    subpills_occurances.extend(subpills)
Counter(subpills_occurances).most_common()

[('Humanitarian Conditions', 37784),
 ('Impact', 30056),
 ('Capacities & Response', 13287),
 ('People At Risk', 8382),
 ('Priority Needs', 2323),
 ('Priority Interventions', 1880)]

In [68]:
pills_unique_1d = set()
pills_occurances_1d = list()
for pills_1d in final_df["pillars_1d"]:
    pills_unique_1d.update(pills_1d)
    pills_occurances_1d.extend(pills_1d)
#pills_unique_1d
Counter(pills_occurances_1d).most_common()

[('Context', 22431),
 ('Humanitarian Profile', 8661),
 ('Displacement', 5330),
 ('Casualties', 3270),
 ('Humanitarian Access', 2021),
 ('Information', 1178)]

In [69]:
subpills_unique_1d = set()
subpills_occurances_1d = list()
for subpills_1d in final_df["subpillars_1d"]:
    subpills_unique_1d.update(subpills_1d)
    subpills_occurances_1d.extend(subpills_1d)
#subpills_unique_1d
Counter(subpills_occurances_1d).most_common()

[('Context->Security & Stability', 5180),
 ('Humanitarian Profile->Affected Groups', 4979),
 ('Displacement->Push/Pull Factors', 3804),
 ('Context->Demography', 3627),
 ('Context->Economy', 3485),
 ('Casualties->Dead', 2515),
 ('Humanitarian Profile->Casualties', 2231),
 ('Context->Hazard & Threats', 2055),
 ('Context->Politics', 1699),
 ('Humanitarian Access->Physical Constraints', 1617),
 ('Humanitarian Profile->Population Movement', 1451),
 ('Context->Overview', 1364),
 ('Context->Key Event', 1149),
 ('Context->Socio Cultural', 997),
 ('Context->Legal  & Policy', 855),
 ('Displacement->Type/Numbers', 842),
 ('Context->Environment', 745),
 ('Information->Information Gaps', 736),
 ('Context->Stakeholders', 680),
 ('Context->Response gap', 595),
 ('Casualties->Injured', 537),
 ('Displacement->Local Integration', 444),
 ('Humanitarian Access->Humanitarian Access Gaps', 404),
 ('Information->Channels & Means', 314),
 ('Casualties->Missing', 218),
 ('Displacement->Intentions', 169),
 ('In

### Splitting

In [70]:
# add columns for sectors
for sector in secs_unique:
    final_df[sector] = 0
# add columns for dimensions
for pillar in pills_unique:
    final_df[pillar] = 0
# add columns for subdimensions
for subpillar in subpills_unique:
    final_df[subpillar] = 0
## add columns for 1d pillars
for pillar_1d in pills_unique_1d:
    final_df[pillar_1d] = 0
## add columns for 1d subpillars
for subpillar_1d in subpills_unique_1d:
    final_df[subpillar_1d] = 0

In [71]:
for row in tqdm(final_df.iterrows(), total=final_df.shape[0]):
    sectors_i = row[1]["sectors"]
    pillars_i = row[1]["pillars"]
    subpillars_i = row[1]["subpillars"]
    pillars_1d_i = row[1]["pillars_1d"]
    subpillars_1d_i = row[1]["subpillars_1d"]
    idx = row[0]
    for sector in sectors_i:
        final_df.loc[idx, sector] = 1
    for pillar in pillars_i:
        final_df.loc[idx, pillar] = 1
    for subpillar in subpillars_i:
        final_df.loc[idx, subpillar] = 1
    for pillar_1d in pillars_1d_i:
        final_df.loc[idx, pillar_1d] = 1
    for subpillar_1d in subpillars_1d_i:
        final_df.loc[idx, subpillar_1d] = 1

100%|██████████| 137801/137801 [02:31<00:00, 906.87it/s] 


In [72]:
classes = list(secs_unique) + list(pills_unique) + list(
    subpills_unique) + list(pills_unique_1d) + list(subpills_unique_1d)
len(classes)

57

In [73]:
final_df[classes].sum().astype(int)

WASH                                             10019
Education                                         9417
Agriculture                                       2595
Logistics                                         2256
Shelter                                           8617
Food Security                                    13369
Nutrition                                         4631
Cross                                            26225
Health                                           31900
Livelihoods                                      14773
Protection                                       24543
Priority Interventions                            1871
Priority Needs                                    2269
Humanitarian Conditions                          33048
People At Risk                                    8338
Capacities & Response                            10846
Impact                                           23857
Priority Interventions                            1871
Priority N

### Stratified Split

In [74]:
from skmultilearn.model_selection import iterative_train_test_split
from skmultilearn.model_selection.measures import get_combination_wise_output_matrix

In [75]:
def train_val_test_df_split(df):
    class_to_id = {clss: i for i, clss in enumerate(classes)}
    num_classes = len(classes)
    labels = np.zeros([len(df), num_classes])
    for i, (sectors_i, pillars_i, subpillars_i, pillars_1d_i,
            subpillars_1d_i) in enumerate(
                zip(df["sectors"], df["pillars_2d"], df["subpillars_2d"],
                    df["pillars_1d"], df["subpillars_1d"])):
        for sec in sectors_i:
            labels[i, class_to_id[sec]] = 1
        for pil in pillars_i:
            labels[i, class_to_id[pil]] = 1
        for subpil in subpillars_i:
            labels[i, class_to_id[subpil]] = 1
        for pil_1d in pillars_1d_i:
            labels[i, class_to_id[pil_1d]] = 1
        for subpil_1d in subpillars_1d_i:
            labels[i, class_to_id[subpil_1d]] = 1
    ##
    X_train, y_train, X_test, y_test = iterative_train_test_split(
        df["excerpt"].to_numpy().reshape(-1, 1), labels, test_size=0.1)
    X_train, y_train, X_val, y_val = iterative_train_test_split(X_train,
                                                                y_train,
                                                                test_size=0.1)
    return {
        "X_train": X_train,
        "y_train": y_train,
        "X_test": X_test,
        "y_test": y_test,
        "X_val": X_val,
        "y_val": y_val
    }

In [76]:
df = final_df[[
    'entry_id',
    'lead_id',
    'project_id',
    'analysis_framework_id',
    'excerpt',
    'dropped_excerpt',
    'created_by_id',
    'modified_by_id',
    'verified',
    'sectors',
    'pillars',
    'subpillars',
    'pillars_1d',
    'subpillars_1d',
]]
df.columns = [
    'entry_id',
    'lead_id',
    'project_id',
    'analysis_framework_id',
    'excerpt',
    'dropped_excerpt',
    'created_by_id',
    'modified_by_id',
    'verified',
    'sectors',
    'pillars_2d',
    'subpillars_2d',
    'pillars_1d',
    'subpillars_1d',
]

In [77]:
X_train, y_train, X_test, y_test, X_val, y_val = list(
    train_val_test_df_split(df).values())

In [78]:
final_df_train = final_df[final_df["excerpt"].isin(X_train.reshape(-1))]
final_df_val = final_df[final_df["excerpt"].isin(X_val.reshape(-1))]
final_df_test = final_df[final_df["excerpt"].isin(X_test.reshape(-1))]

In [79]:
df_train = df[df["excerpt"].isin(X_train.reshape(-1))]
df_val = df[df["excerpt"].isin(X_val.reshape(-1))]
df_test = df[df["excerpt"].isin(X_test.reshape(-1))]

In [80]:
df_train.index & df_val.index, df_train.index & df_test.index, df_test.index & df_val.index 

(Int64Index([], dtype='int64'),
 Int64Index([], dtype='int64'),
 Int64Index([], dtype='int64'))

In [81]:
df_train.to_csv("data_v0.5_train.csv", index=None)
df_val.to_csv("data_v0.5_val.csv", index=None)
df_test.to_csv("data_v0.5_test.csv", index=None)

In [82]:
df.to_csv("data_v0.5_full.csv", index=None)