In [1]:
import re
import json
import random
from pprint import pprint
from ast import literal_eval
from collections import Counter

import numpy as np
import pandas as pd
from tqdm.auto import tqdm

Random seeding for reproducibility

In [2]:
random.seed(2021)
np.random.seed(2021)

In [3]:
# each line in this table correspond to single tagging of an entry.
# an entry may have more than one tagging (e.g. a tag in 2D matrix, another in 1D matrix, 
# secondary tags, etc. )
exportdata = pd.read_csv("exportdata.csv")
# I pulled entries related to projects chosen by Patrice
entries = pd.read_csv("entries_of_projects_chosen_by_patrice.csv")
# The widgets of AF's, i.e. 2D matrices, 1D matrices, secondary tagging widgets, etc.
af_widgets = pd.read_csv("af_widgets_of_interest.csv")
# Projects chosen by Patrice
projects = pd.read_csv("projects_chosen_by_patrice.csv")
# 
exportables = pd.read_csv("af_exportables.csv")
# AF details 
afs = pd.read_csv("analysis_frameworks.csv")
# user ids, first names and last names
users = pd.read_csv("user_names.csv")
# The matchin of pillar/sub-pillar names from different AFs into unified pillar/sub-pillar names
matching_2d_mat = pd.read_csv("mat_2d_matching.csv")
matching_1d_mat = pd.read_csv("mat_1d_matching.csv")

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [4]:
# Pandas will read json objects as strings,
# here I am converting them into Python dict objects
af_widgets["properties"] = af_widgets["properties"].apply(json.loads)

In [5]:
raw_name_to_tag_dict = {
    "Severity":"Severity",
    "Severity (Needs assessment entries only)":"Severity",
    "SEVERITY":"Severity",
    
    "Reliability":"Reliability",
    "RELIABILITY":"Reliability",
    
    "Demographic Groups":"Demographic Groups",
    "DEMOGRAPHIC GROUPS":"Demographic Groups",
    
    "Information date":"Information Date",
    "Information Date":"Information Date",
    "Date":"Information Date",
    "DATE OF INFORMATION":"Information Date",
    "Date range":"Information Date",
    
    "Geo Location":"Geo Location",
    "LOCATION":"Geo Location",
    "Geo location":"Geo Location",
    "Country":"Geo Location",
    "GEOLOCATIONS":"Geo Location",
    
    "Affected groups":"Affected Groups",
    "AFFECTED GROUPS":"Affected Groups",
    
    "Specific Needs Groups":"Specific Needs Groups",
    "SPECIFIC NEEDS GROUPS":"Specific Needs Groups",
}


In [6]:
def clean_titles(x):
    if x in list(raw_name_to_tag_dict.keys()):
        return raw_name_to_tag_dict[x]
    return x

af_widgets['title'] = af_widgets['title'].apply(clean_titles)

In [7]:
af_widgets['title'].value_counts()[:40]

Geo Location                                        233
Excerpt                                             210
Information Date                                    200
Demographic Groups                                  176
Affected Groups                                     169
Severity                                            169
Reliability                                         167
Specific Needs Groups                               164
Cross sector                                         92
Sectors                                              91
EXCERPT                                              71
Sectoral Information                                 52
Operational Environment                              49
Flag                                                 43
Matrix 1D                                            32
Matrix 2D                                            26
Organigram                                           15
Conditional                                     

In [8]:
# widget names of 2D matrices in the chosen AFs's
mat2d_titles = [
    s.upper() for s in [
        "Pre-Crisis",
        "Shock/Event",
        "In-Crisis",
        "Sectors",
        "Sectoral Information",
        "Matrix 2D",

    ]
]

mat2d_properties_ids = af_widgets[(
    af_widgets["title"].str.upper()).isin(mat2d_titles)][[
        "properties", "analysis_framework_id"
    ]]
mat2d_properties = mat2d_properties_ids["properties"].tolist()
mat2d_ids = mat2d_properties_ids["analysis_framework_id"].tolist()
## 1D Matrices
# widget names of 1D matrices in the chosen AFs's
mat1d_titles = [
    s.upper() for s in [
        "Operational Environment",  # iMMAP, 2020 Okular, Okular Generic, Colombia-AF, Nigeria Situation Analysis (OA), IFRC Master Framework 2019
        "Cross Sector",  #Rohingya Framework, IFRC Analytical Framework 2018, 
        "Matrix 1D",  # Situation Analysis Generic Yemen, 
        "Cross Sectors",  # Situation Analysis Generic Libya

    ]
]
mat1d_titles = mat1d_titles + [
    sub_s.title() for sub_s in ('Operational Environment - Dimension',
                                'Operational Environment - Subdimension')
]
mat1d_properties_ids = af_widgets[(
    af_widgets["title"].str.upper()).isin(mat1d_titles)][[
        "properties", "analysis_framework_id"
    ]]
mat1d_properties = mat1d_properties_ids["properties"].tolist()
mat1d_ids = mat1d_properties_ids["analysis_framework_id"].tolist()

In [9]:
afids_pillars_subpillars = dict()
errors = []
for mat, af_id in zip(mat2d_properties, mat2d_ids):
    try:
        dims = mat["data"]['dimensions']
        afids_pillars_subpillars[af_id] = {}
        for dim in dims:
            pillar = dim["title"]
            sub_pillars = []
            for sub_pillar_dict in dim["subdimensions"]:
                sub_pillars.append(sub_pillar_dict["title"])
            afids_pillars_subpillars[af_id][pillar] = sub_pillars
    except KeyError:
        errors.append([mat, af_id])
##
afids_rows_cells = dict()
for mat, af_id in zip(mat1d_properties, mat1d_ids):
    try:
        rows = mat["data"]['rows']
        afids_rows_cells[af_id] = {}
        for row in rows:
            pillar = row["title"]
            sub_pillars = []
            for sub_pillar_dict in row["cells"]:
                sub_pillars.append(sub_pillar_dict["value"])
            afids_rows_cells[af_id][pillar] = sub_pillars
    except KeyError:
        errors.append([mat, af_id])

In [10]:
afids_pillars_subpillars

{1189: {'Scope and Scale': ['Drivers/ Aggravating Factors',
   'System Disruption',
   'Damage',
   'Losses',
   'Lesson Learnt'],
  'Humanitarian Conditions': ['Access to basic services or goods',
   'Impact on physical and mental wellbeing',
   'Risks/Vulnerabilities',
   'Specific Needs',
   'Unmet Needs',
   'Lessons Learnt'],
  'Capacities and Response': ['Coping Mechanisms',
   'National Response',
   'International Response',
   'Systems Functionality',
   'Response Gaps',
   'Lesson Learnt']},
 551: {'Scope & Scale': ['Drivers/Aggravating Factors',
   'System Disruption',
   'Damages & Losses',
   'Lessons Learnt'],
  'Humanitarian Conditions': ['Living Standards',
   'Coping Mechanisms',
   'Physical & mental wellbeing',
   'Risks & Vulnerabilities',
   'People with Specific Needs',
   'Unmet Needs',
   'Lessons Learnt'],
  'Capacities & Response': ['System Functionality',
   'Government',
   'LNGO',
   'International',
   'Response Gaps',
   'Lessons Learnt']},
 359: {'XX': [

In [11]:
def extract_title(x):
    if x["excel"].get("title"):
        return x["excel"]["title"]
    elif x["excel"].get("type") == "multiple":
        return x["excel"]["titles"]
##
exportables["data"] = exportables["data"].apply(literal_eval)
af_titles = exportables["data"].apply(extract_title).tolist()

In [12]:
entries.shape, exportdata.shape

((200243, 21), (1134710, 4))

In [13]:
entries[~entries["excerpt"].isna()]["excerpt"].unique().shape

(153382,)

In [14]:
exid_to_exdata = dict()
for ex_id, ex_data in zip(exportables["id"], exportables["data"]):
    exid_to_exdata[ex_id] = ex_data

In [15]:
exid_to_exdata

{2414: {'excel': {'title': 'SPECIFIC NEEDS GROUPS'}},
 2430: {'excel': {'title': 'DATE OF INFORMATION', 'col_type': 'date'}},
 2472: {'excel': {'title': 'DEMOGRAPHIC GROUPS'}},
 2659: {'excel': {'title': 'RELIABILITY'}},
 2667: {'excel': {'title': 'SEVERITY'}},
 5153: {'excel': {'type': 'multiple',
   'titles': ['Organigram - Level 0',
    'Organigram - Level 1',
    'Organigram - Level 2',
    'Organigram - Level 3']}},
 2662: {'excel': {'type': 'multiple',
   'titles': ['Matrix 1D - Dimension', 'Matrix 1D - Subdimension']},
  'report': {'levels': [{'id': 'pillar-0',
     'title': 'Context',
     'sublevels': [{'id': 'pillar-0-subpillar-0', 'title': 'Overview'},
      {'id': 'pillar-0-subpillar-4', 'title': 'Politics'},
      {'id': 'pillar-0-subpillar-22', 'title': 'Security'},
      {'id': 'pillar-0-subpillar-5', 'title': 'Stakeholders'},
      {'id': 'pillar-0-subpillar-11', 'title': 'Economy'},
      {'id': 'pillar-0-subpillar-6', 'title': 'Society and Community'},
      {'id': 'p

In [16]:
widget_key_id_to_title = dict()
for w_key, w_id, title in zip(af_widgets["key"], af_widgets["widget_id"], af_widgets["title"]):
    widget_key_id_to_title[(w_key, w_id)] = title

In [17]:
def exportdata_to_tag_title(row):
    data = json.loads(row[1])
    wkey, wid = None, None
    if data.get("common"):
        wkey = data.get("common").get("widget_key")
        wid = data["common"].get("widget_id")
    if wkey and wid:
        if widget_key_id_to_title.get((wkey, wid)):
            return widget_key_id_to_title.get((wkey, wid))
    if data.get("report") and data["report"].get("other"):
            if len(data["report"]["other"]) == 1 and data["report"]["other"][0].get("title"):
                if data["report"]["other"][0]["title"]:
                    return data["report"]["other"][0]["title"]
    if isinstance(data.get("excel"), list) and len(data["excel"])==1 and \
     data["excel"][0].get("widget_key") and data["excel"][0].get("widget_id"):
        wkey = data["excel"][0]["widget_key"]
        wid = data["excel"][0]["widget_id"]
        if widget_key_id_to_title.get((wkey, wid)):
            return widget_key_id_to_title.get((wkey, wid))
    exportable_id = row[3]
    if exid_to_exdata[exportable_id]["excel"].get("title"):
        return exid_to_exdata[exportable_id]["excel"]["title"]
    elif exid_to_exdata[exportable_id]["excel"].get("type") == "multiple":
        return exid_to_exdata[exportable_id]["excel"]["titles"]
    raise

In [18]:
exportdata["tag_title"] = exportdata.apply(exportdata_to_tag_title, axis=1)

In [19]:
def exportdata_to_tag_value(row):
    data = json.loads(row[1])
    if isinstance(data["excel"], list) and len(data["excel"])==1:
        return data["excel"][0]["value"]
    if data["excel"].get("type") == "lists":
        return data["excel"]["values"]
    if isinstance(data["excel"], dict) and "value" in data["excel"]:
        return data["excel"].get("value")
    if isinstance(data["excel"], dict) and "values" in data["excel"]:
        return data["excel"].get("values")
    elif "values" in data["common"]:
        return data["common"]["values"]
    elif "value" in data["common"]:
        return data["common"]["value"]
    raise

In [20]:
exportdata["tag_value"] = exportdata.apply(exportdata_to_tag_value, axis=1)

In [21]:
def title_case(tag):
    if isinstance(tag, (list, tuple)):
        return tuple([x.title() for x in tag])
    return tag.title()

In [22]:
exportdata["tag_title"] = exportdata["tag_title"].apply(title_case)

In [24]:
mat_1d_and_flag_titles = [
    'Flag', 'Operational Environment', 'Cross Sector','Temática',
    ('Operational Environment - Dimension',
     'Operational Environment - Subdimension'),
    ('Flag - Dimension', 'Flag - Subdimension'),
    ('High Level Tags - Dimension', 'High Level Tags - Subdimension')
]

In [25]:
exportdata.tag_title.unique()

array(['Severity', 'Reliability', 'Affected Groups', 'Information Date',
       'Geo Location', 'Sectoral Information', 'Demographic Groups',
       'Temática', 'Operational Environment', 'Shock/Event',
       'Context Additional Tags', 'In-Crisis', 'Specific Needs Groups',
       'Crisis Type', 'Comment', 'Additional Context', 'Sectors',
       'Cross Sector', 'Displaced Pop Type', 'Cleaning Tags',
       'Displacement Profile Details',
       ('Affected Groups - Level 0', 'Affected Groups - Level 1', 'Affected Groups - Level 2', 'Affected Groups - Level 3'),
       'Cleaning Comments',
       ('Sectoral Information - Dimension', 'Sectoral Information - Subdimension', 'Sectoral Information - Sector', 'Sectoral Information - Subsectors'),
       'Location', ('Information Date (From)', 'Information Date (To)'),
       ('Operational Environment - Dimension', 'Operational Environment - Subdimension'),
       ('In-Crisis - Dimension', 'In-Crisis - Subdimension', 'In-Crisis - Sector', 'In-C

In [26]:
exportdata[exportdata.tag_title=='Crisis Type']

Unnamed: 0,id,data,entry_id,exportable_id,tag_title,tag_value
95,1930972,"{""excel"": {}, ""common"": {""type"": ""list"", ""valu...",406505,8353,Crisis Type,[Displacement]
282,1844271,"{""excel"": {}, ""common"": {""type"": ""list"", ""valu...",390938,8353,Crisis Type,[Covid-19]
389,1822847,"{""excel"": {}, ""common"": {""type"": ""list"", ""valu...",388476,8353,Crisis Type,[Displacement]
576,726359,"{""excel"": {}, ""common"": {""type"": ""list"", ""valu...",148859,8353,Crisis Type,[Covid-19]
646,733952,"{""excel"": {}, ""common"": {""type"": ""list"", ""valu...",150760,8353,Crisis Type,[Covid-19]
...,...,...,...,...,...,...
1133517,1901013,"{""excel"": {}, ""common"": {""type"": ""list"", ""valu...",401083,8353,Crisis Type,[Displacement]
1133646,1922767,"{""excel"": {}, ""common"": {""type"": ""list"", ""valu...",404919,8353,Crisis Type,[Displacement]
1133695,652815,"{""excel"": {}, ""common"": {""type"": ""list"", ""valu...",133068,8353,Crisis Type,[Covid-19]
1133781,865334,"{""excel"": {}, ""common"": {""type"": ""list"", ""valu...",173679,8353,Crisis Type,[Covid-19]


In [27]:
exportdata[exportdata.tag_title=='Context Additional Tags']

Unnamed: 0,id,data,entry_id,exportable_id,tag_title,tag_value
22,1061513,"{""excel"": {""type"": ""lists"", ""values"": [[""Effec...",218132,9455,Context Additional Tags,"[[Effects, Direct effect]]"
49,935815,"{""excel"": {""type"": ""lists"", ""values"": [[""Gener...",188931,9455,Context Additional Tags,"[[General , Response gap]]"
356,1010477,"{""excel"": {""type"": ""lists"", ""values"": [[""Gener...",204270,9455,Context Additional Tags,"[[General , Risk]]"
376,958623,"{""excel"": {""type"": ""lists"", ""values"": []}, ""co...",193878,9455,Context Additional Tags,[]
482,1352479,"{""excel"": {""type"": ""lists"", ""values"": [[""Effec...",224722,9455,Context Additional Tags,"[[Effects, Direct effect]]"
...,...,...,...,...,...,...
1133765,1240192,"{""excel"": {""type"": ""lists"", ""values"": [[""Gener...",255331,9455,Context Additional Tags,"[[General , Risk]]"
1134159,1185757,"{""excel"": {""type"": ""lists"", ""values"": [[""Human...",243151,9455,Context Additional Tags,"[[Humanitarian conditions, Negative coping mec..."
1134168,1422396,"{""excel"": {""type"": ""lists"", ""values"": [[""Gener...",295941,9455,Context Additional Tags,"[[General , Response gap]]"
1134193,1071675,"{""excel"": {""type"": ""lists"", ""values"": [[""Effec...",198652,9455,Context Additional Tags,"[[Effects, Direct effect]]"


In [28]:
#exportdata[exportdata["tag_title"].isin([('Operational Environment - Dimension', 'Operational Environment - Subdimension')])]["tag_value"].tolist()
#exportdata[exportdata["tag_title"].eq('High Level Tags')]["tag_value"].tolist()

In [29]:
mat2d_titles = [s.title() for s in mat2d_titles]
mat1d_titles = [
    s.title() if isinstance(s, str) else (sub_s.title() for sub_s in s)
    for s in mat1d_titles
]
mat_1d_and_flag_titles = [
    s.title() if isinstance(s, str) else (sub_s.title() for sub_s in s)
    for s in mat_1d_and_flag_titles
]

In [30]:
exportdata_of_interest = exportdata[exportdata["tag_title"].isin(
    mat2d_titles + mat_1d_and_flag_titles)]

In [31]:
#exportdata_of_interest

In [32]:
exportdata_of_interest.shape, entries.shape

((247385, 6), (200243, 21))

In [33]:
entries_labeled = pd.merge(entries,
                           exportdata_of_interest,
                           how="left",
                           left_on="id",
                           right_on="entry_id",
                           suffixes=('_entry', '_exportdata'))

In [34]:
entries_labeled["id_entry"].shape, entries_labeled["id_entry"].unique().shape

((249325,), (200243,))

In [35]:
def agg_group(group):
    tag_titles = group["tag_title"]
    tag_titles = ["Mat1D" if title in mat_1d_and_flag_titles else "Mat2D" for title in tag_titles]
    tag_values = group["tag_value"]
    return {f"{title}_{i}":value for i, (title, value) in enumerate(zip(tag_titles, tag_values))}

In [36]:
tags = entries_labeled.groupby("id_entry").apply(agg_group)

In [37]:
entryids_tags = pd.DataFrame({"entry_id": tags.index, "tags": tags.values})

In [38]:
entryids_tags.shape

(200243, 2)

In [39]:
entries_labeled.columns

Index(['id_entry', 'created_at', 'modified_at', 'excerpt', 'image_raw',
       'analysis_framework_id', 'created_by_id', 'lead_id', 'modified_by_id',
       'entry_type', 'information_date', 'order', 'client_id', 'project_id',
       'tabular_field_id', 'dropped_excerpt', 'highlight_hidden', 'verified',
       'verification_last_changed_by_id', 'image_id', 'title', 'id_exportdata',
       'data', 'entry_id', 'exportable_id', 'tag_title', 'tag_value'],
      dtype='object')

In [40]:
print(entries_labeled.shape)
entries_labeled = entries_labeled[[
    'entry_id',
    'created_at',
    'modified_at',
    'excerpt',
    'entry_type',
    'analysis_framework_id',
    'created_by_id',
    'lead_id',
    'modified_by_id',
    'information_date',
    'order',
    'project_id',
    'dropped_excerpt',
    'verified',
]].drop_duplicates()
print(entries_labeled.shape)

(249325, 27)
(200243, 14)


In [41]:
entries_labeled = pd.merge(entries_labeled, entryids_tags, on="entry_id")

In [42]:
entries_labeled.shape, entries_labeled.columns

((198301, 15),
 Index(['entry_id', 'created_at', 'modified_at', 'excerpt', 'entry_type',
        'analysis_framework_id', 'created_by_id', 'lead_id', 'modified_by_id',
        'information_date', 'order', 'project_id', 'dropped_excerpt',
        'verified', 'tags'],
       dtype='object'))

In [43]:
entries_labeled["tags_str"] = entries_labeled["tags"].apply(str)
entries_labeled.duplicated(subset=["entry_id", "tags_str"]).sum()

0

In [44]:
entries_labeled["entry_id"].duplicated().sum()

0

In [45]:
entries_labeled.shape

(198301, 16)

In [46]:
entries_labeled["tags"].isna().sum()

0

In [47]:
entries.shape, entries[~entries["excerpt"].isna()]["excerpt"].unique().shape

((200243, 21), (153382,))

In [48]:
entries_labeled.shape, entries_labeled.columns

((198301, 16),
 Index(['entry_id', 'created_at', 'modified_at', 'excerpt', 'entry_type',
        'analysis_framework_id', 'created_by_id', 'lead_id', 'modified_by_id',
        'information_date', 'order', 'project_id', 'dropped_excerpt',
        'verified', 'tags', 'tags_str'],
       dtype='object'))

In [49]:
dimensions = []
for k, v in entries_labeled["tags"].tolist()[3].items():
    if k.startswith("Mat2D"):
        tag_value = v
        if len(tag_value):
            for t in tag_value:
                if t[0]:
                    dimensions.append(str(af_id)+"->"+t[0].title() +"->"+t[1].title())

list(set(dimensions))

['1717->Impact->Status Of Essential Infrastructure, Systems, Markets And Networks',
 '1717->Humanitarian Conditions->Living Standards']

In [50]:
def tags_to_pillars(x):
    tags = x[0]
    af_id = x[1]
    pillars = []
    for tag_key, tag_value in tags.items():
        if not tag_key.startswith("Mat2D"):
            continue
        if len(tag_value):
            for t in tag_value:
                if t[0]:
                    pillars.append(str(af_id)+"->"+t[0].title() +"->"+t[1].title())
    return list(set(pillars))

def tags_to_sectors(x):
    tags = x[0]
    af_id = x[1]
    sectors = []
    for tag_key, tag_value in tags.items():
        if not tag_key.startswith("Mat2D"):
            continue
        if len(tag_value):
            for t in tag_value:
                if len(t) > 2 and t[2]:
                    sectors.append(t[2].title())
    return list(set(sectors))

def tags_to_subsectors(x):
    tags = x[0]
    af_id = x[1]
    subsectors = []
    for tag_key, tag_value in tags.items():
        if not tag_key.startswith("Mat2D"):
            continue
        if len(tag_value):
            subsectors = []
            for t in tag_value:
                if len(t) >3 and t[3]:
                    if isinstance(t[3], list):
                        subsectors.extend([x.title() for x in t[3]])
                    else:
                        subsectors.append(t[3].title())
    return list(set(subsectors))
##
def tags_to_pillars_1d(x):
    tags = x[0]
    af_id = x[1]
    pillars = []
    for tag_key, tag_value in tags.items():
        if not tag_key.startswith("Mat1D"):
            continue
        if len(tag_value):
            for t in tag_value:
                if t[0]:
                    pillars.append(str(af_id)+"->"+t[0].title() +"->"+t[1].title())
    return list(set(pillars))

In [51]:
entries_labeled["pillars"] = entries_labeled[["tags", 'analysis_framework_id'
                                              ]].apply(tags_to_pillars, axis=1)
entries_labeled["sectors"] = entries_labeled[["tags", 'analysis_framework_id'
                                              ]].apply(tags_to_sectors, axis=1)
entries_labeled["subsectors"] = entries_labeled[[
    "tags", 'analysis_framework_id'
]].apply(tags_to_subsectors, axis=1)
##
entries_labeled["pillars_1d"] = entries_labeled[["tags", 'analysis_framework_id'
                                              ]].apply(tags_to_pillars_1d, axis=1)

In [52]:
entries_labeled[~entries_labeled["excerpt"].isna()]["excerpt"].unique().shape

(152295,)

In [53]:
pills_unique_1d = set()
pills_occurances_1d = list()
for pills_1d in entries_labeled["pillars_1d"]:
    pills_unique_1d.update(pills_1d)
    pills_occurances_1d.extend(pills_1d)
len(pills_unique_1d), Counter(pills_occurances_1d).most_common()

(240,
 [('1306->Covid-19 Overview->Cases', 7165),
  ('1306->Displacement->Type/Numbers/Movements', 5490),
  ('829->Displacement ->Displacement', 4278),
  ('699->Humanitarian Profile->Affected Groups', 4188),
  ('1306->Context->Security & Stability', 4016),
  ('1306->Covid-19 Overview->Deaths', 3906),
  ('1306->Covid-19 Containment Measures->Public Health Measures', 2994),
  ('1306->Context->Economy', 2870),
  ('1306->Covid-19 Overview->Vaccination', 2856),
  ('1306->Casualties->Dead', 2639),
  ('699->Context->Demography', 2542),
  ('829->Context->Demographics', 2081),
  ('1306->Covid-19 Overview->Testing', 2030),
  ('829->Context->Legal Or Normative Framework', 1920),
  ('1306->Displacement->Push Factors', 1889),
  ('537->Humanitarian Profile->Affected Groups', 1768),
  ('1306->Covid-19 Containment Measures->Movement Restrictions', 1683),
  ('552->Humanitarian Profile->Affected Groups', 1607),
  ('1004->Displacement Profile->Type/Numbers', 1575),
  ('1306->Flag->Response Gap', 1430),
 

In [54]:
pills_unique = set()
pills_occurances = list()
for pills in entries_labeled["pillars"]:
    pills_unique.update(pills)
    pills_occurances.extend(pills)
Counter(pills_occurances).most_common()

[('1306->Humanitarian Conditions->Living Standards', 18932),
 ('1306->Humanitarian Conditions->Physical & Mental Wellbeing', 11641),
 ('1306->Impact->Impact On System & Services', 10276),
 ('1306->Impact->Drivers/Aggravating Factors', 10176),
 ('1306->Impact->Impact On People', 10094),
 ('829->Humanitarian Conditions->Living Standards', 10052),
 ('1465->Humanitarian Conditions->Humanitarian Condition', 8690),
 ('1465->Operational Environment->International Humanitarian Response And Capacities',
  7816),
 ('1306->At Risk->People At Risk / Vulnerable', 5737),
 ('1306->Capacities & Response->International', 4718),
 ('829->Response And Capacities->National And Local', 4358),
 ('699->Capacities & Response->International Actors', 4190),
 ('1465->Humanitarian Conditions->Risk', 3646),
 ('1465->Humanitarian Situation->Humanitarian Profile', 3642),
 ('829->Humanitarian Conditions->Physical And Mental Well-Being', 3596),
 ('1465->Stated Priorities And Recommendations->Recommendations From Aid/Re

In [55]:
secs_unique = set()
secs_occurances = list()
for secs in entries_labeled["sectors"]:
    secs_unique.update(secs)
    secs_occurances.extend(secs)
#secs_unique
Counter(secs_occurances).most_common()

[('Health', 44995),
 ('Protection', 34224),
 ('Livelihoods', 21106),
 ('Cross-Sector', 17853),
 ('Food Security', 16162),
 ('Cross', 15321),
 ('Wash', 14677),
 ('Education', 13520),
 ('Shelter', 10518),
 ('Nutrition', 6684),
 ('Food', 3854),
 ('Agriculture', 3576),
 ('Logistics', 3142),
 ('Shelter And Nfis', 2130),
 ('Cross Cutting', 770),
 ('Cccm', 750),
 ('Logistic', 277),
 ('Nfi', 137),
 ('Livelihood', 134)]

In [56]:
entries_labeled.columns

Index(['entry_id', 'created_at', 'modified_at', 'excerpt', 'entry_type',
       'analysis_framework_id', 'created_by_id', 'lead_id', 'modified_by_id',
       'information_date', 'order', 'project_id', 'dropped_excerpt',
       'verified', 'tags', 'tags_str', 'pillars', 'sectors', 'subsectors',
       'pillars_1d'],
      dtype='object')

In [57]:
final_df = entries_labeled[entries_labeled["entry_type"].eq("excerpt")
                           & (~entries_labeled["excerpt"].isna())]

In [58]:
def remove_newlines(excerpt):
    if not isinstance(excerpt, str):
        return excerpt
    return re.sub("\s+", " ", excerpt)

final_df.loc[:, 'excerpt'] = final_df['excerpt'].apply(remove_newlines)
final_df.loc[:, 'dropped_excerpt'] = final_df['dropped_excerpt'].apply(remove_newlines)
##
final_df = final_df.sort_values("verified").reset_index(drop=True).drop_duplicates(subset='excerpt', keep="last")
final_df.shape, final_df.duplicated(subset="excerpt").sum()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(ilocs[0], value)


((152169, 20), 0)

In [59]:
sector_name_mapper = {
    "Agriculture": "Agriculture",
    "Cccm": "",
    "Cross": "Cross",
    "Cross Cutting": "Cross",
    "Cross-Sector": "Cross",
    "Education": "Education",
    "Food": "Food Security",
    "Food Security": "Food Security",
    "Nutrition": "Nutrition",
    "Health": "Health",
    "Livelihood": "Livelihoods",
    "Livelihoods": "Livelihoods",
    "Logistic": "Logistics",
    "Logistics": "Logistics",
    "Protection": "Protection",
    "Shelter": "Shelter",
    "Shelter And Nfis": "Shelter",
    "Nfi": "",
    "Wash": "WASH",
}

In [60]:
af_title_to_id = dict()
for afid, title in zip(afs["id"], afs["title"]):
    af_title_to_id["title"] = afid

In [61]:
af_id_title = afs[["id", "title"]]
af_id_title.columns = ["analysis_framework_id", "Framework Name"]
matching_2d_mat = pd.merge(matching_2d_mat,
                    af_id_title,
                    how="left",
                    left_on="Framework Title",
                    right_on="Framework Name")
matching_2d_mat.columns

Index(['Unnamed: 0', 'Framework Title', 'Type', 'Pillar', 'Sub-pillar',
       'Virtual Type', 'Virtual Pillar', 'Virtual Sub-pillar', 'First-pass',
       'Verified', 'Reversible', 'Cover', 'analysis_framework_id',
       'Framework Name'],
      dtype='object')

In [62]:
matching_2d_mat["Pillar"] = matching_2d_mat["Pillar"].apply(
    lambda x: x.strip().title())
matching_2d_mat["Sub-pillar"] = matching_2d_mat["Sub-pillar"].apply(
    lambda x: x.strip().title())
matching_2d_mat["Final Pillar Name"] = matching_2d_mat[
    "Virtual Pillar"].apply(lambda x: x.strip().title())
matching_2d_mat["Final Sub-pillar Name"] = matching_2d_mat[
    "Virtual Sub-pillar"].apply(lambda x: x.strip().title())

In [63]:
matching_2d_mat["Final Sub-pillar Name"].unique()

array(['Driver/Aggravating Factors',
       'Impact On Systems, Services And Networks',
       'Number Of People Affected', 'Living Standards',
       'Coping Mechanisms', 'Physical And Mental Well Being',
       'Risk And Vulnerabilities', 'Number Of People In Need',
       'Expressed By Population', 'Expressed By Humanitarian Staff',
       'National Response', 'International Response',
       'Number Of People Reached/Response Gaps', 'Impact On People',
       'Number Of People At Risk', 'Local Response',
       'Number Of People Facing Humanitarian Access Constraints/Humanitarian Access Gaps',
       'Economy', 'Socio Cultural', 'Security & Stability', 'Politics',
       'Legal & Policy', 'Demography', 'Environment',
       'Underlying/Aggravating Factors', 'Restriction Measures'],
      dtype=object)

In [64]:
matching_2d_mat["Final Pillar Name"].unique()

array(['Impact', 'Humanitarian Conditions', 'At Risk', 'Priority Needs',
       'Priority Interventions', 'Capacities & Response',
       'Humanitarian Access', 'Context', 'Shock/Event', 'Covid-19'],
      dtype=object)

In [65]:
matching_2d_mat.loc[
    matching_2d_mat["Final Pillar Name"].eq('Humanitatian Conditions'),
    "Final Pillar Name"] = 'Humanitarian Conditions'

In [66]:
matching_2d_mat = matching_2d_mat[~matching_2d_mat["analysis_framework_id"].isna()]

matching_2d_mat = matching_2d_mat.astype({
    'Framework Name': str,
    'Pillar': str,
    'Sub-pillar': str,
    'Final Pillar Name': str,
    'Final Sub-pillar Name': str,
    'analysis_framework_id': int
})

In [67]:

matching_2d_mat["analysis_framework_id"] = matching_2d_mat["analysis_framework_id"].apply(str)

In [68]:
matching_2d_mat["original_pillar"] = matching_2d_mat[
    "analysis_framework_id"] + "->" + matching_2d_mat[
        "Pillar"] + "->" + matching_2d_mat["Sub-pillar"]
matching_2d_mat["original_subpillar"] = matching_2d_mat[
    "analysis_framework_id"] + "->" + matching_2d_mat[
        "Pillar"] + "->" + matching_2d_mat["Sub-pillar"]
##
matching_2d_mat["target_pillar"] = matching_2d_mat["Final Pillar Name"]
matching_2d_mat["target_subpillar"] = matching_2d_mat[
    "Final Pillar Name"] + "->" + matching_2d_mat["Final Sub-pillar Name"]

In [69]:
pillar_name_mapper = dict()
subpillar_name_mapper = dict()
for pillar, fpillar, subpillar, fsubpillar, af_id in zip(
        matching_2d_mat["original_pillar"], matching_2d_mat["target_pillar"],
        matching_2d_mat["original_subpillar"],
        matching_2d_mat["target_subpillar"],
        matching_2d_mat["analysis_framework_id"]):
    pillar_name_mapper[pillar] = fpillar
    subpillar_name_mapper[subpillar] = fsubpillar

In [70]:
pillar_name_mapper

{'1004->Scope & Scale->Drivers/Aggravating Factors': 'Impact',
 '1004->Scope & Scale->System Disruption': 'Impact',
 '1004->Scope & Scale->People Affected': 'Impact',
 '1004->Humanitarian Conditions->Living Standards': 'Humanitarian Conditions',
 '1004->Humanitarian Conditions->Coping Mechanisms': 'Humanitarian Conditions',
 '1004->Humanitarian Conditions->Physical & Mental Wellbeing': 'Humanitarian Conditions',
 '1004->Humanitarian Conditions->People At Risk / Vulnerable': 'At Risk',
 '1004->Humanitarian Conditions->People In Need': 'Humanitarian Conditions',
 '1004->Priorities->Priority Problems (Pop)': 'Priority Needs',
 '1004->Priorities->Priority Problems (Staff)': 'Priority Needs',
 '1004->Priorities->Priority Interventions (Pop)': 'Priority Interventions',
 '1004->Priorities->Priority Interventions (Staff)': 'Priority Interventions',
 '1004->Capacities & Response->System Functionality': 'Impact',
 '1004->Capacities & Response->National': 'Capacities & Response',
 '1004->Capaciti

In [71]:
matching_1d_mat.columns

Index(['Unnamed: 0', 'Framework Title', 'Type', 'Pillar', 'Sub-pillar',
       'Virtual Type', 'Virtual Pillar', 'Virtual Sub-pillar', 'First-pass',
       'Verified', 'Reversible', 'Cover'],
      dtype='object')

In [72]:
matching_1d_mat = pd.merge(matching_1d_mat,
                           afs[["id", "title"]],
                           how="left",
                           left_on="Framework Title",
                           right_on="title")


In [73]:
matching_1d_mat = matching_1d_mat[~matching_1d_mat["id"].isna()]

matching_1d_mat["id"] = matching_1d_mat["id"].apply(int)

In [74]:
matching_1d_mat.columns

Index(['Unnamed: 0', 'Framework Title', 'Type', 'Pillar', 'Sub-pillar',
       'Virtual Type', 'Virtual Pillar', 'Virtual Sub-pillar', 'First-pass',
       'Verified', 'Reversible', 'Cover', 'id', 'title'],
      dtype='object')

In [75]:
pillar_name_mapper_1d = dict()
subpillar_name_mapper_1d = dict()
for pillar, fpillar, subpillar, fsubpillar, af_id in zip(
        matching_1d_mat["Pillar"], matching_1d_mat["Virtual Pillar"],
        matching_1d_mat["Sub-pillar"],
        matching_1d_mat["Virtual Sub-pillar"], matching_1d_mat["id"]):
    pillar = str(af_id) + "->" + pillar + "->" + subpillar
    pillar_name_mapper_1d[pillar] = fpillar
    subpillar_name_mapper_1d[pillar] = fpillar + "->" + fsubpillar

In [76]:
def sector_mapper(sec):
    if sec == sec:
        sectors =  [sector_name_mapper.get(s, "") for s in sec]
        return [sec for sec in sectors if sec]
    return []
##
def pillar_mapper(dim):
    if dim == dim:
        dim = [pillar_name_mapper.get(d, "") for d in dim]
        return [d for d in dim if d]
    return []
##
def subpillar_mapper(subdim):
    if subdim == subdim:
        subdim = [subpillar_name_mapper.get(d, "") for d in subdim]
        return [d for d in subdim if d]
    return []
##
def pillar_mapper_1d(pill):
    if pill == pill:
        pill = [pillar_name_mapper_1d.get(p, "") for p in pill]
        return [p for p in pill if p]
    return []
##
def subpillar_mapper_1d(subpill):
    if subpill == subpill:
        subpill = [subpillar_name_mapper_1d.get(p, "") for p in subpill]
        return [p for p in subpill if p]
    return []

In [77]:
final_df["sectors"] = final_df["sectors"].apply(sector_mapper)
final_df["subpillars_2d"] = final_df["pillars"].apply(subpillar_mapper)
final_df["pillars_2d"] = final_df["pillars"].apply(pillar_mapper)
##
final_df["subpillars_1d"] = final_df["pillars_1d"].apply(subpillar_mapper_1d)
final_df["pillars_1d"] = final_df["pillars_1d"].apply(pillar_mapper_1d)

In [78]:
final_df.columns

Index(['entry_id', 'created_at', 'modified_at', 'excerpt', 'entry_type',
       'analysis_framework_id', 'created_by_id', 'lead_id', 'modified_by_id',
       'information_date', 'order', 'project_id', 'dropped_excerpt',
       'verified', 'tags', 'tags_str', 'pillars', 'sectors', 'subsectors',
       'pillars_1d', 'subpillars_2d', 'pillars_2d', 'subpillars_1d'],
      dtype='object')

In [79]:
secs_unique = set()
secs_occurances = list()
for secs in final_df["sectors"]:
    secs_unique.update(secs)
    secs_occurances.extend(secs)
#secs_unique
Counter(secs_occurances).most_common()

[('Health', 34455),
 ('Protection', 27482),
 ('Cross', 27244),
 ('Livelihoods', 16157),
 ('Food Security', 14554),
 ('WASH', 10924),
 ('Education', 10347),
 ('Shelter', 9525),
 ('Nutrition', 4957),
 ('Agriculture', 2906),
 ('Logistics', 2617)]

In [80]:
pills_unique = set()
pills_occurances = list()
for pills in final_df["pillars_2d"]:
    pills_unique.update(pills)
    pills_occurances.extend(pills)
#pills_unique
Counter(pills_occurances).most_common()

[('Humanitarian Conditions', 49552),
 ('Impact', 37839),
 ('Capacities & Response', 25039),
 ('At Risk', 9272),
 ('Priority Interventions', 5830),
 ('Context', 3410),
 ('Priority Needs', 3219)]

In [81]:
subpills_unique = set()
subpills_occurances = list()
for subpills in final_df["subpillars_2d"]:
    subpills_unique.update(subpills)
    subpills_occurances.extend(subpills)
Counter(subpills_occurances).most_common()

[('Humanitarian Conditions->Living Standards', 26040),
 ('Humanitarian Conditions->Physical And Mental Well Being', 17263),
 ('Capacities & Response->International Response', 14995),
 ('Impact->Impact On Systems, Services And Networks', 12360),
 ('Impact->Driver/Aggravating Factors', 11817),
 ('Impact->Impact On People', 11411),
 ('At Risk->Risk And Vulnerabilities', 9271),
 ('Capacities & Response->National Response', 7440),
 ('Priority Interventions->Expressed By Humanitarian Staff', 5555),
 ('Humanitarian Conditions->Coping Mechanisms', 5105),
 ('Capacities & Response->Number Of People Reached/Response Gaps', 2346),
 ('Impact->Number Of People Affected', 2251),
 ('Priority Needs->Expressed By Population', 1726),
 ('Priority Needs->Expressed By Humanitarian Staff', 1493),
 ('Humanitarian Conditions->Number Of People In Need', 1144),
 ('Context->Security & Stability', 1024),
 ('Context->Economy', 858),
 ('Context->Socio Cultural', 577),
 ('Context->Environment', 327),
 ('Priority Inte

In [82]:
pills_unique_1d = set()
pills_occurances_1d = list()
for pills_1d in final_df["pillars_1d"]:
    pills_unique_1d.update(pills_1d)
    pills_occurances_1d.extend(pills_1d)
#pills_unique_1d
Counter(pills_occurances_1d).most_common()

[('Context', 17674),
 ('Displacement', 5717),
 ('Casualties', 3885),
 ('Humanitarian Access', 2184),
 ('Information and Communication', 824),
 ('Shock/Event', 760)]

In [83]:
subpills_unique_1d = set()
subpills_occurances_1d = list()
for subpills_1d in final_df["subpillars_1d"]:
    subpills_unique_1d.update(subpills_1d)
    subpills_occurances_1d.extend(subpills_1d)
#subpills_unique_1d
Counter(subpills_occurances_1d).most_common()

[('Context->Security & Stability', 5710),
 ('Context->Demography', 3782),
 ('Context->Economy', 3776),
 ('Casualties->Dead', 2995),
 ('Displacement->Type/Numbers/Movements', 2741),
 ('Displacement->Push factors', 1907),
 ('Context->Politics', 1809),
 ('Humanitarian Access->Physical constraints', 1737),
 ('Context->Socio Cultural', 1053),
 ('Context->Environment', 861),
 ('Context->Legal & Policy', 683),
 ('Casualties->Injured', 635),
 ('Displacement->Local integration', 534),
 ('Shock/Event->Type and characteristics', 451),
 ('Humanitarian Access->Number of people facing humanitarian access constraints/Humanitarian access gaps',
  447),
 ('Information and Communication->Knowledge and info gaps (pop)', 357),
 ('Displacement->Pull factors', 327),
 ('Information and Communication->Communication means and preferences', 314),
 ('Shock/Event->Hazard & Threats', 309),
 ('Casualties->Missing', 255),
 ('Displacement->Intentions', 208),
 ('Information and Communication->Information challenges an

In [84]:
final_df["subpillars_1d"].apply(lambda x:x!=[]).sum(),\
final_df["subpillars_2d"].apply(lambda x:x!=[]).sum(),\
final_df["sectors"].apply(lambda x:x!=[]).sum()

(25507, 96662, 120702)

### Splitting

In [None]:
# add columns for sectors
for sector in secs_unique:
    final_df[sector] = 0
# add columns for dimensions
for pillar in pills_unique:
    final_df[pillar] = 0
# add columns for subdimensions
for subpillar in subpills_unique:
    final_df[subpillar] = 0
## add columns for 1d pillars
for pillar_1d in pills_unique_1d:
    final_df[pillar_1d] = 0
## add columns for 1d subpillars
for subpillar_1d in subpills_unique_1d:
    final_df[subpillar_1d] = 0

In [None]:
for row in tqdm(final_df.iterrows(), total=final_df.shape[0]):
    sectors_i = row[1]["sectors"]
    pillars_i = row[1]["pillars"]
    subpillars_i = row[1]["subpillars"]
    pillars_1d_i = row[1]["pillars_1d"]
    subpillars_1d_i = row[1]["subpillars_1d"]
    idx = row[0]
    for sector in sectors_i:
        final_df.loc[idx, sector] = 1
    for pillar in pillars_i:
        final_df.loc[idx, pillar] = 1
    for subpillar in subpillars_i:
        final_df.loc[idx, subpillar] = 1
    for pillar_1d in pillars_1d_i:
        final_df.loc[idx, pillar_1d] = 1
    for subpillar_1d in subpillars_1d_i:
        final_df.loc[idx, subpillar_1d] = 1

In [None]:
classes = list(secs_unique) + list(pills_unique) + list(
    subpills_unique) + list(pills_unique_1d) + list(subpills_unique_1d)
len(classes)

In [None]:
final_df[classes].sum().astype(int)

### Stratified Split

In [None]:
from skmultilearn.model_selection import iterative_train_test_split
from skmultilearn.model_selection.measures import get_combination_wise_output_matrix

In [None]:
def train_val_test_df_split(df):
    class_to_id = {clss: i for i, clss in enumerate(classes)}
    num_classes = len(classes)
    labels = np.zeros([len(df), num_classes])
    for i, (sectors_i, pillars_i, subpillars_i, pillars_1d_i,
            subpillars_1d_i) in enumerate(
                zip(df["sectors"], df["pillars_2d"], df["subpillars_2d"],
                    df["pillars_1d"], df["subpillars_1d"])):
        for sec in sectors_i:
            labels[i, class_to_id[sec]] = 1
        for pil in pillars_i:
            labels[i, class_to_id[pil]] = 1
        for subpil in subpillars_i:
            labels[i, class_to_id[subpil]] = 1
        for pil_1d in pillars_1d_i:
            labels[i, class_to_id[pil_1d]] = 1
        for subpil_1d in subpillars_1d_i:
            labels[i, class_to_id[subpil_1d]] = 1
    ##
    X_train, y_train, X_test, y_test = iterative_train_test_split(
        df["excerpt"].to_numpy().reshape(-1, 1), labels, test_size=0.1)
    X_train, y_train, X_val, y_val = iterative_train_test_split(X_train,
                                                                y_train,
                                                                test_size=0.1)
    return {
        "X_train": X_train,
        "y_train": y_train,
        "X_test": X_test,
        "y_test": y_test,
        "X_val": X_val,
        "y_val": y_val
    }

In [None]:
df = final_df[[
    'entry_id',
    'lead_id',
    'project_id',
    'analysis_framework_id',
    'excerpt',
    'created_by_id',
    'modified_by_id',
    'verified',
    'sectors',
    'pillars',
    'subpillars',
    'pillars_1d',
    'subpillars_1d',
]]
df.columns = [
    'entry_id',
    'lead_id',
    'project_id',
    'analysis_framework_id',
    'excerpt',
    'created_by_id',
    'modified_by_id',
    'verified',
    'sectors',
    'pillars_2d',
    'subpillars_2d',
    'pillars_1d',
    'subpillars_1d',
]

In [None]:
X_train, y_train, X_test, y_test, X_val, y_val = list(
    train_val_test_df_split(df).values())

In [None]:
final_df_train = final_df[final_df["excerpt"].isin(X_train.reshape(-1))]
final_df_val = final_df[final_df["excerpt"].isin(X_val.reshape(-1))]
final_df_test = final_df[final_df["excerpt"].isin(X_test.reshape(-1))]

In [None]:
df_train = df[df["excerpt"].isin(X_train.reshape(-1))]
df_val = df[df["excerpt"].isin(X_val.reshape(-1))]
df_test = df[df["excerpt"].isin(X_test.reshape(-1))]

In [None]:
df_train.index & df_val.index, df_train.index & df_test.index, df_test.index & df_val.index 

In [None]:
df_train.to_csv("data_v0.5.1_train.csv", index=None)
df_val.to_csv("data_v0.5.1_val.csv", index=None)
df_test.to_csv("data_v0.5.1_test.csv", index=None)

In [None]:
df.to_csv("data_v0.5.1_full.csv", index=None)