In [1]:
import re
import os
import json
import random
from pprint import pprint
from ast import literal_eval
from collections import Counter, defaultdict

import numpy as np
import pandas as pd
from tqdm.auto import tqdm

In [2]:
random.seed(2021)
np.random.seed(2021)

In [3]:
# each line in this table correspond to single tagging of an entry.
# an entry may have more than one tagging (e.g. a tag in 2D matrix, another in 1D matrix,
# secondary tags, etc. )
exportdata = pd.read_csv("exportdata.csv")
# I pulled entries related to projects chosen by Patrice
entries = pd.read_csv("entries.csv")
# The widgets of AF's, i.e. 2D matrices, 1D matrices, secondary tagging widgets, etc.
af_widgets = pd.read_csv("af_widgets_of_interest.csv")
# Projects chosen by Patrice
projects = pd.read_csv("projects.csv")
#
exportables = pd.read_csv("af_exportables.csv")
# AF details
afs = pd.read_csv("analysis_frameworks.csv")
# user ids, first names and last names
users = pd.read_csv("user_names.csv")
# The matchin of pillar/sub-pillar names from different AFs into unified pillar/sub-pillar names
matching_2d_mat = pd.read_csv("mat_2d_matching.csv")
matching_1d_mat = pd.read_csv("mat_1d_matching.csv")

In [4]:
dirName = 'generated_entries'
if not os.path.exists(dirName):
    os.mkdir(dirName)

In [5]:
# remove "GIMAC Generic"'s data
entries = entries[~entries["analysis_framework_id"].eq(1465)]

In [6]:
raw_name_to_tag_dict = {
    "Severity":"Severity",
    "Severity (Needs assessment entries only)":"Severity",
    "SEVERITY":"Severity",
    
    "Reliability":"Reliability",
    "RELIABILITY":"Reliability",
    
    "Demographic Groups":"Demographic Groups",
    "DEMOGRAPHIC GROUPS":"Demographic Groups",
    
    "Information date":"Information Date",
    "Information Date":"Information Date",
    "Date":"Information Date",
    "DATE OF INFORMATION":"Information Date",
    "Date range":"Information Date",
    
    "Geo Location":"Geo Location",
    "LOCATION":"Geo Location",
    "Geo location":"Geo Location",
    "Country":"Geo Location",
    "GEOLOCATIONS":"Geo Location",
    
    "Affected groups":"Affected Groups",
    "AFFECTED GROUPS":"Affected Groups",
    
    "Specific Needs Groups":"Specific Needs Groups",
    "SPECIFIC NEEDS GROUPS":"Specific Needs Groups",
}


In [7]:
entries.shape

(170071, 21)

In [8]:
def clean_titles(x):
    if x in list(raw_name_to_tag_dict.keys()):
        return raw_name_to_tag_dict[x]
    return x


af_widgets['title'] = af_widgets['title'].apply(clean_titles)

In [9]:
# widget names of 2D matrices in the chosen AFs's
mat2d_titles = [
    s.upper() for s in [
        "Sectors",
        "Sectoral Information",
        "Matrix 2D",
    ]
]
# Pandas will read json objects as strings,
# here I am converting them into Python dict objects
af_widgets["properties"] = af_widgets["properties"].apply(json.loads)
##
mat2d_properties_ids = af_widgets[(
    af_widgets["title"].str.upper()).isin(mat2d_titles)][[
        "properties", "analysis_framework_id"
    ]]
mat2d_properties = mat2d_properties_ids["properties"].tolist()
mat2d_ids = mat2d_properties_ids["analysis_framework_id"].tolist()
## 1D Matrices
# widget names of 1D matrices in the chosen AFs's
mat1d_titles = [
    s.upper() for s in [
        "Operational Environment",  # iMMAP, 2020 Okular, Okular Generic, Colombia-AF, Nigeria Situation Analysis (OA), IFRC Master Framework 2019
        "Cross Sector",  #Rohingya Framework, IFRC Analytical Framework 2018, 
        "Matrix 1D",  # Situation Analysis Generic Yemen, 
        "Cross Sectors",  # Situation Analysis Generic Libya
    ]
]
mat1d_titles = mat1d_titles + [
    sub_s.title() for sub_s in ('Operational Environment - Dimension',
                                'Operational Environment - Subdimension')
]
mat1d_properties_ids = af_widgets[(
    af_widgets["title"].str.upper()).isin(mat1d_titles)][[
        "properties", "analysis_framework_id"
    ]]
mat1d_properties = mat1d_properties_ids["properties"].tolist()
mat1d_ids = mat1d_properties_ids["analysis_framework_id"].tolist()

In [10]:
afids_pillars_subpillars = dict()
errors = []
for mat, af_id in zip(mat2d_properties, mat2d_ids):
    try:
        dims = mat["data"]['dimensions']
        afids_pillars_subpillars[af_id] = {}
        for dim in dims:
            pillar = dim["title"]
            sub_pillars = []
            for sub_pillar_dict in dim["subdimensions"]:
                sub_pillars.append(sub_pillar_dict["title"])
            afids_pillars_subpillars[af_id][pillar] = sub_pillars
    except KeyError:
        errors.append([mat, af_id])
##
afids_rows_cells = dict()
for mat, af_id in zip(mat1d_properties, mat1d_ids):
    try:
        rows = mat["data"]['rows']
        afids_rows_cells[af_id] = {}
        for row in rows:
            pillar = row["title"]
            sub_pillars = []
            for sub_pillar_dict in row["cells"]:
                sub_pillars.append(sub_pillar_dict["value"])
            afids_rows_cells[af_id][pillar] = sub_pillars
    except KeyError:
        errors.append([mat, af_id])

In [11]:
def extract_title(x):
    if x["excel"].get("title"):
        return x["excel"]["title"]
    elif x["excel"].get("type") == "multiple":
        return x["excel"]["titles"]
##
exportables["data"] = exportables["data"].apply(literal_eval)
af_titles = exportables["data"].apply(extract_title).tolist()

In [12]:
print(entries.shape, exportdata.shape)
entries = entries[entries["entry_type"].eq("excerpt") & (~entries["excerpt"].isna())]
print(entries.shape, exportdata.shape)

(170071, 21) (1203311, 4)
(125863, 21) (1203311, 4)


In [13]:
exid_to_exdata = dict()
for ex_id, ex_data in zip(exportables["id"], exportables["data"]):
    exid_to_exdata[ex_id] = ex_data

In [14]:
widget_key_id_to_title = dict()
for w_key, w_id, title in zip(af_widgets["key"], af_widgets["widget_id"], af_widgets["title"]):
    widget_key_id_to_title[(w_key, w_id)] = title

In [15]:
def exportdata_to_tag_title(row):
    data = json.loads(row[1])
    wkey, wid = None, None
    if data.get("common"):
        wkey = data.get("common").get("widget_key")
        wid = data["common"].get("widget_id")
    if wkey and wid:
        if widget_key_id_to_title.get((wkey, wid)):
            return widget_key_id_to_title.get((wkey, wid))
    if data.get("report") and data["report"].get("other"):
            if len(data["report"]["other"]) == 1 and data["report"]["other"][0].get("title"):
                if data["report"]["other"][0]["title"]:
                    return data["report"]["other"][0]["title"]
    if isinstance(data.get("excel"), list) and len(data["excel"])==1 and \
     data["excel"][0].get("widget_key") and data["excel"][0].get("widget_id"):
        wkey = data["excel"][0]["widget_key"]
        wid = data["excel"][0]["widget_id"]
        if widget_key_id_to_title.get((wkey, wid)):
            return widget_key_id_to_title.get((wkey, wid))
    exportable_id = row[3]
    if exid_to_exdata[exportable_id]["excel"].get("title"):
        return exid_to_exdata[exportable_id]["excel"]["title"]
    elif exid_to_exdata[exportable_id]["excel"].get("type") == "multiple":
        return exid_to_exdata[exportable_id]["excel"]["titles"]
    raise

In [16]:
exportdata["tag_title"] = exportdata.apply(exportdata_to_tag_title, axis=1)

In [17]:
def exportdata_to_tag_value(row):
    data = json.loads(row[1])
    if isinstance(data["excel"], list) and len(data["excel"])==1:
        return data["excel"][0]["value"]
    if data["excel"].get("type") == "lists":
        return data["excel"]["values"]
    if isinstance(data["excel"], dict) and "value" in data["excel"]:
        return data["excel"].get("value")
    if isinstance(data["excel"], dict) and "values" in data["excel"]:
        return data["excel"].get("values")
    elif "values" in data["common"]:
        return data["common"]["values"]
    elif "value" in data["common"]:
        return data["common"]["value"]
    raise

In [18]:
exportdata["tag_value"] = exportdata.apply(exportdata_to_tag_value, axis=1)

In [19]:
def title_case(tag):
    if isinstance(tag, (list, tuple)):
        return tuple([x.title() for x in tag])
    return tag.title()

In [20]:
exportdata["tag_title"] = exportdata["tag_title"].apply(title_case)

In [21]:
mat_1d_and_flag_titles = [
    'Flag', 'Operational Environment', 'Cross Sector','Temática',
    ('Operational Environment - Dimension',
     'Operational Environment - Subdimension'),
    ('Flag - Dimension', 'Flag - Subdimension'),
    ('High Level Tags - Dimension', 'High Level Tags - Subdimension')
]

In [22]:
#exportdata.tag_title.unique()

In [23]:
#exportdata[exportdata["tag_title"].isin([('Operational Environment - Dimension', 'Operational Environment - Subdimension')])]["tag_value"].tolist()
#exportdata[exportdata["tag_title"].eq('High Level Tags')]["tag_value"].tolist()

In [24]:
mat2d_titles = [s.title() for s in mat2d_titles]
mat1d_titles = [
    s.title() if isinstance(s, str) else (sub_s.title() for sub_s in s)
    for s in mat1d_titles
]
mat_1d_and_flag_titles = [
    s.title() if isinstance(s, str) else (sub_s.title() for sub_s in s)
    for s in mat_1d_and_flag_titles
]

In [25]:
exportdata_of_interest = exportdata[exportdata["tag_title"].isin(
    mat2d_titles + mat_1d_and_flag_titles)]

In [26]:
exportdata_of_interest.shape, entries.shape

((224492, 6), (125863, 21))

In [27]:
entries_labeled = pd.merge(entries,
                           exportdata_of_interest,
                           how="inner",
                           left_on="id",
                           right_on="entry_id",
                           suffixes=('_entry', '_exportdata'))

In [28]:
entries_labeled["id_entry"].shape, entries_labeled["id_entry"].unique().shape

((160835,), (124841,))

In [29]:
def agg_group(group):
    tag_titles = group["tag_title"]
    tag_values = group["tag_value"]
    #
    grouped = defaultdict(list)
    for title, value in zip(tag_titles, tag_values):
        if title in mat_1d_and_flag_titles:
            grouped["Mat1D"].extend(value)
        elif title in mat2d_titles:
            grouped["Mat2D"].extend(value)
    return grouped

In [30]:
entries_labeled.columns

Index(['id_entry', 'created_at', 'modified_at', 'excerpt', 'image_raw',
       'analysis_framework_id', 'created_by_id', 'lead_id', 'modified_by_id',
       'entry_type', 'information_date', 'order', 'client_id', 'project_id',
       'tabular_field_id', 'dropped_excerpt', 'highlight_hidden', 'verified',
       'verification_last_changed_by_id', 'image_id', 'title', 'id_exportdata',
       'data', 'entry_id', 'exportable_id', 'tag_title', 'tag_value'],
      dtype='object')

In [31]:
tags = entries_labeled.groupby("id_entry").apply(agg_group)

In [32]:
entryids_tags = pd.DataFrame({"entry_id": tags.index, "tags": tags.values})

In [33]:
entryids_tags.shape

(124841, 2)

In [34]:
entries_ = entries.rename({"id": "entry_id"}, axis=1, inplace=False)
#
print(entries.shape)
#
entries_ = entries_[[
    'entry_id',
    'created_at',
    'modified_at',
    'excerpt',
    'entry_type',
    'analysis_framework_id',
    'created_by_id',
    'lead_id',
    'modified_by_id',
    'information_date',
    'order',
    'project_id',
    'title',
    'verified',
]].drop_duplicates()
#
print(entries_.shape)

(125863, 21)
(125863, 14)


In [35]:
entries_labeled = pd.merge(entries_, entryids_tags, on="entry_id", how="inner")
entries_labeled.shape, entries_labeled.columns

((124841, 15),
 Index(['entry_id', 'created_at', 'modified_at', 'excerpt', 'entry_type',
        'analysis_framework_id', 'created_by_id', 'lead_id', 'modified_by_id',
        'information_date', 'order', 'project_id', 'title', 'verified', 'tags'],
       dtype='object'))

In [36]:
entries_labeled["tags_str"] = entries_labeled["tags"].apply(str)
entries_labeled.duplicated(subset=["entry_id", "tags_str"]).sum()

0

In [37]:
entries_labeled["entry_id"].duplicated().sum()

0

In [38]:
entries_labeled.shape

(124841, 16)

In [39]:
entries_labeled["tags"].isna().sum()

0

In [40]:
entries.shape, entries[~entries["excerpt"].isna()]["excerpt"].unique().shape

((125863, 21), (123745,))

In [41]:
entries_labeled.shape, entries_labeled.columns

((124841, 16),
 Index(['entry_id', 'created_at', 'modified_at', 'excerpt', 'entry_type',
        'analysis_framework_id', 'created_by_id', 'lead_id', 'modified_by_id',
        'information_date', 'order', 'project_id', 'title', 'verified', 'tags',
        'tags_str'],
       dtype='object'))

In [42]:
def tags_to_pillars(x):
    tags = x[0]
    af_id = x[1]
    pillars = []
    for tag_key, tag_value in tags.items():
        if tag_key != "Mat2D":
            continue
        if len(tag_value):
            for t in tag_value:
                if t[0] and t[1]:
                    pillars.append(
                        str(af_id) + "->" + t[0].title() + "->" + t[1].title())
    return list(set(pillars))


def tags_to_sectors(x):
    tags = x[0]
    af_id = x[1]
    sectors = []
    for tag_key, tag_value in tags.items():
        if tag_key != "Mat2D":
            continue
        if len(tag_value):
            for t in tag_value:
                if len(t) > 2 and t[2]:
                    sectors.append(t[2].title())
    return list(set(sectors))


def tags_to_subsectors(x):
    tags = x[0]
    af_id = x[1]
    subsectors = []
    for tag_key, tag_value in tags.items():
        if tag_key != "Mat2D":
            continue
        if len(tag_value):
            subsectors = []
            for t in tag_value:
                if len(t) > 3 and t[3]:
                    if isinstance(t[3], list):
                        subsectors.extend([x.title() for x in t[3]])
                    else:
                        subsectors.append(t[3].title())
    return list(set(subsectors))


##
def tags_to_label2d(x):
    tags = x[0]
    af_id = x[1]
    label2d = []
    for tag_key, tag_value in tags.items():
        if tag_key != "Mat2D":
            continue
        if len(tag_value):
            for t in tag_value:
                if t[0] and t[1]:
                    label2d.append("AF-ID:" + str(af_id) + "->Sector:" +
                                   t[2].title() + "->Pillar:" + t[0].title() +
                                   "->Sub-pillar:" + t[1].title())
    return list(set(label2d))


##
def tags_to_pillars_1d(x):
    tags = x[0]
    af_id = x[1]
    pillars_1d = []
    for tag_key, tag_value in tags.items():
        if tag_key != "Mat1D":
            continue
        if len(tag_value):
            for t in tag_value:
                if t[0] and t[1]:
                    pillars_1d.append(
                        str(af_id) + "->" + t[0].title() + "->" + t[1].title())
    return list(set(pillars_1d))

In [43]:
entries_labeled["pillars"] = entries_labeled[["tags", 'analysis_framework_id'
                                              ]].apply(tags_to_pillars, axis=1)
entries_labeled["sectors"] = entries_labeled[["tags", 'analysis_framework_id'
                                              ]].apply(tags_to_sectors, axis=1)
entries_labeled["subsectors"] = entries_labeled[[
    "tags", 'analysis_framework_id'
]].apply(tags_to_subsectors, axis=1)
##
entries_labeled["label_2d"] = entries_labeled[[
    "tags", 'analysis_framework_id'
]].apply(tags_to_label2d, axis=1)
##
entries_labeled["pillars_1d"] = entries_labeled[["tags", 'analysis_framework_id'
                                              ]].apply(tags_to_pillars_1d, axis=1)

In [44]:
entries_labeled[~entries_labeled["excerpt"].isna()]["excerpt"].unique().shape

(122786,)

In [45]:
pills_unique_1d = set()
pills_occurances_1d = list()
for pills_1d in entries_labeled["pillars_1d"]:
    pills_unique_1d.update(pills_1d)
    pills_occurances_1d.extend(pills_1d)
len(pills_unique_1d), Counter(pills_occurances_1d).most_common()

(289,
 [('1306->Covid-19 Overview->Cases', 5423),
  ('1306->Displacement->Type/Numbers/Movements', 3857),
  ('1306->Context->Security & Stability', 3776),
  ('1306->Covid-19 Overview->Deaths', 3046),
  ('1306->Covid-19 Overview->Vaccination', 2743),
  ('829->Displacement ->Displacement', 2502),
  ('699->Humanitarian Profile->Affected Groups', 2439),
  ('1306->Covid-19 Containment Measures->Public Health Measures', 2237),
  ('1306->Context->Economy', 2235),
  ('1306->Casualties->Dead', 2180),
  ('1306->Displacement->Push Factors', 1671),
  ('829->Context->Legal Or Normative Framework', 1659),
  ('1306->Covid-19 Overview->Testing', 1601),
  ('699->Context->Demography', 1299),
  ('1306->Covid-19 Containment Measures->Movement Restrictions', 1252),
  ('829->Displacement ->Local Integration', 1128),
  ('537->Humanitarian Profile->Affected Groups', 1114),
  ('699->Humanitarian Profile->Casualties', 991),
  ('552->Humanitarian Profile->Affected Groups', 987),
  ('1306->Covid-19 Containment Me

In [46]:
pills_unique = set()
pills_occurances = list()
for pills in entries_labeled["pillars"]:
    pills_unique.update(pills)
    pills_occurances.extend(pills)
Counter(pills_occurances).most_common()

[('1306->Humanitarian Conditions->Living Standards', 14428),
 ('1306->Impact->Drivers/Aggravating Factors', 9384),
 ('1306->Humanitarian Conditions->Physical & Mental Wellbeing', 8860),
 ('1306->Impact->Impact On System & Services', 8778),
 ('1306->Impact->Impact On People', 8121),
 ('829->Humanitarian Conditions->Living Standards', 7361),
 ('1306->At Risk->People At Risk / Vulnerable', 5112),
 ('1306->Capacities & Response->International', 4245),
 ('829->Response And Capacities->National And Local', 3634),
 ('699->Capacities & Response->International Actors', 2602),
 ('699->Humanitarian Conditions->Living Standards', 2401),
 ('829->Humanitarian Conditions->Physical And Mental Well-Being', 2343),
 ('1306->Humanitarian Conditions->Coping Mechanisms', 2278),
 ('1306->Priorities->Priority Interventions (Staff)', 2198),
 ('699->Capacities & Response->Government And Local Authorities', 2148),
 ('1306->Capacities & Response->Number Of People Reached', 2072),
 ('829->Response And Capacities->

In [47]:
secs_unique = set()
secs_occurances = list()
for secs in entries_labeled["sectors"]:
    secs_unique.update(secs)
    secs_occurances.extend(secs)
#secs_unique
Counter(secs_occurances).most_common()

[('Health', 28135),
 ('Protection', 22520),
 ('Cross', 13603),
 ('Livelihoods', 13225),
 ('Wash', 8404),
 ('Food Security', 8203),
 ('Shelter', 8164),
 ('Education', 7991),
 ('Nutrition', 3277),
 ('Food', 3227),
 ('Agriculture', 3164),
 ('Logistics', 2667),
 ('Cross-Sector', 2632),
 ('Cross Cutting', 678),
 ('Nfi', 611),
 ('Logistic', 248),
 ('Livelihood', 132)]

In [48]:
entries_labeled.columns

Index(['entry_id', 'created_at', 'modified_at', 'excerpt', 'entry_type',
       'analysis_framework_id', 'created_by_id', 'lead_id', 'modified_by_id',
       'information_date', 'order', 'project_id', 'title', 'verified', 'tags',
       'tags_str', 'pillars', 'sectors', 'subsectors', 'label_2d',
       'pillars_1d'],
      dtype='object')

In [49]:
final_df = entries_labeled[entries_labeled["entry_type"].eq("excerpt")
                           & (~entries_labeled["excerpt"].isna())]
final_df.shape

(124841, 21)

In [50]:
def remove_newlines(excerpt):
    if not isinstance(excerpt, str):
        return excerpt
    return re.sub("\s+", " ", excerpt)

final_df.loc[:, 'excerpt'] = final_df['excerpt'].apply(remove_newlines)
#final_df.loc[:, 'dropped_excerpt'] = final_df['dropped_excerpt'].apply(remove_newlines)
##
final_df = final_df.sort_values("verified").reset_index(drop=True).drop_duplicates(subset='excerpt', keep="last")
final_df.shape, final_df.duplicated(subset="excerpt").sum()

((122676, 21), 0)

In [51]:
sector_name_mapper = {
    "Agriculture": "Agriculture",
    "Cccm": "",
    "Cross": "Cross",
    "Cross Cutting": "Cross",
    "Cross-Sector": "Cross",
    "Education": "Education",
    "Food": "Food Security",
    "Food Security": "Food Security",
    "Nutrition": "Nutrition",
    "Health": "Health",
    "Livelihood": "Livelihoods",
    "Livelihoods": "Livelihoods",
    "Logistic": "Logistics",
    "Logistics": "Logistics",
    "Protection": "Protection",
    "Shelter": "Shelter",
    "Shelter And Nfis": "Shelter",
    "Nfi": "",
    "Wash": "WASH",
}

In [52]:
af_title_to_id = dict()
for afid, title in zip(afs["id"], afs["title"]):
    af_title_to_id["title"] = afid

In [53]:
af_id_title = afs[["id", "title"]]
af_id_title["title"] = af_id_title["title"].str.title()
##
matching_2d_mat["Framework Title"] = matching_2d_mat["Framework Title"].str.title()
##
af_id_title.columns = ["analysis_framework_id", "Framework Name"]
matching_2d_mat = pd.merge(matching_2d_mat,
                    af_id_title,
                    how="left",
                    left_on="Framework Title",
                    right_on="Framework Name")
matching_2d_mat.columns

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  af_id_title["title"] = af_id_title["title"].str.title()


Index(['Unnamed: 0', 'Framework Title', 'Type', 'Pillar', 'Sub-pillar',
       'Virtual Type', 'Virtual Pillar', 'Virtual Sub-pillar', 'First-pass',
       'Verified', 'Reversible', 'Cover', 'analysis_framework_id',
       'Framework Name'],
      dtype='object')

In [54]:
matching_2d_mat["Pillar"] = matching_2d_mat["Pillar"].apply(
    lambda x: x.strip().title())
matching_2d_mat["Sub-pillar"] = matching_2d_mat["Sub-pillar"].apply(
    lambda x: x.strip().title())
matching_2d_mat["Final Pillar Name"] = matching_2d_mat[
    "Virtual Pillar"].apply(lambda x: x.strip().title())
matching_2d_mat["Final Sub-pillar Name"] = matching_2d_mat[
    "Virtual Sub-pillar"].apply(lambda x: x.strip().title())

In [55]:
#matching_2d_mat = matching_2d_mat[~matching_2d_mat["analysis_framework_id"].isna()]

matching_2d_mat = matching_2d_mat.astype({
    'Framework Name': str,
    'Pillar': str,
    'Sub-pillar': str,
    'Final Pillar Name': str,
    'Final Sub-pillar Name': str,
    'analysis_framework_id': int
})

In [56]:
matching_2d_mat["analysis_framework_id"] = matching_2d_mat["analysis_framework_id"].apply(str)
##
matching_2d_mat["original_pillar"] = matching_2d_mat[
    "analysis_framework_id"] + "->" + matching_2d_mat[
        "Pillar"] + "->" + matching_2d_mat["Sub-pillar"]
matching_2d_mat["original_subpillar"] = matching_2d_mat[
    "analysis_framework_id"] + "->" + matching_2d_mat[
        "Pillar"] + "->" + matching_2d_mat["Sub-pillar"]
##
matching_2d_mat["target_pillar"] = matching_2d_mat["Final Pillar Name"]
matching_2d_mat["target_subpillar"] = matching_2d_mat[
    "Final Pillar Name"] + "->" + matching_2d_mat["Final Sub-pillar Name"]

In [57]:
pillar_name_mapper = dict()
subpillar_name_mapper = dict()
for pillar, fpillar, subpillar, fsubpillar, af_id in zip(
        matching_2d_mat["original_pillar"], matching_2d_mat["target_pillar"],
        matching_2d_mat["original_subpillar"],
        matching_2d_mat["target_subpillar"],
        matching_2d_mat["analysis_framework_id"]):
    pillar_name_mapper[pillar] = fpillar
    subpillar_name_mapper[subpillar] = fsubpillar

In [58]:
matching_1d_mat["Framework Title"] = matching_1d_mat["Framework Title"].str.title()
##
matching_1d_mat = pd.merge(matching_1d_mat,
                           af_id_title,
                           how="left",
                           left_on="Framework Title",
                           right_on="Framework Name")


In [59]:
#matching_1d_mat = matching_1d_mat[~matching_1d_mat["analysis_framework_id"].isna()]
matching_1d_mat["analysis_framework_id"] = matching_1d_mat["analysis_framework_id"].apply(int)

In [60]:
matching_1d_mat.columns

Index(['Unnamed: 0', 'Framework Title', 'Type', 'Pillar', 'Sub-pillar',
       'Virtual Type', 'Virtual Pillar', 'Virtual Sub-pillar', 'First-pass',
       'Verified', 'Reversible', 'Cover', 'analysis_framework_id',
       'Framework Name'],
      dtype='object')

In [61]:
matching_1d_mat[matching_1d_mat["analysis_framework_id"].isna()]

Unnamed: 0.1,Unnamed: 0,Framework Title,Type,Pillar,Sub-pillar,Virtual Type,Virtual Pillar,Virtual Sub-pillar,First-pass,Verified,Reversible,Cover,analysis_framework_id,Framework Name


In [62]:
pillar_name_mapper_1d = dict()
subpillar_name_mapper_1d = dict()
for pillar, fpillar, subpillar, fsubpillar, af_id in zip(
        matching_1d_mat["Pillar"], matching_1d_mat["Virtual Pillar"],
        matching_1d_mat["Sub-pillar"],
        matching_1d_mat["Virtual Sub-pillar"], matching_1d_mat["analysis_framework_id"]):
    pillar = str(af_id) + "->" + pillar + "->" + subpillar
    pillar_name_mapper_1d[pillar] = fpillar
    subpillar_name_mapper_1d[pillar] = fpillar + "->" + fsubpillar

In [63]:
def sector_mapper(sec):
    if sec == sec:
        sectors =  [sector_name_mapper.get(s, "") for s in sec]
        return [sec for sec in sectors if sec]
    return []
##
def pillar_mapper(dim):
    if dim == dim:
        dim = [pillar_name_mapper.get(d, "") for d in dim]
        return [d for d in dim if d]
    return []
##
def subpillar_mapper(subdim):
    if subdim == subdim:
        subdim = [subpillar_name_mapper.get(d, "") for d in subdim]
        return [d for d in subdim if d]
    return []
##
def pillar_mapper_1d(pill):
    if pill == pill:
        pill = [pillar_name_mapper_1d.get(p, "") for p in pill]
        return [p for p in pill if p]
    return []
##
def subpillar_mapper_1d(subpill):
    if subpill == subpill:
        subpill = [subpillar_name_mapper_1d.get(p, "") for p in subpill]
        return [p for p in subpill if p]
    return []

In [64]:
final_df["sectors"] = final_df["sectors"].apply(sector_mapper)
final_df["subpillars_2d"] = final_df["pillars"].apply(subpillar_mapper)
final_df["pillars_2d"] = final_df["pillars"].apply(pillar_mapper)
##
final_df["subpillars_1d"] = final_df["pillars_1d"].apply(subpillar_mapper_1d)
final_df["pillars_1d"] = final_df["pillars_1d"].apply(pillar_mapper_1d)

In [65]:
final_df.columns

Index(['entry_id', 'created_at', 'modified_at', 'excerpt', 'entry_type',
       'analysis_framework_id', 'created_by_id', 'lead_id', 'modified_by_id',
       'information_date', 'order', 'project_id', 'title', 'verified', 'tags',
       'tags_str', 'pillars', 'sectors', 'subsectors', 'label_2d',
       'pillars_1d', 'subpillars_2d', 'pillars_2d', 'subpillars_1d'],
      dtype='object')

In [66]:
final_df = final_df[~final_df.excerpt.isna()]


In [67]:
secs_unique = set()
secs_occurances = list()
for secs in final_df["sectors"]:
    secs_unique.update(secs)
    secs_occurances.extend(secs)
#secs_unique
Counter(secs_occurances).most_common()

[('Health', 27574),
 ('Protection', 22007),
 ('Cross', 16425),
 ('Livelihoods', 13135),
 ('Food Security', 11225),
 ('WASH', 8226),
 ('Shelter', 7974),
 ('Education', 7873),
 ('Nutrition', 3198),
 ('Agriculture', 3116),
 ('Logistics', 2849)]

In [68]:
pills_unique = set()
pills_occurances = list()
for pills in final_df["pillars_2d"]:
    pills_unique.update(pills)
    pills_occurances.extend(pills)
#pills_unique
Counter(pills_occurances).most_common()

[('Humanitarian Conditions', 50918),
 ('Impact', 37539),
 ('Capacities & Response', 16559),
 ('At Risk', 11934),
 ('Priority Needs', 3179),
 ('Priority Interventions', 2531)]

In [69]:
subpills_unique = set()
subpills_occurances = list()
for subpills in final_df["subpillars_2d"]:
    subpills_unique.update(subpills)
    subpills_occurances.extend(subpills)
Counter(subpills_occurances).most_common()

[('Humanitarian Conditions->Living Standards', 28585),
 ('Humanitarian Conditions->Physical And Mental Well Being', 16520),
 ('Impact->Impact On Systems, Services And Networks', 13320),
 ('Impact->Driver/Aggravating Factors', 12878),
 ('At Risk->Risk And Vulnerabilities', 11654),
 ('Capacities & Response->International Response', 9106),
 ('Impact->Impact On People', 9004),
 ('Capacities & Response->National Response', 4780),
 ('Humanitarian Conditions->Coping Mechanisms', 4481),
 ('Capacities & Response->Number Of People Reached/Response Gaps', 2415),
 ('Impact->Number Of People Affected', 2337),
 ('Priority Interventions->Expressed By Humanitarian Staff', 2242),
 ('Priority Needs->Expressed By Population', 1592),
 ('Priority Needs->Expressed By Humanitarian Staff', 1587),
 ('Humanitarian Conditions->Number Of People In Need', 1332),
 ('Priority Interventions->Expressed By Population', 289),
 ('At Risk->Number Of People At Risk', 280),
 ('Capacities & Response->Local Response', 258)]

In [70]:
pills_unique_1d = set()
pills_occurances_1d = list()
for pills_1d in final_df["pillars_1d"]:
    pills_unique_1d.update(pills_1d)
    pills_occurances_1d.extend(pills_1d)
#pills_unique_1d
Counter(pills_occurances_1d).most_common()

[('Context', 19714),
 ('Displacement', 5831),
 ('Casualties', 3984),
 ('Humanitarian Access', 2363),
 ('Information and Communication', 868),
 ('Shock/Event', 828)]

In [71]:
subpills_unique_1d = set()
subpills_occurances_1d = list()
for subpills_1d in final_df["subpillars_1d"]:
    subpills_unique_1d.update(subpills_1d)
    subpills_occurances_1d.extend(subpills_1d)
#subpills_unique_1d
Counter(subpills_occurances_1d).most_common()

[('Context->Security & Stability', 6872),
 ('Context->Demography', 4181),
 ('Context->Economy', 4043),
 ('Casualties->Dead', 3066),
 ('Displacement->Type/Numbers/Movements', 2734),
 ('Displacement->Push factors', 1984),
 ('Context->Politics', 1966),
 ('Humanitarian Access->Physical constraints', 1891),
 ('Context->Socio Cultural', 1062),
 ('Context->Environment', 905),
 ('Context->Legal & Policy', 685),
 ('Casualties->Injured', 650),
 ('Displacement->Local integration', 550),
 ('Humanitarian Access->Number of people facing humanitarian access constraints/Humanitarian access gaps',
  472),
 ('Shock/Event->Type and characteristics', 450),
 ('Shock/Event->Hazard & Threats', 378),
 ('Information and Communication->Knowledge and info gaps (pop)', 365),
 ('Displacement->Pull factors', 338),
 ('Information and Communication->Communication means and preferences', 325),
 ('Casualties->Missing', 268),
 ('Displacement->Intentions', 225),
 ('Information and Communication->Information challenges an

In [72]:
final_df["subpillars_1d"].apply(lambda x:x!=[]).sum(),\
final_df["subpillars_2d"].apply(lambda x:x!=[]).sum(),\
final_df["sectors"].apply(lambda x:x!=[]).sum()

(27674, 83020, 89829)

In [75]:
final_df = final_df[['entry_id', 'excerpt', 'entry_type',
        'analysis_framework_id', 'lead_id','title',
        'project_id', 'verified',
        'sectors', 'subpillars_2d', 'subpillars_1d'
        ]]

final_df = final_df[~(
    final_df.analysis_framework_id.isna() | 
    final_df.entry_id.isna() |
    final_df.lead_id.isna() |
    final_df.excerpt.isna())
]
final_df

Unnamed: 0,entry_id,excerpt,entry_type,analysis_framework_id,lead_id,title,project_id,verified,sectors,subpillars_2d,subpillars_1d
0,16851,"During the reporting week, IOM provided medica...",excerpt,137,6334,Situation Analysis Generic Yemen,322,False,[Health],[Capacities & Response->International Response],[]
1,454418,El porcentaje de personas desempleadas ha aume...,excerpt,829,64535,UNHCR Peru,1185,False,[Livelihoods],"[Humanitarian Conditions->Living Standards, Im...",[]
2,187289,"As per GIFMM monitoring, between 12 June and 1...",excerpt,829,42134,UNHCR Colombia,1183,False,[Shelter],"[Humanitarian Conditions->Living Standards, Im...",[]
3,215154,"This was the opposite among non-beneficiaries,...",excerpt,699,24416,IFRC Turkey,1142,False,[Education],[Humanitarian Conditions->Living Standards],[]
4,237955,El Centro de Orientación para el Empleo y el E...,excerpt,1306,47317,IMMAP/DFS Colombia,2311,False,[Livelihoods],[Priority Interventions->Expressed By Humanita...,[]
...,...,...,...,...,...,...,...,...,...,...,...
124836,200856,"Todos los migrantes, de cualquier nacionalidad...",excerpt,1306,44408,IMMAP/DFS Colombia,2311,True,[Health],[Priority Interventions->Expressed By Humanita...,[]
124837,200854,Desde el pasado 17 de marzo Colombia cerró sus...,excerpt,1306,44408,IMMAP/DFS Colombia,2311,True,[],[],[Context->Economy]
124838,300624,"[31 March 2021, GoS] The Ministry said that th...",excerpt,1306,52805,IMMAP/DFS Syria,2028,True,[],[],[]
124839,300623,"[31 March 2021, GoS] Health Ministry announced...",excerpt,1306,52805,IMMAP/DFS Syria,2028,True,[],[],[]


In [76]:
final_df.to_csv('generated_entries/primary_tags.csv', index=None)