In [1]:
import re
import json
import random
from pprint import pprint
from ast import literal_eval
from collections import Counter

import numpy as np
import pandas as pd
from tqdm.auto import tqdm

In [2]:
random.seed(2021)
np.random.seed(2021)

In [3]:
exportdata = pd.read_csv("exportdata_from_04062021_to_14062021.csv")
entries = pd.read_csv("entries_from_04062021_to_14062021.csv")
leads = pd.read_csv("leads_from_04062021_to_14062021.csv")
af_widgets = pd.read_csv("af_widgets_of_interest.csv")
projects = pd.read_csv("projects_chosen_by_patrice.csv")
exportables = pd.read_csv("af_exportables.csv")
##
afs = pd.read_csv("analysis_frameworks.csv")
users = pd.read_csv("user_names.csv")
##
matching = pd.read_csv("pillars_subpillars_matching.csv")

In [4]:
af_widgets["properties"] = af_widgets["properties"].apply(json.loads)

In [5]:
mat2d_titles = [
    s.upper() for s in [
        "Pre-Crisis",
        "Shock/Event",
        "In-Crisis",
        "Sectors",
        "Sectoral Information",
        "Matrix 2D",
    ]
]
mat2d_properties_ids = af_widgets[(
    af_widgets["title"].str.upper()).isin(mat2d_titles)][[
        "properties", "analysis_framework_id"
    ]]
mat2d_properties = mat2d_properties_ids["properties"].tolist()
mat2d_ids = mat2d_properties_ids["analysis_framework_id"].tolist()

In [6]:
afids_pillars_subpillars = dict()
for mat, af_id in zip(mat2d_properties, mat2d_ids):
    dims = mat["data"]['dimensions']
    afids_pillars_subpillars[af_id] = {}
    for dim in dims:
        pillar = dim["title"]
        sub_pillars = []
        for sub_pillar_dict in dim["subdimensions"]:
            sub_pillars.append(sub_pillar_dict["title"])
        afids_pillars_subpillars[af_id][pillar] = sub_pillars


In [7]:
exportables["data"] = exportables["data"].apply(literal_eval)

In [8]:
def extract_title(x):
    if x["excel"].get("title"):
        return x["excel"]["title"]
    elif x["excel"].get("type") == "multiple":
        return x["excel"]["titles"]

af_titles = exportables["data"].apply(extract_title).tolist()

In [9]:
entries.shape, exportdata.shape

((730, 23), (3832, 4))

In [10]:
entries[~entries["excerpt"].isna()]["excerpt"].unique().shape

(728,)

In [11]:
exid_to_exdata = dict()
for ex_id, ex_data in zip(exportables["id"], exportables["data"]):
    exid_to_exdata[ex_id] = ex_data

In [12]:
af_widgets["title"].unique()

array(['Sectoral Information', 'LOCATION', 'EXCERPT',
       'Operational Environment', 'Information Date', 'RELIABILITY',
       'Flag', 'DEMOGRAPHIC GROUPS', 'SPECIFIC NEEDS GROUPS',
       'AFFECTED GROUPS', 'Severity', 'Sectors', 'Cross sector',
       'Affected groups', 'Specific Needs Groups', 'Demographic Groups',
       'Reliability', 'Geo Location', 'Excerpt', 'Information date',
       'PRE-CRISIS', 'IN-CRISIS', 'Crisis type',
       'Context additional tags', 'Crisis Type', 'SHOCK/EVENT',
       'Additional Context', 'DISPLACED POP TYPE', 'CLEANING tags',
       'POPULATION GROUPS', 'Cleaning comments', 'HIGH LEVEL TAGS',
       'Comment'], dtype=object)

In [13]:
widget_key_id_to_title = dict()
for w_key, w_id, title in zip(af_widgets["key"], af_widgets["widget_id"], af_widgets["title"]):
    widget_key_id_to_title[(w_key, w_id)] = title

In [14]:
def exportdata_to_tag_title(row):
    data = json.loads(row[1])
    wkey, wid = None, None
    if data.get("common"):
        wkey = data.get("common").get("widget_key")
        wid = data["common"].get("widget_id")
    if wkey and wid:
        if widget_key_id_to_title.get((wkey, wid)):
            return widget_key_id_to_title.get((wkey, wid))
    if data.get("report") and data["report"].get("other"):
            if len(data["report"]["other"]) == 1 and data["report"]["other"][0].get("title"):
                if data["report"]["other"][0]["title"]:
                    return data["report"]["other"][0]["title"]
    if isinstance(data.get("excel"), list) and len(data["excel"])==1 and \
     data["excel"][0].get("widget_key") and data["excel"][0].get("widget_id"):
        wkey = data["excel"][0]["widget_key"]
        wid = data["excel"][0]["widget_id"]
        if widget_key_id_to_title.get((wkey, wid)):
            return widget_key_id_to_title.get((wkey, wid))
    exportable_id = row[3]
    if exid_to_exdata[exportable_id]["excel"].get("title"):
        return exid_to_exdata[exportable_id]["excel"]["title"]
    elif exid_to_exdata[exportable_id]["excel"].get("type") == "multiple":
        return exid_to_exdata[exportable_id]["excel"]["titles"]
    raise

In [15]:
exportdata["tag_title"] = exportdata.apply(exportdata_to_tag_title, axis=1)

In [16]:
def exportdata_to_tag_value(row):
    data = json.loads(row[1])
    if isinstance(data["excel"], list) and len(data["excel"])==1:
        return data["excel"][0]["value"]
    if data["excel"].get("type") == "lists":
        return data["excel"]["values"]
    if isinstance(data["excel"], dict) and "value" in data["excel"]:
        return data["excel"].get("value")
    if isinstance(data["excel"], dict) and "values" in data["excel"]:
        return data["excel"].get("values")
    elif "values" in data["common"]:
        return data["common"]["values"]
    elif "value" in data["common"]:
        return data["common"]["value"]
    raise

In [17]:
exportdata["tag_value"] = exportdata.apply(exportdata_to_tag_value, axis=1)

In [18]:
def title_case(tag):
    if isinstance(tag, (list, tuple)):
        return tuple([x.title() for x in tag])
    return tag.title()

In [19]:
exportdata["tag_title"] = exportdata["tag_title"].apply(title_case)

In [20]:
#exportdata["tag_title"].unique()
# 1. GIMAC --> Pre-Crisis, Shock/Event, In-Crisis
# 2. 2020 Okular --> Sectoral Information
# 3. Okular Analytics Generic --> Sectoral Information
# 4. Rohingya Framework --> Sectors
# 5. IFRC 2018 --> Sectors
# 6. Colombia AF --> Sectoral Information
# 7. Nigeria Situation Analysis (OA) --> Sectoral Information
# 8. Situation Analysis Generic Yemen --> Matrix 2D
# 9. Situation Analysis Generic Libya --> Sectors

In [21]:
mat2d_titles = [
    s.title() for s in [
        "Pre-Crisis",
        "Shock/Event",
        "In-Crisis",
        "Sectors",
        "Sectoral Information",
        "Matrix 2D",
    ]
]

In [22]:
exportdata_mat2d = exportdata[exportdata["tag_title"].isin(mat2d_titles)]

In [23]:
exportdata_mat2d.shape, exportdata.shape, entries.shape

((405, 6), (3832, 6), (730, 23))

In [24]:
entries_labeled_mat2d = pd.merge(entries,
                                 exportdata_mat2d,
                                 how="inner",
                                 left_on="id",
                                 right_on="entry_id",
                                 suffixes=('_entry', '_exportdata'))

In [25]:
entries.shape, entries[~entries["excerpt"].isna()]["excerpt"].unique().shape

((730, 23), (728,))

In [26]:
entries_labeled_mat2d.shape, entries_labeled_mat2d.columns

((405, 29),
 Index(['id_entry', 'created_at', 'modified_at', 'excerpt', 'image_raw',
        'analysis_framework_id', 'created_by_id', 'lead_id', 'modified_by_id',
        'entry_type', 'information_date', 'order', 'client_id', 'project_id',
        'tabular_field_id', 'dropped_excerpt', 'highlight_hidden', 'verified',
        'verification_last_changed_by_id', 'image_id', 'tagger_name',
        'project_title', 'analysis_framework_title', 'id_exportdata', 'data',
        'entry_id', 'exportable_id', 'tag_title', 'tag_value'],
       dtype='object'))

In [27]:
def mat2d_labels_to_dimension(x):
    tag_value = x[0]
    af_id = x[1]
    if not isinstance(tag_value, list):
        print(tag_value)
        raise
    if len(tag_value):
        dimensions = []
        for t in tag_value:
            if t[0]:
                dimensions.append(str(af_id)+"->"+t[0].title() +"->"+t[1].title())
        return list(set(dimensions))
    else:
        return []

def mat2d_labels_to_subdimension(x):
    tag_value = x[0]
    af_id = x[1]
    if not isinstance(tag_value, list):
        print(tag_value)
        raise
    if len(tag_value):
        subdimensions = []
        for t in tag_value:
            if t[1]:
                subdimensions.append(str(af_id)+"->"+t[0].title() +"->"+t[1].title())
        return list(set(subdimensions))
    else:
        return []

def mat2d_labels_to_sector(tag_value):
    if not isinstance(tag_value, list):
        print(tag_value)
        raise
    if len(tag_value):
        sectors = []
        for t in tag_value:
            if t[2]:
                sectors.append(t[2].title())
        return list(set(sectors))
    else:
        return []

def mat2d_labels_to_subsector(tag_value):
    if not isinstance(tag_value, list):
        print(tag_value)
        raise
    if len(tag_value):
        subsectors = []
        for t in tag_value:
            if len(t) == 4 and t[3]:
                if isinstance(t[3], list):
                    subsectors.extend([x.title() for x in t[3]])
                else:
                    subsectors.append(t[3].title())
        return list(set(subsectors))
    else:
        return []

In [28]:
entries_labeled_mat2d["Dimension"] = entries_labeled_mat2d[["tag_value",'analysis_framework_id']].apply(mat2d_labels_to_dimension, axis=1)
entries_labeled_mat2d["Subdimension"] = entries_labeled_mat2d[["tag_value",'analysis_framework_id']].apply(mat2d_labels_to_subdimension, axis=1)
entries_labeled_mat2d["Sector"] = entries_labeled_mat2d["tag_value"].apply(mat2d_labels_to_sector)
#entries_labeled_mat2d["Sector"] = entries_labeled_mat2d[["tag_value",'analysis_framework_id']].apply(mat2d_labels_to_sector)
#entries_labeled_mat2d["Subsector"] = entries_labeled_mat2d[["tag_value",'analysis_framework_id']].apply(mat2d_labels_to_subsector)

In [29]:
# entries_labeled_mat2d
entries_labeled_mat2d[~entries_labeled_mat2d["excerpt"].isna()]["excerpt"].unique().shape

(405,)

In [30]:
dims_unique = set()
dims_occurances = list()
for dims in entries_labeled_mat2d["Dimension"]:
    dims_unique.update(dims)
    dims_occurances.extend(dims)
#dims_unique
Counter(dims_occurances).most_common()

[('1306->Impact->Impact On System & Services', 139),
 ('1306->Humanitarian Conditions->Physical & Mental Wellbeing', 117),
 ('1306->Impact->Impact On People', 93),
 ('1306->Impact->Drivers/Aggravating Factors', 77),
 ('1306->Humanitarian Conditions->Living Standards', 70),
 ('1306->At Risk->People At Risk / Vulnerable', 24),
 ('1306->Humanitarian Conditions->Coping Mechanisms', 13),
 ('1306->Impact->Number Of People Affected', 10),
 ('1306->Priorities->Priority Interventions (Staff)', 9),
 ('1306->Priorities->Priority Needs (Staff)', 8),
 ('1306->Humanitarian Conditions->Number Of People In Need', 6),
 ('1306->Capacities & Response->International', 5),
 ('1306->Priorities->Priority Needs (Pop)', 5),
 ('1306->Capacities & Response->Number Of People Reached', 3),
 ('1306->Capacities & Response->National & Local Actors', 2),
 ('1306->Capacities & Response->Government & Local Authorities', 2)]

In [31]:
subdims_unique = set()
subdims_occurances = list()
for subdims in entries_labeled_mat2d["Subdimension"]:
    subdims_unique.update(subdims)
    subdims_occurances.extend(subdims)
#dims_unique
Counter(subdims_occurances).most_common()

[('1306->Impact->Impact On System & Services', 139),
 ('1306->Humanitarian Conditions->Physical & Mental Wellbeing', 117),
 ('1306->Impact->Impact On People', 93),
 ('1306->Impact->Drivers/Aggravating Factors', 77),
 ('1306->Humanitarian Conditions->Living Standards', 70),
 ('1306->At Risk->People At Risk / Vulnerable', 24),
 ('1306->Humanitarian Conditions->Coping Mechanisms', 13),
 ('1306->Impact->Number Of People Affected', 10),
 ('1306->Priorities->Priority Interventions (Staff)', 9),
 ('1306->Priorities->Priority Needs (Staff)', 8),
 ('1306->Humanitarian Conditions->Number Of People In Need', 6),
 ('1306->Capacities & Response->International', 5),
 ('1306->Priorities->Priority Needs (Pop)', 5),
 ('1306->Capacities & Response->Number Of People Reached', 3),
 ('1306->Capacities & Response->National & Local Actors', 2),
 ('1306->Capacities & Response->Government & Local Authorities', 2)]

In [32]:
secs_unique = set()
secs_occurances = list()
for secs in entries_labeled_mat2d["Sector"]:
    secs_unique.update(secs)
    secs_occurances.extend(secs)
#secs_unique
Counter(secs_occurances).most_common()

[('Protection', 136),
 ('Health', 100),
 ('Cross', 69),
 ('Food Security', 64),
 ('Education', 39),
 ('Shelter', 31),
 ('Logistics', 26),
 ('Agriculture', 21),
 ('Livelihoods', 16),
 ('Wash', 12),
 ('Nutrition', 4)]

In [33]:
entries_labeled_mat2d.columns

Index(['id_entry', 'created_at', 'modified_at', 'excerpt', 'image_raw',
       'analysis_framework_id', 'created_by_id', 'lead_id', 'modified_by_id',
       'entry_type', 'information_date', 'order', 'client_id', 'project_id',
       'tabular_field_id', 'dropped_excerpt', 'highlight_hidden', 'verified',
       'verification_last_changed_by_id', 'image_id', 'tagger_name',
       'project_title', 'analysis_framework_title', 'id_exportdata', 'data',
       'entry_id', 'exportable_id', 'tag_title', 'tag_value', 'Dimension',
       'Subdimension', 'Sector'],
      dtype='object')

In [34]:
final_df = entries_labeled_mat2d[
    entries_labeled_mat2d["entry_type"].eq("excerpt")
    & (~entries_labeled_mat2d["excerpt"].isna())]

In [35]:
def remove_newlines(excerpt):
    if not isinstance(excerpt, str):
        return excerpt
    return re.sub("\s+", " ", excerpt)

final_df.loc[:, 'excerpt'] = final_df['excerpt'].apply(remove_newlines)
final_df.loc[:, 'dropped_excerpt'] = final_df['dropped_excerpt'].apply(remove_newlines)
##
final_df = final_df.sort_values("verified").reset_index(drop=True).drop_duplicates(subset='excerpt', keep="last")
final_df.shape, final_df.duplicated(subset="excerpt").sum()

((405, 32), 0)

In [36]:
sector_name_mapper = {
    "Agriculture": "Agriculture",
    "Cccm": "",
    "Cross": "Cross",
    "Cross Cutting": "Cross",
    "Cross-Sector": "Cross",
    "Education": "Education",
    "Food": "Food Security",
    "Food Security": "Food Security",
    "Nutrition": "Nutrition",
    "Health": "Health",
    "Livelihood": "Livelihoods",
    "Livelihoods": "Livelihoods",
    "Logistic": "Logistics",
    "Logistics": "Logistics",
    "Protection": "Protection",
    "Shelter": "Shelter",
    "Shelter And Nfis": "Shelter",
    "Nfi": "",
    "Wash": "WASH",
}

In [37]:
af_title_to_id = dict()
for afid, title in zip(afs["id"], afs["title"]):
    af_title_to_id["title"] = afid

In [38]:
af_id_title = afs[["id", "title"]]
af_id_title.columns = ["analysis_framework_id", "Framework Name"]
matching = pd.merge(matching, af_id_title, how="left", left_on="Framework Name", right_on="Framework Name")
matching.columns

Index(['Framework Name', 'Pillar', 'Sub-pillar', 'Final Pillar Name',
       'Final Sub-pillar Name', 'analysis_framework_id'],
      dtype='object')

In [39]:
matching["Pillar"] = matching["Pillar"].apply(lambda x: x.strip().title())
matching["Sub-pillar"] = matching["Sub-pillar"].apply(lambda x: x.strip().title())
matching["Final Pillar Name"] = matching["Final Pillar Name"].apply(lambda x: x.strip().title())
matching["Final Sub-pillar Name"] = matching["Final Sub-pillar Name"].apply(lambda x: x.strip().title())

In [40]:
matching["Final Sub-pillar Name"].unique()

array(['Living Standards', 'Physical And Mental Well Being',
       'Coping Mechanisms', 'Impact On People Or Impact On Services',
       'Driver/Aggravating Factors', 'Impact On People',
       'Impact On Systems And Services', 'International Response',
       'National Response', 'Number Of People Affected',
       'Number Of People At Risk', 'Number Of People In Need',
       'Number Of People Reached', 'Expressed By Population',
       'Expressed By Humanitarian Staff', 'Response Gaps',
       'Risk And Vulnerabilities', 'Impact On Services'], dtype=object)

In [41]:
matching["Final Pillar Name"].unique()

array(['Humanitatian Conditions', 'Impact', 'Capacities & Response',
       'People At Risk', 'Humanitarian Conditions', 'Priority Needs',
       'Priority Interventions'], dtype=object)

In [42]:
matching.loc[matching["Final Pillar Name"].eq('Humanitatian Conditions'),
             "Final Pillar Name"] = 'Humanitarian Conditions'

In [43]:
matching = matching.astype({'Framework Name':str, 'Pillar':str, 'Sub-pillar':str, 'Final Pillar Name':str,
       'Final Sub-pillar Name':str, 'analysis_framework_id':str})

In [44]:
matching[
    "original_pillar"] = matching["analysis_framework_id"] + "->" + matching[
        "Pillar"] + "->" + matching["Sub-pillar"]
matching["original_subpillar"] = matching[
    "analysis_framework_id"] + "->" + matching["Pillar"] + "->" + matching[
        "Sub-pillar"]
##
matching["target_pillar"] = matching["Final Pillar Name"]
matching["target_subpillar"] = matching["Final Pillar Name"] + "->" + matching[
    "Final Sub-pillar Name"]

In [45]:
pillar_name_mapper = dict()
subpillar_name_mapper = dict()
for pillar, fpillar, subpillar, fsubpillar, af_id in zip(
        matching["original_pillar"], matching["target_pillar"],
        matching["original_subpillar"], matching["target_subpillar"],
        matching["analysis_framework_id"]):
    pillar_name_mapper[pillar] = fpillar
    subpillar_name_mapper[subpillar] = fsubpillar

In [46]:
pillar_name_mapper

{'136->Humanitarian Conditions->1St Level Outcome': 'Humanitarian Conditions',
 '137->Humanitarian Conditions->1St Level Outcome': 'Humanitarian Conditions',
 '14->Humanitarian Conditions->Access To Basic Services Or Goods': 'Humanitarian Conditions',
 '273->Humanitarian Conditions->Impact On Accessibility, Availability, Quality, Use And Awareness Of Goods And Services': 'Humanitarian Conditions',
 '1004->Humanitarian Conditions->Living Standards': 'Humanitarian Conditions',
 '829->Humanitarian Conditions->Living Standards': 'Humanitarian Conditions',
 '1306->Humanitarian Conditions->Living Standards': 'Humanitarian Conditions',
 '552->Humanitarian Conditions->Living Standards': 'Humanitarian Conditions',
 '495->Humanitarian Conditions->Living Standards': 'Humanitarian Conditions',
 '537->Humanitarian Conditions->Living Standards': 'Humanitarian Conditions',
 '273->Humanitarian Conditions->Impact On Physical And Mental Wellbeing': 'Humanitarian Conditions',
 '14->Humanitarian Condition

In [47]:
def sector_mapper(sec):
    if sec == sec:
        sectors =  [sector_name_mapper.get(s, "") for s in sec]
        return [sec for sec in sectors if sec]
    return []
##
def pillar_mapper(dim):
    if dim == dim:
        dim = [pillar_name_mapper.get(d, "") for d in dim]
        return [d for d in dim if d]
    return []
##
def subpillar_mapper(subdim):
    if subdim == subdim:
        subdim = [subpillar_name_mapper.get(d, "") for d in subdim]
        return [d for d in subdim if d]
    return []

In [48]:
final_df["sectors"] = final_df["Sector"].apply(sector_mapper)
final_df["pillars"] = final_df["Dimension"].apply(pillar_mapper)
final_df["subpillars"] = final_df["Subdimension"].apply(subpillar_mapper)

In [49]:
final_df.columns

Index(['id_entry', 'created_at', 'modified_at', 'excerpt', 'image_raw',
       'analysis_framework_id', 'created_by_id', 'lead_id', 'modified_by_id',
       'entry_type', 'information_date', 'order', 'client_id', 'project_id',
       'tabular_field_id', 'dropped_excerpt', 'highlight_hidden', 'verified',
       'verification_last_changed_by_id', 'image_id', 'tagger_name',
       'project_title', 'analysis_framework_title', 'id_exportdata', 'data',
       'entry_id', 'exportable_id', 'tag_title', 'tag_value', 'Dimension',
       'Subdimension', 'Sector', 'sectors', 'pillars', 'subpillars'],
      dtype='object')

In [50]:
df = final_df[[
    'entry_id',
    'lead_id',
    'project_id',
    'project_title',
    'analysis_framework_id',
    'analysis_framework_title',
    'excerpt',
    'dropped_excerpt',
    'created_by_id',
    'tagger_name',
    'modified_by_id',
    'verified',
    'verification_last_changed_by_id',
    'sectors',
    'pillars',
    'subpillars',
]]
df.columns = [
    'entry_id',
    'lead_id',
    'project_id',
    'project_title',
    'analysis_framework_id',
    'analysis_framework_title',
    'excerpt',
    'dropped_excerpt',
    'created_by_id',
    'tagger_name',
    'modified_by_id',
    'verified',
    'verification_last_changed_by_id',
    'sectors',
    'pillars',
    'subpillars',
]

In [51]:
secs_unique = set()
secs_occurances = list()
for secs in final_df["sectors"]:
    secs_unique.update(secs)
    secs_occurances.extend(secs)
#secs_unique
Counter(secs_occurances).most_common()

[('Protection', 136),
 ('Health', 100),
 ('Cross', 69),
 ('Food Security', 64),
 ('Education', 39),
 ('Shelter', 31),
 ('Logistics', 26),
 ('Agriculture', 21),
 ('Livelihoods', 16),
 ('WASH', 12),
 ('Nutrition', 4)]

In [52]:
dims_unique = set()
dims_occurances = list()
for dims in final_df["pillars"]:
    dims_unique.update(dims)
    dims_occurances.extend(dims)
#dims_unique
Counter(dims_occurances).most_common()

[('Impact', 319),
 ('Humanitarian Conditions', 206),
 ('People At Risk', 24),
 ('Priority Needs', 13),
 ('Capacities & Response', 12),
 ('Priority Interventions', 9)]

In [53]:
subdims_unique = set()
subdims_occurances = list()
for subdims in final_df["subpillars"]:
    subdims_unique.update(subdims)
    subdims_occurances.extend(subdims)
Counter(subdims_occurances).most_common()

[('Impact->Impact On Systems And Services', 139),
 ('Humanitarian Conditions->Physical And Mental Well Being', 117),
 ('Impact->Impact On People', 93),
 ('Impact->Driver/Aggravating Factors', 77),
 ('Humanitarian Conditions->Living Standards', 70),
 ('People At Risk->Risk And Vulnerabilities', 24),
 ('Humanitarian Conditions->Coping Mechanisms', 13),
 ('Impact->Number Of People Affected', 10),
 ('Priority Interventions->Expressed By Humanitarian Staff', 9),
 ('Priority Needs->Expressed By Humanitarian Staff', 8),
 ('Humanitarian Conditions->Number Of People In Need', 6),
 ('Priority Needs->Expressed By Population', 5),
 ('Capacities & Response->International Response', 5),
 ('Capacities & Response->National Response', 4),
 ('Capacities & Response->Number Of People Reached', 3)]

In [54]:
df.to_csv("data_exported.csv", index=None)