### Matching frameworks
0. Let's call the set of analysis frameworks that we have in our training set as AF-t. We already know how to map any framework from AF-t to the NLP framework using the mapping excel sheet that we created with Patrice.
1. Given a user-defined analysis framework, calculate a matching score between it and all the analysis frameworks in AF-t.
2. Let's call the analysis framework from AF-t that gave the highest matching score AF-match.
3. The mapping between AF-match and the NLP framework will be suggested as the mapping between the user-defined framework and the NLP framework.

Example:
Given a user-defined analysis framework, we find that the closest framework to be, for example, "IFRC ESSN Turkey 2021". The mapping between the user-defined analysis and the NLP framework will be the same as (or similar to) the mapping between "IFRC ESSN Turkey 2021" and the NLP framework.

In [1]:
import re
import json
import pandas as pd

In [2]:
import os
os.chdir("/home/abdullah/Documents/DFS/data_raw")

In [3]:
afs = pd.read_csv("analysis_frameworks.csv")
af_widgets = pd.read_csv("af_widgets_all.csv")

In [4]:
af_widgets['title'] = af_widgets['title'].str.title()

In [5]:
# filter out test frameworks
afs = afs[~((afs["title"].str.len() < 4) | afs["title"].str.
            contains("train|test|clone|draft", regex=True, case=False)
            | afs["description"].str.
            contains("train|test|clone|draft", regex=True, case=False))]
len(afs)

722

In [6]:
af_widgets["properties"] = af_widgets["properties"].apply(json.loads)
af_widgets = pd.merge(af_widgets,
                      afs[["id", "title"]],
                      left_on="analysis_framework_id",
                      right_on="id",
                      suffixes=("", "_af"))
af_widgets.drop(columns="id_af", inplace=True)
af_widgets.rename({"title_af": "analysis_framework_title"}, axis=1, inplace=True)

In [7]:
def process_str(s):
    # to make the matching easier,
    # preprocess 1D sub-pillars, 2D sub-pillars and sectors' titles
    if s != s or s=="":
        return ""
    s = s.upper()
    s = s.replace("/", " / ")
    s = s.replace("&", " and ")
    s = re.sub("\s+", " ", s).strip()
    return s

def get_2d_pillars(widget_properties):
    # given a `matrix2dWidget` widget properties, extract 2D pillars and 2D sub-pillars' titles
    if "data" not in widget_properties or widget_properties["data"] is None:
        pillars, subpillars = set(), set()
    else:
        dims = widget_properties["data"]['dimensions']
        pillars_subpillars = dict()
        for dim in dims:
            pillar = process_str(dim["title"])
            sub_pillars = []
            for sub_pillar_dict in dim["subdimensions"]:
                sub_pillars.append(process_str(sub_pillar_dict["title"]))
            pillars_subpillars[pillar] = sub_pillars
        pillars = set(list(sorted(list(pillars_subpillars.keys()))))
        subpillars = set(list(sorted([
            f"{p}->{sp}" for p, sps in pillars_subpillars.items() for sp in sps
        ])))
    return pillars, subpillars


def get_1d_pillars(widget_properties):
    # given a `matrix1dWidget` widget properties, extract 1D pillars and 1D sub-pillars' titles
    if "data" not in widget_properties or widget_properties["data"] is None:
        pillars, subpillars = set(), set()
    else:
        pills = widget_properties["data"]['rows']
        pillars_subpillars = dict()
        for pill in pills:
            pillar = process_str(pill["title"])
            sub_pillars = []
            for sub_pillar_dict in pill["cells"]:
                sub_pillars.append(process_str(sub_pillar_dict["value"]))
            pillars_subpillars[pillar] = sub_pillars
        pillars = set(list(sorted(list(pillars_subpillars.keys()))))
        subpillars = set(list(sorted([
            f"{p}->{sp}" for p, sps in pillars_subpillars.items() for sp in sps
        ])))
    return pillars, subpillars


def get_sectors(widget_properties):
    # given a `matrix2dWidget` widget, extract sectors and sub-sectors' titles
    if "data" not in widget_properties or widget_properties["data"] is None:
        sectors, subsectors = set(), set()
    else:
        sectors = widget_properties["data"]['sectors']
        sectors_subsectors = dict()
        for sec in sectors:
            sector = process_str(sec["title"])
            sub_sectors = []
            for sub_sector_dict in sec["subsectors"]:
                sub_sectors.append(process_str(sub_sector_dict["title"]))
            sectors_subsectors[sector] = sub_sectors
        sectors = set(list(sorted(list(sectors_subsectors.keys()))))
        subsectors = set(list(sorted([
            f"{p}->{sp}" for p, sps in sectors_subsectors.items() for sp in sps
        ])))
    return sectors, subsectors



In [8]:
def match_score(af_user, af_predefined):
    # This function calculates a score of the similarity between two analysis frameworks.
    # It first gets the sectors, 1D sub-pillars and 2D sub-pillars of each framework as sets of strings.
    # Then, it calculates the ratios of matched sectors, 1D sub-pillars and 2D sub-pillars.
    # Finally, it sums those scores to return a final matchign score.
    #
    # Notes: 
    # # 1. I am not matching sub-sectors since they are not used at the moment
    # # 2. sometimes a 2D sub-pillar from a source framework matches a 1D sub-pillar from a target framework,
    # # so I am taking this into account when matching.
    
    sectors_af_user, _ = get_sectors(af_user["2D Matrix"])
    _, subpillars_1d_af_user = get_1d_pillars(af_user["1D Matrix"])
    _, subpillars_2d_af_user = get_2d_pillars(af_user["2D Matrix"])
    #
    sectors_af_predefined, _ = get_sectors(af_predefined["2D Matrix"])
    _, subpillars_1d_af_predefined = get_1d_pillars(
        af_predefined["1D Matrix"])
    _, subpillars_2d_af_predefined = get_2d_pillars(
        af_predefined["2D Matrix"])
    ###
    matched_sectors_perc = 0
    if len(sectors_af_user):
        matched_sectors_perc = len(
            sectors_af_user & sectors_af_predefined)/len(sectors_af_user)
    #
    matched_subpillars_1d_perc = 0
    if len(subpillars_1d_af_user):
        matched_subpillars_1d_perc = len(subpillars_1d_af_user & (
            subpillars_1d_af_predefined | subpillars_2d_af_predefined))/len(subpillars_1d_af_user)
    #
    matched_subpillars_2d_perc = 0
    if len(subpillars_2d_af_user):
        matched_subpillars_2d_perc = len(subpillars_2d_af_user & (
            subpillars_1d_af_predefined | subpillars_2d_af_predefined))/len(subpillars_2d_af_user)
    #
    return matched_sectors_perc + matched_subpillars_1d_perc + matched_subpillars_2d_perc


In [9]:
# Test `match_score` function
af_1 = { # IMMAP/DFS Situation Analysis Framework (id=1306)
    "2D Matrix": af_widgets[af_widgets["analysis_framework_id"].eq(1306) & af_widgets["title"].eq("Sectoral Information")]["properties"].iloc[0],
    "1D Matrix": af_widgets[af_widgets["analysis_framework_id"].eq(1306) & af_widgets["title"].eq("Operational Environment")]["properties"].iloc[0],

}
af_2 = { # IFRC Master Framework 2019 (id=699)
    "2D Matrix": af_widgets[af_widgets["analysis_framework_id"].eq(699) & af_widgets["title"].eq("Sectoral Information")]["properties"].iloc[0],
    "1D Matrix": af_widgets[af_widgets["analysis_framework_id"].eq(699) & af_widgets["title"].eq("Operational Environment")]["properties"].iloc[0],
}
#
match_score(af_1, af_2), match_score(af_2, af_1), match_score(af_1, af_1), match_score(af_2, af_2)


(1.3487193920630454, 1.6090909090909091, 3.0, 3.0)

In [10]:
already_mapped_frameworks = {
        "2020 Okular Analytics Framework": {"2D Matrix": None, "1D Matrix": None},
        "Colombia-AF": {"2D Matrix": None, "1D Matrix": None},
        "GIMAC Generic": {"2D Matrix": None, "1D Matrix": None},
        "IFRC Analytical Framework 2018": {"2D Matrix": None, "1D Matrix": None},
        "IFRC ESSN Turkey 2021": {"2D Matrix": None, "1D Matrix": None},
        "IFRC Master Framework 2019": {"2D Matrix": None, "1D Matrix": None},
        "IMMAP/DFS Situation Analysis Framework": {"2D Matrix": None, "1D Matrix": None},
        #"JIAF Multi Sectoral": {"2D Matrix": None, "1D Matrix": None},
        "Nigeria Situation Analysis (OA)": {"2D Matrix": None, "1D Matrix": None},
        "Okular Analytics Generic": {"2D Matrix": None, "1D Matrix": None},
        "Okular Analytics Libya": {"2D Matrix": None, "1D Matrix": None},
        "Rohingya Framework": {"2D Matrix": None, "1D Matrix": None},
        "Situation Analysis Generic Libya": {"2D Matrix": None, "1D Matrix": None},
        "Situation Analysis Generic Yemen": {"2D Matrix": None, "1D Matrix": None},
    }

mat2d_titles = [
    "Pre-Crisis",
    "Shock/Event",
    "In-Crisis",
    "Sectors",
    "Sectoral Information",
    "Matrix 2D",
]

mat1d_titles = [ "Operational Environment", "Cross Sector", "Matrix 1D", "Cross Sectors", ('Operational Environment - Dimension', 'Operational Environment - Subdimension'), 'Flag',  "High Level Tags", ('Flag - Dimension', 'Flag - Subdimension'), ('High Level Tags - Dimension', 'High Level Tags - Subdimension')
]

for af_title in already_mapped_frameworks:
    already_mapped_frameworks[af_title]["2D Matrix"] = af_widgets[af_widgets["analysis_framework_title"].eq(af_title) & af_widgets["title"].isin(mat2d_titles)]["properties"].iloc[0]
    # WARNING: this will not always give complete info, because some frameworks have both two 1D matrices, Flags and another one.
    already_mapped_frameworks[af_title]["1D Matrix"] = af_widgets[af_widgets["analysis_framework_title"].eq(af_title) & af_widgets["title"].isin(mat1d_titles)]["properties"].iloc[0]

In [11]:
def map_to_nlp_framework(af_user):
    # calculate the matching_score with each of the pre-matched frameworks, then output the one with highest matching score.
    highest_matching_score = 0
    best_matching_framework = None
    for af_predefined_title, af_predefined in already_mapped_frameworks.items():
        matching_score = match_score(af_user, af_predefined)
        if matching_score > highest_matching_score:
            highest_matching_score = matching_score
            best_matching_framework = af_predefined_title
    return best_matching_framework

In [12]:
# "Syria Conflict Generic Framework (OA)"
user_defined_af_example = {
    "2D Matrix": af_widgets[af_widgets["analysis_framework_id"].eq(1306) & af_widgets["title"].eq("Sectoral Information")]["properties"].iloc[0],
    "1D Matrix": af_widgets[af_widgets["analysis_framework_id"].eq(1306) & af_widgets["title"].eq("Operational Environment")]["properties"].iloc[0],
}
map_to_nlp_framework(user_defined_af_example)

'IMMAP/DFS Situation Analysis Framework'