In [1]:
import os
import pandas as pd
from typing import List
import json
from collections import defaultdict
from copy import copy

In [2]:
def flatten(t: List[List]) -> List:
    """flatten list of lists"""
    return [item for sublist in t for item in sublist]

In [3]:
###### final mapping sheet
ordered_columns = ['Original first level',
       'Original second level', 'Original third level', 'NLP Type', 'NLP first level',
       'NLP second level', 'NLP third level', 'NLP fourth level', 'Reversible']

hum_mapping_sheet = pd.read_csv(
    "tables/mapping_sheet_original2nlp.csv", usecols=ordered_columns
)[ordered_columns].reset_index(drop=True)
original_sheet = hum_mapping_sheet.copy()

In [4]:
original_levels_cols = [
    "Original first level",
    "Original second level",
    "Original third level",
]

mapped_levels_cols = [
    "NLP first level",
    "NLP second level",
    "NLP third level",
]

for one_level in original_levels_cols:
    hum_mapping_sheet[one_level] = hum_mapping_sheet[one_level].apply(
        lambda x: x.lower().replace("\t", "").replace("•", "").strip()
        if type(x) is str
        else x
    )

for one_level in mapped_levels_cols:
    hum_mapping_sheet[one_level] = hum_mapping_sheet[one_level].apply(
        lambda x: x.capitalize().strip() if type(x) is str else x
    )

hum_mapping_sheet.drop_duplicates(inplace=True)

hum_mapping_sheet.shape

(3127, 9)

In [5]:
nlp_mapping_sheet = hum_mapping_sheet[hum_mapping_sheet['Reversible']=='TRUE'].drop_duplicates()
nlp_mapping_sheet.shape

(616, 9)

In [6]:
nlp_mapping_sheet = nlp_mapping_sheet[nlp_mapping_sheet['Original third level'].apply(lambda x: str(x)=='nan')]

In [7]:
NLP_TYPES = [
    "2D",
    "nan",
    "1D",
    "Sector",
    "DEMOGRAPHIC GROUPS",
    "SPECIFIC NEEDS GROUPS",
    "AFFECTED GROUPS",
    "SEVERITY",
    "RELIABILITY",
]

#TODO: review this and add demographic groups new level

def get_final_nlp_name(row: pd.Series):

    final_outputs_one_row = []

    if str(row["NLP Type"]) != "nan" and str(row["NLP first level"]) != "nan":

        final_str = row["NLP Type"].lower().replace("•", "").replace("\t", "").strip().replace(" ", "_")

        ### affected groups
        if final_str == "affected_groups":
            if row["NLP first level"] == "Affected" and str(row['NLP second level'])!='nan':

                if str(row['NLP fourth level'])!='nan':
                    final_outputs_one_row.append(
                        f"secondary_tags->{row['NLP second level'].capitalize()}->{row['NLP fourth level'].capitalize()}"
                    )
                elif str(row['NLP third level'])!='nan':
                    final_outputs_one_row.append(
                        f"secondary_tags->{row['NLP second level'].capitalize()}->{row['NLP third level'].capitalize()}"
                    )
                else:
                    final_outputs_one_row.append(
                        f"first_level_tags->Affected->{row['NLP second level'].capitalize()}"
                    )
                    
        ### severity and reliability and specific needs groups
        elif final_str in ["reliability", "severity", "specific_needs_groups"]:
            final_outputs_one_row.append(
                f"secondary_tags->{final_str}->{row['NLP first level'].capitalize()}"
            )

        ### demographic groups
        elif final_str == "demographic_groups":
            if str(row["NLP first level"]) != "nan":
                if str(row["NLP fourth level"]) != "nan":
                    final_outputs_one_row.append(
                        f"secondary_tags->Gender->{row['NLP first level'].capitalize()}"
                    )
                    final_outputs_one_row.append(
                        f"secondary_tags->Age->{row['NLP fourth level'].capitalize()}"
                    )
                elif str(row["NLP third level"]) != "nan":
                    final_outputs_one_row.append(
                        f"secondary_tags->Gender->{row['NLP first level'].capitalize()}"
                    )
                    final_outputs_one_row.append(
                        f"secondary_tags->Age->{row['NLP third level'].capitalize()}"
                    )
                elif str(row["NLP second level"]) != "nan":
                    final_outputs_one_row.append(
                        f"secondary_tags->Gender->{row['NLP first level'].capitalize()}"
                    )
                    final_outputs_one_row.append(
                        f"secondary_tags->Age->{row['NLP second level'].capitalize()}"
                    )               
                else:
                    final_outputs_one_row.append(
                        f"secondary_tags->Gender->{row['NLP first level'].capitalize()}"
                    )

        elif final_str == "sector":
            final_outputs_one_row.append(
                f"first_level_tags->sectors->{row['NLP first level'].capitalize()}"
            )
            if str(row["NLP second level"]) != "nan":
                final_outputs_one_row.append(
                    f"subsectors->{row['NLP first level'].capitalize()}->{row['NLP second level'].capitalize()}"
                )

        ### 1d and 2d subpillars
        elif final_str in ["1d", "2d"]:
            # sometimes only pillar
            if str(row["NLP second level"]) != "nan":
                final_outputs_one_row.append(
                    f"subpillars_{final_str[0]}d->{row['NLP first level'].capitalize()}->{row['NLP second level'].capitalize()}"
                )
            else:
                final_outputs_one_row.append(
                    f"first_level_tags->pillars_{final_str[0]}d->{row['NLP first level'].capitalize()}"
                )

        else:
            print(final_str)
            raise (Exception("problem!"))

    return final_outputs_one_row

In [8]:
nlp_mapping_sheet["mapped_nlp"] = hum_mapping_sheet.apply(
    lambda x: get_final_nlp_name(x), axis=1
)

nlp_tags_mapping = list(set(flatten(nlp_mapping_sheet["mapped_nlp"])))
pillars = [
    "->".join(tag.split("->")[:-1]).replace("subpillars", "pillars")
    for tag in nlp_tags_mapping
    if "subpillars" in tag
]
sectors = [
    "->".join(tag.split("->")[:-1]).replace("subsectors", "sectors")
    for tag in nlp_tags_mapping
    if "subsectors" in tag
]
nlp_tags_mapping += [f"first_level_tags->{tag}" for tag in sectors]
nlp_tags_mapping += [f"first_level_tags->{tag}" for tag in pillars]

nlp_tags_mapping = sorted(list(set(nlp_tags_mapping)))

len(nlp_tags_mapping)

164

In [9]:
nlp_tags_mapping

['first_level_tags->Affected->Displaced',
 'first_level_tags->Affected->Non displaced',
 'first_level_tags->pillars_1d->Casualties',
 'first_level_tags->pillars_1d->Context',
 'first_level_tags->pillars_1d->Covid-19',
 'first_level_tags->pillars_1d->Displacement',
 'first_level_tags->pillars_1d->Humanitarian access',
 'first_level_tags->pillars_1d->Information and communication',
 'first_level_tags->pillars_1d->Shock/event',
 'first_level_tags->pillars_2d->At risk',
 'first_level_tags->pillars_2d->Capacities & response',
 'first_level_tags->pillars_2d->Humanitarian conditions',
 'first_level_tags->pillars_2d->Impact',
 'first_level_tags->pillars_2d->Priority interventions',
 'first_level_tags->pillars_2d->Priority needs',
 'first_level_tags->sectors->Agriculture',
 'first_level_tags->sectors->Cross',
 'first_level_tags->sectors->Education',
 'first_level_tags->sectors->Food security',
 'first_level_tags->sectors->Health',
 'first_level_tags->sectors->Livelihoods',
 'first_level_tags->s

In [10]:
nlp_mapping_folder = 'nlp_mapping_files'
if not os.path.exists(nlp_mapping_folder):
    os.mkdir(nlp_mapping_folder)

In [11]:
with open(f"{nlp_mapping_folder}/all_nlp_tags.json", "w") as fp:
   json.dump(nlp_tags_mapping, fp)

In [12]:
def _clean_tag(tag: str):
    return (
        copy(tag).lower()
        .replace("->->", "->")
        .replace("-> ", "->")
        .replace(" ->", "->")
        .replace("->none", "")
        .replace("->n/a", "")
        .replace("\t", "")
        .replace("•", "")
        .replace('.', '')
        .replace(',', '')
    )

mapping_tags_nlp2original = {}

for one_nlp_key in nlp_tags_mapping:
    last_kw = one_nlp_key.split('->')[-1].lower()
    mapping_tags_nlp2original[last_kw] = [one_nlp_key]

for i, row in nlp_mapping_sheet.iterrows():
    if str(row['Original second level'])=='nan':
        orignal_key = row['Original first level']
    else:
        orignal_key = f"{row['Original first level']}->{row['Original second level']}"

    value = row['mapped_nlp']

    mapping_tags_nlp2original[orignal_key] = value

mapping_tags_nlp2original = {_clean_tag(k): v for k, v in mapping_tags_nlp2original.items()}

with open(f"{nlp_mapping_folder}/mapping_tags_nlp2original.json", 'w') as fp:
    json.dump(mapping_tags_nlp2original, fp)

In [13]:
mapping_tags_nlp2original

{'displaced': ['first_level_tags->Affected->Displaced'],
 'non displaced': ['first_level_tags->Affected->Non displaced'],
 'casualties': ['first_level_tags->pillars_1d->Casualties'],
 'context': ['first_level_tags->pillars_1d->Context'],
 'covid-19': ['first_level_tags->pillars_1d->Covid-19'],
 'displacement': ['first_level_tags->pillars_1d->Displacement'],
 'humanitarian access': ['first_level_tags->pillars_1d->Humanitarian access'],
 'information and communication': ['first_level_tags->pillars_1d->Information and communication'],
 'shock/event': ['first_level_tags->pillars_1d->Shock/event'],
 'at risk': ['first_level_tags->pillars_2d->At risk'],
 'capacities & response': ['first_level_tags->pillars_2d->Capacities & response'],
 'humanitarian conditions': ['first_level_tags->pillars_2d->Humanitarian conditions'],
 'impact': ['first_level_tags->pillars_2d->Impact'],
 'priority interventions': ['first_level_tags->pillars_2d->Priority interventions'],
 'priority needs': ['first_level_tag

In [14]:
#script to be  fed in DEEP

import enchant
from copy import copy
from typing import List, Dict
from collections import Counter
import json


def _clean_tag(tag: str):
    return (
        copy(tag).lower()
        .replace("->->", "->")
        .replace("-> ", "->")
        .replace(" ->", "->")
        .replace("->none", "")
        .replace("->n/a", "")
        .replace("\t", "")
        .replace("•", "")
        .replace('.', '')
        .replace(',', '')
        .replace("!", "")
        .replace("?", "")
    )


def _short_distance_matching(tag, matching_dict: Dict[str, List[str]]):
    keys = list(matching_dict.keys())
    distances = defaultdict(list)
    for k, v in matching_dict.items():
        dist = enchant.utils.levenshtein(k, tag)
        min_distance = 1 + (len(k) // 10)
        if dist <= min_distance:
            distances[k].append(-dist) #neg value so we get the min sum in the end

    if len(distances)==0:
        return []
    else:
        distances = {k: sum(v) for k, v in distances.items()}
        min_key = min(distances, key=distances.get)
        return matching_dict[min_key]


def _one_tag2nlp_match(tag, matching_dict) -> List[str]:
    
    if type(tag) is str:
        clean_tag = _clean_tag(tag)
        mapped_tag = matching_dict.get(clean_tag)
        if mapped_tag is None:
            return _short_distance_matching(clean_tag, matching_dict)
        else:
            return mapped_tag
    else:
        return []

def af2nlp_matching(tags: List[Dict[int, str]], matching_dict_path = "nlp_mapping_files/mapping_tags_nlp2original.json") -> List[List[str]]:
    with open(matching_dict_path, 'r') as f:
        matching_dict = json.load(f)

    return {one_original_af_tag['id']: _one_tag2nlp_match(one_original_af_tag['text'], matching_dict) for one_original_af_tag in tags}


In [15]:
af_tags_test = [
    {'id': 1, 'text': "vulnerability/pi->children with disabilitie"},
    {'id': 2, 'text': "!health."},
    {'id': 3, 'text': "Youth (18 to 24 years old)"},
    {'id': 4, 'text': "test"},
    {'id': 5, 'text': "CAPACITIES and response->national response capacity"},
    {'id': 6, 'text': "national response capacity"}
]

af2nlp_matching(af_tags_test)

{1: [],
 2: ['first_level_tags->sectors->Health'],
 3: ['secondary_tags->Gender->All', 'secondary_tags->Age->18-24 years old'],
 4: [],
 5: ['subpillars_2d->Capacities & response->National response'],
 6: []}