In [68]:
import pandas as pd
import numpy as np

from flashtext import KeywordProcessor
import inflect

In [2]:
from wb_nlp import dir_manager

In [69]:
inflect_engine = inflect.engine()

In [30]:
jdc_tags_processor = KeywordProcessor()

In [12]:
tags_sheet = pd.read_excel(dir_manager.get_data_dir("whitelists", "jdc", "List_filtering_keywords.xlsx"), header=None, index_col=0).rename(columns={1: "tag_keyword"})

### Definition of data input:

The input to the tag extractor is an excel or csv file. The first column of the data must be the intended tag keyword. To remove ambiguity, a header with name "tag_keyword" must be present. Additionally, all non-empty values in the columns to the right of the tag keyword are considered as prototypes. Occurences of these prototypes will be mapped to the tag keyword.


In [72]:
def get_keywords_mapping(tags_sheet):
    tags_mapping = tags_sheet.set_index("tag_keyword").T.apply(
        # If prototypes have "underscores" create a copy with the underscore replaced with a space.
        lambda x: [[i] if "_" not in i else [i, i.replace("_", " ")] for i in x.dropna().tolist()] +

        # Add the tag keyword as well
        [[x.name, x.name.replace("_", " ")]])

    # Clean up the keywords to remove duplicates.
    tags_mapping = tags_mapping.map(lambda x: sorted(set([j for i in x for j in i])))

    tags_mapping = tags_mapping.map(
        lambda x: x + [inflect_engine.plural(i) for i in x if "_" not in i])

    # Clean up the keywords to remove duplicates.
    tags_mapping = tags_mapping.map(
        lambda x: sorted(set(x)))

    return tags_mapping

In [73]:
tags_mapping = get_keywords_mapping(tags_sheet)
tags_mapping.pop("Kakuma (Kenya)")

['Dagahaley (Kenya)',
 'Dagahaley (Kenya)s',
 'Hagadera (Kenya)',
 'Hagadera (Kenya)s',
 'Ifo (Kenya)',
 'Ifo (Kenya)s',
 'Kakuma (Kenya)',
 'Kakuma (Kenya)s',
 'Katumba (Tanzania)',
 'Katumba (Tanzania)s',
 "Kutupalong (Ukhia, Cox's Bazar, Bangladesh)",
 "Kutupalong (Ukhia, Cox's Bazar, Bangladesh)s",
 'Mishamo (Tanzania)',
 'Mishamo (Tanzania)s',
 'Panian (Pakistan)',
 'Panian (Pakistan)s',
 'Pugnido (Ethiopia)',
 'Pugnido (Ethiopia)s',
 'Yida (South Sudan)',
 'Yida (South Sudan)s',
 'Zaatari (Jordan)',
 'Zaatari (Jordan)s']

In [74]:
tags_mapping

tag_keyword
population_of_concern              [PoC, PoCs, population of concern, population_...
refugee                                                          [refugee, refugees]
internally_displaced_population    [idp, idps, internally displaced, internally d...
stateless                                                   [stateless, statelesses]
climate_refugee                    [climate refugee, climate refugees, climate_re...
returnee                                                       [returnee, returnees]
refugee_camp                             [refugee camp, refugee camps, refugee_camp]
host_community                     [host communities, host community, host_commun...
asylum_seeker                      [asylum, asylum seeker, asylum seekers, asylum...
country_of_asylum                  [countries of asylum, country of asylum, count...
forced_displacement                [displaced people, displaced person, displaced...
ocha                                                 

In [75]:
tags_mapping["unhcr"]

['High Commission for Refugees',
 'High Commissions for Refugees',
 'United Nations High Commission for Refugees',
 'United Nations High Commissions for Refugees',
 'unhcr',
 'unhcrs']

In [31]:
jdc_tags_processor.add_keywords_from_dict(tags_mapping)

In [49]:
with open(dir_manager.get_data_dir("corpus", "WB", "TXT_ORIG", "wb_731917.txt"), "rb") as open_file:
    txt = open_file.read().decode("utf-8", errors="ignore")

In [63]:
txt += " the refugee, asylum seeker, internally displaced population  and stateless people are suffering in exile with other refugee, asylum seeker, and stateless people and climate refugees under the high Commission for refugees."

In [64]:
jdc_tags_processor.extract_keywords(txt)

['refugee',
 'asylum_seeker',
 'stateless',
 'exile',
 'refugee',
 'asylum_seeker',
 'stateless',
 'climate_refugee',
 'refugee',
 'asylum_seeker',
 'stateless',
 'exile',
 'refugee',
 'asylum_seeker',
 'stateless',
 'refugee',
 'asylum_seeker',
 'internally_displaced_population',
 'stateless',
 'exile',
 'refugee',
 'asylum_seeker',
 'stateless',
 'refugee',
 'asylum_seeker',
 'internally_displaced_population',
 'stateless',
 'exile',
 'refugee',
 'asylum_seeker',
 'stateless',
 'unhcr',
 'refugee',
 'asylum_seeker',
 'internally_displaced_population',
 'stateless',
 'exile',
 'refugee',
 'asylum_seeker',
 'stateless',
 'unhcr']

In [66]:
tags_mapping.pop("Kakuma (Kenya)")

['Dagahaley (Kenya)',
 'Hagadera (Kenya)',
 'Ifo (Kenya)',
 'Kakuma (Kenya)',
 'Katumba (Tanzania)',
 "Kutupalong (Ukhia, Cox's Bazar, Bangladesh)",
 'Mishamo (Tanzania)',
 'Panian (Pakistan)',
 'Pugnido (Ethiopia)',
 'Yida (South Sudan)',
 'Zaatari (Jordan)']

In [67]:
tags_mapping

tag_keyword
population_of_concern              [PoC, population of concern, population_of_con...
refugee                                                                    [refugee]
internally_displaced_population    [idp, internally displaced, internally displac...
stateless                                                                [stateless]
climate_refugee                                   [climate refugee, climate_refugee]
returnee                                                                  [returnee]
refugee_camp                                            [refugee camp, refugee_camp]
host_community                                      [host community, host_community]
asylum_seeker                                 [asylum, asylum seeker, asylum_seeker]
country_of_asylum                             [country of asylum, country_of_asylum]
forced_displacement                [displaced person, displaced population, displ...
ocha                                                 