In [1]:
import re
import string
import functools
from ast import literal_eval
from operator import itemgetter
from collections import Counter, defaultdict, OrderedDict
from collections.abc import Sequence

import numpy as np
import pandas as pd
from sklearn import metrics
from sklearn.feature_extraction.text import CountVectorizer

from tqdm.notebook import tqdm

from num2words import num2words
                                                                
import nltk
from nltk import ngrams
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import sent_tokenize, word_tokenize

from IPython.core.display import HTML

In [2]:
tqdm.pandas()

In [3]:
def preprocess(doc):
    if doc != doc:
        return ""
    doc = doc.lower()
    # remove preceeding dates
    #doc = re.sub("^\[.+\]", " ", doc).strip()
    #doc = re.sub("^\(.+\)", " ", doc).strip()
    # spaces btw numbers and words
    doc = re.sub('(\d+(\W\d+)?)', r' \1 ', doc).strip()
    doc = re.sub("[‐‑–—―─_]", "-", doc)
    doc = re.sub("(\w)\- (\w)", r"\1\2", doc)
    doc = re.sub(
        "[" + re.escape(
            '_@^~.()[],"“’…<❖‐»—─|•&{≥➢\ue0e4\uf0d8\uf0fc●°#\u200b>`?�€■!‘%;̧\'›«”:≤―\uf0b7$}*´=‑▪\xad❑·–'
        ) + "]", " ", doc)
    #remove some puncs
    doc = re.sub('\s+', " ", doc).strip()
    return doc
##
def preprocess_and_tokenize(doc, n=1):
    doc = preprocess(doc)
    # tokenize
    words = word_tokenize(doc)
    if n == 1:
        return set(words)
    return set(ngrams(words, n))
##
def tokenize(words, n=1):
    if n == 1:
        return set(words)
    return set(ngrams(words, n))

In [4]:
df_train = pd.read_csv("../data/train_v0.7.1.csv",
                       usecols=[
                           'entry_id', 'excerpt', 'specific_needs_groups',
                           'lang', "translation_en",
                           "translation_fr", "translation_es"
                       ])
df_val = pd.read_csv("../data/val_v0.7.1.csv",
                     usecols=[
                         'entry_id', 'excerpt', 'specific_needs_groups',
                         'lang', "translation_en", "translation_fr",
                         "translation_es"
                     ])
df_test = pd.read_csv("../data/test_v0.7.1.csv",
                      usecols=[
                          'entry_id', 'excerpt', 'specific_needs_groups',
                          'lang', "translation_en", "translation_fr",
                          "translation_es"
                      ])

In [5]:
col = 'specific_needs_groups'
for df in [df_train, df_val, df_test]:
    df[col] = df[col].apply(lambda x: [
        e for e in list(sorted(list(set(literal_eval(x)))))
        #if e not in ['None', 'NOT_MAPPED']
    ])

In [6]:
df_train_en = df_train.copy()
df_train_en.loc[df_train_en["lang"].ne("en"),
                "excerpt"] = df_train_en.loc[df_train_en["lang"].ne("en"),
                                             "translation_en"]
##
df_val_en = df_val.copy()
df_val_en.loc[df_val_en["lang"].ne("en"),
                "excerpt"] = df_val_en.loc[df_val_en["lang"].ne("en"),
                                             "translation_en"]
##
df_test_en = df_test.copy()
df_test_en.loc[df_test_en["lang"].ne("en"),
                "excerpt"] = df_test_en.loc[df_test_en["lang"].ne("en"),
                                             "translation_en"]

In [7]:
def unique_values(df, col):
    vals = Counter()
    for val in df[col]:
        vals.update(val)
    return vals.most_common()

In [8]:
unique_values(df_train_en, "specific_needs_groups"),

([('Pregnant or Lactating Women', 1667),
  ('Indigenous people', 1341),
  ('Persons with Disability', 1289),
  ('Minorities', 757),
  ('GBV survivors', 742),
  ('Unaccompanied or Separated Children', 638),
  ('Chronically Ill', 615),
  ('Female Head of Household', 573),
  ('LGBTQI+', 411),
  ('Single Women (including Widows)', 157),
  ('Child Head of Household', 139),
  ('Elderly Head of Household', 93)],)

In [9]:
plw = 'Pregnant or Lactating Women'
ip = 'Indigenous people'
pwd = 'Persons with Disability'
minorities = 'Minorities'
gbv = 'GBV survivors'
unaccompanied_child = 'Unaccompanied or Separated Children'
chronically_ill = 'Chronically Ill'
fhh = 'Female Head of Household'
lgbt = 'LGBTQI+'
single_women = 'Single Women (including Widows)'
chh = 'Child Head of Household'
ehh = 'Elderly Head of Household'

In [10]:
df_train_en_positive = df_train_en[df_train_en['specific_needs_groups'].apply(
    lambda x: x != [])].copy()
##
df_train_en_plw = df_train_en_positive[df_train_en_positive['specific_needs_groups'].apply(
    lambda x: plw in x)]
df_train_en_ip = df_train_en_positive[df_train_en_positive['specific_needs_groups'].apply(
    lambda x: ip in x)]
df_train_en_pwd = df_train_en_positive[df_train_en_positive['specific_needs_groups'].apply(
    lambda x: pwd in x)]
df_train_en_minorities = df_train_en_positive[df_train_en_positive['specific_needs_groups'].apply(
    lambda x: minorities in x)]
df_train_en_gbv = df_train_en_positive[df_train_en_positive['specific_needs_groups'].apply(
    lambda x: gbv in x)]
df_train_en_unaccompanied_child = df_train_en_positive[df_train_en_positive['specific_needs_groups'].apply(
    lambda x: unaccompanied_child in x)]
df_train_en_chronically_ill = df_train_en_positive[df_train_en_positive['specific_needs_groups'].apply(
    lambda x: chronically_ill in x)]
df_train_en_fhh = df_train_en_positive[df_train_en_positive['specific_needs_groups'].apply(
    lambda x: fhh in x)]
df_train_en_lgbt = df_train_en_positive[df_train_en_positive['specific_needs_groups'].apply(
    lambda x: lgbt in x)]
df_train_en_single_women = df_train_en_positive[df_train_en_positive['specific_needs_groups'].apply(
    lambda x: single_women in x)]
df_train_en_chh = df_train_en_positive[df_train_en_positive['specific_needs_groups'].apply(
    lambda x: chh in x)]
df_train_en_ehh = df_train_en_positive[df_train_en_positive['specific_needs_groups'].apply(
    lambda x: ehh in x)]

In [11]:
class KeywordExtractor:
    def __init__(
        self,
        docs_bg_corpus,
        docs_classes,
        lang="en",
        n_grams=2,
        num_to_words=False,
        stop_words=None,
    ):
        if isinstance(stop_words, Sequence) and not isinstance(stop_words, set):
            self.stop_words = set(stop_words)
        elif stop_words is None:
            self.stop_words = set()
        self.n_grams = n_grams
        self.num_to_words = num_to_words
        self.docs_bg_corpus = docs_bg_corpus
        self.class_name_to_idx = dict(
            zip(list(docs_classes.keys()), range(len(docs_classes))))
        self.docs_classes = list(docs_classes.values())
        self.lang = lang
        ## for preprocessing - should be moved to a util class/func
        self.num_normalizer = dict(
            zip("⁰¹²³⁴⁵⁶⁷⁸⁹", [str(i) for i in range(10)]))
        self.num_normalizer.update(
            dict(zip("₀₁₂₃₄₅₆₇₈₉", [str(i) for i in range(10)])))
        if lang == "en":
            self.num_normalizer.update({"⅓": "one-third", "¼": "one-fourth"})
        elif lang == "fr":
            self.num_normalizer.update({"⅓": "un-tiers", "¼": "un-quart"})
        elif lang == "es":
            self.num_normalizer.update({"⅓": "un-tercio", "¼": "un-tercio"})
        ##
        self.word_to_freq_bg_corpus = self.extract_word_counts(docs_bg_corpus)
        # split word to freq dict into multiple dicts according to ngram len
        self.ngram_to_freq_bg_corpus = [dict() for _ in range(n_grams)]
        for kw, count in self.word_to_freq_bg_corpus.items():
            if isinstance(kw, str):
                self.ngram_to_freq_bg_corpus[0][kw] = count
            else:
                self.ngram_to_freq_bg_corpus[len(kw) - 1][kw] = count
        self.bg_corpus_sizes = [
            sum(self.ngram_to_freq_bg_corpus[i].values())
            for i in range(n_grams)
        ]
        ##
        self.word_to_freq_classes = [
            self.extract_word_counts(corpus) for corpus in self.docs_classes
        ]
        # split word to freq dict of each class into multiple dicts
        # len(ngram_word_to_freq_classes) = c
        # len(ngram_word_to_freq_classes[x]) = n
        self.ngram_word_to_freq_classes = [[dict() for _ in range(n_grams)]
                                           for _ in self.class_name_to_idx]
        for c, word_to_freq_cls in enumerate(self.word_to_freq_classes):
            for kw, count in word_to_freq_cls.items():
                n = 0 if isinstance(kw, str) else len(kw) - 1
                self.ngram_word_to_freq_classes[c][n][kw] = count
        self.ngram_corpora_sizes = [[
            sum(word_to_freq[i].values()) for i in range(n_grams)
        ] for word_to_freq in self.ngram_word_to_freq_classes]

        self.ngram_corpora_sizes = [[
            sum(word_to_freq[i].values()) for i in range(n_grams)
        ] for word_to_freq in self.ngram_word_to_freq_classes]
        ##
        # calc likelihoods for each ngram length in each class separately
        # a list of lists of dicts
        # each represents a class
        # each class is represented by n dicts
        # each dict is {"ngram in class": likelihood}
        self.ngram_likelihoods = [[] for _ in self.class_name_to_idx]
        for c, cls_name in enumerate(self.class_name_to_idx):
            for n in range(self.n_grams):
                w_to_f = self.ngram_word_to_freq_classes[c][n]
                corpus_size = self.ngram_corpora_sizes[c][n]
                self.ngram_likelihoods[c].append(
                    self.calc_likelihoods(w_to_f, corpus_size))
        ##
        # calc potts scores for each ngram in each class
        add_two_dict = lambda a, b: {
            **a,
            **b,
            **{k: a[k] + b[k]
               for k in a.keys() & b}
        }
        self.ngram_potts_scores = [[] for _ in self.class_name_to_idx]
        self.ngram_z_score_of_the_log_odds_ratios = [
            [] for _ in self.class_name_to_idx
        ]
        for c, cls_name in enumerate(self.class_name_to_idx):
            for n in range(self.n_grams):
                lh = self.ngram_likelihoods[c][n]
                other_lhs = [
                    self.ngram_likelihoods[c_other][n]
                    for c_other, _ in enumerate(self.class_name_to_idx)
                    if c != c_other
                ]
                other_lhs = functools.reduce(add_two_dict, other_lhs)
                self.ngram_potts_scores[c].append(
                    self.calc_potts_scores(lh, other_lhs))

                word_to_freq = self.ngram_word_to_freq_classes[c][n]
                corpus_size = self.ngram_corpora_sizes[c][n]
                word_to_freq_others = [
                    self.ngram_word_to_freq_classes[c_other][n]
                    for c_other, _ in enumerate(self.class_name_to_idx)
                    if c != c_other
                ]
                word_to_freq_others = functools.reduce(add_two_dict,
                                                       word_to_freq_others)
                corpus_size_others = sum([
                    self.ngram_corpora_sizes[c_other][n]
                    for c_other, _ in enumerate(self.class_name_to_idx)
                    if c != c_other
                ])

                self.ngram_z_score_of_the_log_odds_ratios[c].append(
                    self.calc_prior_modified_log_odds_ratio(
                        word_to_freq, corpus_size, word_to_freq_others,
                        corpus_size_others, self.ngram_to_freq_bg_corpus[n],
                        self.bg_corpus_sizes[n]))

    def preprocess_and_tokenize(self, doc):
        if doc != doc:
            return ""
        # remove preceeding dates
        doc = re.sub("^\[.+\]", " ", doc).strip()
        doc = re.sub("^\(.+\)", " ", doc).strip()
        # spaces btw numbers and words
        doc = re.sub('(\d+(\W\d+)?)', r' \1 ', doc).strip()
        doc = re.sub("[‐‑–—―─_]", "-", doc)
        doc = re.sub(
            "[" + re.escape(
                '_@^~.()[],"“’…<❖‐»—─|•&{≥➢\ue0e4\uf0d8\uf0fc●°#\u200b>`?�€■!‘%;̧\'›«”:≤―\uf0b7$}*+´=‑▪\xad❑·–'
            ) + "]", " ", doc)
        #remove some puncs
        doc = re.sub('\s+', " ", doc)

        # tokenize
        words = word_tokenize(doc)
        # lower and remove non-words
        words = [word.lower() for word in words if word not in self.stop_words]
        words = [self.num_normalizer.get(token, token) for token in words]
        if self.num_to_words:
            words = [
                num2words(token, lang=self.lang)
                if token.isnumeric() else token for token in words
            ]
        kw_kp = words.copy()
        for n in range(2, self.n_grams + 1):
            kw_kp.extend(list(ngrams(words, n)))
        return kw_kp

    def calc_potts_scores(self, word_to_likelihood_main,
                          word_to_likelihood_other):
        potts_scores = dict()
        for word in word_to_likelihood_main.keys():
            potts_scores[word] = word_to_likelihood_main[word] / (
                word_to_likelihood_main[word] +
                word_to_likelihood_other.get(word, 0))
        return potts_scores

    def calc_likelihoods(self, word_to_freq, corpus_size):
        likelihoods = dict()
        for word, count in word_to_freq.items():
            likelihoods[word] = count / corpus_size
        return likelihoods

    def extract_word_counts(self, docs):
        word_to_freq = defaultdict(int)
        for doc in docs:
            words = self.preprocess_and_tokenize(doc)
            for word in words:
                #if word in stopwords, then do not add it
                word_to_freq[word] += 1

        return word_to_freq

    def calc_prior_modified_log_odds_ratio(self, word_to_freq_c1,
                                           corpus_size_c1, word_to_freq_c2,
                                           corpus_size_c2, word_to_freq_all,
                                           corpus_size_all):

        prior_modified_log_odds_ratio_c1 = dict()
        variance_of_the_log_odds_ratio = dict()
        z_score_of_the_log_odds_ratio_c1 = dict()
        ##
        for word in word_to_freq_c1.keys():
            numerator_1 = word_to_freq_c1[word] + word_to_freq_all[word]
            denomerator_1 = corpus_size_c1 + corpus_size_all - (numerator_1)
            ratio_1 = np.log(numerator_1 / denomerator_1)
            ##
            numerator_2 = word_to_freq_c2.get(word, 0) + word_to_freq_all[word]
            denomerator_2 = corpus_size_c2 + corpus_size_all - (numerator_2)
            ratio_2 = np.log(numerator_2 / denomerator_2)
            ##
            prior_modified_log_odds_ratio_c1[word] = ratio_1 - ratio_2
            ##
            variance_of_the_log_odds_ratio[word] = (1 / numerator_1) + (
                1 / numerator_2)
            ##
            z_score_of_the_log_odds_ratio_c1[
                word] = prior_modified_log_odds_ratio_c1[word] / np.sqrt(
                    variance_of_the_log_odds_ratio[word])
        return z_score_of_the_log_odds_ratio_c1

    def get_kws(self, cls_name, n):
        cls_idx = self.class_name_to_idx[cls_name]
        kw_dict = self.ngram_z_score_of_the_log_odds_ratios[cls_idx][n - 1]
        return list(
            sorted([(word, score) for word, score in kw_dict.items()],
                   key=itemgetter(1),
                   reverse=True))

In [12]:
n_grams = 7
stop_words_en = None
# stop_words_en = stopwords.words("english")
kwe_en = KeywordExtractor(
    df_train_en_positive['excerpt'].tolist(),
    {
        plw:df_train_en_plw["excerpt"].tolist(),
        ip:df_train_en_ip["excerpt"].tolist(),
        pwd:df_train_en_pwd["excerpt"].tolist(),
        minorities:df_train_en_minorities["excerpt"].tolist(),
        gbv:df_train_en_gbv["excerpt"].tolist(),
        unaccompanied_child:df_train_en_unaccompanied_child["excerpt"].tolist(),
        chronically_ill:df_train_en_chronically_ill["excerpt"].tolist(),
        fhh:df_train_en_fhh["excerpt"].tolist(),
        lgbt:df_train_en_lgbt["excerpt"].tolist(),
        single_women:df_train_en_single_women["excerpt"].tolist(),
        chh:df_train_en_chh["excerpt"].tolist(),
        ehh:df_train_en_ehh["excerpt"].tolist(),
    },
    n_grams=n_grams,
    stop_words=stop_words_en)

In [13]:
kw_en_plw = [
    'abortion', 'abortions', 'ahi', 'ante-natal', 'anteparto', 'breastfall',
    'breastfed', 'breastfeed', 'breastfeeding', 'breastfeeds', 'childbirth',
    'childbirths', 'eclampsia', 'fetal', 'fetus', 'gestant', 'gestants',
    'gestation', 'gestational', 'gestations', 'gs/plw', 'hcw', 'impregnated',
    'infanting', 'intrapartum', 'ipt', 'lactate', 'lactates', 'lactating',
    'lactating/pregnant', 'lactation', 'lactations', 'miscarriage',
    'miscarriages', 'mothers/plw', #'neo-natal', 'neonatal', 'neo-natals',
    #'neonatals', 'neonatology', 'newborn', 'newborns', 'new-born', 'new-borns',
    'non-breastfall', 'non-breastfeeding', 'obsteric', 'obstestrium',
    'obstetric', 'obstetrician', 'obstetricians', 'obstetrics', 'parturients',
    'perinatal', 'perinatals', 'placental', 'placentarios', 'plw', 'plw/gs',
    'plw/mothers', 'plwg', 'plwgs', 'plws', 'post-birth', 'post-natal',
    'post-partum', 'postnatal', 'postpartum', 'pre-natal', 'preeclampsia',
    'pregnancies', 'pregnancy', 'pregnancy-related', 'pregnant',
    'pregnant/lactating', 'pregnants', 'prematurity-immature', 'prenatal', 
]
##
negative_kw_plw = []
##
plw_patterns = []
##
def is_plw(row):
    for kw in negative_kw_plw:
        if isinstance(kw, str) and kw in row['tokenized_excerpt']: return False
        elif len(kw) == 2 and kw in row['bigram_excerpt']: return False
        elif len(kw) == 3 and kw in row['trigram_excerpt']: return False
        elif len(kw) == 4 and kw in row['fourgram_excerpt']: return False
        elif len(kw) == 5 and kw in row['fivegram_excerpt']: return False
        elif len(kw) == 6 and kw in row['sixgram_excerpt']: return False
        elif len(kw) == 7 and kw in row['sevengram_excerpt']: return False
    for p in plw_patterns:
        if re.search(p, row["excerpt_pp"]): return True
    for kw in kw_en_plw:
        if isinstance(kw, str) and kw in row['tokenized_excerpt']: return True
        elif len(kw) == 2 and kw in row['bigram_excerpt']: return True
        elif len(kw) == 3 and kw in row['trigram_excerpt']: return True
        elif len(kw) == 4 and kw in row['fourgram_excerpt']: return True
        elif len(kw) == 5 and kw in row['fivegram_excerpt']: return True
        elif len(kw) == 6 and kw in row['sixgram_excerpt']: return True
        elif len(kw) == 7 and kw in row['sevengram_excerpt']: return True
    return False
#list(sorted(set(kw_en_plw)))

In [14]:
kw_en_ip = [
   'indigenous', 'habitat', 'habitats', 'aboriginals', # 'inhabitants', 'inhabitant', 
    ('aboriginal', 'people'), ('aboriginal', 'peoples'), ('aboriginal', 'persons'), 
    ('aboriginal', 'community'), ('aboriginal', 'communities'), ('urban', 'communities'), 
    ('urban', 'community'), ('rural', 'communities'), ('rural', 'community'), 
    ('recipient', 'community'), ('recipient', 'communities'), 
]
##
negative_kw_ip = []
##
ip_patterns = []
##
def is_ip(row):
    for kw in negative_kw_ip:
        if isinstance(kw, str) and kw in row['tokenized_excerpt']: return False
        elif len(kw) == 2 and kw in row['bigram_excerpt']: return False
        elif len(kw) == 3 and kw in row['trigram_excerpt']: return False
        elif len(kw) == 4 and kw in row['fourgram_excerpt']: return False
        elif len(kw) == 5 and kw in row['fivegram_excerpt']: return False
        elif len(kw) == 6 and kw in row['sixgram_excerpt']: return False
        elif len(kw) == 7 and kw in row['sevengram_excerpt']: return False
    for p in ip_patterns:
        if re.search(p, row["excerpt_pp"]): return True
    for kw in kw_en_ip:
        if isinstance(kw, str) and kw in row['tokenized_excerpt']: return True
        elif len(kw) == 2 and kw in row['bigram_excerpt']: return True
        elif len(kw) == 3 and kw in row['trigram_excerpt']: return True
        elif len(kw) == 4 and kw in row['fourgram_excerpt']: return True
        elif len(kw) == 5 and kw in row['fivegram_excerpt']: return True
        elif len(kw) == 6 and kw in row['sixgram_excerpt']: return True
        elif len(kw) == 7 and kw in row['sevengram_excerpt']: return True
    return False

In [33]:
kw_en_pwd = [
    #'disabilities',
    'disabilites',
    #'disability',
    #'disabled',
    'handicap',
    'pwds',
    'pwd',
    #'impaired',
    'wheelchair',
    'wheelchairs',
    'disability-related',
    'deaf',
    'prosthesis',
    'rehabilitative',
    'prosthetic',
    'prostheses',
    'deaf-deaf',
    'atrophy',
    'disability-confidence',
    'guide-dogs',
    'power-wheelchairs',
    'pwg',
    "pbs",#Personnes à Besoins Spécifiques
    'psns', #persons with specific needs (PSNs)
    "schizophrenia",
    'deafness', 
    ('people', 'with', 'reduced', 'mobility'),
    ('persons', 'with', 'reduced', 'mobility'),
    ('children', 'with', 'reduced', 'mobility'),
    ('men', 'with', 'reduced', 'mobility'),
    ('women', 'with', 'reduced', 'mobility'),
    ('elderly', 'with', 'reduced', 'mobility'),
    ('people', 'with', 'reduced', 'or', 'elderly', 'mobility'),
    
    ('persons', 'with', 'specific', 'needs'),
    ('people', 'with', 'specific', 'needs'),
    ('peoples', 'with', 'specific', 'needs'),
    ('children', 'with', 'specific', 'needs'),
    ('girls', 'with', 'specific', 'needs'),
    ('boys', 'with', 'specific', 'needs'),
    ('women', 'with', 'specific', 'needs'),
    ('men', 'with', 'specific', 'needs'),
    ('elderly', 'with', 'specific', 'needs'),
    ('persons', 'with', 'special', 'needs'),
    ('people', 'with', 'special', 'needs'),
    ('peoples', 'with', 'special', 'needs'),
    ('children', 'with', 'special', 'needs'),
    ('girls', 'with', 'special', 'needs'),
    ('boys', 'with', 'special', 'needs'),
    ('women', 'with', 'special', 'needs'),
    ('men', 'with', 'special', 'needs'),
    ('elderly', 'with', 'special', 'needs'),
    ('persons', 'with', 'difficulties', 'in', 'functioning'),
    ('people', 'with', 'difficulties', 'in', 'functioning'),
    ('peoples', 'with', 'difficulties', 'in', 'functioning'),
    ('children', 'with', 'difficulties', 'in', 'functioning'),
    ('women', 'with', 'difficulties', 'in', 'functioning'),
    ('men', 'with', 'difficulties', 'in', 'functioning'),
    ('elderly', 'with', 'difficulties', 'in', 'functioning'),
    "dementia",
    
    ('persons', 'with', 'certain', 'impairments'),
    ('people', 'with', 'certain', 'impairments'),
    ('peoples', 'with', 'certain', 'impairments'),
    ('children', 'with', 'certain', 'impairments'),
    ('women', 'with', 'certain', 'impairments'),
    ('men', 'with', 'certain', 'impairments'),
    ('elderly', 'with', 'certain', 'impairments'),
    ('physical', 'impairments'),
    ('physical', 'impairment'),
    ('visual', 'impairments'),
    ('visual', 'impairment'),
    ('sensory', 'impairments'),
    ('sensory', 'impairment'),
    ('hearing', 'impairments'),
    ('hearing', 'impairment'),
    ('physical', 'impairments'),
    ('physical', 'impairment'),
    ('psychic', 'impairments'),
    ('psychic', 'impairment'),
    ('psychiatric', 'impairments'),
    ('psychiatric', 'impairment'),
    ('psychological', 'impairments'),
    ('psychological', 'impairment'),
    ('mobility', 'impairments'),
    ('mobility', 'impairment'),
    ('persons', 'with', 'physical'),
    ('people', 'with', 'physical'),
    ('peoples', 'with', 'physical'),
    ('children', 'with', 'physical'),
    ('girls', 'with', 'physical'),
    ('boys', 'with', 'physical'),
    ('youths', 'with', 'physical'),
    ('women', 'with', 'physical'),
    ('men', 'with', 'physical'),
    ('elderly', 'with', 'physical'),
    ('persons', 'with', 'psychic'),
    ('people', 'with', 'psychic'),
    ('peoples', 'with', 'psychic'),
    ('children', 'with', 'psychic'),
    ('girls', 'with', 'psychic'),
    ('boys', 'with', 'psychic'),
    ('youths', 'with', 'psychic'),
    ('women', 'with', 'psychic'),
    ('men', 'with', 'psychic'),
    ('elderly', 'with', 'psychic'),
    ('living', 'with', 'specific', 'needs'),
    ('reduced', 'mobility'),
    ('impaired', 'mobility'),
    ('had', 'any', 'specific', 'need'),
    ('physical', 'difficulties'),
    ('physical', 'difficulty'),
    ('difficulty', 'walking'),
    ('people', 'have', 'specific', 'needs'),
    ('people', 'having', 'specific', 'needs'),
    ('difficulties', 'of', 'mobility'),
    ('difficulty', 'of', 'mobility'),
    ('barely', 'moves', 'alone'),
    ('persons', 'with', 'disabilities'),
    ('people', 'with', 'disabilities'),
    ('peoples', 'with', 'disabilities'),
    ('children', 'with', 'disabilities'),
    ('girls', 'with', 'disabilities'),
    ('boys', 'with', 'disabilities'),
    ('women', 'with', 'disabilities'),
    ('men', 'with', 'disabilities'),
    ('elderly', 'with', 'disabilities'),
    ('persons', 'with', 'disabilities-'),
    ('people', 'with', 'disabilities-'),
    ('peoples', 'with', 'disabilities-'),
    ('children', 'with', 'disabilities-'),
    ('girls', 'with', 'disabilities-'),
    ('boys', 'with', 'disabilities-'),
    ('women', 'with', 'disabilities-'),
    ('men', 'with', 'disabilities-'),
    ('elderly', 'with', 'disabilities-'),
    ('persons', 'with', 'disabilities—'),
    ('people', 'with', 'disabilities—'),
    ('peoples', 'with', 'disabilities—'),
    ('children', 'with', 'disabilities—'),
    ('girls', 'with', 'disabilities—'),
    ('boys', 'with', 'disabilities—'),
    ('women', 'with', 'disabilities—'),
    ('men', 'with', 'disabilities—'),
    ('elderly', 'with', 'disabilities—'),
    ('persons', 'with', 'disability'),
    ('people', 'with', 'disability'),
    ('peoples', 'with', 'disability'),
    ('children', 'with', 'disability'),
    ('girls', 'with', 'disability'),
    ('boys', 'with', 'disability'),
    ('women', 'with', 'disability'),
    ('men', 'with', 'disability'),
    ('elderly', 'with', 'disability'),
    ('persons', 'with', 'disability-'),
    ('people', 'with', 'disability-'),
    ('peoples', 'with', 'disability-'),
    ('children', 'with', 'disability-'),
    ('girls', 'with', 'disability-'),
    ('boys', 'with', 'disability-'),
    ('women', 'with', 'disability-'),
    ('men', 'with', 'disability-'),
    ('elderly', 'with', 'disability-'),
    ('persons', 'with', 'disability—'),
    ('people', 'with', 'disability—'),
    ('peoples', 'with', 'disability—'),
    ('children', 'with', 'disability—'),
    ('girls', 'with', 'disability—'),
    ('boys', 'with', 'disability—'),
    ('women', 'with', 'disability—'),
    ('men', 'with', 'disability—'),
    ('elderly', 'with', 'disability—'),
    ('disabled', 'persons'),
    ('disabled', 'people'),
    ('disabled', 'peoples'),
    ('disabled', 'children'),
    ('disabled', 'girls'),
    ('disabled', 'boys'),
    ('disabled', 'women'),
    ('disabled', 'men'),
    ('disabled', 'elderly'),
    ('persons', 'in', 'disability'),
    ('people', 'in', 'disability'),
    ('persons', 'in', 'disabilities'),
    ('people', 'in', 'disabilities'),
    ('living', 'with', 'a', 'disability'),
    ('living', 'with', 'disability'),
    ('living', 'with', 'disabilities'),
]


#disability --> neg when: payment, sales
#disabled
#     impaired the labor
#
##
def lf_pwd_pos_1(row):
    if len({'disability', 'disabilities', 'disabled', 'impaired'}
           & row['tokenized_excerpt']) and (len(
               {'labor', 'labuor', 'sales', 'market', 'payment', 'payments'}
               & row['tokenized_excerpt'])):
        return -1
    elif len({'disability', 'disabilities', 'disabled', 'impaired'}
             & row['tokenized_excerpt']) and (not len(
                 {'labor', 'labuor', 'sales', 'market', 'payment', 'payments'}
                 & row['tokenized_excerpt'])):
        return 1
    return -1


##
def lf_pwd_pos_2(row):
    if row["lang"] == "fr" and (
        ('women', 'and', 'girls', 'stupids') in row["fourgram_excerpt"] or
        ('of', 'a', 'certain', 'vulnerability') in row["fourgram_excerpt"] or
        ('with', 'other', 'vulnerabilities') in row["trigram_excerpt"]):
        return 1
    return -1


##

##
negative_kw_pwd = []
##
pwd_patterns = []


##
def is_pwd(row):
    for kw in negative_kw_pwd:
        if isinstance(kw, str) and kw in row['tokenized_excerpt']: return False
        elif len(kw) == 2 and kw in row['bigram_excerpt']: return False
        elif len(kw) == 3 and kw in row['trigram_excerpt']: return False
        elif len(kw) == 4 and kw in row['fourgram_excerpt']: return False
        elif len(kw) == 5 and kw in row['fivegram_excerpt']: return False
        elif len(kw) == 6 and kw in row['sixgram_excerpt']: return False
        elif len(kw) == 7 and kw in row['sevengram_excerpt']: return False
    for p in pwd_patterns:
        if re.search(p, row["excerpt_pp"]): return True
    for lf in [lf_pwd_pos_1, lf_pwd_pos_2]:
        if lf(row) == 1:
            return True
    for kw in kw_en_pwd:
        if isinstance(kw, str) and kw in row['tokenized_excerpt']: return True
        elif len(kw) == 2 and kw in row['bigram_excerpt']: return True
        elif len(kw) == 3 and kw in row['trigram_excerpt']: return True
        elif len(kw) == 4 and kw in row['fourgram_excerpt']: return True
        elif len(kw) == 5 and kw in row['fivegram_excerpt']: return True
        elif len(kw) == 6 and kw in row['sixgram_excerpt']: return True
        elif len(kw) == 7 and kw in row['sevengram_excerpt']: return True
    return False

In [16]:
kw_en_gbv = [
    'gbv',
    'gbvs',
    'raping',
    'rape',
    'rapes',
    'raped',
    'rapist',
    'rapists',
    'sgbv',
    'sgbvs',
    'vbg',
    'vbgs',
    'vsbg',
    'vbgs',
    'gbbs',
    'gbv-covid-',
    'gbvsc-nwsw-cameroon',
    'sgtvbg',
    'sgt',
    'gbvv',
    'gbvims',
    'gbv-related',
    'gbv-specific',
    'sgbv/covid',
    'protection/gbv', 
    'gender-based',
    'fgm',
    'fgms',
    'fpcs',
    ('sexually', 'harassed'),
    ('survival', 'sex'),
    'vawg',
    'bushwives',
    'bushwife',
    ('sexual', 'slaves'),
    ('sexual', 'slave'),
    ('attacks', 'on', 'integrity'),
    ('infringement', 'of', 'integrity'),
    ('infringements', 'of', 'the', 'integrity'),
    'vsbgs',
    ('domestic', 'abuse'),
    ('intimate', 'violence'),
    ('inappropriate', 'intimate', 'behaviours'),
    ('females', 'facing', 'attacks'),
    ('girls', 'facing', 'attacks'),
    ('women', 'facing', 'attacks'),
    ('survivors', 'of', 'violence'),
    ('sexual', 'and', 'emotional', 'violence'),
    ('genital', 'mutilation'),
    ('sexual', 'assignments'),
    ('sexual', 'operating'),  # fr
    'vawg',  #VAWG # Violence Against Women and Girls
    'vawgs',  #VAWG # Violence Against Women and Girls
    'gbvie',  #GBViE # GBV in Emergencies (GBViE)
    'psea',  #Protection against sexual exploitation and abuse (PSEA)
    ('sexual', 'violence'),
    ('genderbased', 'violence'),
    ('gender-based', 'violence'),
    ('sexual', 'abuse'),
    ('genderbased', 'abuse'),
    ('gender-based', 'abuse'),
    ('sexual', 'assault'),
    ('genderbased', 'assault'),
    ('gender-based', 'assault'),
    ('sexual', 'assault'),
    ('sexual', 'abuse'),
    ('sexual', 'exploitation'),
    ('sexual', 'harassment'),
    ('sexual', 'gender-based'),
    ('sexual', 'assaults'),
    ('sexual', 'exploitation'),
    ('integrity', 'abuse'),
    ('integrity', 'abuses'),
    ('integrity', 'attack'),
    ('integrity', 'attacks'),
    ('taking', 'over', 'the', 'integrity'),
    ('victims', 'of', 'infringement'),
    ('integrity', 'damage'),
    ('bacha', 'bazi'),
    ('teen', 'pregnancies'),
    ('forced', 'marriage'),
    ('forced', 'marriages'),
    ('domestic', 'violence'),
    ('victims', 'of', 'sexual'),
    ('intimate', 'partner', 'violence'),
    ('violence', 'against', 'women'),
    ('violence', 'against', 'girls'),
    ('abuse', 'and', 'sexual'),
    ('forced', 'into', 'child', 'marriage'),
    ('sexual', 'violence'),
    ('sexual', 'assault'),
    ('sexual', 'exploitation'),
    ('sexual', 'abuse'),
    ('sexual', 'maltreatment'),
    ('emotional', 'maltreatment'),
    ('gender', 'based', 'violence'),
    ('female', 'genital', 'mutilation'),
    ('female', 'genital', 'mutilations'),
]


def lf_pos_gbv_1(row):
    if len({'women', 'woman', 'girls', 'girl', 'female', 'females'}
           & row["tokenized_excerpt"]) and len({
               'violence', 'exploitation', 'exploited', 'assault', 'abuse',
               'abuses', 'maltreatment', 'maltreatments', 'survivors',
               'survivor', 'harassment', 'harassments'
               'discrimination', 'discriminations', 'trafficking',
               'aggressions', 'aggression'
           } & row["tokenized_excerpt"]):
        return 1
    return -1


def lf_pos_gbv_2(row):
    if len({'sex', 'sexual', 'sexually'}
           & row["tokenized_excerpt"]) and len({
               'violence', 'exploitation', 'exploited', 'assault', 'abuse',
               'abuses', 'maltreatment', 'maltreatments', 'survivors',
               'survivor', 'harassment', 'harassments', 'aggressions',
               'aggression'
           }
                                               & row["tokenized_excerpt"]):
        return 1
    return -1


def lf_pos_gbv_3(row):
    if len({'survivor', 'survivors'}
           & row["tokenized_excerpt"]) and len({
               'violence', 'exploitation', 'exploited', 'assault', 'abuse',
               'abuses', 'maltreatment', 'maltreatments', 'harassment',
               'harassments', 'trafficking', 'aggressions', 'aggression'
           }
                                               & row["tokenized_excerpt"]):
        return 1
    return -1


def lf_pos_gbv_4(row):
    # gbv-*
    if re.search(r"\bgbv\-?.+?\b", row['excerpt_pp']):
        return 1
    return -1


##
negative_kw_gbv = []
##
gbv_patterns = []


##
def is_gbv(row):
    for kw in negative_kw_gbv:
        if isinstance(kw, str) and kw in row['tokenized_excerpt']: return False
        elif len(kw) == 2 and kw in row['bigram_excerpt']: return False
        elif len(kw) == 3 and kw in row['trigram_excerpt']: return False
        elif len(kw) == 4 and kw in row['fourgram_excerpt']: return False
        elif len(kw) == 5 and kw in row['fivegram_excerpt']: return False
        elif len(kw) == 6 and kw in row['sixgram_excerpt']: return False
        elif len(kw) == 7 and kw in row['sevengram_excerpt']: return False
    for p in gbv_patterns:
        if re.search(p, row["excerpt_pp"]): return True
    for lf in [lf_pos_gbv_1, lf_pos_gbv_2, lf_pos_gbv_3, lf_pos_gbv_4]:
        if lf(row) == 1:
            return True
    for kw in kw_en_gbv:
        if isinstance(kw, str) and kw in row['tokenized_excerpt']: return True
        elif len(kw) == 2 and kw in row['bigram_excerpt']: return True
        elif len(kw) == 3 and kw in row['trigram_excerpt']: return True
        elif len(kw) == 4 and kw in row['fourgram_excerpt']: return True
        elif len(kw) == 5 and kw in row['fivegram_excerpt']: return True
        elif len(kw) == 6 and kw in row['sixgram_excerpt']: return True
        elif len(kw) == 7 and kw in row['sevengram_excerpt']: return True
    return False

In [17]:
kw_en_unaccompanied_child = [
    ('unaccompanied', 'nna'),
    ('nna', 'unaccompanied'),
    ('nnas', 'unaccompanied'),
    'orphans',
    ('orphan', 'children'),
    ('orphaned', 'children'),
    ('separated', 'nna'),
    ('separated', 'nnas'),
    ('nna', 'separated'),
    ('nnas', 'separated'),
    ('separate', 'nna'),
    ('separate', 'nnas'),
    ('nna', 'separate'),
    ('nnas', 'separate'),
    ('undecreamed', 'minors'),
    ('children', 'alone'),
    ('unaccompanied', 'children'),
    ('unaccochildrenmpanied', 'unaccompanied'),
    ('non-accompanied', 'children'),
    ('unaccompanied', 'minors'),
    ('non-accompanied', 'minors'),
    ('minors', 'unaccompanied'),
    ('separated', 'children'),
    ('children', 'separated'),
    ('separate', 'children'),
    ('children', 'separate'),
    ('separated', 'minors'),
    ('minors', 'separated'),
    ('separate', 'minors'),
    ('minors', 'separate'),
    ('separation', 'of', 'children'),
    ('street', 'children'),
    ('abandoning', 'children'),
    ('abandoned', 'children'),
    ('deprived', 'of', 'their', 'childhood'),
    ('deprived', 'of', 'their', 'adolescence'),
    ('wish', 'to', 'find', 'their', 'families'),
    'uasc',  # unaccomapnied or asylum-seeking children
]


def lf_pos_unaccompanied_child_1(row):
    if len({
            'children', 'child', 'girls', 'boys', 'adolescents', 'nna',
            'minor', 'minors', 'teen', 'teens', 'nna', 'nnas'
    }
           & row["tokenized_excerpt"]) and len({
               'separated', 'unaccompanied', 'non-accompanied',
               'not-accompanied', 'unaccompanied/separated',
               'separated/unaccompanied', 'household/unaccompanied',
               'unaccompanied/household', 'orphan', 'orphans', 'orphaned'
           } & row["tokenized_excerpt"]):
        return 1
    return -1


def lf_pos_unaccompanied_child_2(row):
    if len({
            'children', 'child', 'girls', 'boys', 'adolescents', 'nna',
            'minor', 'minors', 'teen', 'teens', 'nna', 'nnas'
    }
           & row["tokenized_excerpt"]) and len({
               ('no', 'accompaniment'),
               ('no', 'accompanied'),
               ('arrived', 'alone'),
               ('family', 'separation'),
               ('family', 'separations'),
               ('outside', 'family'),
               ('not', 'accompanied'),
               ('live', 'alone'),
               ('without', 'parents'),
           } & row["bigram_excerpt"]):
        return 1
    return -1


def lf_pos_unaccompanied_child_3(row):
    if len({
            'children', 'child', 'girls', 'boys', 'adolescents', 'nna',
            'minor', 'minors', 'teen', 'teens', 'nna', 'nnas'
    }
           & row["tokenized_excerpt"]) and len({
               ('without', 'the', 'company'),
               ('without', 'their', 'parents'),
               ('without', 'their', 'accompanying'),
               ('without', 'a', 'company'),
               ('outside', 'the', 'family'),
               ('loss', 'of', 'parents'),
               ('loss', 'of', 'guardians'),
               ('outside', 'of', 'family'),
               ('lost', 'their', 'family'),
               ('lost', 'their', 'parents'),
               ('without', 'a', 'parent'),
               ('without', 'a', 'guardian'),
               ('live', 'with', 'none'),
           } & row["trigram_excerpt"]):
        return 1
    return -1


def lf_pos_unaccompanied_child_4(row):
    if len({
            'children', 'child', 'girls', 'boys', 'adolescents', 'nna',
            'minor', 'minors', 'teen', 'teens', 'nna', 'nnas'
    }
           & row["tokenized_excerpt"]) and len({
               ('without', 'their', 'legal', 'learners'),
               ('without', 'their', 'legal', 'learner'),
               ('without', 'their', 'legal', 'counseling'),
               ('separated', 'from', 'both', 'parents'),
               ('not', 'live', 'with', 'relatives'),
               ('separated', 'from', 'their', 'families'),
           } & row["fourgram_excerpt"]):
        return 1
    return -1


def lf_pos_unaccompanied_child_5(row):
    if len({
            'children', 'child', 'girls', 'boys', 'adolescents', 'nna',
            'minor', 'minors', 'teen', 'teens', 'nna', 'nnas'
    }
           & row["tokenized_excerpt"]) and is_stateless(row):
        return 1
    return -1


def lf_pos_unaccompanied_child_6(row):  # to be used for now
    if len({
            'children', 'child', 'girls', 'boys', 'adolescents', 'nna',
            'minor', 'minors', 'teen', 'teens', 'nna', 'nnas'
    }
           & row["tokenized_excerpt"]) and len({
               ('armed', 'groups'),
               ('armed', 'group'),
               ('armed', 'forces'),
               ('armed', 'force'),
           } & row["bigram_excerpt"]) and len(
               {'recruitment', 'recruitments', 'recruited'}
               & row["tokenized_excerpt"]):
        return 1
    return -1


##
negative_kw_unaccompanied_child = []
##
unaccompanied_child_patterns = []


##
def is_unaccompanied_child(row):
    for kw in negative_kw_unaccompanied_child:
        if isinstance(kw, str) and kw in row['tokenized_excerpt']: return False
        elif len(kw) == 2 and kw in row['bigram_excerpt']: return False
        elif len(kw) == 3 and kw in row['trigram_excerpt']: return False
        elif len(kw) == 4 and kw in row['fourgram_excerpt']: return False
        elif len(kw) == 5 and kw in row['fivegram_excerpt']: return False
        elif len(kw) == 6 and kw in row['sixgram_excerpt']: return False
        elif len(kw) == 7 and kw in row['sevengram_excerpt']: return False
    for p in unaccompanied_child_patterns:
        if re.search(p, row["excerpt_pp"]): return True
    for lf in [
            lf_pos_unaccompanied_child_1,
            lf_pos_unaccompanied_child_2,
            lf_pos_unaccompanied_child_3,
            lf_pos_unaccompanied_child_4,
            #lf_pos_unaccompanied_child_5,
    ]:
        if lf(row) == 1:
            return True
    for kw in kw_en_unaccompanied_child:
        if isinstance(kw, str) and kw in row['tokenized_excerpt']: return True
        elif len(kw) == 2 and kw in row['bigram_excerpt']: return True
        elif len(kw) == 3 and kw in row['trigram_excerpt']: return True
        elif len(kw) == 4 and kw in row['fourgram_excerpt']: return True
        elif len(kw) == 5 and kw in row['fivegram_excerpt']: return True
        elif len(kw) == 6 and kw in row['sixgram_excerpt']: return True
        elif len(kw) == 7 and kw in row['sevengram_excerpt']: return True
    return False

In [18]:
kw_en_chronically_ill = [
    'hiv',
    'hypertension',
    'cancer',
    'aids',
    'dpvih',
    'asthma',
    'dialysis',
    'tuberculosis',
    'tb',
    'diabetic',
    'dialyzed',
    'ncds',
    'arthritis',
    'plhiv',
    #'paralyzed',
    'immunodeficiency',
    'immunodeficient',
    'hypertensive',
    'hepatitis',
    'chemotherapy',
    'parkinson',
    'hiv-aids',
    'hiv/aids',
    'schizophrenia',
    'nephropathy',
    'neoplasm',
    'rheumatism',
    'osteoporosis',
    'rheumatic',
    'snyder-robinson',
    'leukemia',
    'immunodeficiencies',
    'pvvih',
    'pvvs',
    'transplantation',
    'transplant',
    'transplants',
    'transplanted',
    'poliomyelitis',
    'marasmus',
    'tbc',
    ('prostatic', 'hyperplasia'),
    ('prostate', 'hyperplasia'),
    ('snyder', 'robinson'),
    ('robinson', 'syndrome'),
    ('chronic', 'psychiatric', 'disorders'),
    ('chronic', 'psychiatric', 'disorder'),
    ('chronic', 'psychological', 'disorders'),
    ('chronic', 'psychological', 'disorder'),
    ('chronic', 'physical', 'disorders'),
    ('chronic', 'physical', 'disorder'),
    ('permanent', 'psychiatric', 'disorders'),
    ('permanent', 'psychiatric', 'disorder'),
    ('permanent', 'psychological', 'disorders'),
    ('permanent', 'psychological', 'disorder'),
    ('permanent', 'physical', 'disorders'),
    ('permanent', 'physical', 'disorder'),
    ('chronic', 'diseases'),
    ('chronic', 'disease'),
    ('chronic', 'illnesses'),
    ('chronic', 'illness'),
    ('permanent', 'diseases'),
    ('permanent', 'disease'),
    ('permanent', 'illnesses'),
    ('permanent', 'illness'),
    ('chronic', 'sick'),
    ('chronic', 'ill'),
    ('chronically', 'sick'),
    ('chronically', 'ill'),
    ('chronic', 'sickness'),
    ('chronic', 'sicknesses'),
    ('permanent', 'vulnerabilities'),
    ('permanent', 'vulnerability'),
    ('chronic', 'malnutrition'),
    ('chronic', 'conditions'),
    ('chronic', 'health', 'conditions'),
    ('chronic', 'health', 'issues'),
    ('chronic', 'medical', 'health', 'condition'),
    ('chronic', 'medical', 'health', 'conditions'),
    ('chronic', 'medical', 'condition'),
    ('chronic', 'medical', 'conditions'),
    ('chronic', 'patients'),
    'paraling',
    'diabetics',
    'diabetologist',
    'obese',
    'diabetes',
    'infarction',
    'hiv/aids',
    'unaids',
    'obesity',
    ('require', 'daily', 'medicine'),
    ('people', 'require', 'medications', 'daily'),
    ('retrooviral', 'treatment'),
    ('retroviral', 'therapy'),
    ('antiretroviral', 'treatment'),
    ('gastrointestinal', 'diseases'),
    ('acute', 'respiratory', 'diseases'),
    ('gestational', 'syphilis'),
    ('congenital', 'syphilis'),
    ('maternal', 'morbidity'),
    ('long-term', 'treatment'),
    ('daily', 'medication'),
    ('acute', 'malnutrition'),
    ('medical', 'or', 'chronic', 'condition'),
    'acv',
    ('daily', 'medical', 'attention'),
    ('medications', 'daily'),
    ('daily', 'medications'),
    # 'art',
    ## infectious and serious illnesses
    ('advanced', 'disease'),
    ('pre-existing', 'diseases'),
    'contagion',
    ('infectious', 'diseases'),
    ('contagious', 'diseases'),
    ('critical', 'medical', 'conditions'),
    ('critical', 'medical', 'condition'),
    ('serious', 'medical', 'condition'),
    ('serious', 'medical', 'conditions'),
    'dengue',
    'malaria',
    ('conditions', 'that', 'increase', 'the', 'risks', 'of', 'mortality'),
    ('illness', 'which', 'lasted', '3', 'months', 'or', 'longer'),
    'hernias',
    ('joint', 'pain'),
    ('affected', 'medical', 'problems'),
    ('stroke', 'of', 'ischemic', 'type'),
    'cholera',
    ('persons', 'with', 'mental', 'health', 'problems'),
    ('persons', 'with', 'mental', 'health', 'problem'),
    ('persons', 'with', 'mental', 'health', 'disorders'),
    ('persons', 'with', 'mental', 'health', 'disorder'),
    ('children', 'with', 'mental', 'health', 'problems'),
    ('children', 'with', 'mental', 'health', 'problem'),
    ('children', 'with', 'mental', 'health', 'disorders'),
    ('children', 'with', 'mental', 'health', 'disorder'),
    ('girls', 'with', 'mental', 'health', 'problems'),
    ('girls', 'with', 'mental', 'health', 'problem'),
    ('girls', 'with', 'mental', 'health', 'disorders'),
    ('girls', 'with', 'mental', 'health', 'disorder'),
    ('boys', 'with', 'mental', 'health', 'problems'),
    ('boys', 'with', 'mental', 'health', 'problem'),
    ('boys', 'with', 'mental', 'health', 'disorders'),
    ('boys', 'with', 'mental', 'health', 'disorder'),
    ('youths', 'with', 'mental', 'health', 'problems'),
    ('youths', 'with', 'mental', 'health', 'problem'),
    ('youths', 'with', 'mental', 'health', 'disorders'),
    ('youths', 'with', 'mental', 'health', 'disorder'),
    ('women', 'with', 'mental', 'health', 'problems'),
    ('women', 'with', 'mental', 'health', 'problem'),
    ('women', 'with', 'mental', 'health', 'disorders'),
    ('women', 'with', 'mental', 'health', 'disorder'),
    ('men', 'with', 'mental', 'health', 'problems'),
    ('men', 'with', 'mental', 'health', 'problem'),
    ('men', 'with', 'mental', 'health', 'disorders'),
    ('men', 'with', 'mental', 'health', 'disorder'),
    ('elderly', 'with', 'mental', 'health', 'problems'),
    ('elderly', 'with', 'mental', 'health', 'problem'),
    ('elderly', 'with', 'mental', 'health', 'disorders'),
    ('elderly', 'with', 'mental', 'health', 'disorder'),
    ('people', 'with', 'mental', 'health', 'problems'),
    ('people', 'with', 'mental', 'health', 'problem'),
    ('people', 'with', 'mental', 'health', 'disorders'),
    ('people', 'with', 'mental', 'health', 'disorder'),
    ('peoples', 'with', 'mental', 'health', 'problems'),
    ('peoples', 'with', 'mental', 'health', 'problem'),
    ('peoples', 'with', 'mental', 'health', 'disorders'),
    ('peoples', 'with', 'mental', 'health', 'disorder'),
    ('chronic', 'psychiatric', 'disorders'),
    ('chronic', 'psychological', 'disorders'),
    ('suffering', 'from', 'mental', 'disorders'),
    ('suffering', 'from', 'mental', 'illnesses'),
    ('suffering', 'from', 'mental', 'illness'),
    ('chronic', 'psychiatric', 'disorders'),
    ('chronic', 'psychiatric', 'disorder'),
]


def lf_chronically_ill_pos_1(row):
    if ('serious', 'chronic', 'conditions') in row["trigram_excerpt"] or (
        ('chronic', 'illness') in row["bigram_excerpt"] or
        ('chronic', 'disorders') in row["bigram_excerpt"] or
        ('transplanted', 'persons') in row["bigram_excerpt"]):
        return 1
    return -1


def lf_chronically_ill_pos_2(row):
    if row["lang"] == "fr" and (
        ('mental', 'patients') in row["bigram_excerpt"] or
        ('mental', 'sick', 'women', 'and', 'girls') in row["fivegram_excerpt"]
            or ('mental', 'women', 'and', 'girls') in row["fourgram_excerpt"]):
        return 1
    return -1


def lf_chronically_ill_pos_3(row):
    if row["lang"] == "es" and (
        ('serious', 'chronic', 'conditions') in row["trigram_excerpt"] or
        ('chronic', 'illness') in row["bigram_excerpt"] or
        ('chronic', 'disorders') in row["bigram_excerpt"] or
        ('transplanted', 'persons') in row["bigram_excerpt"]):
        return 1
    return -1


##
negative_kw_chronically_ill = []
##
chronically_ill_patterns = []


##
def is_chronically_ill(row):
    for kw in negative_kw_chronically_ill:
        if isinstance(kw, str) and kw in row['tokenized_excerpt']: return False
        elif len(kw) == 2 and kw in row['bigram_excerpt']: return False
        elif len(kw) == 3 and kw in row['trigram_excerpt']: return False
        elif len(kw) == 4 and kw in row['fourgram_excerpt']: return False
        elif len(kw) == 5 and kw in row['fivegram_excerpt']: return False
        elif len(kw) == 6 and kw in row['sixgram_excerpt']: return False
        elif len(kw) == 7 and kw in row['sevengram_excerpt']: return False
    for p in chronically_ill_patterns:
        if re.search(p, row["excerpt_pp"]): return True
    for lf in [
            lf_chronically_ill_pos_1,
            lf_chronically_ill_pos_2,
            lf_chronically_ill_pos_3,
    ]:
        if lf(row) == 1:
            return True
    for kw in kw_en_chronically_ill:
        if isinstance(kw, str) and kw in row['tokenized_excerpt']: return True
        elif len(kw) == 2 and kw in row['bigram_excerpt']: return True
        elif len(kw) == 3 and kw in row['trigram_excerpt']: return True
        elif len(kw) == 4 and kw in row['fourgram_excerpt']: return True
        elif len(kw) == 5 and kw in row['fivegram_excerpt']: return True
        elif len(kw) == 6 and kw in row['sixgram_excerpt']: return True
        elif len(kw) == 7 and kw in row['sevengram_excerpt']: return True
    return False

In [19]:
kw_en_fhh = ['fhh', 'fhhs', ('women', 's', 'household', 'women')]


def lf_fhh_pos_1(row):
    if len({
        ('female', 'headed'),
        ('female', 'led'),
        ('women', 'led'),
        ('women', 'headed'),
        ('woman', 'led'),
        ('woman', 'headed'),
        ('female-', 'headed'),
        ('female-', 'led'),
        ('women-', 'led'),
        ('women-', 'headed'),
        ('woman-', 'led'),
        ('woman-', 'headed'),
        ('women', 'run'),
        ('woman', 'run'),
        ('female', 'run'),
        ('females', 'run'),
    }
           & row['bigram_excerpt']) and len({'households', 'household'}
                                            & row['tokenized_excerpt']):
        return 1
    return -1


def lf_fhh_pos_2(row):
    if len({
            'female-headed', 'female-led', 'women-led', 'women-headed',
            'woman-headed', 'femaleheaded', 'femaleled', 'womenled',
            'womenheaded', 'womanheaded', 'womenrun', 'womanrun', 'femalerun',
            'femalesrun'
    }
           & row['tokenized_excerpt']) and len({'households', 'household'}
                                               & row['tokenized_excerpt']):
        return 1
    return -1


def lf_fhh_pos_3(row):
    if len({'female', 'females', 'woman', 'women'}
           & row['tokenized_excerpt']) and len(
               {
                   'headed',
                   'head',
                   'heads',
                   'headquarters',
                   'headquarter',
                   'led',
                   'supplier',
                   'suppliers',
                   'chefts',
                   'chiefs',
                   'chief',
                   'cheffes',
                   'cheffes',
                   'breadwinner',
                   'breadwinners',
                   'headache',
               }
               & row['tokenized_excerpt']
           ) and len({
               'households', 'household', 'family', 'families', 'home', 'homes'
           }
                     & row['tokenized_excerpt']):
        return 1
    return -1


def lf_fhh_pos_4(row):
    if fhh in row["specific_needs_groups"]:
        return 1
    return -1


##
negative_kw_fhh = []
##
fhh_patterns = []


##
def is_fhh(row):
    for kw in negative_kw_fhh:
        if isinstance(kw, str) and kw in row['tokenized_excerpt']: return False
        elif len(kw) == 2 and kw in row['bigram_excerpt']: return False
        elif len(kw) == 3 and kw in row['trigram_excerpt']: return False
        elif len(kw) == 4 and kw in row['fourgram_excerpt']: return False
        elif len(kw) == 5 and kw in row['fivegram_excerpt']: return False
        elif len(kw) == 6 and kw in row['sixgram_excerpt']: return False
        elif len(kw) == 7 and kw in row['sevengram_excerpt']: return False
    for p in fhh_patterns:
        if re.search(p, row["excerpt_pp"]): return True
    for lf in [lf_fhh_pos_1, lf_fhh_pos_2, lf_fhh_pos_3, lf_fhh_pos_4]:
        if lf(row) == 1:
            return True
    for kw in kw_en_fhh:
        if isinstance(kw, str) and kw in row['tokenized_excerpt']: return True
        elif len(kw) == 2 and kw in row['bigram_excerpt']: return True
        elif len(kw) == 3 and kw in row['trigram_excerpt']: return True
        elif len(kw) == 4 and kw in row['fourgram_excerpt']: return True
        elif len(kw) == 5 and kw in row['fivegram_excerpt']: return True
        elif len(kw) == 6 and kw in row['sixgram_excerpt']: return True
        elif len(kw) == 7 and kw in row['sevengram_excerpt']: return True
    return False

In [20]:
kw_en_single_women = [
    'widows',
    'widow',
    'widowed',
    'cpf',
    'widowhood',
    'wrc',
    'widowers',
    'widower',
    'unmarried',
    'single-parent',
    'singleparent',
    'single-mother',
    'single-mothers',
    'singlemother',
    'singlemothers',
    ('single', 'women'),
    ('single', 'mothers'),
    ('single', 'mother'),
    ('single', 'woman'),
    ('single', 'female'),
    ('single', 'females'),
    ('women', 'traveling', 'alone'),
    ('woman', 'traveling', 'alone'),
    ('unmarried', 'women'),
    ('unmarried', 'women'),
    ('unmarried', 'females'),
    ('unmarried', 'female'),
    ('females', 'traveling', 'alone'),
    ('female', 'traveling', 'alone'),
    ('women', 'living', 'alone'),
    ('woman', 'living', 'alone'),
    ('females', 'living', 'alone'),
    ('female', 'living', 'alone'),
    ('divorced', 'or', 'separated'),
    ('women-only', 'places'),
    ('womenonly', 'places'),
    ('women', 'only', 'places'),
    ('female-only', 'places'),
    ('femaleonly', 'places'),
    ('female', 'only', 'places'),
    ('females-only', 'places'),
    ('femalesonly', 'places'),
    ('females', 'only', 'places'),
]


def lf_single_women_1(row):
    if single_women in row['specific_needs_groups']:
        return 1
    return -1


##
negative_kw_single_women = []
##
single_women_patterns = []


##
def is_single_women(row):
    for kw in negative_kw_single_women:
        if isinstance(kw, str) and kw in row['tokenized_excerpt']: return False
        elif len(kw) == 2 and kw in row['bigram_excerpt']: return False
        elif len(kw) == 3 and kw in row['trigram_excerpt']: return False
        elif len(kw) == 4 and kw in row['fourgram_excerpt']: return False
        elif len(kw) == 5 and kw in row['fivegram_excerpt']: return False
        elif len(kw) == 6 and kw in row['sixgram_excerpt']: return False
        elif len(kw) == 7 and kw in row['sevengram_excerpt']: return False
    for p in single_women_patterns:
        if re.search(p, row["excerpt_pp"]): return True
    for lf in [lf_single_women_1]:
        if lf(row) == 1:
            return True
    for kw in kw_en_single_women:
        if isinstance(kw, str) and kw in row['tokenized_excerpt']: return True
        elif len(kw) == 2 and kw in row['bigram_excerpt']: return True
        elif len(kw) == 3 and kw in row['trigram_excerpt']: return True
        elif len(kw) == 4 and kw in row['fourgram_excerpt']: return True
        elif len(kw) == 5 and kw in row['fivegram_excerpt']: return True
        elif len(kw) == 6 and kw in row['sixgram_excerpt']: return True
        elif len(kw) == 7 and kw in row['sevengram_excerpt']: return True
    return False

In [21]:
# when the labels of a tag are good enough, I usually preserve the original labels to
# avoid lowering the precision caused by noisy keywords I extract
kw_en_lgbt = []


def lf_lgbt_1(row):
    if lgbt in row['specific_needs_groups']:
        return 1
    return -1


##
negative_kw_lgbt = []
##
lgbt_patterns = []


##
def is_lgbt(row):
    for kw in negative_kw_lgbt:
        if isinstance(kw, str) and kw in row['tokenized_excerpt']: return False
        elif len(kw) == 2 and kw in row['bigram_excerpt']: return False
        elif len(kw) == 3 and kw in row['trigram_excerpt']: return False
        elif len(kw) == 4 and kw in row['fourgram_excerpt']: return False
        elif len(kw) == 5 and kw in row['fivegram_excerpt']: return False
        elif len(kw) == 6 and kw in row['sixgram_excerpt']: return False
        elif len(kw) == 7 and kw in row['sevengram_excerpt']: return False
    for p in lgbt_patterns:
        if re.search(p, row["excerpt_pp"]): return True
    for lf in [lf_lgbt_1]:
        if lf(row) == 1:
            return True
    for kw in kw_en_lgbt:
        if isinstance(kw, str) and kw in row['tokenized_excerpt']: return True
        elif len(kw) == 2 and kw in row['bigram_excerpt']: return True
        elif len(kw) == 3 and kw in row['trigram_excerpt']: return True
        elif len(kw) == 4 and kw in row['fourgram_excerpt']: return True
        elif len(kw) == 5 and kw in row['fivegram_excerpt']: return True
        elif len(kw) == 6 and kw in row['sixgram_excerpt']: return True
        elif len(kw) == 7 and kw in row['sevengram_excerpt']: return True
    return False

In [22]:
kw_en_chh = [
    'chh',
    'chhs'
    'child-headed',
    'childheaded',
    'children-headed',
    'childrenheaded',
    ('children', 'heads'),
    ('child', 'heads'),
    ('children', 'headed'),
    ('child', 'household'),
    ('child', 'hh'),
    ('children', 'hh'),
    ('child', 'headed'),
    ('children', 'headed'),
    ('child', 'headed'),
    ('children', 'headed'),
]


def lf_chh_pos_1(row):
    if len({
        ('child', 'headed'),
        ('child-', 'headed'),
        ('child', 'led'),
        ('child-', 'led'),
        ('child', 'run'),
        ('child-', 'run'),
        ('children', 'headed'),
        ('children-', 'headed'),
        ('children', 'led'),
        ('children-', 'led'),
        ('children', 'run'),
        ('children-', 'run'),
        ('adolescent', 'headed'),
        ('adolescent-', 'headed'),
        ('adolescent', 'led'),
        ('adolescent-', 'led'),
        ('adolescent', 'run'),
        ('adolescent-', 'run'),
        ('adolescents', 'headed'),
        ('adolescents-', 'headed'),
        ('adolescents', 'led'),
        ('adolescents-', 'led'),
        ('adolescents', 'run'),
        ('adolescents-', 'run'),
    }
           & row['bigram_excerpt']) and len({'households', 'household', 'family', 'families', 'home', 'homes'}
                                            & row['tokenized_excerpt']):
        return 1
    return -1


def lf_chh_pos_2(row):
    if len({
            'child-headed', 'child-led', 'child-run', 'childheaded',
            'childled', 'childrun', 'adolescent-headed', 'adolescent-led',
            'adolescent-run', 'adolescentheaded', 'adolescentled',
            'adolescentrun', 'adolescents-headed', 'adolescents-led',
            'adolescents-run', 'adolescentsheaded', 'adolescentsled',
            'adolescentsrun', 'children-headed', 'children-led',
            'children-run', 'childrenheaded', 'childrenled', 'childrenrun'
    }
           & row['tokenized_excerpt']) and len({'households', 'household', 'family', 'families', 'home', 'homes'}
                                               & row['tokenized_excerpt']):
        return 1
    return -1


def lf_chh_pos_3(row):
    if len({'child', 'children', 'adolescent', 'adolescents'}
           & row['tokenized_excerpt']) and len(
               {
                   'headed',
                   'head',
                   'heads',
                   'headquarters',
                   'headquarter',
                   'led',
                   'supplier',
                   'suppliers',
                   'chefts',
                   'chiefs',
                   'chief',
                   'cheffes',
                   'cheffes',
                   'breadwinner',
                   'breadwinners',
                   'headache',
               }
               & row['tokenized_excerpt']
           ) and len({
               'households', 'household', 'family', 'families', 'home', 'homes'
           }
                     & row['tokenized_excerpt']):
        return 1
    return -1


def lf_chh_pos_4(row):
    if chh in row['specific_needs_groups']:
        return 1
    return -1


##
negative_kw_chh = []
##
chh_patterns = []


##
def is_chh(row):
    for kw in negative_kw_chh:
        if isinstance(kw, str) and kw in row['tokenized_excerpt']: return False
        elif len(kw) == 2 and kw in row['bigram_excerpt']: return False
        elif len(kw) == 3 and kw in row['trigram_excerpt']: return False
        elif len(kw) == 4 and kw in row['fourgram_excerpt']: return False
        elif len(kw) == 5 and kw in row['fivegram_excerpt']: return False
        elif len(kw) == 6 and kw in row['sixgram_excerpt']: return False
        elif len(kw) == 7 and kw in row['sevengram_excerpt']: return False
    for p in chh_patterns:
        if re.search(p, row["excerpt_pp"]): return True
    for lf in [lf_chh_pos_1, lf_chh_pos_2, lf_chh_pos_4]:
        if lf(row) == 1:
            return True
    for kw in kw_en_chh:
        if isinstance(kw, str) and kw in row['tokenized_excerpt']: return True
        elif len(kw) == 2 and kw in row['bigram_excerpt']: return True
        elif len(kw) == 3 and kw in row['trigram_excerpt']: return True
        elif len(kw) == 4 and kw in row['fourgram_excerpt']: return True
        elif len(kw) == 5 and kw in row['fivegram_excerpt']: return True
        elif len(kw) == 6 and kw in row['sixgram_excerpt']: return True
        elif len(kw) == 7 and kw in row['sevengram_excerpt']: return True
    return False

In [23]:
kw_en_ehh = [
    'ehh',
    'ehhs',
    'elderly-headed',
    'elderlyheaded',
    'retiree-headed',
    'retireeheaded',
    'retirees-headed',
    'retireesheaded',
    'pensioner-headed',
    'pensionerheaded',
    'pensioners-headed',
    'pensionersheaded',
    'elderly-led',
    'elderlyled',
    'retiree-led',
    'retireeled',
    'retirees-led',
    'retireesled',
    'pensioner-led',
    'pensionerled',
    'pensioners-led',
    'pensionersled',
    'elderly-run',
    'elderlyrun',
    'retiree-run',
    'retireerun',
    'retirees-run',
    'retireesrun',
    'pensioner-run',
    'pensionerrun',
    'pensioners-run',
    'pensionersrun',
    ('elderly', 'headed'),
    ('retiree', 'headed'),
    ('retirees', 'headed'),
    ('pensioner', 'headed'),
    ('pensioners', 'headed'),
    ('elderly', 'led'),
    ('retiree', 'led'),
    ('retirees', 'led'),
    ('pensioner', 'led'),
    ('pensioners', 'led'),
    ('elderly', 'run'),
    ('retiree', 'run'),
    ('retirees', 'run'),
    ('pensioner', 'run'),
    ('pensioners', 'run'),
]


def lf_ehh_pos_1(row):
    if len({'elderly', 'retiree', 'retirees', 'pensioner', 'pensioners'}
           & row['tokenized_excerpt']) and len(
               {
                   'headed',
                   'head',
                   'heads',
                   'headquarters',
                   'headquarter',
                   'led',
                   'run',
                   'supplier',
                   'suppliers',
                   'chefts',
                   'chiefs',
                   'chief',
                   'cheffes',
                   'cheffes',
                   'breadwinner',
                   'breadwinners',
                   'headache',
               }
               & row['tokenized_excerpt']
           ) and len({
               'households', 'household', 'family', 'families', 'home', 'homes'
           }
                     & row['tokenized_excerpt']):
        return 1
    return -1


def lf_ehh_pos_2(row):
    if ehh in row['specific_needs_groups']:
        return 1
    return -1


##
negative_kw_ehh = []
##
ehh_patterns = []


##
def is_ehh(row):
    for kw in negative_kw_ehh:
        if isinstance(kw, str) and kw in row['tokenized_excerpt']: return False
        elif len(kw) == 2 and kw in row['bigram_excerpt']: return False
        elif len(kw) == 3 and kw in row['trigram_excerpt']: return False
        elif len(kw) == 4 and kw in row['fourgram_excerpt']: return False
        elif len(kw) == 5 and kw in row['fivegram_excerpt']: return False
        elif len(kw) == 6 and kw in row['sixgram_excerpt']: return False
        elif len(kw) == 7 and kw in row['sevengram_excerpt']: return False
    for p in ehh_patterns:
        if re.search(p, row["excerpt_pp"]): return True
    for lf in [lf_ehh_pos_2]:
        if lf(row) == 1:
            return True
    for kw in kw_en_ehh:
        if isinstance(kw, str) and kw in row['tokenized_excerpt']: return True
        elif len(kw) == 2 and kw in row['bigram_excerpt']: return True
        elif len(kw) == 3 and kw in row['trigram_excerpt']: return True
        elif len(kw) == 4 and kw in row['fourgram_excerpt']: return True
        elif len(kw) == 5 and kw in row['fivegram_excerpt']: return True
        elif len(kw) == 6 and kw in row['sixgram_excerpt']: return True
        elif len(kw) == 7 and kw in row['sevengram_excerpt']: return True
    return False

In [24]:
kw_en_minorities = [
    'ex-combatants',
    'excombatants',
    'afro-descendant',
    'afrodescendant',
    'haitians',
    'haitian',
    #'ethnic',
    'afro-descendants',
    'afrodescendants',
    'peasant',
    'peasants',
    'guyanese',
    'boudouma',
    'bahamian',
    'kanembou',
    'racism',
    'bojayá',
    'bojaya',
    'minority',
    'minorities',
    'bahamians',
    'afroodescending',
    'afrodescending',
    'afro-descending',
    #'black',
    #'ethnicity',
    #'victimizing',
    ('ethnic', 'communities'),
    ('ethnic', 'groups'),
    ('religious', 'groups'),
    'afro-colombians',
    'afrocolombians',
    'afro-colombian',
    'afrocolombian',
    'guaviare',
    'meta-guaviare',
    'metaguaviare',
    'bilwi',
    'nariño',
    'narino',
    'raizals',
    'raizal',
    'gaitanistas',
    ('ethnic', 'organizational', 'processes'),
    'gao',
    ('ethnic', 'territories'),
    'wounaan',
    ('mesopotamia', 'communities'),
    ('opogadó', 'bocas'),
    ('opogado', 'bocas'),
    'bojaya',
]


def lf_minorities_pos_1(row):
    if minorities in row['specific_needs_groups']:
        return 1
    return -1


##
negative_kw_minorities = []
##
minorities_patterns = []

##
def is_minorities(row):
    for kw in negative_kw_minorities:
        if isinstance(kw, str) and kw in row['tokenized_excerpt']: return False
        elif len(kw) == 2 and kw in row['bigram_excerpt']: return False
        elif len(kw) == 3 and kw in row['trigram_excerpt']: return False
        elif len(kw) == 4 and kw in row['fourgram_excerpt']: return False
        elif len(kw) == 5 and kw in row['fivegram_excerpt']: return False
        elif len(kw) == 6 and kw in row['sixgram_excerpt']: return False
        elif len(kw) == 7 and kw in row['sevengram_excerpt']: return False
    for p in minorities_patterns:
        if re.search(p, row["excerpt_pp"]): return True
    for lf in [lf_minorities_pos_1]:
        if lf(row) == 1:
            return True
    for kw in kw_en_minorities:
        if isinstance(kw, str) and kw in row['tokenized_excerpt']: return True
        elif len(kw) == 2 and kw in row['bigram_excerpt']: return True
        elif len(kw) == 3 and kw in row['trigram_excerpt']: return True
        elif len(kw) == 4 and kw in row['fourgram_excerpt']: return True
        elif len(kw) == 5 and kw in row['fivegram_excerpt']: return True
        elif len(kw) == 6 and kw in row['sixgram_excerpt']: return True
        elif len(kw) == 7 and kw in row['sevengram_excerpt']: return True
    return False

In [25]:
def specific_needs_groups(row):
    ret = []
    if is_plw(row):
        ret.append(plw)
    if is_ip(row):
        ret.append(ip)
    if is_pwd(row):
        ret.append(pwd)
    if is_minorities(row):
        ret.append(minorities)
    if is_gbv(row):
        ret.append(gbv)
    if is_unaccompanied_child(row):
        ret.append(unaccompanied_child)
    if is_chronically_ill(row):
        ret.append(chronically_ill)
    if is_fhh(row):
        ret.append(fhh)
    if is_lgbt(row):
        ret.append(lgbt)
    if is_single_women(row):
        ret.append(single_women)
    if is_chh(row):
        ret.append(chh)
    if is_ehh(row):
        ret.append(ehh)
    return ret

In [26]:
def preprocess(doc):
    if doc != doc:
        return ""
    doc = doc.lower()
    # remove preceeding dates
    #doc = re.sub("^\[.+\]", " ", doc).strip()
    #doc = re.sub("^\(.+\)", " ", doc).strip()
    # spaces btw numbers and words
    doc = re.sub('(\d+(\W\d+)?)', r' \1 ', doc).strip()
    doc = re.sub("[‐‑–—―─_]", "-", doc)
    doc = re.sub("(\w)\- (\w)", r"\1\2", doc)
    # NOTE: I have added "-"
    doc = re.sub(
        "[" + re.escape(
            '-_@^~.()[],"“’…<❖‐»—─|•&{≥➢\ue0e4\uf0d8\uf0fc●°#\u200b>`?�€■!‘%;̧\'›«”:≤―\uf0b7$}*´=‑▪\xad❑·–'
        ) + "]", " ", doc)
    #remove some puncs
    doc = re.sub('\s+', " ", doc).strip()
    return doc

In [27]:
def preprocess_and_tokenize(doc, n=1):
    doc = preprocess(doc)
    # tokenize
    words = word_tokenize(doc)
    if n == 1:
        return words
    return set(ngrams(words, n))

In [28]:
def tokenize(words, n=1):
    if n == 1:
        return set(words)
    return set(ngrams(words, n))

In [29]:
# preprocess
df_train_en["excerpt_pp"] = df_train_en["excerpt"].progress_apply(preprocess)
# tokenize and cacl ngrams
df_train_en["tokenized_excerpt"] = df_train_en["excerpt"].progress_apply(preprocess_and_tokenize)
df_train_en["bigram_excerpt"] = df_train_en["tokenized_excerpt"].progress_apply(lambda x: tokenize(x, 2))
df_train_en["trigram_excerpt"] = df_train_en["tokenized_excerpt"].progress_apply(lambda x: tokenize(x, 3))
df_train_en["fourgram_excerpt"] = df_train_en["tokenized_excerpt"].progress_apply(lambda x: tokenize(x, 4))
df_train_en["fivegram_excerpt"] = df_train_en["tokenized_excerpt"].progress_apply(lambda x: tokenize(x, 5))
df_train_en["sixgram_excerpt"] = df_train_en["tokenized_excerpt"].progress_apply(lambda x: tokenize(x, 6))
df_train_en["sevengram_excerpt"] = df_train_en["tokenized_excerpt"].progress_apply(lambda x: tokenize(x, 7))
df_train_en["tokenized_excerpt"] = df_train_en["tokenized_excerpt"].progress_apply(lambda x: set(x))

  0%|          | 0/126323 [00:00<?, ?it/s]

  0%|          | 0/126323 [00:00<?, ?it/s]

  0%|          | 0/126323 [00:00<?, ?it/s]

  0%|          | 0/126323 [00:00<?, ?it/s]

  0%|          | 0/126323 [00:00<?, ?it/s]

  0%|          | 0/126323 [00:00<?, ?it/s]

  0%|          | 0/126323 [00:00<?, ?it/s]

  0%|          | 0/126323 [00:00<?, ?it/s]

  0%|          | 0/126323 [00:00<?, ?it/s]

In [30]:
# preprocess
df_val_en["excerpt_pp"] = df_val_en["excerpt"].progress_apply(preprocess)
# tokenize and cacl ngrams
df_val_en["tokenized_excerpt"] = df_val_en["excerpt"].progress_apply(preprocess_and_tokenize)
df_val_en["bigram_excerpt"] = df_val_en["tokenized_excerpt"].progress_apply(lambda x: tokenize(x, 2))
df_val_en["trigram_excerpt"] = df_val_en["tokenized_excerpt"].progress_apply(lambda x: tokenize(x, 3))
df_val_en["fourgram_excerpt"] = df_val_en["tokenized_excerpt"].progress_apply(lambda x: tokenize(x, 4))
df_val_en["fivegram_excerpt"] = df_val_en["tokenized_excerpt"].progress_apply(lambda x: tokenize(x, 5))
df_val_en["sixgram_excerpt"] = df_val_en["tokenized_excerpt"].progress_apply(lambda x: tokenize(x, 6))
df_val_en["sevengram_excerpt"] = df_val_en["tokenized_excerpt"].progress_apply(lambda x: tokenize(x, 7))
df_val_en["tokenized_excerpt"] = df_val_en["tokenized_excerpt"].progress_apply(lambda x: set(x))

  0%|          | 0/14425 [00:00<?, ?it/s]

  0%|          | 0/14425 [00:00<?, ?it/s]

  0%|          | 0/14425 [00:00<?, ?it/s]

  0%|          | 0/14425 [00:00<?, ?it/s]

  0%|          | 0/14425 [00:00<?, ?it/s]

  0%|          | 0/14425 [00:00<?, ?it/s]

  0%|          | 0/14425 [00:00<?, ?it/s]

  0%|          | 0/14425 [00:00<?, ?it/s]

  0%|          | 0/14425 [00:00<?, ?it/s]

In [31]:
# preprocess
df_test_en["excerpt_pp"] = df_test_en["excerpt"].progress_apply(preprocess)
# tokenize and cacl ngrams
df_test_en["tokenized_excerpt"] = df_test_en["excerpt"].progress_apply(preprocess_and_tokenize)
df_test_en["bigram_excerpt"] = df_test_en["tokenized_excerpt"].progress_apply(lambda x: tokenize(x, 2))
df_test_en["trigram_excerpt"] = df_test_en["tokenized_excerpt"].progress_apply(lambda x: tokenize(x, 3))
df_test_en["fourgram_excerpt"] = df_test_en["tokenized_excerpt"].progress_apply(lambda x: tokenize(x, 4))
df_test_en["fivegram_excerpt"] = df_test_en["tokenized_excerpt"].progress_apply(lambda x: tokenize(x, 5))
df_test_en["sixgram_excerpt"] = df_test_en["tokenized_excerpt"].progress_apply(lambda x: tokenize(x, 6))
df_test_en["sevengram_excerpt"] = df_test_en["tokenized_excerpt"].progress_apply(lambda x: tokenize(x, 7))
df_test_en["tokenized_excerpt"] = df_test_en["tokenized_excerpt"].progress_apply(lambda x: set(x))

  0%|          | 0/17200 [00:00<?, ?it/s]

  0%|          | 0/17200 [00:00<?, ?it/s]

  0%|          | 0/17200 [00:00<?, ?it/s]

  0%|          | 0/17200 [00:00<?, ?it/s]

  0%|          | 0/17200 [00:00<?, ?it/s]

  0%|          | 0/17200 [00:00<?, ?it/s]

  0%|          | 0/17200 [00:00<?, ?it/s]

  0%|          | 0/17200 [00:00<?, ?it/s]

  0%|          | 0/17200 [00:00<?, ?it/s]

In [34]:
df_train_en["specific_needs_groups_kw"] = df_train_en.progress_apply(specific_needs_groups, axis=1)
df_val_en["specific_needs_groups_kw"] = df_val_en.progress_apply(specific_needs_groups, axis=1)
df_test_en["specific_needs_groups_kw"] = df_test_en.progress_apply(specific_needs_groups, axis=1)

  0%|          | 0/126323 [00:00<?, ?it/s]

  0%|          | 0/14425 [00:00<?, ?it/s]

  0%|          | 0/17200 [00:00<?, ?it/s]

In [35]:
df_train_en.to_csv("train_0.7.1_keyword_specific_needs_groups.csv",
                  columns=['entry_id', "specific_needs_groups_kw"],
                  index=False)
##
df_val_en.to_csv("val_0.7.1_keyword_specific_needs_groups.csv",
                  columns=['entry_id', "specific_needs_groups_kw"],
                  index=False)
##
df_test_en.to_csv("test_0.7.1_keyword_specific_needs_groups.csv",
                  columns=['entry_id', "specific_needs_groups_kw"],
                  index=False)

In [None]:
#VPI: gender based violence
#NSAGs: non-state armed groups