In [1]:
import re
import string
import functools
from ast import literal_eval
from operator import itemgetter
from collections import Counter, defaultdict, OrderedDict
from collections.abc import Sequence

import fasttext
import numpy as np
import pandas as pd
from sklearn import metrics
from sklearn.feature_extraction.text import CountVectorizer

from tqdm.notebook import tqdm

from num2words import num2words
                                                                
import nltk
from nltk import ngrams
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import sent_tokenize, word_tokenize

In [2]:
tqdm.pandas()

In [3]:
def preprocess(doc):
    if doc != doc:
        return ""
    doc = doc.lower()
    # remove preceeding dates
    #doc = re.sub("^\[.+\]", " ", doc).strip()
    #doc = re.sub("^\(.+\)", " ", doc).strip()
    # spaces btw numbers and words
    doc = re.sub('(\d+(\W\d+)?)', r' \1 ', doc).strip()
    doc = re.sub("[‐‑–—―─_]", "-", doc)
    doc = re.sub("(\w)\- (\w)", r"\1\2", doc)
    doc = re.sub(
        "[" + re.escape(
            '_@^~.()[],"“’…<❖‐»—─|•&{≥➢\ue0e4\uf0d8\uf0fc●°#\u200b>`?�€■!‘%;̧\'›«”:≤―\uf0b7$}*´=‑▪\xad❑·–'
        ) + "]", " ", doc)
    #remove some puncs
    doc = re.sub('\s+', " ", doc).strip()
    return doc
##
def preprocess_and_tokenize(doc, n=1):
    doc = preprocess(doc)
    # tokenize
    words = word_tokenize(doc)
    if n == 1:
        return set(words)
    return set(ngrams(words, n))

In [4]:
df_train = pd.read_csv("../data/train_v0.7.1.csv",
                       usecols=[
                           'entry_id', 'excerpt', 'age', 'lang',
                           "translation_en", "translation_fr", "translation_es"
                       ])
df_val = pd.read_csv("../data/val_v0.7.1.csv",
                     usecols=[
                         'entry_id', 'excerpt', 'age', 'lang',
                         "translation_en", "translation_fr", "translation_es"
                     ])
df_test = pd.read_csv("../data/test_v0.7.1.csv",
                      usecols=[
                          'entry_id', 'excerpt', 'age', 'lang',
                          "translation_en", "translation_fr", "translation_es"
                      ])

In [5]:
col = "age"
for df in [df_train, df_val, df_test]:
    df[col] = df[col].apply(lambda x: list(sorted(list(set(literal_eval(x))))))

In [6]:
df_train_en = df_train.copy()
df_train_en.loc[df_train_en["lang"].ne("en"),
                "excerpt"] = df_train_en.loc[df_train_en["lang"].ne("en"),
                                             "translation_en"]
##
df_train_fr = df_train.copy()
df_train_fr.loc[df_train_fr["lang"].ne("fr"),
                "excerpt"] = df_train_fr.loc[df_train_fr["lang"].ne("fr"),
                                             "translation_fr"]
##
df_train_es = df_train.copy()
df_train_es.loc[df_train_es["lang"].ne("es"),
                "excerpt"] = df_train_es.loc[df_train_es["lang"].ne("es"),
                                             "translation_es"]

In [7]:
df_train_en = df_train.copy()
df_train_en = df_train_en[df_train_en["lang"].eq("en")]
##
df_train_fr = df_train.copy()
df_train_fr = df_train_fr[df_train_fr["lang"].eq("fr")]
##
df_train_es = df_train.copy()
df_train_es = df_train_es[df_train_es["lang"].eq("es")]

In [8]:
def unique_values(df, col):
    vals = Counter()
    for val in df[col]:
        vals.update(val)
    return vals.most_common()

In [9]:
unique_values(df_train_en, "age")

[('Children/Youth (5 to 17 years old)', 7629),
 ('Adult (18 to 59 years old)', 3505),
 ('Older Persons (60+ years old)', 2708),
 ('Infants/Toddlers (<5 years old)', 1497)]

In [10]:
unique_values(df_train_fr, "age")

[('Children/Youth (5 to 17 years old)', 3286),
 ('Adult (18 to 59 years old)', 1798),
 ('Older Persons (60+ years old)', 864),
 ('Infants/Toddlers (<5 years old)', 750)]

In [11]:
unique_values(df_train_es, "age")

[('Children/Youth (5 to 17 years old)', 2390),
 ('Adult (18 to 59 years old)', 1710),
 ('Older Persons (60+ years old)', 614),
 ('Infants/Toddlers (<5 years old)', 544)]

In [12]:
child = 'Children/Youth (5 to 17 years old)'
adult = 'Adult (18 to 59 years old)'
old = 'Older Persons (60+ years old)'
infant = 'Infants/Toddlers (<5 years old)'

In [13]:
df_train_en_age = df_train_en[df_train_en['age'].apply(
    lambda x: x != [])].copy()
##
df_train_en_age_infant = df_train_en_age[df_train_en_age['age'].apply(
    lambda x: infant in x)]
df_train_en_age_child = df_train_en_age[df_train_en_age['age'].apply(
    lambda x: child in x)]
df_train_en_age_adult = df_train_en_age[df_train_en_age['age'].apply(
    lambda x: adult in x)]
df_train_en_age_old = df_train_en_age[df_train_en_age['age'].apply(
    lambda x: old in x)]
##
df_train_en_age_infant_only = df_train_en_age[df_train_en_age['age'].apply(
    lambda x: [infant] == x)]
df_train_en_age_child_only = df_train_en_age[df_train_en_age['age'].apply(
    lambda x: [child] == x)]
df_train_en_age_adult_only = df_train_en_age[df_train_en_age['age'].apply(
    lambda x: [adult] == x)]
df_train_en_age_old_only = df_train_en_age[df_train_en_age['age'].apply(
    lambda x: [old] == x)]

In [14]:
df_train_fr_age = df_train_fr[df_train_fr['age'].apply(
    lambda x: x != [])].copy()
##
df_train_fr_age_infant = df_train_fr_age[df_train_fr_age['age'].apply(
    lambda x: infant in x)]
df_train_fr_age_child = df_train_fr_age[df_train_fr_age['age'].apply(
    lambda x: child in x)]
df_train_fr_age_adult = df_train_fr_age[df_train_fr_age['age'].apply(
    lambda x: adult in x)]
df_train_fr_age_old = df_train_fr_age[df_train_fr_age['age'].apply(
    lambda x: old in x)]
##
df_train_fr_age_infant_only = df_train_fr_age[df_train_fr_age['age'].apply(
    lambda x: [infant] == x)]
df_train_fr_age_child_only = df_train_fr_age[df_train_fr_age['age'].apply(
    lambda x: [child] == x)]
df_train_fr_age_adult_only = df_train_fr_age[df_train_fr_age['age'].apply(
    lambda x: [adult] == x)]
df_train_fr_age_old_only = df_train_fr_age[df_train_fr_age['age'].apply(
    lambda x: [old] == x)]

In [15]:
df_train_es_age = df_train_es[df_train_es['age'].apply(
    lambda x: x != [])].copy()
##
df_train_es_age_infant = df_train_es_age[df_train_es_age['age'].apply(
    lambda x: infant in x)]
df_train_es_age_child = df_train_es_age[df_train_es_age['age'].apply(
    lambda x: child in x)]
df_train_es_age_adult = df_train_es_age[df_train_es_age['age'].apply(
    lambda x: adult in x)]
df_train_es_age_old = df_train_es_age[df_train_es_age['age'].apply(
    lambda x: old in x)]
##
df_train_es_age_infant_only = df_train_es_age[df_train_es_age['age'].apply(
    lambda x: [infant] == x)]
df_train_es_age_child_only = df_train_es_age[df_train_es_age['age'].apply(
    lambda x: [child] == x)]
df_train_es_age_adult_only = df_train_es_age[df_train_es_age['age'].apply(
    lambda x: [adult] == x)]
df_train_es_age_old_only = df_train_es_age[df_train_es_age['age'].apply(
    lambda x: [old] == x)]

In [16]:
class KeywordExtractor:
    def __init__(
        self,
        docs_bg_corpus,
        docs_classes,
        lang="en",
        n_grams=2,
        num_to_words=False,
        stop_words=None,
    ):
        if isinstance(stop_words, Sequence) and not isinstance(stop_words, set):
            self.stop_words = set(stop_words)
        elif stop_words is None:
            self.stop_words = set()
        self.n_grams = n_grams
        self.num_to_words = num_to_words
        self.docs_bg_corpus = docs_bg_corpus
        self.class_name_to_idx = dict(
            zip(list(docs_classes.keys()), range(len(docs_classes))))
        self.docs_classes = list(docs_classes.values())
        self.lang = lang
        ## for preprocessing - should be moved to a util class/func
        self.num_normalizer = dict(
            zip("⁰¹²³⁴⁵⁶⁷⁸⁹", [str(i) for i in range(10)]))
        self.num_normalizer.update(
            dict(zip("₀₁₂₃₄₅₆₇₈₉", [str(i) for i in range(10)])))
        if lang == "en":
            self.num_normalizer.update({"⅓": "one-third", "¼": "one-fourth"})
        elif lang == "fr":
            self.num_normalizer.update({"⅓": "un-tiers", "¼": "un-quart"})
        elif lang == "es":
            self.num_normalizer.update({"⅓": "un-tercio", "¼": "un-tercio"})
        ##
        self.word_to_freq_bg_corpus = self.extract_word_counts(docs_bg_corpus)
        # split word to freq dict into multiple dicts according to ngram len
        self.ngram_to_freq_bg_corpus = [dict() for _ in range(n_grams)]
        for kw, count in self.word_to_freq_bg_corpus.items():
            if isinstance(kw, str):
                self.ngram_to_freq_bg_corpus[0][kw] = count
            else:
                self.ngram_to_freq_bg_corpus[len(kw) - 1][kw] = count
        self.bg_corpus_sizes = [
            sum(self.ngram_to_freq_bg_corpus[i].values())
            for i in range(n_grams)
        ]
        ##
        self.word_to_freq_classes = [
            self.extract_word_counts(corpus) for corpus in self.docs_classes
        ]
        # split word to freq dict of each class into multiple dicts
        # len(ngram_word_to_freq_classes) = c
        # len(ngram_word_to_freq_classes[x]) = n
        self.ngram_word_to_freq_classes = [[dict() for _ in range(n_grams)]
                                           for _ in self.class_name_to_idx]
        for c, word_to_freq_cls in enumerate(self.word_to_freq_classes):
            for kw, count in word_to_freq_cls.items():
                n = 0 if isinstance(kw, str) else len(kw) - 1
                self.ngram_word_to_freq_classes[c][n][kw] = count
        self.ngram_corpora_sizes = [[
            sum(word_to_freq[i].values()) for i in range(n_grams)
        ] for word_to_freq in self.ngram_word_to_freq_classes]

        self.ngram_corpora_sizes = [[
            sum(word_to_freq[i].values()) for i in range(n_grams)
        ] for word_to_freq in self.ngram_word_to_freq_classes]
        ##
        # calc likelihoods for each ngram length in each class separately
        # a list of lists of dicts
        # each represents a class
        # each class is represented by n dicts
        # each dict is {"ngram in class": likelihood}
        self.ngram_likelihoods = [[] for _ in self.class_name_to_idx]
        for c, cls_name in enumerate(self.class_name_to_idx):
            for n in range(self.n_grams):
                w_to_f = self.ngram_word_to_freq_classes[c][n]
                corpus_size = self.ngram_corpora_sizes[c][n]
                self.ngram_likelihoods[c].append(
                    self.calc_likelihoods(w_to_f, corpus_size))
        ##
        # calc potts scores for each ngram in each class
        add_two_dict = lambda a, b: {
            **a,
            **b,
            **{k: a[k] + b[k]
               for k in a.keys() & b}
        }
        self.ngram_potts_scores = [[] for _ in self.class_name_to_idx]
        self.ngram_z_score_of_the_log_odds_ratios = [
            [] for _ in self.class_name_to_idx
        ]
        for c, cls_name in enumerate(self.class_name_to_idx):
            for n in range(self.n_grams):
                lh = self.ngram_likelihoods[c][n]
                other_lhs = [
                    self.ngram_likelihoods[c_other][n]
                    for c_other, _ in enumerate(self.class_name_to_idx)
                    if c != c_other
                ]
                other_lhs = functools.reduce(add_two_dict, other_lhs)
                self.ngram_potts_scores[c].append(
                    self.calc_potts_scores(lh, other_lhs))

                word_to_freq = self.ngram_word_to_freq_classes[c][n]
                corpus_size = self.ngram_corpora_sizes[c][n]
                word_to_freq_others = [
                    self.ngram_word_to_freq_classes[c_other][n]
                    for c_other, _ in enumerate(self.class_name_to_idx)
                    if c != c_other
                ]
                word_to_freq_others = functools.reduce(add_two_dict,
                                                       word_to_freq_others)
                corpus_size_others = sum([
                    self.ngram_corpora_sizes[c_other][n]
                    for c_other, _ in enumerate(self.class_name_to_idx)
                    if c != c_other
                ])

                self.ngram_z_score_of_the_log_odds_ratios[c].append(
                    self.calc_prior_modified_log_odds_ratio(
                        word_to_freq, corpus_size, word_to_freq_others,
                        corpus_size_others, self.ngram_to_freq_bg_corpus[n],
                        self.bg_corpus_sizes[n]))

    def preprocess_and_tokenize(self, doc):
        if doc != doc:
            return ""
        # remove preceeding dates
        doc = re.sub("^\[.+\]", " ", doc).strip()
        doc = re.sub("^\(.+\)", " ", doc).strip()
        # spaces btw numbers and words
        doc = re.sub('(\d+(\W\d+)?)', r' \1 ', doc).strip()
        doc = re.sub("[‐‑–—―─_]", "-", doc)
        doc = re.sub(
            "[" + re.escape(
                '_@^~.()[],"“’…<❖‐»—─|•&{≥➢\ue0e4\uf0d8\uf0fc●°#\u200b>`?�€■!‘%;̧\'›«”:≤―\uf0b7$}*+´=‑▪\xad❑·–'
            ) + "]", " ", doc)
        #remove some puncs
        doc = re.sub('\s+', " ", doc)

        # tokenize
        words = word_tokenize(doc)
        # lower and remove non-words
        words = [word.lower() for word in words if word not in self.stop_words]
        words = [self.num_normalizer.get(token, token) for token in words]
        if self.num_to_words:
            words = [
                num2words(token, lang=self.lang)
                if token.isnumeric() else token for token in words
            ]
        kw_kp = words.copy()
        for n in range(2, self.n_grams + 1):
            kw_kp.extend(list(ngrams(words, n)))
        return kw_kp

    def calc_potts_scores(self, word_to_likelihood_main,
                          word_to_likelihood_other):
        potts_scores = dict()
        for word in word_to_likelihood_main.keys():
            potts_scores[word] = word_to_likelihood_main[word] / (
                word_to_likelihood_main[word] +
                word_to_likelihood_other.get(word, 0))
        return potts_scores

    def calc_likelihoods(self, word_to_freq, corpus_size):
        likelihoods = dict()
        for word, count in word_to_freq.items():
            likelihoods[word] = count / corpus_size
        return likelihoods

    def extract_word_counts(self, docs):
        word_to_freq = defaultdict(int)
        for doc in docs:
            words = self.preprocess_and_tokenize(doc)
            for word in words:
                #if word in stopwords, then do not add it
                word_to_freq[word] += 1

        return word_to_freq

    def calc_prior_modified_log_odds_ratio(self, word_to_freq_c1,
                                           corpus_size_c1, word_to_freq_c2,
                                           corpus_size_c2, word_to_freq_all,
                                           corpus_size_all):

        prior_modified_log_odds_ratio_c1 = dict()
        variance_of_the_log_odds_ratio = dict()
        z_score_of_the_log_odds_ratio_c1 = dict()
        ##
        for word in word_to_freq_c1.keys():
            numerator_1 = word_to_freq_c1[word] + word_to_freq_all[word]
            denomerator_1 = corpus_size_c1 + corpus_size_all - (numerator_1)
            ratio_1 = np.log(numerator_1 / denomerator_1)
            ##
            numerator_2 = word_to_freq_c2.get(word, 0) + word_to_freq_all[word]
            denomerator_2 = corpus_size_c2 + corpus_size_all - (numerator_2)
            ratio_2 = np.log(numerator_2 / denomerator_2)
            ##
            prior_modified_log_odds_ratio_c1[word] = ratio_1 - ratio_2
            ##
            variance_of_the_log_odds_ratio[word] = (1 / numerator_1) + (
                1 / numerator_2)
            ##
            z_score_of_the_log_odds_ratio_c1[
                word] = prior_modified_log_odds_ratio_c1[word] / np.sqrt(
                    variance_of_the_log_odds_ratio[word])
        return z_score_of_the_log_odds_ratio_c1

    def get_kws(self, cls_name, n):
        cls_idx = self.class_name_to_idx[cls_name]
        kw_dict = self.ngram_z_score_of_the_log_odds_ratios[cls_idx][n - 1]
        return list(
            sorted([(word, score) for word, score in kw_dict.items()],
                   key=itemgetter(1),
                   reverse=True))

In [17]:
kw_en_infant = [
    '(24-59m)',
    ('0', '-', '59', 'months'),
    ('0-59', 'months'),
    ('1', '-year-old'),
    '1-year-old',
    ('12-15', 'month'),
    ('2', '-year-old'),
    '2-year-old',
    ('24-59', 'm'),
    ('3', '-year-old'),
    '3-year-old',
    ('4', '-year-old'),
    '4-year-old',
    ('5', '-year-old'),
    '5-year-old',
    ('6', '-', '23', 'months', 'old'),
    ('6-', '23', 'months', 'old'),
    ('6-23', 'months', 'old'),
    ('age', 'of', '5'),
    ('age', 'of', 'five'),
    ('aged', '0', '-', '23', 'months'),
    ('aged', '0-23', 'months'),
    ('aged', '0-', '23', 'months'),
    ('aged', 'zero', '-', 'twenty-three', 'months'),
    'babies',
    'baby',
    'born',
    'breastfed',
    ('children', '6-', '59', 'months', 'of', 'age'),
    ('children', '6-59', 'months'),
    ('children', 'under', 'the', 'age', 'of', 'five', 'years'),
    ('children', 'below', 'five', 'years', 'of', 'age'),
    ('children', '6-', '59', 'months'),
    ('children', '0-23', 'months'),
    ('children', 'aged', '0-', '23', 'months'),
    ('children', 'under', '5', 'years', 'of', 'age'),
    ('children', 'of', '6-', '59', 'months'),
    ('children', 'six', '-', 'fifty-nine', 'months'),
    ('children', 'aged', '0-23', 'months'),
    ('children', 'aged', '0', '-', '23', 'months'),
    ('children', 'under', 'five', 'years', 'of', 'age'),
    ('children', 'aged', 'six', 'to', 'fifty-nine', 'months'),
    ('children', 'between', 'the', 'ages', 'of', 'twenty-four', 'and',
     'fifty-nine', 'months'),
    ('children', 'aged', '6-59', 'months'),
    ('children', '6-59', 'months', 'of', 'age'),
    ('children', 'of', '6-59', 'months'),
    ('children', 'under', 'the', 'age', 'of', '5'),
    ('children', 'below', '5', 'years', 'of', 'age'),
    ('children', 'between', 'six', 'to', 'twenty-three', 'months'),
    ('children', 'from', '6', 'to', '59', 'months'),
    ('children', '6', '-', '59', 'months'),
    ('children', 'under', '5', 'years', 'old'),
    ('children', 'under', 'the', 'age', 'of', '5', 'years'),
    ('children', 'aged', 'six', '-', 'fifty-nine', 'months'),
    ('children', 'aged', '6-', '59', 'months'),
    ('children', 'six', '-', 'fifty-nine', 'months', 'of', 'age'),
    ('children', 'aged', '6', '-', '59', 'months'),
    ('children', '0', '-', '23', 'months'),
    ('children', 'from', 'six', 'to', 'fifty-nine', 'months'),
    ('children', 'zero', '-', 'twenty-three', 'months'),
    ('children', 'of', '6', '-', '59', 'months'),
    ('children', 'under', 'five', 'years'),
    ('children', 'of', 'six', '-', 'fifty-nine', 'months'),
    ('children', 'aged', 'zero', '-', 'twenty-three', 'months'),
    ('children', '0-', '23', 'months'),
    ('children', 'between', 'the', 'ages', 'of', '24', 'and', '59', 'months'),
    ('children', 'under', 'the', 'age', 'of', 'five'),
    ('children', '6', '-', '59', 'months', 'of', 'age'),
    ('children', 'under', 'five', 'years', 'old'),
    ('children', 'aged', '6', 'to', '59', 'months'),
    ('children', 'between', '6', 'to', '23', 'months'),
    'cmam',
    'congenital',
    ('infant', 'and', 'young', 'child'),
    'infant',
    ('infant', 'and', 'young', 'child', 'feeding'),
    'infantile',
    'infants',
    'iycf',
    ('less', 'than', 'five', 'years'),
    ('less', 'than', '5', 'years'),
    ('live', 'births'),
    ('live', 'birth'),
    'measles',
    'neonatal',
    'newborn',
    'newborns',
    'perinatal',
    'post-natal',
    ('six', '-', 'twenty-three', 'months', 'old'),
    'stunted',
    'stunting',
    ('under', 'the', 'age', 'of', 'five', 'years'),
    ('under', 'the', 'age', 'of', '5'),
    ('under', 'the', 'age', 'of', 'five'),
    ('under', 'the', 'age', 'of', '5', 'years'),
    ("aged", "0-9", "years"),
]
# list(
#     sorted(list(set(kw_en_infant)),
#            key=lambda x: x if isinstance(x, str) else x[0]))
negative_kw_infant = [("women","with","babies"),]
##
r_0_59 = r"\b(5[0-9]|[0-9])\b"  # 0->59
r_0_5 = r"\b([0-5])\b"  # 0->5
##
infant_patterns = [
    f"less than {r_0_5} years",
    f"younger than {r_0_5} years",
    f"under the age of {r_0_5}",
    f"below the age of {r_0_5}",
    f"{r_0_59} ?- ?{r_0_59} months old",
    f"aged {r_0_5} ?- ?[0-9] years",
    f"aged {r_0_59} ?- ?{r_0_59} months?",
    f"{r_0_59} ?- ?{r_0_59} ?m",
    f"{r_0_59} ?- ?{r_0_59} ?months?",
    f"{r_0_5}-year-old",
    f"children {r_0_59} ?- ?{r_0_59} months?",
    f"children of {r_0_59} ?- ?{r_0_59} months?",
    f"children under {r_0_5}",
    f"children under the age of {r_0_5}",
    f"children under {r_0_5} years",
    f"children below {r_0_5} years",
    f"children below the age of {r_0_5}",
    f"children below {r_0_5}",
    f"children aged {r_0_59} ?- ?{r_0_59} months?",
    f"children aged {r_0_59} to {r_0_59} months?",
    f"children between {r_0_59} ?- ?{r_0_59} months?",
    f"children between {r_0_59} to {r_0_59} months?",
    f"children between the ages of {r_0_59} and {r_0_59} months?",
    f"children from {r_0_59} ?- ?{r_0_59} months?",
    f"children from {r_0_59} to {r_0_59} months?",
    f"children of {r_0_59} ?- ?{r_0_59} months?",
]
##
def is_infant(row):
    for kw in negative_kw_infant:
        if isinstance(kw, str) and kw in row['tokenized_excerpt']: return False
        elif len(kw) == 2 and kw in row['bigram_excerpt']: return False
        elif len(kw) == 3 and kw in row['trigram_excerpt']: return False
        elif len(kw) == 4 and kw in row['fourgram_excerpt']: return False
        elif len(kw) == 5 and kw in row['fivegram_excerpt']: return False
        elif len(kw) == 6 and kw in row['sixgram_excerpt']: return False
        elif len(kw) == 7 and kw in row['sevengram_excerpt']: return False
    for p in infant_patterns:
        if re.search(p, row["excerpt_pp"]): return True
    for kw in kw_en_infant:
        if isinstance(kw, str) and kw in row['tokenized_excerpt']: return True
        elif len(kw) == 2 and kw in row['bigram_excerpt']: return True
        elif len(kw) == 3 and kw in row['trigram_excerpt']: return True
        elif len(kw) == 4 and kw in row['fourgram_excerpt']: return True
        elif len(kw) == 5 and kw in row['fivegram_excerpt']: return True
        elif len(kw) == 6 and kw in row['sixgram_excerpt']: return True
        elif len(kw) == 7 and kw in row['sevengram_excerpt']: return True
    return False

In [18]:
kw_en_child = [
    ('14', 'years'), ('17', 'years'), 'adolescent', 'adolescents',
    ('age', 'group', '5', '-', '14', 'years'),
    ('age', 'group', '5-14', 'years'), ('aged', 'ten'), ('aged', '10'),
    ('attending', 'school'), ('below', '18', 'years'), ('below', '18'),
    ('below', 'the', 'age', 'of', '14'), ('below', '14'),
    ('below', '18', 'years', 'of', 'age'), ('below', 'the', 'age', 'of', '17'),
    ('below', 'the', 'age', 'of','18'), ('below', '17'),
    ('below', 'eighteen', 'years'), ('below', 'eighteen'),
    ('below', 'the', 'age', 'of','19'), 'boy', 'boys', 'child',
    ('child', 'marriages'), ('child', 'labor'), ('child', 'abuse'),
    ('child', 'marriage'), ('child', 'labour'), ('child', 'friendly'),
    'child-friendly', ('child-friendly', 'spaces'), 'child-headed',
    ('children', 'with', 'disabilities'), 'children',
    ('children', 'under', 'eighteen'),
    ('children', 'below', 'the', 'age', 'of', 'twelve'),
    ('children', 'dropping', 'out', 'of', 'school'),
    ('children', 'separated', 'from', 'their', 'parents'),
    ('children', 'below', 'the', 'age', 'of', '12'),
    ('children', 'not', 'attending', 'school'), ('children', 'school'),
    ('children', 'under', '18'), ('children', 'caregivers'),
    ('early', 'marriage'),
    ('eighteen', 'years'), ('for', 'refugee', 'children'),
    ('for', 'displaced', 'children'), ('fourteen', 'years'), 'girl', 'girls',
    ('girls', 'children'), 'grades', 'in-school', ('labor', 'child'),
    ('minor', 'child'), ('minor', 'children'), 'minors', ('mixed', 'school'),
    ('out', 'of', 'education'), ('out', 'of', 'school'),
    ('out-of-school', 'children'), 'out-of-school', 'pediatric',
    ('primary', 'students'), ('primary', 'student'), ('refugee', 'children'),
    ('rohingya', 'children'), ('school', 'students'),
    ('school', 'aged', 'children'), ('school', 'children'),
    ('school', 'aged'), 'school-age', ('school-age', 'children'),
    ('school-aged', 'children'), 'school-aged', 'school-going', 'schoolboys',
    'schoolchildren', 'schoolgirls', ('separated', 'children'),
    ('seventeen', 'years'), 'students', 'teenagers', 'uasc', ('under', '18'),
    ('under', 'the', 'age', 'of','14'), ('under', '18', 'years'), ('under', '14'),
    ('under', 'the', 'age', 'of', 'eighteen'), ('under', 'the', 'age', 'of', '17'),
    ('under', '17'), ('under', 'the', 'age', 'of', '18'),
    ('under', 'the', 'age', 'of', '19'), ('under', 'eighteen'),
    ('under', 'eighteen', 'years'), ('under', '18', 'years', 'of', 'age'),
    ('under', 'the', 'age', 'of', '18'), ('venezuelan', 'minors'),
    ('violence', 'against', 'children'), ('vulnerable', 'children'),
    ('young', 'girls'), ("teenage","girl"), ("teenage","girls"),
    ("teenage","boy"), ("teenage","boys"), ("older", "than", "5", "years")
]
# list(
#     sorted(list(set(kw_en_child)),
#            key=lambda x: x if isinstance(x, str) else x[0]))
negative_kw_child = [
    "gam",
    "sam",
    "muac",
    "24-59m",
    ('above', 'the', 'age', '18', 'years'),
    ('above', '18', 'years', 'of', 'age'),
    ('moderate', 'acute', 'malnutrition', 'mam'),
    ('moderate', 'acute', 'malnutrition'),
    'mam',
    ('aged', '6-59', 'months'),
    ('aged', '6-59'),
]
##
##
r_6_18 = r"\b(1[0-8]|[6-9])\b"  # 6->18
r_5_15 = r"\b(1[0-5]|[5-9])\b"  # 5->15
r_5_18 = r"\b(1[0-8]|[5-9])\b"  # 5->18
r_14_18 = r"\b(1[4-8])\b"  # 4->18
##
child_patterns = [
    f"between {r_6_18} ?- ?{r_6_18}",
    f"between {r_6_18} to {r_6_18}",
    f"between the age of {r_6_18} and {r_6_18}",
    f"the {r_6_18} ?- ?{r_6_18} age",
    f"{r_6_18} ?- ?{r_6_18} years",
    f"{r_6_18} to {r_6_18} years",
    f"{r_6_18} ?-year-old",
    f"{r_6_18} ?-year-olds",
    f"{r_6_18} ?- ?{r_6_18} years old",
    f"{r_6_18} ?- ?{r_6_18} age",
    f"ages of {r_6_18} ?- ?{r_6_18}",
    f"ages of {r_6_18} and {r_6_18}",
    f"age of {r_6_18} ?- ?{r_6_18}",
    f"age of {r_6_18} years",
    f"age {r_6_18}",
    f"aged {r_6_18}",
    f"age {r_6_18} ?- ?{r_6_18}",
    f"ages {r_6_18} ?- ?{r_6_18}",
    f"aged {r_6_18} ?- ?{r_6_18}",
    f"age {r_6_18} ?- ?{r_6_18}",
    f"{r_6_18} years",
    f"age group {r_5_18} ?- ?{r_6_18} years",
    f"below {r_14_18} years",
    f"below the age of {r_14_18}",
    f"children under {r_14_18}",
    f"under the age of {r_14_18}",
    f"under {r_14_18} years",
    f"below {r_14_18} years",
    f"below the age of {r_14_18}",
    f"younger than {r_14_18}",
    f"less than {r_14_18}",
    f"older than {r_5_15} years",
]
##
def is_child(row):
    for kw in negative_kw_child:
        if isinstance(kw, str) and kw in row['tokenized_excerpt']: return False
        elif len(kw) == 2 and kw in row['bigram_excerpt']: return False
        elif len(kw) == 3 and kw in row['trigram_excerpt']: return False
        elif len(kw) == 4 and kw in row['fourgram_excerpt']: return False
        elif len(kw) == 5 and kw in row['fivegram_excerpt']: return False
        elif len(kw) == 6 and kw in row['sixgram_excerpt']: return False
        elif len(kw) == 7 and kw in row['sevengram_excerpt']: return False
    for p in child_patterns:
        if re.search(p, row["excerpt_pp"]): return True
    for kw in kw_en_child:
        if isinstance(kw, str) and kw in row['tokenized_excerpt']: return True
        elif len(kw) == 2 and kw in row['bigram_excerpt']: return True
        elif len(kw) == 3 and kw in row['trigram_excerpt']: return True
        elif len(kw) == 4 and kw in row['fourgram_excerpt']: return True
        elif len(kw) == 5 and kw in row['fivegram_excerpt']: return True
        elif len(kw) == 6 and kw in row['sixgram_excerpt']: return True
        elif len(kw) == 7 and kw in row['sevengram_excerpt']: return True
    return False

In [19]:
kw_en_adult = [
    'adult', 'adults', ('asylum', 'seekers'), 'breadwinner',
    ('care', 'givers'), 'care-giving', 'caregivers', 'child-bearing',
    'childbearing', 'employability', 'employed', 'employees', 'employers',
    'employment', 'entrepreneurs', ('female', 'detainees'), 'female-headed',
    ('foreign', 'health', 'professionals'), 'gbv', 'graduates',
    ('had', 'given', 'a', 'life', 'birth'), 'husbands', 'jobs',
    ('killing', 'of', 'teachers'), ('labor', 'market'), ('life', 'birth'),
    ('male', 'headed'), 'male-headed', 'maternal', 'men', 'mothers', 'parents',
    ('reproductive', 'age'), 'scholarships', ('stateless', 'persons'),
    ('to',
     'work'), 'underemployed', 'underemployment', 'unemployed', 'unemployment',
    ('unintended', 'pregnancies'), 'university', 'widow',
    'widows', 'wives', 'woman', 'women', 'women-headed', 'workers',
    ('working', 'in'), 'working-age', 'youth', 'youths'
]
# list(
#     sorted(list(set(kw_en_adult)),
#            key=lambda x: x if isinstance(x, str) else x[0]))
negative_kw_adult = []
##
r_18_59 = r"\b([2-5][0-9]|1[8-9])\b"  # 18->59
r_20_59 = r"\b([2-5][0-9])\b"  # 20->59
r_15_19 = r"\b(1[5-9])\b"  # 15->19
r_11_19 = r"\b(1[1-9])\b"  # 11->19
r_18_49 = r"\b([2-4][0-9]|1[8-9])\b"  # 18->49
##
adult_patterns = [
    f"between {r_11_19} ?- ?{r_20_59}",
    f"between {r_18_59} ?- ?{r_18_59}",
    f"between {r_18_59} to {r_18_59}",
    f"between {r_11_19} to {r_20_59}",
    f"between the age of {r_18_59} and {r_18_59}",
    f"between the age of {r_11_19} and {r_20_59}",
    f"the {r_18_59} ?- ?{r_18_59} age",
    f"{r_18_59} ?- ?{r_18_59} years",
    f"{r_18_59} to {r_18_59} years",
    f"{r_18_59}\+ years",
    f"{r_18_59} ?-year-old",
    f"{r_18_59} ?-year-olds",
    f"{r_18_59} ?- ?{r_18_59} years old",
    f"{r_18_59} ?- ?{r_18_59} age",
    f"ages of {r_18_59} ?- ?{r_18_59}",
    f"ages of {r_11_19} ?- ?{r_20_59}",
    f"ages of {r_11_19} and {r_20_59}",
    f"age of {r_18_59} ?- ?{r_18_59}",
    f"age of {r_18_59} ?- ?{r_18_59}",
    f"age of {r_18_59} years",
    f"age {r_18_59}",
    f"age {r_18_59} ?- ?{r_18_59}",
    f"ages {r_11_19} ?- ?{r_20_59}",
    f"aged {r_18_59} ?- ?{r_18_59}",
    f"age {r_11_19} ?- ?{r_20_59}",
    f"\d\d+,?\d* female",
    f"\d\d+,?\d* females",
    f"\d\d+,?\d* male",
    f"\d\d+,?\d* males",
    #f"\d\d+,?\d* people",
    #f"\d\d+,?\d* persons",
    f"older than {r_18_49} years",
    f"{r_18_49} years and above",
    f"{r_18_49} years old and above",
    f"above the age of {r_18_49}",
    f"over the age of {r_18_49}",
    f"over >{r_18_59} ?years",
    f"above >{r_18_59} ?years",
]
##
def is_adult(row):
    for kw in negative_kw_adult:
        if isinstance(kw, str) and kw in row['tokenized_excerpt']: return False
        elif len(kw) == 2 and kw in row['bigram_excerpt']: return False
        elif len(kw) == 3 and kw in row['trigram_excerpt']: return False
        elif len(kw) == 4 and kw in row['fourgram_excerpt']: return False
        elif len(kw) == 5 and kw in row['fivegram_excerpt']: return False
        elif len(kw) == 6 and kw in row['sixgram_excerpt']: return False
        elif len(kw) == 7 and kw in row['sevengram_excerpt']: return False
    for p in adult_patterns:
        if re.search(p, row["excerpt_pp"]): return True
    for kw in kw_en_adult:
        if isinstance(kw, str) and kw in row['tokenized_excerpt']: return True
        elif len(kw) == 2 and kw in row['bigram_excerpt']: return True
        elif len(kw) == 3 and kw in row['trigram_excerpt']: return True
        elif len(kw) == 4 and kw in row['fourgram_excerpt']: return True
        elif len(kw) == 5 and kw in row['fivegram_excerpt']: return True
        elif len(kw) == 6 and kw in row['sixgram_excerpt']: return True
        elif len(kw) == 7 and kw in row['sevengram_excerpt']: return True
    return False

In [20]:
kw_en_old = [
    ('55', 'years', 'and', 'above'),
    ('59', 'years', 'and', 'above'),
    ('60', 's'),
    ('60', 'years'),
    ('60', 'years', 'and', 'above'),
    '60-year-old',
    ('62', 'older'),
    ('65', 'years'),
    ('65', 'years', 'and', 'above'),
    ('70', 'years'),
    ('70', 'years', 'and', 'above'),
    ('70', '-year-old'),
    '70-year-old',
    ('75', 'years', 'and', 'above'),
    ('80', 'years'),
    '80-year-old',
    ('above', '65', 'years'),
    ('above', '65', 'years', 'of', 'age'),
    ('above', '60', 'years'),
    ('above', '55', 'years'),
    ('above', '70'),
    ('above', '80'),
    ('above', '59'),
    ('above', '70', 'years'),
    ('above', '59', 'years'),
    ('above', '75', 'years'),
    ('above', '60'),
    ('above', '65'),
    ('adults', 'aged', '65'),
    ('adults', 'aged', '70'),
    ('aged', 'above', '59'),
    ('aged', '80', 'and', 'above'),
    ('aged', 'above', '60'),
    ('aged', '65'),
    ('aged', 'above', '85'),
    ('aged', 'above', '55'),
    ('aged', '70', 'and', 'above'),
    ('aged', '75', 'and', 'above'),
    ('aged', 'above', '65'),
    ('aged', '80'),
    ('aged', 'above', '75'),
    ('aged', '70'),
    ('aged', 'above', '50'),
    ('aged', 'above', '70'),
    ('aged', '60', 'and', 'above'),
    ('aged', '65', 'and', 'above'),
    ('aged', '60'),
    ('aged', 'above', '80'),
    'ageing',
    ('elderly', 'community'),
    ('elderly', 'population'),
    ('elderly', 'men'),
    ('elderly', 'women'),
    ('elderly', 'persons'),
    'elderly',
    ('elderly', 'people'),
    ('elders', 'people'),
    ('older', 'persons'),
    ('older', 'people'),
    ('older', 'population'),
    ('older', 'person'),
    ('older', 'men'),
    ('older', 'women'),
    ('over', '59'),
    ('over', '70', 'years'),
    ('over', '59', 'years'),
    ('over', '55'),
    ('over', '60', 'years', 'old'),
    ('over', '80', 'years', 'old'),
    ('over', '60'),
    ('over', '65', 'years'),
    ('over', '60', 'years'),
    ('over', '55', 'years'),
    ('over', '80', 'years'),
    ('people', 'aged', '59'),
    ('people', 'over', '65'),
    ('people', 'aged', '55'),
    ('people', 'aged', '60'),
    ('people', 'over', '80'),
    ('people', 'over', '70'),
    ('the', '59', '+', 'population'),
    "elders",
    ('50', '+', 'years'),
    ("geriatric", "population"),
]
# list(
#     sorted(list(set(kw_en_old)),
#            key=lambda x: x if isinstance(x, str) else x[0]))
negative_kw_old = []
r_59_100 = r"\b(100|[6-9][0-9]|59)\b"  # 59-100
r_50_100 = r"\b(100|[6-9][0-9]|5[0-9])\b"  # 50-100
r_61_100 = r"\b(100|[6-9][1-9])\b"  # 61-100
r_60_100 = r"\b(100|[6-9][0-9])\b"  # 60-100
r_0_59 = r"\b(5[0-9]|[0-9][0-9])\b"  # 0-59
patterns_old = [
    f"between {r_59_100} ?- ?{r_59_100}",
    f"between {r_50_100} ?- ?{r_61_100}",
    f"{r_59_100} years and older",
    f"{r_59_100} years and above",
    f"{r_59_100} years old and above",
    f"above {r_59_100} years",
    f"over {r_59_100} years",
    f"above the age of {r_50_100}",
    f"over the age of {r_50_100}",
    f"between {r_50_100} ?- ?{r_59_100}",
    f"between {r_50_100} to {r_59_100}",
    f"between the age of {r_50_100} and {r_59_100}",
    f"between the ages of {r_50_100} and {r_59_100}",
    f"the {r_59_100} ?- ?{r_59_100} age",
    f"{r_50_100} ?- ?{r_61_100} years",
    f"{r_50_100} to {r_61_100} years",
    f"{r_50_100}\+ years",
    f"{r_60_100} ?-year-old",
    f"{r_60_100} ?-year-olds",
    f"{r_50_100} ?- ?{r_61_100} years old",
    f"{r_50_100} ?- ?{r_61_100} age",
    f"ages of {r_50_100} ?- ?{r_61_100}",
    f"ages of {r_50_100} ?- ?{r_61_100}",
    f"ages of {r_50_100} and {r_61_100}",
    f"age of {r_50_100} ?- ?{r_61_100}",
    f"age of {r_50_100} ?- ?{r_61_100}",
    f"age of {r_60_100} years",
    f"age {r_60_100}",
    f"age {r_50_100} ?- ?{r_61_100}",
    f"ages {r_50_100} ?- ?{r_61_100}",
    f"aged {r_50_100} ?- ?{r_61_100}",
    f"age {r_50_100} ?- ?{r_61_100}",
    f"aged {r_50_100} years and older",
    f"older than {r_50_100}",
]


##
def is_old(row):
    for kw in negative_kw_old:
        if isinstance(kw, str) and kw in row['tokenized_excerpt']: return False
        elif len(kw) == 2 and kw in row['bigram_excerpt']: return False
        elif len(kw) == 3 and kw in row['trigram_excerpt']: return False
        elif len(kw) == 4 and kw in row['fourgram_excerpt']: return False
        elif len(kw) == 5 and kw in row['fivegram_excerpt']: return False
        elif len(kw) == 6 and kw in row['sixgram_excerpt']: return False
        elif len(kw) == 7 and kw in row['sevengram_excerpt']: return False
    for p in patterns_old:
        if re.search(p, row["excerpt_pp"]): return True
    for kw in kw_en_old:
        if isinstance(kw, str) and kw in row['tokenized_excerpt']: return True
        elif len(kw) == 2 and kw in row['bigram_excerpt']: return True
        elif len(kw) == 3 and kw in row['trigram_excerpt']: return True
        elif len(kw) == 4 and kw in row['fourgram_excerpt']: return True
        elif len(kw) == 5 and kw in row['fivegram_excerpt']: return True
        elif len(kw) == 6 and kw in row['sixgram_excerpt']: return True
        elif len(kw) == 7 and kw in row['sevengram_excerpt']: return True
    return False

In [None]:
# preprocess
df_train_en["excerpt_pp"] = df_train_en["excerpt"].progress_apply(preprocess)
# tokenize and cacl ngrams
df_train_en["tokenized_excerpt"] = df_train_en["excerpt"].progress_apply(preprocess_and_tokenize)
df_train_en["bigram_excerpt"] = df_train_en["excerpt"].progress_apply(lambda x: preprocess_and_tokenize(x, 2))
df_train_en["trigram_excerpt"] = df_train_en["excerpt"].progress_apply(lambda x: preprocess_and_tokenize(x, 3))
df_train_en["fourgram_excerpt"] = df_train_en["excerpt"].progress_apply(lambda x: preprocess_and_tokenize(x, 4))
df_train_en["fivegram_excerpt"] = df_train_en["excerpt"].progress_apply(lambda x: preprocess_and_tokenize(x, 5))
df_train_en["sixgram_excerpt"] = df_train_en["excerpt"].progress_apply(lambda x: preprocess_and_tokenize(x, 6))
df_train_en["sevengram_excerpt"] = df_train_en["excerpt"].progress_apply(lambda x: preprocess_and_tokenize(x, 7))