In [1]:
import os
cwd = "/home/abdullah/Documents/DFS/kw_and_snorkel_based_labeling/age"
os.chdir(cwd)

In [2]:
import re
from ast import literal_eval

import pandas as pd
from tqdm.notebook import tqdm
from nltk.corpus import stopwords
from nltk import word_tokenize, ngrams
from IPython.core.display import HTML

In [3]:
tqdm.pandas()

In [4]:
df_train = pd.read_csv("../data/train_v0.7.1.csv",
                       usecols=[
                           'entry_id', 'excerpt', 'age', 'lang',
                           "translation_en", "translation_fr", "translation_es"
                       ])
df_val = pd.read_csv("../data/val_v0.7.1.csv",
                     usecols=[
                         'entry_id', 'excerpt', 'age', 'lang',
                         "translation_en", "translation_fr", "translation_es"
                     ])
df_test = pd.read_csv("../data/test_v0.7.1.csv",
                      usecols=[
                          'entry_id', 'excerpt', 'age', 'lang',
                          "translation_en", "translation_fr", "translation_es"
                      ])

In [5]:
col = "age"
for df in [df_train, df_val, df_test]:
    df[col] = df[col].apply(lambda x: list(sorted(list(set(literal_eval(x))))))

In [6]:
df_train.loc[df_train["lang"].eq("en"),
                "translation_en"] = df_train.loc[
                    df_train["lang"].eq("en"), "excerpt"]
# df_train["excerpt"] = df_train["translation_en"]
# df_train = df_train[['entry_id', 'excerpt', 'age', 'lang']]

In [7]:
df_val.loc[df_val["lang"].eq("en"),
                "translation_en"] = df_val.loc[
                    df_val["lang"].eq("en"), "excerpt"]
# df_val["excerpt"] = df_val["translation_en"]
# df_val = df_val[['entry_id', 'excerpt', 'age', 'lang']]

In [8]:
df_test.loc[df_test["lang"].eq("en"),
                "translation_en"] = df_test.loc[
                    df_test["lang"].eq("en"), "excerpt"]
# df_test["excerpt"] = df_test["translation_en"]
# df_test = df_test[['entry_id', 'excerpt', 'age', 'lang']]

In [9]:
def preprocess(doc):
    if doc != doc:
        return ""
    doc = doc.lower()
    # remove preceeding dates
    #doc = re.sub("^\[.+\]", " ", doc).strip()
    #doc = re.sub("^\(.+\)", " ", doc).strip()
    # spaces btw numbers and words
    doc = re.sub('(\d+(\W\d+)?)', r' \1 ', doc).strip()
    doc = re.sub("[‐‑–—―─_]", "-", doc)
    doc = re.sub("(\w)\- (\w)", r"\1\2", doc)
    doc = re.sub(
        "[" + re.escape(
            '_@^~.()[],"“’…<❖‐»—─|•&{≥➢\ue0e4\uf0d8\uf0fc●°#\u200b>`?�€■!‘%;̧\'›«”:≤―\uf0b7$}*´=‑▪\xad❑·–'
        ) + "]", " ", doc)
    #remove some puncs
    doc = re.sub('\s+', " ", doc).strip()
    return doc
##
def preprocess_and_tokenize(doc, n=1):
    doc = preprocess(doc)
    # tokenize
    words = word_tokenize(doc)
    if n == 1:
        return set(words)
    return set(ngrams(words, n))

In [10]:
kw_en_infant = [
    '(24-59m)',
    ('0', '-', '59', 'months'),
    ('0-59', 'months'),
    ('1', '-year-old'),
    '1-year-old',
    ('12-15', 'month'),
    ('2', '-year-old'),
    '2-year-old',
    ('24-59', 'm'),
    ('3', '-year-old'),
    '3-year-old',
    ('4', '-year-old'),
    '4-year-old',
    ('5', '-year-old'),
    '5-year-old',
    ('6', '-', '23', 'months', 'old'),
    ('6-', '23', 'months', 'old'),
    ('6-23', 'months', 'old'),
    ('age', 'of', '5'),
    ('age', 'of', 'five'),
    ('aged', '0', '-', '23', 'months'),
    ('aged', '0-23', 'months'),
    ('aged', '0-', '23', 'months'),
    ('aged', 'zero', '-', 'twenty-three', 'months'),
    'babies',
    'baby',
    'born',
    'breastfed',
    ('children', '6-', '59', 'months', 'of', 'age'),
    ('children', '6-59', 'months'),
    ('children', 'under', 'the', 'age', 'of', 'five', 'years'),
    ('children', 'below', 'five', 'years', 'of', 'age'),
    ('children', '6-', '59', 'months'),
    ('children', '0-23', 'months'),
    ('children', 'aged', '0-', '23', 'months'),
    ('children', 'under', '5', 'years', 'of', 'age'),
    ('children', 'of', '6-', '59', 'months'),
    ('children', 'six', '-', 'fifty-nine', 'months'),
    ('children', 'aged', '0-23', 'months'),
    ('children', 'aged', '0', '-', '23', 'months'),
    ('children', 'under', 'five', 'years', 'of', 'age'),
    ('children', 'aged', 'six', 'to', 'fifty-nine', 'months'),
    ('children', 'between', 'the', 'ages', 'of', 'twenty-four', 'and',
     'fifty-nine', 'months'),
    ('children', 'aged', '6-59', 'months'),
    ('children', '6-59', 'months', 'of', 'age'),
    ('children', 'of', '6-59', 'months'),
    ('children', 'under', 'the', 'age', 'of', '5'),
    ('children', 'below', '5', 'years', 'of', 'age'),
    ('children', 'between', 'six', 'to', 'twenty-three', 'months'),
    ('children', 'from', '6', 'to', '59', 'months'),
    ('children', '6', '-', '59', 'months'),
    ('children', 'under', '5', 'years', 'old'),
    ('children', 'under', 'the', 'age', 'of', '5', 'years'),
    ('children', 'aged', 'six', '-', 'fifty-nine', 'months'),
    ('children', 'aged', '6-', '59', 'months'),
    ('children', 'six', '-', 'fifty-nine', 'months', 'of', 'age'),
    ('children', 'aged', '6', '-', '59', 'months'),
    ('children', '0', '-', '23', 'months'),
    ('children', 'from', 'six', 'to', 'fifty-nine', 'months'),
    ('children', 'zero', '-', 'twenty-three', 'months'),
    ('children', 'of', '6', '-', '59', 'months'),
    ('children', 'under', 'five', 'years'),
    ('children', 'of', 'six', '-', 'fifty-nine', 'months'),
    ('children', 'aged', 'zero', '-', 'twenty-three', 'months'),
    ('children', '0-', '23', 'months'),
    ('children', 'between', 'the', 'ages', 'of', '24', 'and', '59', 'months'),
    ('children', 'under', 'the', 'age', 'of', 'five'),
    ('children', '6', '-', '59', 'months', 'of', 'age'),
    ('children', 'under', 'five', 'years', 'old'),
    ('children', 'aged', '6', 'to', '59', 'months'),
    ('children', 'between', '6', 'to', '23', 'months'),
    'cmam',
    'congenital',
    ('infant', 'and', 'young', 'child'),
    'infant',
    ('infant', 'and', 'young', 'child', 'feeding'),
    'infantile',
    'infants',
    'iycf',
    ('less', 'than', 'five', 'years'),
    ('less', 'than', '5', 'years'),
    ('live', 'births'),
    ('live', 'birth'),
    'measles',
    'neonatal',
    'newborn',
    'newborns',
    'perinatal',
    'post-natal',
    ('six', '-', 'twenty-three', 'months', 'old'),
    'stunted',
    'stunting',
    ('under', 'the', 'age', 'of', 'five', 'years'),
    ('under', 'the', 'age', 'of', '5'),
    ('under', 'the', 'age', 'of', 'five'),
    ('under', 'the', 'age', 'of', '5', 'years'),
    ("aged", "0-9", "years"),
]
# list(
#     sorted(list(set(kw_en_infant)),
#            key=lambda x: x if isinstance(x, str) else x[0]))
negative_kw_infant = [
    ("women", "with", "babies"),
]
##
r_0_59 = r"\b(5[0-9]|[0-9])\b"  # 0->59
r_0_5 = r"\b([0-5])\b"  # 0->5
##
infant_patterns = [
    f"less than {r_0_5} years",
    f"younger than {r_0_5} years",
    f"under the age of {r_0_5}",
    f"below the age of {r_0_5}",
    f"{r_0_59} ?- ?{r_0_59} months old",
    f"aged {r_0_5} ?- ?[0-9] years",
    f"aged {r_0_59} ?- ?{r_0_59} months?",
    f"{r_0_59} ?- ?{r_0_59} ?m",
    f"{r_0_59} ?- ?{r_0_59} ?months?",
    f"{r_0_5}-year-old",
    f"children {r_0_59} ?- ?{r_0_59} months?",
    f"children of {r_0_59} ?- ?{r_0_59} months?",
    f"children under {r_0_5}",
    f"children under the age of {r_0_5}",
    f"children under {r_0_5} years",
    f"children below {r_0_5} years",
    f"children below the age of {r_0_5}",
    f"children below {r_0_5}",
    f"children aged {r_0_59} ?- ?{r_0_59} months?",
    f"children aged {r_0_59} to {r_0_59} months?",
    f"children between {r_0_59} ?- ?{r_0_59} months?",
    f"children between {r_0_59} to {r_0_59} months?",
    f"children between the ages of {r_0_59} and {r_0_59} months?",
    f"children from {r_0_59} ?- ?{r_0_59} months?",
    f"children from {r_0_59} to {r_0_59} months?",
    f"children of {r_0_59} ?- ?{r_0_59} months?",
]


##
def is_infant(row):
    for kw in negative_kw_infant:
        if isinstance(kw, str) and kw in row['tokenized_excerpt']: return False
        elif len(kw) == 2 and kw in row['bigram_excerpt']: return False
        elif len(kw) == 3 and kw in row['trigram_excerpt']: return False
        elif len(kw) == 4 and kw in row['fourgram_excerpt']: return False
        elif len(kw) == 5 and kw in row['fivegram_excerpt']: return False
        elif len(kw) == 6 and kw in row['sixgram_excerpt']: return False
        elif len(kw) == 7 and kw in row['sevengram_excerpt']: return False
    for p in infant_patterns:
        if re.search(p, row["excerpt_pp"]): return True
    for kw in kw_en_infant:
        if isinstance(kw, str) and kw in row['tokenized_excerpt']: return True
        elif len(kw) == 2 and kw in row['bigram_excerpt']: return True
        elif len(kw) == 3 and kw in row['trigram_excerpt']: return True
        elif len(kw) == 4 and kw in row['fourgram_excerpt']: return True
        elif len(kw) == 5 and kw in row['fivegram_excerpt']: return True
        elif len(kw) == 6 and kw in row['sixgram_excerpt']: return True
        elif len(kw) == 7 and kw in row['sevengram_excerpt']: return True
    return False

In [11]:
kw_en_child = [
    ('14', 'years'), ('17', 'years'), 'adolescent', 'adolescents',
    ('age', 'group', '5', '-', '14', 'years'),
    ('age', 'group', '5-14', 'years'), ('aged', 'ten'), ('aged', '10'),
    ('attending', 'school'), ('below', '18', 'years'), ('below', '18'),
    ('below', 'the', 'age', 'of', '14'), ('below', '14'),
    ('below', '18', 'years', 'of', 'age'), ('below', 'the', 'age', 'of', '17'),
    ('below', 'the', 'age', 'of','18'), ('below', '17'),
    ('below', 'eighteen', 'years'), ('below', 'eighteen'),
    ('below', 'the', 'age', 'of','19'), 'boy', 'boys', 'child',
    ('child', 'marriages'), ('child', 'labor'), ('child', 'abuse'),
    ('child', 'marriage'), ('child', 'labour'), ('child', 'friendly'),
    'child-friendly', ('child-friendly', 'spaces'), 'child-headed',
    ('children', 'with', 'disabilities'), 'children',
    ('children', 'under', 'eighteen'),
    ('children', 'below', 'the', 'age', 'of', 'twelve'),
    ('children', 'dropping', 'out', 'of', 'school'),
    ('children', 'separated', 'from', 'their', 'parents'),
    ('children', 'below', 'the', 'age', 'of', '12'),
    ('children', 'not', 'attending', 'school'), ('children', 'school'),
    ('children', 'under', '18'), ('children', 'caregivers'),
    ('early', 'marriage'),
    ('eighteen', 'years'), ('for', 'refugee', 'children'),
    ('for', 'displaced', 'children'), ('fourteen', 'years'), 'girl', 'girls',
    ('girls', 'children'), 'grades', 'in-school', ('labor', 'child'),
    ('minor', 'child'), ('minor', 'children'), 'minors', ('mixed', 'school'),
    ('out', 'of', 'education'), ('out', 'of', 'school'),
    ('out-of-school', 'children'), 'out-of-school', 'pediatric',
    ('primary', 'students'), ('primary', 'student'), ('refugee', 'children'),
    ('rohingya', 'children'), ('school', 'students'),
    ('school', 'aged', 'children'), ('school', 'children'),
    ('school', 'aged'), 'school-age', ('school-age', 'children'),
    ('school-aged', 'children'), 'school-aged', 'school-going', 'schoolboys',
    'schoolchildren', 'schoolgirls', ('separated', 'children'),
    ('seventeen', 'years'), 'students', 'teenagers', 'uasc', ('under', '18'),
    ('under', 'the', 'age', 'of','14'), ('under', '18', 'years'), ('under', '14'),
    ('under', 'the', 'age', 'of', 'eighteen'), ('under', 'the', 'age', 'of', '17'),
    ('under', '17'), ('under', 'the', 'age', 'of', '18'),
    ('under', 'the', 'age', 'of', '19'), ('under', 'eighteen'),
    ('under', 'eighteen', 'years'), ('under', '18', 'years', 'of', 'age'),
    ('under', 'the', 'age', 'of', '18'), ('venezuelan', 'minors'),
    ('violence', 'against', 'children'), ('vulnerable', 'children'),
    ('young', 'girls'), ("teenage","girl"), ("teenage","girls"),
    ("teenage","boy"), ("teenage","boys"), ("older", "than", "5", "years")
]
# list(
#     sorted(list(set(kw_en_child)),
#            key=lambda x: x if isinstance(x, str) else x[0]))
negative_kw_child = [
    "gam",
    "sam",
    "muac",
    "24-59m",
    ('above', 'the', 'age', '18', 'years'),
    ('above', '18', 'years', 'of', 'age'),
    ('moderate', 'acute', 'malnutrition', 'mam'),
    ('moderate', 'acute', 'malnutrition'),
    'mam',
    ('aged', '6-59', 'months'),
    ('aged', '6-59'),
]
##
##
r_6_18 = r"\b(1[0-8]|[6-9])\b"  # 6->18
r_5_15 = r"\b(1[0-5]|[5-9])\b"  # 5->15
r_5_18 = r"\b(1[0-8]|[5-9])\b"  # 5->18
r_14_18 = r"\b(1[4-8])\b"  # 4->18
##
child_patterns = [
    f"between {r_6_18} ?- ?{r_6_18}",
    f"between {r_6_18} to {r_6_18}",
    f"between the age of {r_6_18} and {r_6_18}",
    f"the {r_6_18} ?- ?{r_6_18} age",
    f"{r_6_18} ?- ?{r_6_18} years",
    f"{r_6_18} to {r_6_18} years",
    f"{r_6_18} ?-year-old",
    f"{r_6_18} ?-year-olds",
    f"{r_6_18} ?- ?{r_6_18} years old",
    f"{r_6_18} ?- ?{r_6_18} age",
    f"ages of {r_6_18} ?- ?{r_6_18}",
    f"ages of {r_6_18} and {r_6_18}",
    f"age of {r_6_18} ?- ?{r_6_18}",
    f"age of {r_6_18} years",
    f"age {r_6_18}",
    f"aged {r_6_18}",
    f"age {r_6_18} ?- ?{r_6_18}",
    f"ages {r_6_18} ?- ?{r_6_18}",
    f"aged {r_6_18} ?- ?{r_6_18}",
    f"age {r_6_18} ?- ?{r_6_18}",
    f"{r_6_18} years",
    f"age group {r_5_18} ?- ?{r_6_18} years",
    f"below {r_14_18} years",
    f"below the age of {r_14_18}",
    f"children under {r_14_18}",
    f"under the age of {r_14_18}",
    f"under {r_14_18} years",
    f"below {r_14_18} years",
    f"below the age of {r_14_18}",
    f"younger than {r_14_18}",
    f"less than {r_14_18}",
    f"older than {r_5_15} years",
]
##
def is_child(row):
    for kw in negative_kw_child:
        if isinstance(kw, str) and kw in row['tokenized_excerpt']: return False
        elif len(kw) == 2 and kw in row['bigram_excerpt']: return False
        elif len(kw) == 3 and kw in row['trigram_excerpt']: return False
        elif len(kw) == 4 and kw in row['fourgram_excerpt']: return False
        elif len(kw) == 5 and kw in row['fivegram_excerpt']: return False
        elif len(kw) == 6 and kw in row['sixgram_excerpt']: return False
        elif len(kw) == 7 and kw in row['sevengram_excerpt']: return False
    for p in child_patterns:
        if re.search(p, row["excerpt_pp"]): return True
    for kw in kw_en_child:
        if isinstance(kw, str) and kw in row['tokenized_excerpt']: return True
        elif len(kw) == 2 and kw in row['bigram_excerpt']: return True
        elif len(kw) == 3 and kw in row['trigram_excerpt']: return True
        elif len(kw) == 4 and kw in row['fourgram_excerpt']: return True
        elif len(kw) == 5 and kw in row['fivegram_excerpt']: return True
        elif len(kw) == 6 and kw in row['sixgram_excerpt']: return True
        elif len(kw) == 7 and kw in row['sevengram_excerpt']: return True
    return False

In [12]:
kw_en_adult = [
    'adult', 'adults', ('asylum', 'seekers'), 'breadwinner',
    ('care', 'givers'), 'care-giving', 'caregivers', 'child-bearing',
    'childbearing', 'employability', 'employed', 'employees', 'employers',
    'employment', 'entrepreneurs', ('female', 'detainees'), 'female-headed',
    ('foreign', 'health', 'professionals'), 'gbv', 'graduates',
    ('had', 'given', 'a', 'life', 'birth'), 'husbands', 'jobs',
    ('killing', 'of', 'teachers'), ('labor', 'market'), ('life', 'birth'),
    ('male', 'headed'), 'male-headed', 'maternal', 'men', 'mothers', 'parents',
    ('reproductive', 'age'), 'scholarships', ('stateless', 'persons'),
    ('to',
     'work'), 'underemployed', 'underemployment', 'unemployed', 'unemployment',
    ('unintended', 'pregnancies'), 'university', 'widow',
    'widows', 'wives', 'woman', 'women', 'women-headed', 'workers',
    ('working', 'in'), 'working-age', 'youth', 'youths'
]
# list(
#     sorted(list(set(kw_en_adult)),
#            key=lambda x: x if isinstance(x, str) else x[0]))
negative_kw_adult = []
##
r_18_59 = r"\b([2-5][0-9]|1[8-9])\b"  # 18->59
r_20_59 = r"\b([2-5][0-9])\b"  # 20->59
r_15_19 = r"\b(1[5-9])\b"  # 15->19
r_11_19 = r"\b(1[1-9])\b"  # 11->19
r_18_49 = r"\b([2-4][0-9]|1[8-9])\b"  # 18->49
##
adult_patterns = [
    f"between {r_11_19} ?- ?{r_20_59}",
    f"between {r_18_59} ?- ?{r_18_59}",
    f"between {r_18_59} to {r_18_59}",
    f"between {r_11_19} to {r_20_59}",
    f"between the age of {r_18_59} and {r_18_59}",
    f"between the age of {r_11_19} and {r_20_59}",
    f"the {r_18_59} ?- ?{r_18_59} age",
    f"{r_18_59} ?- ?{r_18_59} years",
    f"{r_18_59} to {r_18_59} years",
    f"{r_18_59}\+ years",
    f"{r_18_59} ?-year-old",
    f"{r_18_59} ?-year-olds",
    f"{r_18_59} ?- ?{r_18_59} years old",
    f"{r_18_59} ?- ?{r_18_59} age",
    f"ages of {r_18_59} ?- ?{r_18_59}",
    f"ages of {r_11_19} ?- ?{r_20_59}",
    f"ages of {r_11_19} and {r_20_59}",
    f"age of {r_18_59} ?- ?{r_18_59}",
    f"age of {r_18_59} ?- ?{r_18_59}",
    f"age of {r_18_59} years",
    f"age {r_18_59}",
    f"age {r_18_59} ?- ?{r_18_59}",
    f"ages {r_11_19} ?- ?{r_20_59}",
    f"aged {r_18_59} ?- ?{r_18_59}",
    f"age {r_11_19} ?- ?{r_20_59}",
    f"\d\d+,?\d* female",
    f"\d\d+,?\d* females",
    f"\d\d+,?\d* male",
    f"\d\d+,?\d* males",
    #f"\d\d+,?\d* people",
    #f"\d\d+,?\d* persons",
    f"older than {r_18_49} years",
    f"{r_18_49} years and above",
    f"{r_18_49} years old and above",
    f"above the age of {r_18_49}",
    f"over the age of {r_18_49}",
    f"over >{r_18_59} ?years",
    f"above >{r_18_59} ?years",
]
##
def is_adult(row):
    for kw in negative_kw_adult:
        if isinstance(kw, str) and kw in row['tokenized_excerpt']: return False
        elif len(kw) == 2 and kw in row['bigram_excerpt']: return False
        elif len(kw) == 3 and kw in row['trigram_excerpt']: return False
        elif len(kw) == 4 and kw in row['fourgram_excerpt']: return False
        elif len(kw) == 5 and kw in row['fivegram_excerpt']: return False
        elif len(kw) == 6 and kw in row['sixgram_excerpt']: return False
        elif len(kw) == 7 and kw in row['sevengram_excerpt']: return False
    for p in adult_patterns:
        if re.search(p, row["excerpt_pp"]): return True
    for kw in kw_en_adult:
        if isinstance(kw, str) and kw in row['tokenized_excerpt']: return True
        elif len(kw) == 2 and kw in row['bigram_excerpt']: return True
        elif len(kw) == 3 and kw in row['trigram_excerpt']: return True
        elif len(kw) == 4 and kw in row['fourgram_excerpt']: return True
        elif len(kw) == 5 and kw in row['fivegram_excerpt']: return True
        elif len(kw) == 6 and kw in row['sixgram_excerpt']: return True
        elif len(kw) == 7 and kw in row['sevengram_excerpt']: return True
    return False

In [13]:
kw_en_old = [
    ('55', 'years', 'and', 'above'),
    ('59', 'years', 'and', 'above'),
    ('60', 's'),
    ('60', 'years'),
    ('60', 'years', 'and', 'above'),
    '60-year-old',
    ('62', 'older'),
    ('65', 'years'),
    ('65', 'years', 'and', 'above'),
    ('70', 'years'),
    ('70', 'years', 'and', 'above'),
    ('70', '-year-old'),
    '70-year-old',
    ('75', 'years', 'and', 'above'),
    ('80', 'years'),
    '80-year-old',
    ('above', '65', 'years'),
    ('above', '65', 'years', 'of', 'age'),
    ('above', '60', 'years'),
    ('above', '55', 'years'),
    ('above', '70'),
    ('above', '80'),
    ('above', '59'),
    ('above', '70', 'years'),
    ('above', '59', 'years'),
    ('above', '75', 'years'),
    ('above', '60'),
    ('above', '65'),
    ('adults', 'aged', '65'),
    ('adults', 'aged', '70'),
    ('aged', 'above', '59'),
    ('aged', '80', 'and', 'above'),
    ('aged', 'above', '60'),
    ('aged', '65'),
    ('aged', 'above', '85'),
    ('aged', 'above', '55'),
    ('aged', '70', 'and', 'above'),
    ('aged', '75', 'and', 'above'),
    ('aged', 'above', '65'),
    ('aged', '80'),
    ('aged', 'above', '75'),
    ('aged', '70'),
    ('aged', 'above', '50'),
    ('aged', 'above', '70'),
    ('aged', '60', 'and', 'above'),
    ('aged', '65', 'and', 'above'),
    ('aged', '60'),
    ('aged', 'above', '80'),
    'ageing',
    ('elderly', 'community'),
    ('elderly', 'population'),
    ('elderly', 'men'),
    ('elderly', 'women'),
    ('elderly', 'persons'),
    'elderly',
    ('elderly', 'people'),
    ('elders', 'people'),
    ('older', 'persons'),
    ('older', 'people'),
    ('older', 'population'),
    ('older', 'person'),
    ('older', 'men'),
    ('older', 'women'),
    ('over', '59'),
    ('over', '70', 'years'),
    ('over', '59', 'years'),
    ('over', '55'),
    ('over', '60', 'years', 'old'),
    ('over', '80', 'years', 'old'),
    ('over', '60'),
    ('over', '65', 'years'),
    ('over', '60', 'years'),
    ('over', '55', 'years'),
    ('over', '80', 'years'),
    ('people', 'aged', '59'),
    ('people', 'over', '65'),
    ('people', 'aged', '55'),
    ('people', 'aged', '60'),
    ('people', 'over', '80'),
    ('people', 'over', '70'),
    ('the', '59', '+', 'population'),
    "elders",
    ('50', '+', 'years'),
    ("geriatric", "population"),
]
# list(
#     sorted(list(set(kw_en_old)),
#            key=lambda x: x if isinstance(x, str) else x[0]))
negative_kw_old = []
r_59_100 = r"\b(100|[6-9][0-9]|59)\b"  # 59-100
r_50_100 = r"\b(100|[6-9][0-9]|5[0-9])\b"  # 50-100
r_61_100 = r"\b(100|[6-9][1-9])\b"  # 61-100
r_60_100 = r"\b(100|[6-9][0-9])\b"  # 60-100
r_0_59 = r"\b(5[0-9]|[0-9][0-9])\b"  # 0-59
patterns_old = [
    f"between {r_59_100} ?- ?{r_59_100}",
    f"between {r_50_100} ?- ?{r_61_100}",
    f"{r_59_100} years and older",
    f"{r_59_100} years and above",
    f"{r_59_100} years old and above",
    f"above {r_59_100} years",
    f"over {r_59_100} years",
    f"above the age of {r_50_100}",
    f"over the age of {r_50_100}",
    f"between {r_50_100} ?- ?{r_59_100}",
    f"between {r_50_100} to {r_59_100}",
    f"between the age of {r_50_100} and {r_59_100}",
    f"between the ages of {r_50_100} and {r_59_100}",
    f"the {r_59_100} ?- ?{r_59_100} age",
    f"{r_50_100} ?- ?{r_61_100} years",
    f"{r_50_100} to {r_61_100} years",
    f"{r_50_100}\+ years",
    f"{r_60_100} ?-year-old",
    f"{r_60_100} ?-year-olds",
    f"{r_50_100} ?- ?{r_61_100} years old",
    f"{r_50_100} ?- ?{r_61_100} age",
    f"ages of {r_50_100} ?- ?{r_61_100}",
    f"ages of {r_50_100} ?- ?{r_61_100}",
    f"ages of {r_50_100} and {r_61_100}",
    f"age of {r_50_100} ?- ?{r_61_100}",
    f"age of {r_50_100} ?- ?{r_61_100}",
    f"age of {r_60_100} years",
    f"age {r_60_100}",
    f"age {r_50_100} ?- ?{r_61_100}",
    f"ages {r_50_100} ?- ?{r_61_100}",
    f"aged {r_50_100} ?- ?{r_61_100}",
    f"age {r_50_100} ?- ?{r_61_100}",
    f"aged {r_50_100} years and older",
    f"older than {r_50_100}",
]


##
def is_old(row):
    for kw in negative_kw_old:
        if isinstance(kw, str) and kw in row['tokenized_excerpt']: return False
        elif len(kw) == 2 and kw in row['bigram_excerpt']: return False
        elif len(kw) == 3 and kw in row['trigram_excerpt']: return False
        elif len(kw) == 4 and kw in row['fourgram_excerpt']: return False
        elif len(kw) == 5 and kw in row['fivegram_excerpt']: return False
        elif len(kw) == 6 and kw in row['sixgram_excerpt']: return False
        elif len(kw) == 7 and kw in row['sevengram_excerpt']: return False
    for p in patterns_old:
        if re.search(p, row["excerpt_pp"]): return True
    for kw in kw_en_old:
        if isinstance(kw, str) and kw in row['tokenized_excerpt']: return True
        elif len(kw) == 2 and kw in row['bigram_excerpt']: return True
        elif len(kw) == 3 and kw in row['trigram_excerpt']: return True
        elif len(kw) == 4 and kw in row['fourgram_excerpt']: return True
        elif len(kw) == 5 and kw in row['fivegram_excerpt']: return True
        elif len(kw) == 6 and kw in row['sixgram_excerpt']: return True
        elif len(kw) == 7 and kw in row['sevengram_excerpt']: return True
    return False

In [14]:
# preprocess
df_train["excerpt_pp"] = df_train["translation_en"].progress_apply(preprocess)
# tokenize and cacl ngrams
df_train["tokenized_excerpt"] = df_train["translation_en"].progress_apply(preprocess_and_tokenize)
df_train["bigram_excerpt"] = df_train["translation_en"].progress_apply(lambda x: preprocess_and_tokenize(x, 2))
df_train["trigram_excerpt"] = df_train["translation_en"].progress_apply(lambda x: preprocess_and_tokenize(x, 3))
df_train["fourgram_excerpt"] = df_train["translation_en"].progress_apply(lambda x: preprocess_and_tokenize(x, 4))
df_train["fivegram_excerpt"] = df_train["translation_en"].progress_apply(lambda x: preprocess_and_tokenize(x, 5))
df_train["sixgram_excerpt"] = df_train["translation_en"].progress_apply(lambda x: preprocess_and_tokenize(x, 6))
df_train["sevengram_excerpt"] = df_train["translation_en"].progress_apply(lambda x: preprocess_and_tokenize(x, 7))

  0%|          | 0/126323 [00:00<?, ?it/s]

  0%|          | 0/126323 [00:00<?, ?it/s]

  0%|          | 0/126323 [00:00<?, ?it/s]

  0%|          | 0/126323 [00:00<?, ?it/s]

  0%|          | 0/126323 [00:00<?, ?it/s]

  0%|          | 0/126323 [00:00<?, ?it/s]

  0%|          | 0/126323 [00:00<?, ?it/s]

  0%|          | 0/126323 [00:00<?, ?it/s]

In [15]:
# preprocess
df_val["excerpt_pp"] = df_val["translation_en"].progress_apply(preprocess)
# tokenize and cacl ngrams
df_val["tokenized_excerpt"] = df_val["translation_en"].progress_apply(preprocess_and_tokenize)
df_val["bigram_excerpt"] = df_val["translation_en"].progress_apply(lambda x: preprocess_and_tokenize(x, 2))
df_val["trigram_excerpt"] = df_val["translation_en"].progress_apply(lambda x: preprocess_and_tokenize(x, 3))
df_val["fourgram_excerpt"] = df_val["translation_en"].progress_apply(lambda x: preprocess_and_tokenize(x, 4))
df_val["fivegram_excerpt"] = df_val["translation_en"].progress_apply(lambda x: preprocess_and_tokenize(x, 5))
df_val["sixgram_excerpt"] = df_val["translation_en"].progress_apply(lambda x: preprocess_and_tokenize(x, 6))
df_val["sevengram_excerpt"] = df_val["translation_en"].progress_apply(lambda x: preprocess_and_tokenize(x, 7))

  0%|          | 0/14425 [00:00<?, ?it/s]

  0%|          | 0/14425 [00:00<?, ?it/s]

  0%|          | 0/14425 [00:00<?, ?it/s]

  0%|          | 0/14425 [00:00<?, ?it/s]

  0%|          | 0/14425 [00:00<?, ?it/s]

  0%|          | 0/14425 [00:00<?, ?it/s]

  0%|          | 0/14425 [00:00<?, ?it/s]

  0%|          | 0/14425 [00:00<?, ?it/s]

In [17]:
# preprocess
df_test["excerpt_pp"] = df_test["translation_en"].progress_apply(preprocess)
# tokenize and cacl ngrams
df_test["tokenized_excerpt"] = df_test["translation_en"].progress_apply(preprocess_and_tokenize)
df_test["bigram_excerpt"] = df_test["translation_en"].progress_apply(lambda x: preprocess_and_tokenize(x, 2))
df_test["trigram_excerpt"] = df_test["translation_en"].progress_apply(lambda x: preprocess_and_tokenize(x, 3))
df_test["fourgram_excerpt"] = df_test["translation_en"].progress_apply(lambda x: preprocess_and_tokenize(x, 4))
df_test["fivegram_excerpt"] = df_test["translation_en"].progress_apply(lambda x: preprocess_and_tokenize(x, 5))
df_test["sixgram_excerpt"] = df_test["translation_en"].progress_apply(lambda x: preprocess_and_tokenize(x, 6))
df_test["sevengram_excerpt"] = df_test["translation_en"].progress_apply(lambda x: preprocess_and_tokenize(x, 7))

  0%|          | 0/17200 [00:00<?, ?it/s]

  0%|          | 0/17200 [00:00<?, ?it/s]

  0%|          | 0/17200 [00:00<?, ?it/s]

  0%|          | 0/17200 [00:00<?, ?it/s]

  0%|          | 0/17200 [00:00<?, ?it/s]

  0%|          | 0/17200 [00:00<?, ?it/s]

  0%|          | 0/17200 [00:00<?, ?it/s]

  0%|          | 0/17200 [00:00<?, ?it/s]

In [18]:
INFANT = 'Infants/Toddlers (<5 years old)'
CHILD = 'Children/Youth (5 to 17 years old)'
ADULT = 'Adult (18 to 59 years old)'
OLD = 'Older Persons (60+ years old)'

In [19]:
def pred_age(row):
    lbls = []
    if is_infant(row):
        lbls.append(INFANT)
    if is_child(row):
        lbls.append(CHILD)
    if is_adult(row):
        lbls.append(ADULT)
    if is_old(row):
        lbls.append(OLD)
    return lbls

In [20]:
i = 11255
row = df_test[df_test['entry_id'].eq(i)].iloc[0]
row["excerpt"]

'In 2018, UNHCR provided solutions to 2,520 refugees and asylum-seekers. Since 1 September 2017, 701 individuals were submitted for resettlement to 8 States (Canada, France, Germany, Italy, Netherlands, Norway, Sweden, Switzerland). A total of 70 individuals departed on resettlement directly from Libya to Canada, France, Sweden and the Netherlands. A total of 10 individuals have been accepted for resettlement and are pending departure directly from Libya to Sweden. In addition, a total of 1,858 persons (including unaccompanied children) were evacuated to Niger (1,536), Italy (312) and the Romania (10). Currently, the total remaining capacity to accommodate evacuees in the Emergency Transit Mechanism in Niamey is 322 persons.'

In [21]:
x = "sth"
for kw in negative_kw_child:
    if isinstance(kw, str) and kw in row['tokenized_excerpt']: x = False
    elif len(kw) == 2 and kw in row['bigram_excerpt']: x= False
    elif len(kw) == 3 and kw in row['trigram_excerpt']: x= False
    elif len(kw) == 4 and kw in row['fourgram_excerpt']: x= False
    elif len(kw) == 5 and kw in row['fivegram_excerpt']: x= False
    elif len(kw) == 6 and kw in row['sixgram_excerpt']: x= False
    elif len(kw) == 7 and kw in row['sevengram_excerpt']: x= False
for p in child_patterns:
    if re.search(p, row["excerpt_pp"]): x= True
for kw in kw_en_child:
    if isinstance(kw, str) and kw in row['tokenized_excerpt']: x= True
    elif len(kw) == 2 and kw in row['bigram_excerpt']: x= True
    elif len(kw) == 3 and kw in row['trigram_excerpt']: x= True
    elif len(kw) == 4 and kw in row['fourgram_excerpt']: x= True
    elif len(kw) == 5 and kw in row['fivegram_excerpt']: x= True
    elif len(kw) == 6 and kw in row['sixgram_excerpt']: x= True
    elif len(kw) == 7 and kw in row['sevengram_excerpt']: x= True
# x=False

In [22]:
is_child(row)

True

In [23]:
df_train["age_kw_pred"] = df_train.progress_apply(pred_age, axis=1)

  0%|          | 0/126323 [00:00<?, ?it/s]

In [24]:
df_val["age_kw_pred"] = df_val.progress_apply(pred_age, axis=1)

  0%|          | 0/14425 [00:00<?, ?it/s]

In [25]:
df_test["age_kw_pred"] = df_test.progress_apply(pred_age, axis=1)

  0%|          | 0/17200 [00:00<?, ?it/s]

In [50]:
df_test.columns

Index(['entry_id', 'excerpt', 'age', 'lang', 'translation_en',
       'translation_fr', 'translation_es', 'excerpt_pp', 'tokenized_excerpt',
       'bigram_excerpt', 'trigram_excerpt', 'fourgram_excerpt',
       'fivegram_excerpt', 'sixgram_excerpt', 'sevengram_excerpt',
       'age_kw_pred'],
      dtype='object')

In [26]:
df_train.to_csv("train_0.7.1_keyword_age.csv",
               index=False,
               columns=['entry_id', 'age_kw_pred'])
##
df_val.to_csv("val_0.7.1_keyword_age.csv",
               index=False,
               columns=['entry_id', 'age_kw_pred'])
##
df_test.to_csv("test_0.7.1_keyword_age.csv",
               index=False,
               columns=['entry_id', 'age_kw_pred'])

In [23]:
(df_train["age_kw_pred"] == df_train["age"]).sum(), df_train.shape

(95919, (126323, 16))

In [24]:
(df_val["age_kw_pred"] == df_val["age"]).sum(), df_val.shape

(10689, (14425, 16))

In [25]:
(df_test["age_kw_pred"] == df_test["age"]).sum(), df_val.shape

(12843, (14425, 16))

### Create a test set

In [26]:
import random
import numpy as np
random.seed(2021)
np.random.seed(2021)

In [27]:
df_test[(df_test["age_kw_pred"] != df_test["age"]) & (df_test["age_kw_pred"].apply(lambda x: x!=[]))  & (df_test["age"].apply(lambda x: x==[]))].shape

(2739, 16)

In [28]:
df_part_1 = df_test[(df_test["age_kw_pred"] == df_test["age"]) & (df_test["age_kw_pred"].apply(lambda x: x!=[]))  & (df_test["age"].apply(lambda x: x!=[]))].sample(n=50, replace=False)
#df_part_2 = df_test[(df_test["age_kw_pred"] == df_test["age"]) & (df_test["age_kw_pred"].apply(lambda x: x==[]))  & (df_test["age"].apply(lambda x: x!=[]))].sample(n=10)
#df_part_3 = df_test[(df_test["age_kw_pred"] == df_test["age"]) & (df_test["age_kw_pred"].apply(lambda x: x!=[]))  & (df_test["age"].apply(lambda x: x==[]))].sample(n=10)
df_part_4 = df_test[(df_test["age_kw_pred"] == df_test["age"]) & (df_test["age_kw_pred"].apply(lambda x: x==[]))  & (df_test["age"].apply(lambda x: x==[]))].sample(n=50, replace=False)
##
df_part_5 = df_test[(df_test["age_kw_pred"] != df_test["age"]) & (df_test["age_kw_pred"].apply(lambda x: x!=[]))  & (df_test["age"].apply(lambda x: x!=[]))].sample(n=50, replace=False)
df_part_6 = df_test[(df_test["age_kw_pred"] != df_test["age"]) & (df_test["age_kw_pred"].apply(lambda x: x==[]))  & (df_test["age"].apply(lambda x: x!=[]))].sample(n=50, replace=False)
df_part_7 = df_test[(df_test["age_kw_pred"] != df_test["age"]) & (df_test["age_kw_pred"].apply(lambda x: x!=[]))  & (df_test["age"].apply(lambda x: x==[]))].sample(n=50, replace=False)
#df_part_8 = df_test[(df_test["age_kw_pred"] != df_test["age"]) & (df_test["age_kw_pred"].apply(lambda x: x==[]))  & (df_test["age"].apply(lambda x: x==[]))].sample(n=50)
##
df_test_to_be_tagged = pd.concat([df_part_1, df_part_4, df_part_5, df_part_6, df_part_7], axis=0).sample(frac=1., replace=False)
df_test_to_be_tagged.shape

(250, 16)

In [30]:
df_test_to_be_tagged[['entry_id', "translation_en",
                      "age_kw_pred"]].to_csv("kw_tagged_test_taggers.csv",
                                             index=False)

In [61]:
max_rows = 100
df_x = df_test_to_be_tagged[["translation_en", "age_kw_pred"]][:max_rows]
display(HTML(df_x.to_html()))

Unnamed: 0,translation_en,age_kw_pred
12135,"In response to the needs faced by migrant children, the Ecuador Red Cross implements 14 safe spaces in 11 provinces across the country. It also has additional mobile safe spaces it deploys when the number of migrants increase. The mobile spaces support children as they walk for long distances in hard to access locations.",[Children/Youth (5 to 17 years old)]
13935,"The situation of gender-based violence (VBG) in the region has grown with the safe crisis. In this month, two incidents of rapes were perpetrated against women by a non-state armed group (Gane) in the municipality of Matiacoali, province of Gourma. Indeed, during the night of 31/05/2021, a gane burst into the village of Ougarou in Matiacoali in a courtyard where two women aged about 26 and 29 years old were removed and raped. They were then released in the morning of the next day.",[Adult (18 to 59 years old)]
11103,"69 percent of in-school Colombians and 65 percent of in-school Venezuelans performed only at or below the benchmark for ORF. Similarly, for reading comprehension, 64 percent of in-school Colombians, 61 percent of in-school Venezuelans and 86 percent of OOS Venezuelans fell below the benchmark.",[Children/Youth (5 to 17 years old)]
5728,"Vulnerabilites: Discontinuation of education and or learning opportunities (short, medium & longterm) and vaccination and other childhood care including ECD (early childhood development); • Food shortage/security, resulting in malnutrition; • Domestic violence, abuse, child labour and involvement in crime for livelihood (mugging/snatching, commercial sex, drug courier). • Trafficking",[Children/Youth (5 to 17 years old)]
16840,"According to the UN, Al-Shabaab remains the main threat to security and stability in Somalia today. The group exerts effective control over large parts of rural areas in southern and central Somalia. Al-Shabaab has resorted to largely asymmetrical means of warfare, such as suicide attacks and the use of improvised explosive devices (IEDs). Its presence has exacerbated unrest in Somalia and represents a specific threat to children, between 2016 and 2019 Al-Shabaab committed 10,462 grave violations against children. During the same period2,916 children were killed or maimed in armed violence in Somalia. In 2019 alone, some 222 children were killed and 481 were maimeddue to armed conflict in Somalia, the majority of which were perpetrated by Al-Shabaab.",[Children/Youth (5 to 17 years old)]
3653,"Saint Vincent del Caganá is a non-Covid municipality, but 86 percent of families declare not to be willing to bring their children to school under the alternating model",[Children/Youth (5 to 17 years old)]
17060,"Lack of education, secure employment and life prospects combine to drive young men and women onward across borders, towards Egypt, Libya and Europe.",[Adult (18 to 59 years old)]
6966,"UNICEF continued to strengthen interventions to protect Almajiri children and support their family reunification in the northern states as well as enhancing community engagement to address violence against children, FGM and other protection concerns affecting during the COVID-19 pandemic.",[Children/Youth (5 to 17 years old)]
3369,"In 2016 and 2017, the Constitutional Court of Colombia concluded that corruption was one of the main factors that affected the access of children to water, school feeding programs and health services in La Guajira. The Court determined that the hiring of these services was carried out with little transparency and pointed out that the Prosecutor's Office, the Attorney General's Office should prioritize the measures to sanction corruption in the department.",[Children/Youth (5 to 17 years old)]
9678,"[1st -31st Oct 2020, North east Nigeria]In October 2020, a total of 189 boys and girls aged 6-59 months were screened from UNICEF-supported reception centers in IDP Camps across hard to reach areas of Bama, Gwoza, Mobbar and Ngala LGAs of Borno state. Out of the 189 children, 11 (5.8%) were found with SAM while 15 (7.9%) were found with MAM. A total of 47 (25%%) were from other accessible LGAs, 105 (55%) came from neighboring countries of Niger, Cameroon and Chad while the remaining 37 (20%) were from inaccessible locations. All children with SAM were enrolled in the therapeutic feeding and treatment service delivery sites",[Infants/Toddlers (<5 years old)]


In [68]:
df_test_to_be_tagged[['entry_id', 'translation_en']].to_excel("AgeData.xlsx", index=False)

In [20]:
df_train.to_csv("train_0.7.1_kw_age.csv", index=False)
df_val.to_csv("val_0.7.1_kw_age.csv", index=False)

In [20]:
df_train.columns

Index(['entry_id', 'excerpt', 'age', 'lang', 'translation_en',
       'translation_fr', 'translation_es', 'excerpt_pp', 'tokenized_excerpt',
       'bigram_excerpt', 'trigram_excerpt', 'fourgram_excerpt',
       'fivegram_excerpt', 'sixgram_excerpt', 'sevengram_excerpt',
       'age_kw_pred'],
      dtype='object')

In [22]:
df_train[['entry_id', 'age_kw_pred']].to_csv("train_0.7.1_keyword_age.csv", index=False)
df_val[['entry_id', 'age_kw_pred']].to_csv("val_0.7.1_keyword_age.csv", index=False)
df_test[['entry_id', 'age_kw_pred']].to_csv("test_0.7.1_keyword_age.csv", index=False)

In [21]:
len(df_train), len(df_val)

(126323, 14425)