In [1]:
import re
import string
import functools
from ast import literal_eval
from operator import itemgetter
from collections.abc import Sequence
from collections import Counter, defaultdict, OrderedDict

import numpy as np
import pandas as pd
from sklearn import metrics
from tqdm.notebook import tqdm
from num2words import num2words
                                                                
import nltk
from nltk import ngrams
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import sent_tokenize, word_tokenize

from IPython.core.display import HTML

In [2]:
tqdm.pandas()

In [3]:
def preprocess(doc):
    if doc != doc:
        return ""
    doc = doc.lower()
    # remove preceeding dates
    #doc = re.sub("^\[.+\]", " ", doc).strip()
    #doc = re.sub("^\(.+\)", " ", doc).strip()
    # spaces btw numbers and words
    doc = re.sub('(\d+(\W\d+)?)', r' \1 ', doc).strip()
    doc = re.sub("[‐‑–—―─_]", "-", doc)
    doc = re.sub("(\w)\- (\w)", r"\1\2", doc)
    doc = re.sub(
        "[" + re.escape(
            '_@^~.()[],"“’…<❖‐»—─|•&{≥➢\ue0e4\uf0d8\uf0fc●°#\u200b>`?�€■!‘%;̧\'›«”:≤―\uf0b7$}*´=‑▪\xad❑·–'
        ) + "]", " ", doc)
    #remove some puncs
    doc = re.sub('\s+', " ", doc).strip()
    return doc
##
def preprocess_and_tokenize(doc, n=1):
    doc = preprocess(doc)
    # tokenize
    words = word_tokenize(doc)
    if n == 1:
        return set(words)
    return set(ngrams(words, n))

In [4]:
df_train = pd.read_csv("../data/train_v0.7.1.csv",
                       usecols=[
                           'entry_id', 'excerpt', 'affected_groups_level_0',
                           'affected_groups_level_1',
                           'affected_groups_level_2',
                           'affected_groups_level_3', 'lang', "translation_en",
                           "translation_fr", "translation_es"
                       ])
df_val = pd.read_csv("../data/val_v0.7.1.csv",
                     usecols=[
                         'entry_id', 'excerpt', 'affected_groups_level_0',
                         'affected_groups_level_1', 'affected_groups_level_2',
                         'affected_groups_level_3', 'lang', "translation_en",
                         "translation_fr", "translation_es"
                     ])
df_test = pd.read_csv("../data/test_v0.7.1.csv",
                      usecols=[
                          'entry_id', 'excerpt', 'affected_groups_level_0',
                          'affected_groups_level_1', 'affected_groups_level_2',
                          'affected_groups_level_3', 'lang', "translation_en",
                          "translation_fr", "translation_es"
                      ])

In [5]:
cols = [
    'affected_groups_level_0', 'affected_groups_level_1',
    'affected_groups_level_2', 'affected_groups_level_3'
]
for col in cols:
    for df in [df_train, df_val, df_test]:
        df[col] = df[col].apply(lambda x: [
            e for e in list(sorted(list(set(literal_eval(x)))))
            if e not in ['None', 'NOT_MAPPED']
        ])

In [6]:
df_train_en = df_train.copy()
df_train_en.loc[df_train_en["lang"].ne("en"),
                "excerpt"] = df_train_en.loc[df_train_en["lang"].ne("en"),
                                             "translation_en"]

In [7]:
df_val_en = df_val.copy()
df_val_en.loc[df_val_en["lang"].ne("en"),
                "excerpt"] = df_val_en.loc[df_val_en["lang"].ne("en"),
                                             "translation_en"]
##
df_test_en = df_test.copy()
df_test_en.loc[df_test_en["lang"].ne("en"),
                "excerpt"] = df_test_en.loc[df_test_en["lang"].ne("en"),
                                             "translation_en"]

In [8]:
def unique_values(df, col):
    vals = Counter()
    for val in df[col]:
        vals.update(val)
    return vals.most_common()

In [10]:
unique_values(df_train_en, "affected_groups_level_1")

[('Affected', 64334), ('Not Affected', 292)]

In [11]:
affected = "Affected"
not_affected = "Not Affected"

In [13]:
df_train_en_af1 = df_train_en[df_train_en['affected_groups_level_1'].apply(
    lambda x: x != [])].copy()
##
df_train_en_af1_affected = df_train_en_af1[df_train_en_af1['affected_groups_level_1'].apply(
    lambda x: affected in x)]
df_train_en_af1_not_affected = df_train_en_af1[df_train_en_af1['affected_groups_level_1'].apply(
    lambda x: not_affected in x)]

In [14]:
class KeywordExtractor:
    def __init__(
        self,
        docs_bg_corpus,
        docs_classes,
        lang="en",
        n_grams=2,
        num_to_words=False,
        stop_words=None,
    ):
        if isinstance(stop_words, Sequence) and not isinstance(stop_words, set):
            self.stop_words = set(stop_words)
        elif stop_words is None:
            self.stop_words = set()
        self.n_grams = n_grams
        self.num_to_words = num_to_words
        self.docs_bg_corpus = docs_bg_corpus
        self.class_name_to_idx = dict(
            zip(list(docs_classes.keys()), range(len(docs_classes))))
        self.docs_classes = list(docs_classes.values())
        self.lang = lang
        ## for preprocessing - should be moved to a util class/func
        self.num_normalizer = dict(
            zip("⁰¹²³⁴⁵⁶⁷⁸⁹", [str(i) for i in range(10)]))
        self.num_normalizer.update(
            dict(zip("₀₁₂₃₄₅₆₇₈₉", [str(i) for i in range(10)])))
        if lang == "en":
            self.num_normalizer.update({"⅓": "one-third", "¼": "one-fourth"})
        elif lang == "fr":
            self.num_normalizer.update({"⅓": "un-tiers", "¼": "un-quart"})
        elif lang == "es":
            self.num_normalizer.update({"⅓": "un-tercio", "¼": "un-tercio"})
        ##
        self.word_to_freq_bg_corpus = self.extract_word_counts(docs_bg_corpus)
        # split word to freq dict into multiple dicts according to ngram len
        self.ngram_to_freq_bg_corpus = [dict() for _ in range(n_grams)]
        for kw, count in self.word_to_freq_bg_corpus.items():
            if isinstance(kw, str):
                self.ngram_to_freq_bg_corpus[0][kw] = count
            else:
                self.ngram_to_freq_bg_corpus[len(kw) - 1][kw] = count
        self.bg_corpus_sizes = [
            sum(self.ngram_to_freq_bg_corpus[i].values())
            for i in range(n_grams)
        ]
        ##
        self.word_to_freq_classes = [
            self.extract_word_counts(corpus) for corpus in self.docs_classes
        ]
        # split word to freq dict of each class into multiple dicts
        # len(ngram_word_to_freq_classes) = c
        # len(ngram_word_to_freq_classes[x]) = n
        self.ngram_word_to_freq_classes = [[dict() for _ in range(n_grams)]
                                           for _ in self.class_name_to_idx]
        for c, word_to_freq_cls in enumerate(self.word_to_freq_classes):
            for kw, count in word_to_freq_cls.items():
                n = 0 if isinstance(kw, str) else len(kw) - 1
                self.ngram_word_to_freq_classes[c][n][kw] = count
        self.ngram_corpora_sizes = [[
            sum(word_to_freq[i].values()) for i in range(n_grams)
        ] for word_to_freq in self.ngram_word_to_freq_classes]

        self.ngram_corpora_sizes = [[
            sum(word_to_freq[i].values()) for i in range(n_grams)
        ] for word_to_freq in self.ngram_word_to_freq_classes]
        ##
        # calc likelihoods for each ngram length in each class separately
        # a list of lists of dicts
        # each represents a class
        # each class is represented by n dicts
        # each dict is {"ngram in class": likelihood}
        self.ngram_likelihoods = [[] for _ in self.class_name_to_idx]
        for c, cls_name in enumerate(self.class_name_to_idx):
            for n in range(self.n_grams):
                w_to_f = self.ngram_word_to_freq_classes[c][n]
                corpus_size = self.ngram_corpora_sizes[c][n]
                self.ngram_likelihoods[c].append(
                    self.calc_likelihoods(w_to_f, corpus_size))
        ##
        # calc potts scores for each ngram in each class
        add_two_dict = lambda a, b: {
            **a,
            **b,
            **{k: a[k] + b[k]
               for k in a.keys() & b}
        }
        self.ngram_potts_scores = [[] for _ in self.class_name_to_idx]
        self.ngram_z_score_of_the_log_odds_ratios = [
            [] for _ in self.class_name_to_idx
        ]
        for c, cls_name in enumerate(self.class_name_to_idx):
            for n in range(self.n_grams):
                lh = self.ngram_likelihoods[c][n]
                other_lhs = [
                    self.ngram_likelihoods[c_other][n]
                    for c_other, _ in enumerate(self.class_name_to_idx)
                    if c != c_other
                ]
                other_lhs = functools.reduce(add_two_dict, other_lhs)
                self.ngram_potts_scores[c].append(
                    self.calc_potts_scores(lh, other_lhs))

                word_to_freq = self.ngram_word_to_freq_classes[c][n]
                corpus_size = self.ngram_corpora_sizes[c][n]
                word_to_freq_others = [
                    self.ngram_word_to_freq_classes[c_other][n]
                    for c_other, _ in enumerate(self.class_name_to_idx)
                    if c != c_other
                ]
                word_to_freq_others = functools.reduce(add_two_dict,
                                                       word_to_freq_others)
                corpus_size_others = sum([
                    self.ngram_corpora_sizes[c_other][n]
                    for c_other, _ in enumerate(self.class_name_to_idx)
                    if c != c_other
                ])

                self.ngram_z_score_of_the_log_odds_ratios[c].append(
                    self.calc_prior_modified_log_odds_ratio(
                        word_to_freq, corpus_size, word_to_freq_others,
                        corpus_size_others, self.ngram_to_freq_bg_corpus[n],
                        self.bg_corpus_sizes[n]))

    def preprocess_and_tokenize(self, doc):
        if doc != doc:
            return ""
        # remove preceeding dates
        doc = re.sub("^\[.+\]", " ", doc).strip()
        doc = re.sub("^\(.+\)", " ", doc).strip()
        # spaces btw numbers and words
        doc = re.sub('(\d+(\W\d+)?)', r' \1 ', doc).strip()
        doc = re.sub("[‐‑–—―─_]", "-", doc)
        doc = re.sub(
            "[" + re.escape(
                '_@^~.()[],"“’…<❖‐»—─|•&{≥➢\ue0e4\uf0d8\uf0fc●°#\u200b>`?�€■!‘%;̧\'›«”:≤―\uf0b7$}*+´=‑▪\xad❑·–'
            ) + "]", " ", doc)
        #remove some puncs
        doc = re.sub('\s+', " ", doc)

        # tokenize
        words = word_tokenize(doc)
        # lower and remove non-words
        words = [word.lower() for word in words if word not in self.stop_words]
        words = [self.num_normalizer.get(token, token) for token in words]
        if self.num_to_words:
            words = [
                num2words(token, lang=self.lang)
                if token.isnumeric() else token for token in words
            ]
        kw_kp = words.copy()
        for n in range(2, self.n_grams + 1):
            kw_kp.extend(list(ngrams(words, n)))
        return kw_kp

    def calc_potts_scores(self, word_to_likelihood_main,
                          word_to_likelihood_other):
        potts_scores = dict()
        for word in word_to_likelihood_main.keys():
            potts_scores[word] = word_to_likelihood_main[word] / (
                word_to_likelihood_main[word] +
                word_to_likelihood_other.get(word, 0))
        return potts_scores

    def calc_likelihoods(self, word_to_freq, corpus_size):
        likelihoods = dict()
        for word, count in word_to_freq.items():
            likelihoods[word] = count / corpus_size
        return likelihoods

    def extract_word_counts(self, docs):
        word_to_freq = defaultdict(int)
        for doc in docs:
            words = self.preprocess_and_tokenize(doc)
            for word in words:
                #if word in stopwords, then do not add it
                word_to_freq[word] += 1

        return word_to_freq

    def calc_prior_modified_log_odds_ratio(self, word_to_freq_c1,
                                           corpus_size_c1, word_to_freq_c2,
                                           corpus_size_c2, word_to_freq_all,
                                           corpus_size_all):

        prior_modified_log_odds_ratio_c1 = dict()
        variance_of_the_log_odds_ratio = dict()
        z_score_of_the_log_odds_ratio_c1 = dict()
        ##
        for word in word_to_freq_c1.keys():
            numerator_1 = word_to_freq_c1[word] + word_to_freq_all[word]
            denomerator_1 = corpus_size_c1 + corpus_size_all - (numerator_1)
            ratio_1 = np.log(numerator_1 / denomerator_1)
            ##
            numerator_2 = word_to_freq_c2.get(word, 0) + word_to_freq_all[word]
            denomerator_2 = corpus_size_c2 + corpus_size_all - (numerator_2)
            ratio_2 = np.log(numerator_2 / denomerator_2)
            ##
            prior_modified_log_odds_ratio_c1[word] = ratio_1 - ratio_2
            ##
            variance_of_the_log_odds_ratio[word] = (1 / numerator_1) + (
                1 / numerator_2)
            ##
            z_score_of_the_log_odds_ratio_c1[
                word] = prior_modified_log_odds_ratio_c1[word] / np.sqrt(
                    variance_of_the_log_odds_ratio[word])
        return z_score_of_the_log_odds_ratio_c1

    def get_kws(self, cls_name, n):
        cls_idx = self.class_name_to_idx[cls_name]
        kw_dict = self.ngram_z_score_of_the_log_odds_ratios[cls_idx][n - 1]
        return list(
            sorted([(word, score) for word, score in kw_dict.items()],
                   key=itemgetter(1),
                   reverse=True))

In [17]:
end = -1
n_grams = 5
stop_words_en = None
# stop_words_en = stopwords.words("english")
kwe_en = KeywordExtractor(
    df_train_en_af1['excerpt'].tolist(),
    {
        affected: df_train_en_af1_affected["excerpt"].tolist()[:end],
        not_affected: df_train_en_af1_not_affected["excerpt"].tolist()[:end],
    },
    n_grams=n_grams,
    stop_words=stop_words_en)

In [19]:
n = 1
cls_name = not_affected
kwe_en.get_kws(cls_name, n)

[('older', 2.0700037625880863),
 ('pps', 1.87791927437606),
 ('companies', 1.596202679924283),
 ('taxist', 1.5953593415974385),
 ('loin', 1.5953593415974385),
 ('self-acceptance', 1.539477140067048),
 ('\uf02d', 1.4604759470231827),
 ('con-', 1.4408955160881438),
 ('guatire', 1.4408955160881438),
 ('screented', 1.3267097873594487),
 ('liquidity', 1.3227029025242527),
 ('per-', 1.26910673565632),
 ('ageing', 1.26910673565632),
 ('dimensions', 1.195246502545197),
 ('ly', 1.192472873879054),
 ('ol-', 1.192472873879054),
 ('age-friendly', 1.192472873879054),
 ('ipnh', 1.192472873879054),
 ('vt', 1.182119338706167),
 ('3010/2020', 1.1280890573354432),
 ('quecolombia', 1.1280890573354432),
 ('famosacation', 1.1280890573354432),
 ('singeralcides', 1.1280890573354432),
 ('encolombiapor', 1.1280890573354432),
 ('unperitonisy', 1.1280890573354432),
 ('dedenaspor', 1.1280890573354432),
 ('deceso', 1.1280890573354432),
 ('portalntn', 1.1280890573354432),
 ('rocha', 1.1280890573354432),
 ('jumper',

In [20]:
display(HTML(df_train_en_af1_not_affected[["excerpt", "lang"]].to_html()))

Unnamed: 0,excerpt,lang
585,"A total of 1,740 traffic traffic traffickers were screented this 3010/2020; No alert has been identified.",fr
685,"This 25/10/2020, 1,613 international travelers were recorded and screed. None of them were warned. At the end of S43, 12,237 travelers were registered and screened with international POEs",fr
816,"A total of 113 samples were received at the laboratory this 25/10/2020. At the end of the day, 139 samples thus at the end of S43, 881 samples were received at the laboratory; In the end, 1,050 samples were analyzed, a positivity around 9.6% (101/1050). For reminder at the end of S42, 1,293 samples were received and 1,247 (96.4%) had been analyzed; The positivity rate was then 15.0% (187/1247) (Figure 10).",fr
2106,"Day after day are more students and teachers who add to face-to-face returns to educational institutions throughout the national territory, filling again the educational spaces of joy and new opportunities to learn, grow and develop; And the Ministry of Education continues to accompany the regions to reaffirm the message of trust in the return process.",es
2161,"Two thirds (66%) of older persons on the move in Andean countries had no monthly income and 57% considered their income insufficient to meet their basic needs, a ﬁgure that rises to 70% in Colombia. Women and persons with disabilities are less likely to have month- ly income.",en
2604,A total of 934 traffic traffic passengers were screented this 23/07/2020; No alert has been identified. This same result was obtained after 38 102 travelers were screented on the side of the national traffic.,fr
2952,"Approximately half of those who said they were Venezuelan (46%) said they had not applied for asy- lum. A third of the sample (34%) said this was because they did not know how to. Thirty per cent of older per- sons said that the ID they had did not give them access to humanitari- an services, and seven per cent said they were unsure. These results may reﬂect a lack of available ser- vices or a lack of knowledge about what services respondents are en- titled to (HelpAge, 2020c).",en
3581,"• WFP’s latest food security estimates found 48 percent of Colombians and 71 percent of the Venezuelan migrant population affected by moderate to severe food insecurity in January. This compares to 22 and 55 percent, respectively, before the COVID-19 pandemic.",en
3773,"The point of coronavirus disease in this area gives 7 recorded suspect cases, 4 samples, all negative.",fr
5102,"In addition, the closing of schools as part of the prevention measures results in the suspension of school feeding programs, which particularly impacts children from insecure households (R4V 02/04/2020). According to the WFP, more than 4 million children in Colombia (Colombians and Venezuelans) do not have access to school meals due to restrictions (WFP 04/2020). Alternative Modalities of Food Assistance are necessary to continue ensuring food access to children.",es


In [18]:

kw_en_migrants = [
    'immigrant', 'immigrants', 'immigration', 'migracion', 'migrant', 'migrants', 'migrate',
    'migration', 'migratory', 'out-migrants', 'migrations', 'non-migrant', 
    'emigrated', 'emigrate', 'emigrates',
]
##
negative_kw_migrant = []
##
migrant_patterns = []
##
def is_migrant(row):
    for kw in negative_kw_migrant:
        if isinstance(kw, str) and kw in row['tokenized_excerpt']: return False
        elif len(kw) == 2 and kw in row['bigram_excerpt']: return False
        elif len(kw) == 3 and kw in row['trigram_excerpt']: return False
        elif len(kw) == 4 and kw in row['fourgram_excerpt']: return False
        elif len(kw) == 5 and kw in row['fivegram_excerpt']: return False
        elif len(kw) == 6 and kw in row['sixgram_excerpt']: return False
        elif len(kw) == 7 and kw in row['sevengram_excerpt']: return False
    for p in migrant_patterns:
        if re.search(p, row["excerpt_pp"]): return True
    for kw in kw_en_migrants:
        if isinstance(kw, str) and kw in row['tokenized_excerpt']: return True
        elif len(kw) == 2 and kw in row['bigram_excerpt']: return True
        elif len(kw) == 3 and kw in row['trigram_excerpt']: return True
        elif len(kw) == 4 and kw in row['fourgram_excerpt']: return True
        elif len(kw) == 5 and kw in row['fivegram_excerpt']: return True
        elif len(kw) == 6 and kw in row['sixgram_excerpt']: return True
        elif len(kw) == 7 and kw in row['sevengram_excerpt']: return True
    return False
##
# list(
#     sorted(list(set(kw_en_migrants)),
#            key=lambda x: x if isinstance(x, str) else x[0]))

In [19]:
kw_en_host = [('aboriginal', 'people'), ('camps', 'and', 'host'),
              ('community', 'members'), ('community', 'children'),
              ('community', 'volunteers'), ('community', 'engagement'),
              ('community', 'awareness'), ('community', 'of', 'origin'),
              ('community', 'consultation'), ('community', 'population'),
              ('community', 'families'), ('community', 'people'),
              ('community', 'feedback/complaints'), ('community', 'areas'),
              ('community', 'households'), ('community', 'response'),
              ('displaced', 'people', 'and', 'hosts'),
              ('displaced', 'and', 'host'),
              ('displaced', 'persons', 'and', 'hosts'),
              ('host', 'and', 'displaced'), ('host', 'men'),
              ('host', 'and', 'idp'), ('host', 'community'),
              ('host', 'and', 'idps'), ('host', 'women'),
              ('host', 'and', 'camps'), ('host', 'population'),
              ('host', 'and', 'refugees'), ('host', 'populations'),
              ('host', 'communities'), ('host', 'families'),
              ('host', 'family'), ('host', 'households'), 'host-communities',
              'host-community', 'host/refugee', 'hostmen',
              ('hosts', 'of', 'the', 'communes'), ('idp', 'and', 'host'),
              ('idps', 'and', 'host'), 'indigenous',
              ('inhabitants', 'of', 'the', 'region'), 'intercommunities',
              'intercommunity', ('local', 'population'),
              ('local', 'populations'), ('local', 'people'),
              ('national', 'citizens'), 'refugee/host',
              ('refugees', 'and', 'host'), ('resident', 'households'),
              ('reception', 'communities'), ('welcome','communities'),
              ('welcome','community'),
             ]
##
negative_kw_host = []
##
host_patterns = []
##
def is_host(row):
    for kw in negative_kw_host:
        if isinstance(kw, str) and kw in row['tokenized_excerpt']: return False
        elif len(kw) == 2 and kw in row['bigram_excerpt']: return False
        elif len(kw) == 3 and kw in row['trigram_excerpt']: return False
        elif len(kw) == 4 and kw in row['fourgram_excerpt']: return False
        elif len(kw) == 5 and kw in row['fivegram_excerpt']: return False
        elif len(kw) == 6 and kw in row['sixgram_excerpt']: return False
        elif len(kw) == 7 and kw in row['sevengram_excerpt']: return False
    for p in host_patterns:
        if re.search(p, row["excerpt_pp"]): return True
    for kw in kw_en_host:
        if isinstance(kw, str) and kw in row['tokenized_excerpt']: return True
        elif len(kw) == 2 and kw in row['bigram_excerpt']: return True
        elif len(kw) == 3 and kw in row['trigram_excerpt']: return True
        elif len(kw) == 4 and kw in row['fourgram_excerpt']: return True
        elif len(kw) == 5 and kw in row['fivegram_excerpt']: return True
        elif len(kw) == 6 and kw in row['sixgram_excerpt']: return True
        elif len(kw) == 7 and kw in row['sevengram_excerpt']: return True
    return False
# list(
#     sorted(list(set(kw_en_host)),
#            key=lambda x: x if isinstance(x, str) else x[0]))
################
# 'hospitalization','community', 'communities','host', 'hosts',  ('community', 'members'),
# ('community', 'people'), ('community', 'families'), ('community', 'areas'),
# ('community', 'children'), ('community', 'population'), ('community', 'engagement'),
#     ('community', 'response'), ('community', 'consultation'), ('community', 'feedback/complaints'),
#     ('community', 'children'), ('community', 'volunteers'), ('community', 'awareness'),
#, ('community', 'engagement', 'activities'), ('community', 'members'),
#  ('through', 'community', 'engagement'),
#     ('community', 'engagement', 'activities'), ('national', 'level'), ('local', 'authority'),
# ('integrating', 'into', 'the', 'labor', 'market'),('educational', 'integration'),

In [20]:
# kw_en_permanent = [
#     'argentina', 'arrest', 'arrested', 'arrests', 'budget', ('care', 'center'),
#     ('care', 'centers'), 'chile', 'colombia', 'companies', 'company', 'crime',
#     'crimes', 'discrimination', ('double', 'nationality'), 'education',
#     ('educational', 'integration'), 'educational', 'employed', 'employer',
#     'employers', 'employing', 'employment', 'expelled', 'expulsion',
#     ('family', 'reunion'), 'food', ('foreign', 'persons'),
#     ('foreign', 'population'), ('foreign', 'persons',
#                                 'residing'), 'harassment',
#     ('head', 'of', 'family'), 'health', 'homes', 'households',
#     'inclusion', 'inclusive', 'income',
#     ('integrating', 'into', 'the', 'labor',
#      'market'), 'integration', 'job', 'jobs', 'kill', 'killed', 'labor',
#     ('labor', 'market'), 'labour', 'lease', 'live', 'lived', 'living',
#     ('long', 'stay'), ('longer', 'stays'), 'occupation', 'occupations',
#     'panama', 'personnel', 'peru', 'poverty', 'pregnancy', 'pregnant',
#     'pregnants', 'ptp', 'qualification', 'qualifications', 'quarantine',
#     'regularization', 'regularizations', 'regularize', 'regularized',
#     'regularizes', 'regulated', 'reintegration', 'remittance', 'remittances',
#     'remuneration', 'rent', 'rental', 'rents', 'reside', 'resident',
#     'residents', 'resides', 'reunification', 'robbery', 'salaried', 'salaries',
#     'salary', 'self-employing', 'settlement',
#     'settlements', 'stigmatization', 'student', 'students',
#     ('temporary', 'stay', 'permit'), 'trafficking', 'underemployment',
#     'university', 'violence', 'work', 'worker', 'workers',
#     ('working', 'conditions'), 'workplace', 'xenophobia'
# ]
# #
# ##
# pre_kw_permanent = [
#     'venezuelan', 'venezuelans', 'foreigner', 'foreigners', 'foreign'
# ]
# ##
# negative_kw_permanent = []
# ##
# permanent_patterns = []


# ##
# def is_permanent(row):
#     if not (is_migrant(row) or 'venezuelan' in row['tokenized_excerpt']
#             or 'venezuelans' in row['tokenized_excerpt']
#             or 'foreigner' in row['tokenized_excerpt']
#             or 'foreigners' in row['tokenized_excerpt']
#             or 'foreign' in row['tokenized_excerpt']):
#         return False
#     for kw in negative_kw_permanent:
#         if isinstance(kw, str) and kw in row['tokenized_excerpt']: return False
#         elif len(kw) == 2 and kw in row['bigram_excerpt']: return False
#         elif len(kw) == 3 and kw in row['trigram_excerpt']: return False
#         elif len(kw) == 4 and kw in row['fourgram_excerpt']: return False
#         elif len(kw) == 5 and kw in row['fivegram_excerpt']: return False
#         elif len(kw) == 6 and kw in row['sixgram_excerpt']: return False
#         elif len(kw) == 7 and kw in row['sevengram_excerpt']: return False
#     for p in permanent_patterns:
#         if re.search(p, row["excerpt_pp"]): return True
#     for kw in kw_en_permanent:
#         if isinstance(kw, str) and kw in row['tokenized_excerpt']: return True
#         elif len(kw) == 2 and kw in row['bigram_excerpt']: return True
#         elif len(kw) == 3 and kw in row['trigram_excerpt']: return True
#         elif len(kw) == 4 and kw in row['fourgram_excerpt']: return True
#         elif len(kw) == 5 and kw in row['fivegram_excerpt']: return True
#         elif len(kw) == 6 and kw in row['sixgram_excerpt']: return True
#         elif len(kw) == 7 and kw in row['sevengram_excerpt']: return True
#     return False

In [21]:
kw_en_refugees = [
    'hijra', 'host/refugee', 'refuges', 'refuge', 'refugee', 'refugee-hosting',
    'refugee/host', 'refugees', 'unhcr', 'unrwa', 'hijra',
    # 'camp', 'camp-in-charge', 'camps',
]
##
negative_kw_refugees = []
##
refugees_patterns = []
##
def is_refugee(row):
    for kw in negative_kw_refugees:
        if isinstance(kw, str) and kw in row['tokenized_excerpt']: return False
        elif len(kw) == 2 and kw in row['bigram_excerpt']: return False
        elif len(kw) == 3 and kw in row['trigram_excerpt']: return False
        elif len(kw) == 4 and kw in row['fourgram_excerpt']: return False
        elif len(kw) == 5 and kw in row['fivegram_excerpt']: return False
        elif len(kw) == 6 and kw in row['sixgram_excerpt']: return False
        elif len(kw) == 7 and kw in row['sevengram_excerpt']: return False
    for p in refugees_patterns:
        if re.search(p, row["excerpt_pp"]): return True
    for kw in kw_en_refugees:
        if isinstance(kw, str) and kw in row['tokenized_excerpt']: return True
        elif len(kw) == 2 and kw in row['bigram_excerpt']: return True
        elif len(kw) == 3 and kw in row['trigram_excerpt']: return True
        elif len(kw) == 4 and kw in row['fourgram_excerpt']: return True
        elif len(kw) == 5 and kw in row['fivegram_excerpt']: return True
        elif len(kw) == 6 and kw in row['sixgram_excerpt']: return True
        elif len(kw) == 7 and kw in row['sevengram_excerpt']: return True
    return False
#
# kw_en_refugees = [
#     'camp', 'camp-in-charge', 'camps', 'hijra', 'host/refugee', 'newcomers',
#     'refuge', 'refugee', 'refugee-hosting', 'refugee/host', 'refugees',
#     'relocate', 'relocated', 'relocation', 'relocations', 'repatriation',
#     'resettlement', 'self-settlements', 'transferred', 'unhcr', 'unregistered',
#     'unrwa'
# ]
#
# list(
#     sorted(list(set(kw_en_refugees)),
#            key=lambda x: x if isinstance(x, str) else x[0]))

In [22]:
kw_en_idp = [
    'idps',
    'idp',
    'pdi',
    'pdis',
    'displacement',
    'displacements',
    'displaced',
    ('population','movements'),
    ('population','movement'),
#     ('displaced', 'persons'),
#     ('displaced', 'people'),
#     ('displaced', 'peoples'),
#     ('displaced', 'population'),
#     ('displaced', 'populations'),
#     ('internally', 'displaced', 'persons'),
#     ('internally', 'displaced', 'people'),
#     ('internally', 'displaced', 'peoples'),
#     ('internally', 'displaced', 'population'),
#     ('internally', 'displaced', 'populations'),
#     ('internally', 'displaced'),
#     ('displaced','inside','the','country'),
]
negative_kw_idp = [
    'non-idp',
    'non-idps',
    ('non', 'idp'),
    ('non', 'idps'),
    #('personnes', 'déplacées', 'internes'),
    'non-displaced',
    ('non', 'displaced'),
]
##
idp_patterns = []
##
def is_idp(row):
    for kw in negative_kw_idp:
        if isinstance(kw, str) and kw in row['tokenized_excerpt']: return False
        elif len(kw) == 2 and kw in row['bigram_excerpt']: return False
        elif len(kw) == 3 and kw in row['trigram_excerpt']: return False
        elif len(kw) == 4 and kw in row['fourgram_excerpt']: return False
        elif len(kw) == 5 and kw in row['fivegram_excerpt']: return False
        elif len(kw) == 6 and kw in row['sixgram_excerpt']: return False
        elif len(kw) == 7 and kw in row['sevengram_excerpt']: return False
    for p in idp_patterns:
        if re.search(p, row["excerpt_pp"]): return True
    for kw in kw_en_idp:
        if isinstance(kw, str) and kw in row['tokenized_excerpt']: return True
        elif len(kw) == 2 and kw in row['bigram_excerpt']: return True
        elif len(kw) == 3 and kw in row['trigram_excerpt']: return True
        elif len(kw) == 4 and kw in row['fourgram_excerpt']: return True
        elif len(kw) == 5 and kw in row['fivegram_excerpt']: return True
        elif len(kw) == 6 and kw in row['sixgram_excerpt']: return True
        elif len(kw) == 7 and kw in row['sevengram_excerpt']: return True
    return False
###
# list(
#     sorted(list(set(kw_en_idp)),
#            key=lambda x: x if isinstance(x, str) else x[0]))

In [23]:
#'returned', ('returned', 'from'), ('returned', 'to'), ('returned', 'people'), ('people', 'returned'),
#'return',
#'returns',
# ('back', 'to'), ('come', 'back'),
# 're-integrate', 're-integrates', 're-integrating',
#     're-integratings', 're-integration', 're-integrations',
# 'reintegrate', 'reintegrates', 'reintegrating',
#     'reintegratings', 'reintegration', 'reintegrations',
negative_kw_returnees = []
# list(
#     sorted(list(set(kw_en_returnees)),
#            key=lambda x: x if isinstance(x, str) else x[0]))
kw_en_returnees = [
    'deport', 'deportation', 'deportations', 'deported', 'deportee',
    'deportee/returnee', 'deportees', 'deportees/returnees',
    'deporting', 'deports', ('households', 'returned'),
    ('individuals', 'returned'), ('persons', 'returned'),
    ('recently', 'returned'), ('refugees', 'returned'), ('refugees', 'return'),
    ('refugees', 'returning'), 'repatriate', 'repatriated', 'repatriates',
    'repatriating', 'repatriatings', 'repatriation', 'repatriations',
    ('reportedly', 'returned'), ('return', 'community'), ('return', 'home'),
    ('return', 'communities'), ('return', 'from'), ('return', 'program'),
    ('return', 'migrants'), ('returning', 'migrants'), ('returned', 'home'),
    ('returned', 'persons'), ('returned', 'communities'), ('returned', 'from'),
    ('returned', 'populations'), ('returned', 'to', 'their'),
    ('returned', 'population'), ('returned', 'community'), 'returnee',
    'returnee/deportee', 'returnees', 'returnees/deportees',
    ('returning', 'community'), ('returning', 'home'), ('returning',
                                                        'persons'),
    ('returning', 'communities'), ('returning', 'from'),
    ('returning', 'populations'), ('returning', 'population'), 'returnings',
    ('returns', 'from'), ('returns', 'to'), ('spontaneous', 'return'),
    ('spontaneous', 'returns'), ('travelled', 'back'), ('traveled', 'back'),
    ('traveling', 'back'), ('travelling', 'back'), ('voluntarily', 'returns'),
    ('voluntarily', 'returning'), ('voluntarily', 'return'),
    ('voluntarily', 'returned'), ('voluntary', 'return'),
    ('voluntary', 'returns'), ('voluntary', 'repatriation')
]


# list(
#     sorted(list(set(kw_en_returnees)),
#            key=lambda x: x if isinstance(x, str) else x[0]))
##########
# @labeling_function()
def lf_gt(row):
    return returnees in row['affected_groups_level_3']


# @labeling_function()
def lf_neg_1(row):
    if (re.search(
            r"fear|fearful|fearing|fears", row['excerpt_pp']
    ) and re.search(
            r"expel|expelled|expeled|expulsion|expulsions|deport|deportation|deportations|deported|repatriate|repatriated|repatriation",
            row['excerpt_pp'])):
        return 0
    return -1


# @labeling_function()
def lf_neg_2(row):
    if ('not', 'expel') in row['bigram_excerpt'] or (
            'not', 'deport') in row['bigram_excerpt']:
        return 0
    return -1


# @labeling_function()
def lf_neg_3(row):
    if re.search(
            r"not (\w+ ){0,3}?\b(return|returned|expel|expeled|expelled|deport|deported)\b",
            row["excerpt_pp"]):
        return 0
    return -1


# @labeling_function()
def lf_neg_4(row):
    if (("prevent" in row['tokenized_excerpt']
         or "prevents" in row['tokenized_excerpt'])
            and (("from", "returning") in row['bigram_excerpt'] or
                 ("from", "deporting") in row['bigram_excerpt'] or
                 ("from", "expulsion") in row['bigram_excerpt'] or
                 ("from", "deportation") in row['bigram_excerpt'])):
        return 0
    return -1


# @labeling_function()
def lf_neg_5(row):
    if ('have', 'positive', 'returned') in row['trigram_excerpt']:
        return 0
    return -1


# Positive labeling functions
# @labeling_function()
def lf_pos_1(row):
    if ("population", "deported") in row["bigram_excerpt"]:
        return 1
    return -1


# @labeling_function()
def lf_pos_2(row):
    for lf_neg in [lf_neg_1, lf_neg_2, lf_neg_3, lf_neg_4, lf_neg_5]:
        if lf_neg(row) != -1:
            return 0
    if re.search(
            #r"(be|been|were|are|have|has|venezuelan|venezuelans|syrian|syrians|rohingya|rohingyas|rwandan|rwandans|people|persons|refugee|refugees|migrant|migrants) (\w+ ){0,3}?\b(returned|deported|expeled|expelled|repatriated)",
            r"(be|been|were|are|have|has|venezuelan|venezuelans|syrian|syrians|rohingya|rohingyas|rwandan|rwandans|people|persons|refugee|refugees|migrant|migrants) (\w+ ){0,3}?\b(returned|repatriated)",
            row["excerpt_pp"]):
        return 1
    return -1


# @labeling_function()
def lf_pos_3(row):
    if ('chile', 'expelled') in row["bigram_excerpt"]:
        return 1
    if ('chile', 'expeled') in row["bigram_excerpt"]:
        return 1
    if ('chile', 'deported') in row["bigram_excerpt"]:
        return 1
    if ('chile', 'will', 'expel') in row["trigram_excerpt"]:
        return 1
    if ('chile', 'will', 'deport') in row["trigram_excerpt"]:
        return 1
    if ('chile', 'will', 'repatriate') in row["trigram_excerpt"]:
        return 1
    return -1


# @labeling_function()
def lf_pos_4(row):
    if ('deportations', 'of', 'detainees') in row['trigram_excerpt']:
        return 1
    if ('deportation', 'of', 'detainees') in row['trigram_excerpt']:
        return 1
    if ('deportation', 'of', 'detainee') in row['trigram_excerpt']:
        return 1
    return -1


# @labeling_function()
def lf_pos_5(row):
    if ('voluntary', 'repatriation') in row['bigram_excerpt']:
        return 1
    if ('voluntary', 'repatriations') in row['bigram_excerpt']:
        return 1
    return -1


# @labeling_function()
def lf_pos_6(row):
    if ('returned', 'people') in row['bigram_excerpt']:
        return 1
    return -1


# @labeling_function()
def lf_pos_7(row):
    if len({
            'deportee', 'deportee/returnee', 'deportees',
            'deportees/returnees', 'returnee', 'returnee/deportee',
            'returnees', 'returnees/deportees'
    } & row['tokenized_excerpt']):
        return 1
    return -1


# @labeling_function()
def lf_pos_8(row):
    if len({
        ('returning', 'migrants'),
        ('returned', 'migrants'),
        ('repatriated', 'migrants'),
        ('repatriating', 'migrants'),
        ('returning', 'communities'),
        ('returned', 'communities'),
        ('repatriated', 'communities'),
        ('repatriating', 'communities'),
        ('returning', 'populations'),
        ('returned', 'populations'),
        ('repatriated', 'populations'),
        ('repatriating', 'populations'),
        ('returning', 'population'),
        ('returned', 'population'),
        ('repatriated', 'population'),
        ('repatriating', 'population'),
    } & row['bigram_excerpt']):
        return 1
    return -1


# @labeling_function()
def lf_pos_9(row):
    if len({
        ('voluntarily', 'returns'),
        ('voluntarily', 'returning'),
        ('voluntarily', 'return'),
        ('voluntarily', 'returned'),
        ('voluntary', 'return'),
        ('voluntary', 'returns'),
        ('voluntary', 'repatriation'),
        ('forced', 'return'),
        ('forced', 'returns'),
        ('forced', 'repatriation'),
        ('forced', 'deportation'),
        ('forced', 'deportations'),
    }
           & row['bigram_excerpt']):
        return 1
    return -1


# @labeling_function()
def lf_pos_10(row):
    if len({
        ('spontaneous', 'return'),
        ('spontaneous', 'returns'),
    } & row['bigram_excerpt']):
        return 1
    return -1


# @labeling_function()
def lf_pos_11(row):
    if len({('travelled', 'back'), ('traveled', 'back'), ('traveling', 'back'),
            ('travelling', 'back')} & row['bigram_excerpt']):
        return 1
    return -1


####
def is_returnee(row):
    gt = lf_gt(row)
    if gt: return gt
    for lf in [
            #lf_neg_1, lf_neg_2, lf_neg_3, lf_neg_4, lf_neg_5,
            lf_pos_1,
            lf_pos_2,
            lf_pos_3,
            lf_pos_4,
            lf_pos_5,
            lf_pos_6,
            lf_pos_7,
            lf_pos_8,
            lf_pos_9,
            lf_pos_10,
    ]:
        lf_ret = lf(row)
        if lf_ret != -1:
            return lf_ret
    return gt

In [24]:
kw_en_asylum = [
    'asylum',
    'asylum-',
    'asylums',
    'asylum-seekers',
    'asylum-seeker',
    'asylumseekers',
    'asylumseeker',
    ('seeking', 'refuge'),
    ('refugee', 'applicants'),
    ('refugee', 'applicant'),
    ('refugee', 'applicantion'),
    ('refugee', 'applicantions'),
    ('refuge', 'seeker'),
    ('refuge', 'seekers'),
    ('refuge', 'seeking'),
    ('refuge', 'seekings'),
]
negative_kw_asylum = []
##
asylum_patterns = []
##
def is_asylum(row):
    for kw in negative_kw_asylum:
        if isinstance(kw, str) and kw in row['tokenized_excerpt']: return False
        elif len(kw) == 2 and kw in row['bigram_excerpt']: return False
        elif len(kw) == 3 and kw in row['trigram_excerpt']: return False
        elif len(kw) == 4 and kw in row['fourgram_excerpt']: return False
        elif len(kw) == 5 and kw in row['fivegram_excerpt']: return False
        elif len(kw) == 6 and kw in row['sixgram_excerpt']: return False
        elif len(kw) == 7 and kw in row['sevengram_excerpt']: return False
    for p in asylum_patterns:
        if re.search(p, row["excerpt_pp"]): return True
    for kw in kw_en_asylum:
        if isinstance(kw, str) and kw in row['tokenized_excerpt']: return True
        elif len(kw) == 2 and kw in row['bigram_excerpt']: return True
        elif len(kw) == 3 and kw in row['trigram_excerpt']: return True
        elif len(kw) == 4 and kw in row['fourgram_excerpt']: return True
        elif len(kw) == 5 and kw in row['fivegram_excerpt']: return True
        elif len(kw) == 6 and kw in row['sixgram_excerpt']: return True
        elif len(kw) == 7 and kw in row['sevengram_excerpt']: return True
    return False
##
# list(
#     sorted(list(set(kw_en_asylum)),
#            key=lambda x: x if isinstance(x, str) else x[0]))

In [25]:
kw_en_stateless = [
    'statelessness', ('lack','documentation'), ('lack','documentations'),
    ('lack','of','documentation'), ('lack','of','documentations'), ('civil', 'documentation'),
    ('civil', 'documentations'),
    ('not','have','documentation'), ('not','have','documentations'), ('nt','have','documentation'),
    ('nt','have','documentations'),
    ('not','having','any', 'documents'), ('birth','registration'),
    ('birth','registrations'), ('birth', 'declarations'), ('birth', 'declaration'),
    ('lack', 'birth', 'certificates'), ('lack', 'birth', 'certificate'), 
    ('loss', 'birth', 'certificates'), ('loss', 'birth', 'certificate'), 
    ('stateless', 'persons'), ('stateless', 'person'), 
    ('stateless', 'community'), ('stateless', 'people'), 
    ('stateless', 'migrants'), ('stateless', 'migrant'), 
    ('stateless', 'immigrants'), ('stateless', 'immigrant'), 
    ('stateless', 'refugees'), ('stateless', 'refugee'), 
    ('stateless', 'generation'), ('stateless', 'generations'), 
    ('not', 'have', 'a', 'nationality'),
    ('children', 's', 'registration'), ('children', 'registration'), ('children', 'registrations'), 
    ('child', 'registration'), ('child', 'registrations'), ('newborn', 'registration'), ('newborn', 'registrations'), 
    ('issuance', 'of', 'nationality', 'certificates'),
    ('identified', 'as', 'stateless'), 
    ('not', 'been', 'declared', 'at', 'birth'),
    ('not', 'declared', 'at', 'birth'), ('national', 'identity', 'cards'),
]
negative_kw_stateless = []
##
stateless_patterns = []
##
def is_stateless(row):
    for kw in negative_kw_stateless:
        if isinstance(kw, str) and kw in row['tokenized_excerpt']: return False
        elif len(kw) == 2 and kw in row['bigram_excerpt']: return False
        elif len(kw) == 3 and kw in row['trigram_excerpt']: return False
        elif len(kw) == 4 and kw in row['fourgram_excerpt']: return False
        elif len(kw) == 5 and kw in row['fivegram_excerpt']: return False
        elif len(kw) == 6 and kw in row['sixgram_excerpt']: return False
        elif len(kw) == 7 and kw in row['sevengram_excerpt']: return False
    for p in stateless_patterns:
        if re.search(p, row["excerpt_pp"]): return True
    for kw in kw_en_stateless:
        if isinstance(kw, str) and kw in row['tokenized_excerpt']: return True
        elif len(kw) == 2 and kw in row['bigram_excerpt']: return True
        elif len(kw) == 3 and kw in row['trigram_excerpt']: return True
        elif len(kw) == 4 and kw in row['fourgram_excerpt']: return True
        elif len(kw) == 5 and kw in row['fivegram_excerpt']: return True
        elif len(kw) == 6 and kw in row['sixgram_excerpt']: return True
        elif len(kw) == 7 and kw in row['sevengram_excerpt']: return True
    return False

In [26]:
cls_name = stateless
n = 1
kwe_en.get_kws(cls_name, n)

[('idp', 2.984596324889756),
 ('found', 2.924540871325797),
 ('site', 2.912798074300048),
 ('settlements', 2.758592815795093),
 ('reported', 2.550478344447243),
 ('households', 2.453274177510887),
 ('yukpas', 2.4518498623141647),
 ('genre', 2.244141427702453),
 ('pbs', 2.184494509417986),
 ('was', 1.980024315882124),
 ('data', 1.9136469410710621),
 ('statelessness', 1.7666636725093057),
 ('january', 1.69366923006434),
 ('arrived', 1.6852308090957535),
 ('sites', 1.6800889191778658),
 ('bahamas', 1.654563942918802),
 ('hurricane', 1.654563942918802),
 ('cornered', 1.654563942918802),
 ('perijá', 1.654563942918802),
 ('dictatorship', 1.6278120398049762),
 ('hours', 1.6170437775584954),
 ('no', 1.6113020770104027),
 ('province', 1.6021281595711867),
 ('a', 1.5920322156062376),
 ('were', 1.4848687275905774),
 ('year', 1.4716356501118697),
 ('the', 1.4215777355307069),
 ('about', 1.4206079335281983),
 ('kananga', 1.3990514878759372),
 ('paramilitary', 1.3990514878759372),
 ('after', 1.38971

In [27]:
migrants = 'Migrants'
refugees = 'Refugees'
host = 'Host'
idp = 'IDP'
returnees = 'Returnees'
asylum_seekers = 'Asylum Seekers'
non_host = 'Non Host'
stateless = 'Stateless'
##
displaced = 'Displaced'
non_displaced = 'Non Displaced'

In [28]:
def affected_groups(row):
    ret = []
    if is_asylum(row):
        ret.append(asylum_seekers)
    if is_idp(row):
        ret.append(idp)
    if is_migrant(row):
        ret.append(migrants)
    if is_refugee(row):
        ret.append(refugees)
    if is_returnee(row):
        ret.append(returnees)
    if is_host(row):
        ret.append(host)
    return ret

In [29]:
def preprocess(doc):
    if doc != doc:
        return ""
    doc = doc.lower()
    # remove preceeding dates
    #doc = re.sub("^\[.+\]", " ", doc).strip()
    #doc = re.sub("^\(.+\)", " ", doc).strip()
    # spaces btw numbers and words
    doc = re.sub('(\d+(\W\d+)?)', r' \1 ', doc).strip()
    doc = re.sub("[‐‑–—―─_]", "-", doc)
    doc = re.sub("(\w)\- (\w)", r"\1\2", doc)
    doc = re.sub(
        "[" + re.escape(
            '_@^~.()[],"“’…<❖‐»—─|•&{≥➢\ue0e4\uf0d8\uf0fc●°#\u200b>`?�€■!‘%;̧\'›«”:≤―\uf0b7$}*´=‑▪\xad❑·–'
        ) + "]", " ", doc)
    #remove some puncs
    doc = re.sub('\s+', " ", doc).strip()
    return doc
##
def preprocess_and_tokenize(doc, n=1):
    doc = preprocess(doc)
    # tokenize
    words = word_tokenize(doc)
    if n == 1:
        return set(words)
    return set(ngrams(words, n))

In [30]:
# preprocess
df_train_en["excerpt_pp"] = df_train_en["excerpt"].progress_apply(preprocess)
# tokenize and cacl ngrams
df_train_en["tokenized_excerpt"] = df_train_en["excerpt"].progress_apply(preprocess_and_tokenize)
df_train_en["bigram_excerpt"] = df_train_en["excerpt"].progress_apply(lambda x: preprocess_and_tokenize(x, 2))
df_train_en["trigram_excerpt"] = df_train_en["excerpt"].progress_apply(lambda x: preprocess_and_tokenize(x, 3))
df_train_en["fourgram_excerpt"] = df_train_en["excerpt"].progress_apply(lambda x: preprocess_and_tokenize(x, 4))
df_train_en["fivegram_excerpt"] = df_train_en["excerpt"].progress_apply(lambda x: preprocess_and_tokenize(x, 5))
df_train_en["sixgram_excerpt"] = df_train_en["excerpt"].progress_apply(lambda x: preprocess_and_tokenize(x, 6))
df_train_en["sevengram_excerpt"] = df_train_en["excerpt"].progress_apply(lambda x: preprocess_and_tokenize(x, 7))

  0%|          | 0/126323 [00:00<?, ?it/s]

  0%|          | 0/126323 [00:00<?, ?it/s]

  0%|          | 0/126323 [00:00<?, ?it/s]

  0%|          | 0/126323 [00:00<?, ?it/s]

  0%|          | 0/126323 [00:00<?, ?it/s]

  0%|          | 0/126323 [00:00<?, ?it/s]

  0%|          | 0/126323 [00:00<?, ?it/s]

  0%|          | 0/126323 [00:00<?, ?it/s]

In [31]:
df_train_en.columns

Index(['entry_id', 'excerpt', 'affected_groups_level_0',
       'affected_groups_level_1', 'affected_groups_level_2',
       'affected_groups_level_3', 'lang', 'translation_en', 'translation_fr',
       'translation_es', 'excerpt_pp', 'tokenized_excerpt', 'bigram_excerpt',
       'trigram_excerpt', 'fourgram_excerpt', 'fivegram_excerpt',
       'sixgram_excerpt', 'sevengram_excerpt'],
      dtype='object')

In [32]:
df_train_en["affected_groups_level_3_kw"] = df_train_en.progress_apply(affected_groups, axis=1)

  0%|          | 0/126323 [00:00<?, ?it/s]

In [35]:
# preprocess
df_val_en["excerpt_pp"] = df_val_en["excerpt"].progress_apply(preprocess)
# tokenize and cacl ngrams
df_val_en["tokenized_excerpt"] = df_val_en["excerpt"].progress_apply(preprocess_and_tokenize)
df_val_en["bigram_excerpt"] = df_val_en["excerpt"].progress_apply(lambda x: preprocess_and_tokenize(x, 2))
df_val_en["trigram_excerpt"] = df_val_en["excerpt"].progress_apply(lambda x: preprocess_and_tokenize(x, 3))
df_val_en["fourgram_excerpt"] = df_val_en["excerpt"].progress_apply(lambda x: preprocess_and_tokenize(x, 4))
df_val_en["fivegram_excerpt"] = df_val_en["excerpt"].progress_apply(lambda x: preprocess_and_tokenize(x, 5))
df_val_en["sixgram_excerpt"] = df_val_en["excerpt"].progress_apply(lambda x: preprocess_and_tokenize(x, 6))
df_val_en["sevengram_excerpt"] = df_val_en["excerpt"].progress_apply(lambda x: preprocess_and_tokenize(x, 7))

  0%|          | 0/14425 [00:00<?, ?it/s]

  0%|          | 0/14425 [00:00<?, ?it/s]

  0%|          | 0/14425 [00:00<?, ?it/s]

  0%|          | 0/14425 [00:00<?, ?it/s]

  0%|          | 0/14425 [00:00<?, ?it/s]

  0%|          | 0/14425 [00:00<?, ?it/s]

  0%|          | 0/14425 [00:00<?, ?it/s]

  0%|          | 0/14425 [00:00<?, ?it/s]

In [36]:
df_val_en["affected_groups_level_3_kw"] = df_val_en.progress_apply(affected_groups, axis=1)

  0%|          | 0/14425 [00:00<?, ?it/s]

In [37]:
# preprocess
df_test_en["excerpt_pp"] = df_test_en["excerpt"].progress_apply(preprocess)
# tokenize and cacl ngrams
df_test_en["tokenized_excerpt"] = df_test_en["excerpt"].progress_apply(preprocess_and_tokenize)
df_test_en["bigram_excerpt"] = df_test_en["excerpt"].progress_apply(lambda x: preprocess_and_tokenize(x, 2))
df_test_en["trigram_excerpt"] = df_test_en["excerpt"].progress_apply(lambda x: preprocess_and_tokenize(x, 3))
df_test_en["fourgram_excerpt"] = df_test_en["excerpt"].progress_apply(lambda x: preprocess_and_tokenize(x, 4))
df_test_en["fivegram_excerpt"] = df_test_en["excerpt"].progress_apply(lambda x: preprocess_and_tokenize(x, 5))
df_test_en["sixgram_excerpt"] = df_test_en["excerpt"].progress_apply(lambda x: preprocess_and_tokenize(x, 6))
df_test_en["sevengram_excerpt"] = df_test_en["excerpt"].progress_apply(lambda x: preprocess_and_tokenize(x, 7))

  0%|          | 0/17200 [00:00<?, ?it/s]

  0%|          | 0/17200 [00:00<?, ?it/s]

  0%|          | 0/17200 [00:00<?, ?it/s]

  0%|          | 0/17200 [00:00<?, ?it/s]

  0%|          | 0/17200 [00:00<?, ?it/s]

  0%|          | 0/17200 [00:00<?, ?it/s]

  0%|          | 0/17200 [00:00<?, ?it/s]

  0%|          | 0/17200 [00:00<?, ?it/s]

In [38]:
df_test_en["affected_groups_level_3_kw"] = df_test_en.progress_apply(affected_groups, axis=1)

  0%|          | 0/17200 [00:00<?, ?it/s]