In [1]:
import re
import string
from ast import literal_eval
from operator import itemgetter
from collections import Counter, defaultdict, OrderedDict

import fasttext
import numpy as np
import pandas as pd
from sklearn import metrics
from sklearn.feature_extraction.text import CountVectorizer

from tqdm.notebook import tqdm

import spacy

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import sent_tokenize, word_tokenize

  return torch._C._cuda_getDeviceCount() > 0
2021-11-17 11:04:29.153029: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0


In [2]:
tqdm.pandas()

In [3]:
df_train = pd.read_csv("../data/train_v0.7.1.csv",
                       usecols=[
                           'entry_id', 'excerpt', 'gender', 'lang',
                           "translation_en", "translation_fr", "translation_es"
                       ])
df_val = pd.read_csv("../data/val_v0.7.1.csv",
                     usecols=[
                         'entry_id', 'excerpt', 'gender', 'lang',
                         "translation_en", "translation_fr", "translation_es"
                     ])
df_test = pd.read_csv("../data/test_v0.7.1.csv",
                      usecols=[
                          'entry_id', 'excerpt', 'gender', 'lang',
                          "translation_en", "translation_fr", "translation_es"
                      ])

In [4]:
col = "gender"
for df in [df_train, df_val, df_test]:
    df[col] = df[col].apply(lambda x: list(sorted(list(set(literal_eval(x))))))

In [5]:
df_train_en = df_train.copy()
df_train_en.loc[df_train_en["lang"].ne("en"),
                "excerpt"] = df_train_en.loc[df_train_en["lang"].ne("en"),
                                             "translation_en"]
##
df_train_fr = df_train.copy()
df_train_fr.loc[df_train_fr["lang"].ne("fr"),
                "excerpt"] = df_train_fr.loc[df_train_fr["lang"].ne("fr"),
                                             "translation_fr"]
##
df_train_es = df_train.copy()
df_train_es.loc[df_train_es["lang"].ne("es"),
                "excerpt"] = df_train_es.loc[df_train_es["lang"].ne("es"),
                                             "translation_es"]

In [6]:
df_train_en_gender = df_train_en[df_train_en['gender'].apply(
    lambda x: x != [])].copy()
df_train_en_gender_male = df_train_en_gender[
    df_train_en_gender['gender'].apply(lambda x: "Male" in x)]
df_train_en_gender_female = df_train_en_gender[
    df_train_en_gender['gender'].apply(lambda x: "Female" in x)]
##
df_train_en_gender_male_only = df_train_en_gender[
    df_train_en_gender['gender'].apply(lambda x: ["Male"] == x)]
df_train_en_gender_female_only = df_train_en_gender[
    df_train_en_gender['gender'].apply(lambda x: ["Female"] == x)]

In [7]:
df_train_fr_gender = df_train_fr[df_train_fr['gender'].apply(
    lambda x: x != [])].copy()
df_train_fr_gender_male = df_train_fr_gender[
    df_train_fr_gender['gender'].apply(lambda x: "Male" in x)]
df_train_fr_gender_female = df_train_fr_gender[
    df_train_fr_gender['gender'].apply(lambda x: "Female" in x)]
##
df_train_fr_gender_male_only = df_train_fr_gender[
    df_train_fr_gender['gender'].apply(lambda x: ["Male"] == x)]
df_train_fr_gender_female_only = df_train_fr_gender[
    df_train_fr_gender['gender'].apply(lambda x: ["Female"] == x)]

In [8]:
df_train_es_gender = df_train_es[df_train_es['gender'].apply(
    lambda x: x != [])].copy()
df_train_es_gender_male = df_train_es_gender[
    df_train_es_gender['gender'].apply(lambda x: "Male" in x)]
df_train_es_gender_female = df_train_es_gender[
    df_train_es_gender['gender'].apply(lambda x: "Female" in x)]
##
df_train_es_gender_male_only = df_train_es_gender[
    df_train_es_gender['gender'].apply(lambda x: ["Male"] == x)]
df_train_es_gender_female_only = df_train_es_gender[
    df_train_es_gender['gender'].apply(lambda x: ["Female"] == x)]

In [9]:
class KeywordExtractor:
    def __init__(self, docs_all, docs_male, docs_female):
        self.docs_all = docs_all
        self.docs_male = docs_male
        self.docs_female = docs_female
        ##
        self.word_to_freq_all = self.extract_word_counts(docs_all)
        self.word_to_freq_male = self.extract_word_counts(docs_male)
        self.word_to_freq_female = self.extract_word_counts(docs_female)
        ##
        self.corpus_size = sum(self.word_to_freq_all.values())
        self.male_corpus_size = sum(self.word_to_freq_male.values())
        self.female_corpus_size = sum(self.word_to_freq_female.values())
        ##
        self.male_words_likelihoods = self.calc_likelihoods(
            self.word_to_freq_male, self.male_corpus_size)
        self.female_words_likelihoods = self.calc_likelihoods(
            self.word_to_freq_female, self.female_corpus_size)
        self.male_likelihoods_sum = sum(self.male_words_likelihoods.values())
        self.female_likelihoods_sum = sum(
            self.female_words_likelihoods.values())
        ##
        self.male_potts_scores = self.calc_potts_scores(
            self.male_words_likelihoods, self.female_words_likelihoods)
        self.female_potts_scores = self.calc_potts_scores(
            self.female_words_likelihoods, self.male_words_likelihoods)
        ##
        self.z_score_of_the_log_odds_ratio_male = self.calc_prior_modified_log_odds_ratio(
            self.word_to_freq_male, self.male_corpus_size,
            self.word_to_freq_female, self.female_corpus_size,
            self.word_to_freq_all, self.corpus_size)
        self.z_score_of_the_log_odds_ratio_female = self.calc_prior_modified_log_odds_ratio(
            self.word_to_freq_female, self.female_corpus_size,
            self.word_to_freq_male, self.male_corpus_size,
            self.word_to_freq_all, self.corpus_size)

    def preprocess_and_tokenize(self, doc):
        # remove preceeding dates
        doc = re.sub("^\[.+\]", " ", doc).strip()
        doc = re.sub("^\(.+\)", " ", doc).strip()
        # spaces btw numbers and words
        doc = re.sub('(\d+(\.\d+)?)', r' \1 ', doc).strip()
        # tokenize
        words = word_tokenize(doc)
        # lower and remove non-words
        words = [word.lower() for word in words if word.isalpha()]
        return words

    def calc_potts_scores(self, word_to_likelihood_main,
                          word_to_likelihood_other):
        potts_scores = dict()
        for word in word_to_likelihood_main.keys():
            potts_scores[word] = word_to_likelihood_main[word] / (
                word_to_likelihood_main[word] +
                word_to_likelihood_other.get(word, 0))
        return potts_scores

    def calc_likelihoods(self, word_to_freq, corpus_size):
        likelihoods = dict()
        for word, count in word_to_freq.items():
            likelihoods[word] = count / corpus_size
        return likelihoods

    def extract_word_counts(self, docs):
        word_to_freq = defaultdict(int)
        for doc in docs:
            words = self.preprocess_and_tokenize(doc)
            for word in words:
                #if word in stopwords, then do not add it
                word_to_freq[word] += 1

        return word_to_freq

    def calc_prior_modified_log_odds_ratio(self, word_to_freq_c1,
                                           corpus_size_c1,
                                           word_to_freq_c2,
                                           corpus_size_c2,
                                           word_to_freq_all, corpus_size_all):

        prior_modified_log_odds_ratio_c1 = dict()
        variance_of_the_log_odds_ratio = dict()
        z_score_of_the_log_odds_ratio_c1 = dict()
        ##
        for word in word_to_freq_c1.keys():
            numerator_1 = word_to_freq_c1[word] + word_to_freq_all[word]
            denomerator_1 = corpus_size_c1 + corpus_size_all - (numerator_1)
            ratio_1 = np.log(numerator_1 / denomerator_1)
            ##
            numerator_2 = word_to_freq_c2[word] + word_to_freq_all[word]
            denomerator_2 = corpus_size_c2 + corpus_size_all - (
                numerator_2)
            ratio_2 = np.log(numerator_2 / denomerator_2)
            ##
            prior_modified_log_odds_ratio_c1[word] = ratio_1 - ratio_2
            ##
            variance_of_the_log_odds_ratio[word] = (1 / numerator_1) + (
                1 / numerator_2)
            ##
            z_score_of_the_log_odds_ratio_c1[
                word] = prior_modified_log_odds_ratio_c1[word] / np.sqrt(
                    variance_of_the_log_odds_ratio[word])
        return z_score_of_the_log_odds_ratio_c1

    def get_kw_with_scores(self, cls):
        if cls == "female":
            return list(
                sorted([(word, score) for word, score in
                        self.z_score_of_the_log_odds_ratio_female.items()],
                       key=itemgetter(1),
                       reverse=True))
        elif cls == "male":
            return list(
                sorted([(word, score) for word, score in
                        self.z_score_of_the_log_odds_ratio_male.items()],
                       key=itemgetter(1),
                       reverse=True))
        else:
            raise ValueError("`cls` can be `male` or `female`.")

In [10]:
kwe_en = KeywordExtractor(
    df_train_en_gender['excerpt'].tolist(),
    df_train_en_gender_male_only['excerpt'].tolist(),
    df_train_en_gender_female_only['excerpt'].tolist(),
)

In [11]:
kwe_en.get_kw_with_scores("male")

[('men', 14.58833508575718),
 ('boys', 12.021551154931165),
 ('male', 5.316495883498527),
 ('were', 4.013343917741883),
 ('years', 3.532682207467063),
 ('total', 3.2535728317270665),
 ('confirmed', 3.0364685809136738),
 ('man', 2.9962186049669675),
 ('between', 2.9333195723484553),
 ('travelers', 2.8863638916910226),
 ('him', 2.857388036175913),
 ('ic', 2.8520337274885184),
 ('cases', 2.6874962757827823),
 ('incidents', 2.629031660603854),
 ('males', 2.600855263476125),
 ('immigrants', 2.585914217652198),
 ('watch', 2.5422735213392564),
 ('older', 2.483993987743554),
 ('deaths', 2.4010862866190377),
 ('integrity', 2.3791338851484714),
 ('returned', 2.3733689740380544),
 ('mixed', 2.343941884223197),
 ('observed', 2.3435763055199104),
 ('commonly', 2.3262308718540563),
 ('feel', 2.2558684913221914),
 ('throwing', 2.24275768204825),
 ('diaz', 2.24275768204825),
 ('moph', 2.2339067591877395),
 ('ratio', 2.1895236716702104),
 ('thousand', 2.1666614204909944),
 ('reached', 2.128636928758038

In [12]:
kwe_en.get_kw_with_scores("female")

[('women', 6.921368387693348),
 ('violence', 4.880866277947747),
 ('sexual', 3.634348982908556),
 ('maternal', 2.6559968176474165),
 ('health', 2.5957058834837325),
 ('pregnant', 2.5103397133986625),
 ('reproductive', 2.4005181085170717),
 ('vulnerable', 2.4001381718532464),
 ('vbg', 2.3672390569114166),
 ('early', 2.1870825846530657),
 ('girls', 2.1605968471893986),
 ('access', 2.1096258901878318),
 ('mothers', 2.025287730707826),
 ('face', 2.0171870097325963),
 ('marriage', 2.007854694941978),
 ('healthcare', 1.9840758837802213),
 ('pregnancy', 1.9684303301825516),
 ('particularly', 1.9283136969589925),
 ('services', 1.8259524342900848),
 ('increased', 1.802406233657647),
 ('rape', 1.779014134088139),
 ('survival', 1.7678031504619933),
 ('survivors', 1.7503371617570234),
 ('limited', 1.7369622401545837),
 ('genital', 1.7308196818247017),
 ('domestic', 1.6956536031582223),
 ('gender', 1.6953784684541924),
 ('mutilation', 1.6277842417462738),
 ('especially', 1.610612324981553),
 ('risk

In [13]:
# sgbv = Sexual and Gender Based Violence
# fgm = female genital mutilation
# srh = Sexual reproductive health

In [14]:
male_kw_en = [
    'man', 'men', 'boy', 'boys', 'male', 'males', "him", "his", "son", "sons",
    "father", "fathers"
]
female_kw_en = [
    'woman', 'women', 'girl', 'girls', 'female', 'females', 'mother',
    'mothers', "pregnancy", "pregnancies", "pregnant", "pregnants", "she",
    "her", "hers", "reproductive", "maternal", "daughter", "daughters",
    "childbearing", "feminicide", "feminicides", "femicide", "femicides",
    "vbg", "gbv", "sgbv", "fgm", "srh", "lactating", "menstrual", "contraceptive",
    "fetus", "foetation", "foetus",
]

### Keyword Extraction for French Excerpts

In [15]:
kwe_fr = KeywordExtractor(
    df_train_fr_gender['excerpt'].tolist(),
    df_train_fr_gender_male_only['excerpt'].tolist(),
    df_train_fr_gender_female_only['excerpt'].tolist(),
)

In [16]:
kwe_fr.get_kw_with_scores("male")

[('hommes', 14.677867975095092),
 ('garçons', 12.026520060695688),
 ('ans', 4.636063953049315),
 ('âgés', 3.8263158744842465),
 ('voyageurs', 3.711044008374122),
 ('étaient', 3.641083857348801),
 ('ic', 3.625525100907481),
 ('total', 3.134720280131219),
 ('masculin', 3.127130230015379),
 ('masculins', 3.0222570512929643),
 ('intégrité', 2.992431870661823),
 ('homme', 2.9901917729339584),
 ('cas', 2.715559416009255),
 ('couramment', 2.6784189453950593),
 ('incidents', 2.665287576226826),
 ('immigrants', 2.661398840453853),
 ('sentent', 2.607732638828974),
 ('règlement', 2.593012306278978),
 ('confirmés', 2.5495700535539396),
 ('école', 2.512134632416714),
 ('adultes', 2.4116388131537447),
 ('watch', 2.320142070099111),
 ('human', 2.307180472021151),
 ('rights', 2.307180472021151),
 ('ratio', 2.278458053034851),
 ('diaz', 2.2496935505119566),
 ('surreprésentation', 2.2496935505119566),
 ('lui', 2.24246620383884),
 ('moph', 2.2413039309245257),
 ('décès', 2.23924781741236),
 ('dont', 2.23

In [17]:
kwe_fr.get_kw_with_scores("female")

[('femmes', 6.22972952179731),
 ('violence', 4.315250982642952),
 ('santé', 2.8195582374130304),
 ('adolescentes', 2.5199176174768163),
 ('sexuelle', 2.507968804426181),
 ('enceintes', 2.408574790816952),
 ('vbg', 2.359109293707483),
 ('violences', 2.210921086022484),
 ('maternelle', 2.205863489398937),
 ('vulnérables', 2.2048829352071024),
 ('vénézuéliennes', 2.1105914956581135),
 ('précoce', 2.0557632643670485),
 ('mères', 1.9846992732383284),
 ('sexiste', 1.9790560534260542),
 ('services', 1.9694229488449222),
 ('grossesse', 1.9691206679521418),
 ('aux', 1.9542558854005385),
 ('sexuelles', 1.9485209348510533),
 ('mariage', 1.935550411977049),
 ('elles', 1.8355816457959577),
 ('particulier', 1.7406830678914358),
 ('compris', 1.725254862528668),
 ('filles', 1.7037008107609863),
 ('survie', 1.684469147427862),
 ('la', 1.6488257423522443),
 ('reproduction', 1.6485171046813003),
 ('défenseurs', 1.6228750057223482),
 ('gbv', 1.5356664700884153),
 ('paix', 1.5269552852215944),
 ('faso', 1.

In [18]:
male_kw_fr = [
    "homme", "hommes", "garçon", "garçons", "masculin", "masculins", "lui", 
]
female_kw_fr = [
    "femme", "femmes", "vbg", "maternelle", "maternelles", "vénézuélienne",
    "vénézuéliennes", "mère", "mères", "sexiste", "grossesse", "grossesses",
    "elle", "elles" "fille", "filles", "reproduction", "reproductions", 
    "gbv", "migrante", "migrantes", "survivante", "survivantes", "allaitante"
    "allaitantes", "handicapée", "handicapées", "srh", "fgm", "féminicide",
    "féminicides", "affectées", "réfugiée", "réfugiées", "sgbv", "mariée",
    "mariées", "féminine", "féminines", "violée", "violées", "sis", "menstruelle",
    "menstruelles", "péruvienne", "péruviennes", 
]

### Spanish

In [19]:
kwe_es = KeywordExtractor(
    df_train_es_gender['excerpt'].tolist(),
    df_train_es_gender_male_only['excerpt'].tolist(),
    df_train_es_gender_female_only['excerpt'].tolist(),
)

In [20]:
kwe_es.get_kw_with_scores("male")

[('hombres', 14.758041333243987),
 ('niños', 6.892206679891334),
 ('los', 5.20266830666417),
 ('años', 4.1002171777452645),
 ('viajeros', 3.674527977131148),
 ('masculinos', 3.229083680425738),
 ('fueron', 3.2161124508768446),
 ('confirmados', 3.138164082994006),
 ('u', 2.994331560067191),
 ('total', 2.9582955626538303),
 ('hombre', 2.8479823226356507),
 ('masculino', 2.7956938336683583),
 ('edad', 2.794404242571409),
 ('human', 2.76911556605391),
 ('rights', 2.76911556605391),
 ('watch', 2.76911556605391),
 ('venezolanos', 2.6642267650426628),
 ('informadas', 2.6567098877204876),
 ('casos', 2.6295834816189734),
 ('díaz', 2.596538329350968),
 ('incidentes', 2.5559088128845757),
 ('entre', 2.511001992337511),
 ('pruebas', 2.4889712755659),
 ('adultos', 2.423448354142819),
 ('el', 2.4217278449701434),
 ('inmigrantes', 2.4077869147713793),
 ('integridad', 2.3981546540846144),
 ('ic', 2.391567299928525),
 ('seguros', 2.3632455375227646),
 ('comúnmente', 2.3521742397493886),
 ('eran', 2.346

In [21]:
kwe_es.get_kw_with_scores("female")

[('las', 6.910534972163364),
 ('mujeres', 6.4400441204776016),
 ('violencia', 4.779224657345703),
 ('sexual', 3.322557891684301),
 ('género', 2.9009366910842704),
 ('la', 2.5735327578577074),
 ('salud', 2.504564706689293),
 ('vbg', 2.3583653240268028),
 ('venezolanas', 2.350665156086779),
 ('materna', 2.346778097895923),
 ('embarazadas', 2.256815867301191),
 ('vulnerables', 2.1997324232119504),
 ('reproductiva', 2.1615947285361057),
 ('matrimonio', 2.105932309487845),
 ('madres', 1.9568574537454728),
 ('embarazo', 1.9385572894672158),
 ('riesgo', 1.8722605823670733),
 ('servicios', 1.8480666902862246),
 ('acceso', 1.8300208848322772),
 ('supervivencia', 1.8045088225416739),
 ('sobrevivientes', 1.7534948565993498),
 ('genital', 1.6845284916068597),
 ('mutilación', 1.6602873600428012),
 ('mujer', 1.614978903337496),
 ('enfrentan', 1.590578493954475),
 ('defensoras', 1.5900839669303457),
 ('algunas', 1.5640077600338183),
 ('gbv', 1.5558894069254374),
 ('crisis', 1.5523297192549883),
 ('es

In [22]:
male_kw_es = [
    "hombre", "hombres", "viajero", "viajeros", "masculino", "masculinos",
    "venezolano", "venezolanos", "macho", "machos", "testigo", "masculina",
    "niño", "niños", 
]
female_kw_es = [
    "mujere", "mujeres", "vbg", "venezolana", "venezolanas", "materna", "maternas", 
    "embarazada", "embarazadas", "madre", "madres", "embarazo", "embarazos", "supervivencia",
    "supervivencias", "mujer", "mujers", "gbv", "fgm", "lactante", "lactantes", "lideresa",
    "lideresas", "hija", "hijas", "feminicidio", "feminicidios", "femenina", "femeninas",
    "anticonceptivo", "anticonceptivos", "srh", "refugiada", "refugiadas", "peruana", "peruanas",
    "niñas", "niña", "embarazo", "embarazos", "gestantes", "gestante", "menstrual", "menstruals",
    "sgbv", "menstruale", "menstruales", "vulnerabilidade", "vulnerabilidades", "desplazada",
    "desplazadas", "casada", "casadas", "prenatale", "prenatales", "femicidio", "femicidios",
    
]