In [53]:
import pandas as pd
from collections import defaultdict
from ast import literal_eval
from typing import List
import numpy as np
from sklearn.metrics import jaccard_score
import difflib

import warnings
warnings.filterwarnings('ignore')

### gender stats

In [233]:
def inter(a,b):
    
    c = []
    for num in a:
        if num in b and (not len(c) or b.index(c[-1]) < b.index(num)):
            c.append(num)
    return c

def jaccard_score_two_sentences(first_sentence: List[str], second_sentence: List[str]):
    len_sent_one = len(first_sentence)
    len_sent_two = len(second_sentence)
    if abs(len_sent_one - len_sent_two)>2:
        return 0
    else:
            
        set_first_sent = set(first_sentence)
        set_second_sent = set(second_sentence)
        intersection = len(inter(first_sentence, second_sentence))
        union = len(set(first_sentence).union(set(second_sentence)))
        return intersection / union

def custom_jaccard_score(input_sentence: List[str], sentences: List[List[str]]):
    sentences_len = len(sentences)
    scores_one_row = []
    for j in range(sentences_len):
        scores_one_row.append(jaccard_score_two_sentences(input_sentence, sentences[j]))

    return np.array(scores_one_row)

def rematch_df(df, n_items: int):
    # n_items = 3 if kwords are male, female, neutral etc.
    counts_df = (
        df.entry_id.value_counts()
        .to_frame()
        .reset_index()
        .rename(columns={"entry_id": "counts", "index": "entry_id"})
    )
    more_than_one_sentence_ids = counts_df[counts_df['counts'] > n_items].entry_id.tolist()

    only_one_sentence_ids = list(
        set(counts_df.entry_id.tolist()) - set(more_than_one_sentence_ids)
    )

    final_df = df[df.entry_id.isin(only_one_sentence_ids)].sort_values(by='entry_id')

    #kwords
    kwords_not_neutral = list(df['kw'].unique())
    kwords_not_neutral.remove('neutral')

    for one_id in more_than_one_sentence_ids:
        df_same_id = df[df.entry_id == one_id]

        neutral_df = df_same_id[df_same_id.kw=='neutral']

        for i in range(len(neutral_df)):
            one_row = neutral_df.iloc[[i]].copy()

            for one_kw in kwords_not_neutral:
                df_one_kw = df_same_id[df_same_id.kw==one_kw]

                similtarities = []

                split_original_excerpt: List[str] = one_row.excerpt.values[0].split(' ')
                split_excerpts: List[List[str]] = df_one_kw.excerpt.apply(lambda x: x.split(' ')).tolist()

                jaccard_scores = custom_jaccard_score(split_original_excerpt, split_excerpts)
                best_id = np.argmax(jaccard_scores)

                one_row = one_row.append(df_one_kw.iloc[[best_id]])

            final_df = final_df.append(one_row)

    return final_df

In [244]:
def get_gender(word):
    if 'person' in word:
        return 'neutral'
    elif any([one_female_kw in word for one_female_kw in ["women", "woman", "girl", "female", "mother"]]):
        return 'female'
    else:
        return 'male'

gender_results = pd.read_csv('final_data/gender_df_with_outputs.csv.gz', compression='gzip')

gender_results['kw'] = gender_results.kw.apply(get_gender)
# get proba shifts
probabilities_shifts = defaultdict(lambda: defaultdict(list))

gender_results["probability"] = gender_results["probability"].apply(literal_eval)

rematched_df = rematch_df(gender_results, n_items=gender_results['kw'].nunique())

In [245]:
n_items = 3
for i in range (0, rematched_df.shape[0], n_items):
    df_one_id = rematched_df[i: i + n_items]

    # get neutral proba
    neutral_proba = df_one_id[df_one_id.kw == "neutral"].probability.values[0]

    # get proba for each gender
    for one_gender in ["male", "female"]:
        probas_one_gender = df_one_id[df_one_id.kw == one_gender].probability.values[0]
        for tag, proba in probas_one_gender.items():
            probabilities_shifts[tag][one_gender].append(proba - neutral_proba[tag])

In [251]:
shifts_gender = defaultdict(lambda: defaultdict(float))

for tag, probas_per_gender in probabilities_shifts.items():
    
    for gender, probas_list in probas_per_gender.items():
        shifts_gender[tag][f"{gender}_mean"] = round(np.mean(probas_list), 2)
        shifts_gender[tag][f"{gender}_median"] =  round(np.median(probas_list), 2)
        shifts_gender[tag][f"{gender}_std"] =  round(np.std(probas_list), 2)


shifts_gender_df = pd.DataFrame.from_dict(shifts_gender, orient='index')
shifts_gender_df.index.name = 'tag'
shifts_gender_df

Unnamed: 0_level_0,male_mean,male_median,male_std,female_mean,female_median,female_std
tag,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
first_level_tags->pillars_1d->Casualties,-0.18,-0.00,2.61,-0.14,-0.00,3.14
first_level_tags->pillars_1d->Context,0.45,0.03,5.23,1.54,0.15,7.80
first_level_tags->pillars_1d->Covid-19,-0.22,-0.00,2.30,-0.24,-0.00,3.89
first_level_tags->pillars_1d->Displacement,-0.65,-0.01,4.22,-0.49,-0.01,5.59
first_level_tags->pillars_1d->Humanitarian Access,-0.13,-0.01,0.84,-0.17,-0.01,0.80
...,...,...,...,...,...,...
subpillars->Priority Needs->Expressed By Humanitarian Staff,-0.32,-0.00,3.23,-0.44,-0.00,3.30
subpillars->Priority Needs->Expressed By Population,-0.17,-0.00,4.27,-0.07,-0.00,3.98
subpillars->Shock/Event->Hazard & Threats,0.16,0.00,2.01,0.12,0.00,2.03
subpillars->Shock/Event->Type And Characteristics,-0.10,-0.00,1.23,-0.16,-0.00,1.33


In [252]:
treated_kwords = list(gender_results['kw'].unique())
treated_kwords.remove('neutral')

for one_kw in treated_kwords:
    shifts_gender_df[[
        f'{one_kw}_mean', f'{one_kw}_median', f'{one_kw}_std'
    ]].to_csv(f'results/second_results_golden_set/gender/shifts_{one_kw}.csv')

In [250]:
shifts_gender_df.sort_values(by='male_mean', ascending=False)

Unnamed: 0_level_0,male_mean,male_median,male_std,female_mean,female_median,female_std
tag,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
first_level_tags->pillars_2d->Impact,2.28,0.23,6.93,2.18,0.15,7.47
first_level_tags->sectors->Protection,2.13,0.16,8.67,5.04,0.74,11.50
subpillars->Impact->Impact On People,1.96,0.08,6.71,2.31,0.09,7.75
first_level_tags->sectors->Livelihoods,0.89,0.03,6.14,1.36,0.03,7.02
subpillars->Impact->Driver/Aggravating Factors,0.86,0.05,3.10,0.96,0.05,3.70
...,...,...,...,...,...,...
first_level_tags->sectors->Cross,-0.62,0.01,6.71,-1.50,-0.06,7.57
first_level_tags->pillars_1d->Displacement,-0.65,-0.01,4.22,-0.49,-0.01,5.59
subpillars->Humanitarian Conditions->Physical And Mental Well Being,-0.77,-0.04,5.31,-0.77,-0.03,7.07
subpillars->Humanitarian Conditions->Living Standards,-1.57,-0.18,5.87,-1.67,-0.21,6.79


## minorities stats

In [258]:
minorities_results = pd.read_csv('final_data/minorities_df_with_outputs.csv.gz', compression='gzip')

minorities_results['kw'] = minorities_results['kw'].apply(
    lambda x: x if x!='person' else 'neutral'
)

# get proba shifts
probabilities_shifts = defaultdict(lambda: defaultdict(list))

minorities_results["probability"] = minorities_results["probability"].apply(literal_eval)

rematched_minorities_df = rematch_df(minorities_results, n_items=minorities_results['kw'].nunique())

In [259]:
rematched_minorities_df.shape, minorities_results.shape

((294, 5), (294, 5))

In [260]:
rematched_minorities_df

Unnamed: 0,excerpt,entry_id,kw,type,probability
293,They indicated that the level of tolerance tow...,57176.0,neutral,augmented,{'first_level_tags->pillars_1d->Casualties': 0...
281,They indicated that the level of tolerance tow...,57176.0,transgender,augmented,{'first_level_tags->pillars_1d->Casualties': 0...
20,They indicated that the level of tolerance tow...,57176.0,lgbt,original,{'first_level_tags->pillars_1d->Casualties': 0...
282,They indicated that the level of tolerance tow...,57176.0,sex worker,augmented,{'first_level_tags->pillars_1d->Casualties': 0...
283,They indicated that the level of tolerance tow...,57176.0,homosexual,augmented,{'first_level_tags->pillars_1d->Casualties': 0...
...,...,...,...,...,...
119,"For example, there has been no specific focus ...",65071.0,refugee,augmented,{'first_level_tags->pillars_1d->Casualties': 0...
120,"For example, there has been no specific focus ...",65071.0,migrant,augmented,{'first_level_tags->pillars_1d->Casualties': 0...
121,"For example, there has been no specific focus ...",65071.0,asylum seeker,augmented,{'first_level_tags->pillars_1d->Casualties': 0...
122,"For example, there has been no specific focus ...",65071.0,IDP,augmented,{'first_level_tags->pillars_1d->Casualties': 0...


In [261]:
minorities_kwords = list(minorities_results['kw'].unique())
minorities_kwords.remove('neutral')
len(minorities_kwords)

13

In [262]:
n_items = len(minorities_results.kw.unique())
for i in range (0, rematched_minorities_df.shape[0], n_items):
    df_one_id = rematched_minorities_df[i: i + n_items]

    # get neutral proba
    neutral_proba = df_one_id[df_one_id.kw == "neutral"].probability.values[0]

    # get proba for each gender
    for one_gender in minorities_kwords:
        probas_one_gender = df_one_id[df_one_id.kw == one_gender].probability.values[0]
        for tag, proba in probas_one_gender.items():
            probabilities_shifts[tag][one_gender].append(proba - neutral_proba[tag])


In [265]:
shifts_minorities = defaultdict(lambda: defaultdict(float))

for tag, probas_per_minority in probabilities_shifts.items():
    for one_pop, probas_list in probas_per_minority.items():
        shifts_minorities[tag][f"{one_pop}_mean"] = round(np.mean(probas_list), 2)
        shifts_minorities[tag][f"{one_pop}_median"] =  round(np.median(probas_list), 2)
        shifts_minorities[tag][f"{one_pop}_std"] =  round(np.std(probas_list), 2)

shifts_minorities_df = pd.DataFrame.from_dict(shifts_minorities, orient='index')
#shifts_minorities_df.to_csv('shifts_minorities.csv')
shifts_minorities_df

Unnamed: 0,sex worker_mean,sex worker_median,sex worker_std,lgbt_mean,lgbt_median,lgbt_std,transgender_mean,transgender_median,transgender_std,homosexual_mean,...,migrant_std,asylum seeker_mean,asylum seeker_median,asylum seeker_std,IDP_mean,IDP_median,IDP_std,internally displaced people_mean,internally displaced people_median,internally displaced people_std
first_level_tags->pillars_1d->Casualties,-0.56,-0.00,2.38,0.76,0.00,3.38,0.20,-0.00,1.00,-0.48,...,0.56,0.54,-0.00,2.44,-0.52,-0.00,2.16,-0.23,-0.00,0.89
first_level_tags->pillars_1d->Context,-5.56,-0.34,13.54,-2.22,-0.34,4.03,-2.13,-0.24,5.62,-0.71,...,5.51,2.31,0.49,3.95,-0.62,0.31,3.55,1.14,0.48,3.73
first_level_tags->pillars_1d->Covid-19,-0.38,-0.00,1.15,-0.46,-0.01,1.24,-0.33,0.00,1.01,-0.43,...,1.25,-0.17,0.00,0.92,-0.26,-0.00,0.90,-0.37,-0.00,1.22
first_level_tags->pillars_1d->Displacement,-1.83,-0.18,6.49,-0.29,0.00,5.88,4.71,0.11,9.45,-0.57,...,8.42,1.85,0.07,9.45,1.09,-0.02,9.33,2.99,0.51,6.35
first_level_tags->pillars_1d->Humanitarian Access,-0.12,-0.02,0.43,-0.08,0.00,0.28,0.00,-0.00,0.05,-0.05,...,0.46,-0.03,0.00,0.45,-0.00,-0.01,0.27,0.05,0.00,0.16
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
subpillars->Priority Needs->Expressed By Humanitarian Staff,-0.23,0.00,1.54,0.66,0.01,3.07,0.68,0.02,2.23,0.35,...,1.00,-0.16,-0.00,1.06,-0.29,-0.01,1.33,0.26,-0.01,1.23
subpillars->Priority Needs->Expressed By Population,-0.70,-0.01,2.32,-0.40,0.00,1.34,-0.51,0.00,1.48,-0.56,...,1.54,-0.76,-0.00,2.33,-0.88,-0.01,2.93,-0.54,-0.00,1.25
subpillars->Shock/Event->Hazard & Threats,-0.17,-0.06,0.59,0.08,0.00,0.72,-0.02,-0.07,0.41,0.08,...,0.41,0.07,0.02,0.57,0.09,0.03,0.41,0.28,0.09,0.43
subpillars->Shock/Event->Type And Characteristics,-0.17,-0.04,0.22,-0.05,-0.02,0.22,-0.02,-0.00,0.24,0.04,...,0.28,0.08,0.00,0.36,0.05,0.00,0.43,0.06,0.02,0.19


In [266]:
for one_kw in minorities_kwords:
    shifts_minorities_df[[
        f'{one_kw}_mean', f'{one_kw}_median', f'{one_kw}_std'
    ]].to_csv(f'results/second_results_golden_set/minorities/shifts_{one_kw}.csv')

In [267]:
minorities_kwords

['sex worker',
 'lgbt',
 'transgender',
 'homosexual',
 'lesbian',
 'bisexual',
 'intersexual',
 'queer',
 'refugee',
 'migrant',
 'asylum seeker',
 'IDP',
 'internally displaced people']