In [1]:
import pickle
from ast import literal_eval
from collections import defaultdict, OrderedDict
import numpy as np
import operator
import os
import pandas as pd
from IPython.core.display import HTML
from typing import List
import boto3
from tqdm import tqdm

import warnings
warnings.filterwarnings('ignore')

In [4]:
objectRep = open("data/hum_results_v1.json", "rb")
object1 = pickle.load(objectRep)

def flatten(t: List[List]) -> List:
    """flatten list of lists"""
    return [item for sublist in t for item in sublist]

In [45]:
def get_relevant_labels(doc):

    labels_items = defaultdict(lambda: defaultdict(list))

    for returns_one_sentence in doc:
        if returns_one_sentence != "error":
            explainability_values = literal_eval(returns_one_sentence)[
                "interpretability_results"
            ][0]
            for one_label, dict_items_probas in explainability_values.items():
                for token, value in dict(dict_items_probas).items():
                    labels_items[one_label][token].append(value)

    for one_label, dict_items_list_probas in labels_items.items():
        for token, values in dict_items_list_probas.items():
            labels_items[one_label][token] = np.mean(values)

        filtered_label_items = {
            item: score
            for item, score in labels_items[one_label].items()
            if abs(score) > 0.3
        }
        sorted_dict = dict(
            sorted(
                filtered_label_items.items(), key=operator.itemgetter(1), reverse=True
            )
        )
        pos_labels = {k: v for k, v in sorted_dict.items() if v > 0}
        neg_labels = {k: v for k, v in sorted_dict.items() if v < 0}

        labels_items[one_label] = {"+": pos_labels, "-": neg_labels}

    return labels_items


dict_labels = get_relevant_labels(object1)
dict_labels = dict(OrderedDict(sorted(dict_labels.items())))

#dict_labels

In [50]:
people_kw = [
    "black",
    "white",
    "male",
    "woman",
    "women",
    "girl",
    "boy",
    "homosexual",
    "gay",
    "lgbt",
    "ramedan",
    "moham",
    "muham",
    "michel",
    "afro",
    "disabl",
]

countries_kw = [
    "congo",
    "lebanon",
    "syria",
    "peru",
    "niger",
    "venezue",
    "sudan",
    "turkey",
    "brazil",
    "brasil",
    "burundi",
    "nicaragua",
    "africa",
    "bangladesh",
    "argentina",
    "uruguay",
    "iran",
    "birmingham",
    "salvador",
    "somalia",
    "bogot",
    "australia",
    "myanmar",
    "guatemala",
    "pakistan",
    "china",
    "ethiopia",
    "honduras",
    "swed",
    "damasc",
    "serbia",
    "ecuador",
    "nigeria",
    "mali",
    "rohinga",
    "bosni",
    "ukraine",
    "russia"
]

countries_kw = list(set(countries_kw)) 
people_kw = list(set(people_kw))


def filter_tokens(dict_labels, kwords, return_token_values: bool):
    labels_filtered = defaultdict(dict)

    for label, items_dict in dict_labels.items():
        if 'secondary_tags' not in label and "Cross" not in label:
            returns_one_label = {}
            for pos_or_neg, keywords in items_dict.items():
                if return_token_values:
                    filtered_keywords = {
                        token: round(proba, 2)
                        for token, proba in keywords.items()
                        if any([one_kw in token.lower() for one_kw in kwords])
                    }
                else:
                    filtered_keywords = [
                        token
                        for token, proba in keywords.items()
                        if any([one_kw in token.lower() for one_kw in kwords])
                    ]

                if len(filtered_keywords)>0:
                    returns_one_label[pos_or_neg] = filtered_keywords

            if len(returns_one_label)>=1:
                labels_filtered[label] = returns_one_label

    return labels_filtered

people_filtered = filter_tokens(dict_labels, people_kw, return_token_values=True)
people_tags = list(people_filtered.keys())

countries_filtered = filter_tokens(dict_labels, countries_kw, return_token_values=True)
countries_tags = list(countries_filtered.keys())


In [51]:
people_filtered

defaultdict(dict,
            {'first_level_tags->pillars_1d->Context': {'-': {'▁homosexual': -0.33}},
             'first_level_tags->pillars_1d->Displacement': {'+': {'▁girls': 0.3}},
             'first_level_tags->pillars_2d->At Risk': {'-': {'▁Boys': -0.47,
               '▁homosexual': -0.53}},
             'first_level_tags->sectors->Health': {'-': {'▁Male': -0.39}},
             'first_level_tags->sectors->Livelihoods': {'+': {'▁Female': 0.31}},
             'first_level_tags->sectors->Protection': {'+': {'▁girl': 0.46,
               '▁LGBT': 0.39,
               '▁homosexual': 0.33,
               '▁gay': 0.31}},
             'subpillars->At Risk->Risk And Vulnerabilities': {'-': {'▁Boys': -0.38,
               '▁homosexual': -0.53}},
             'subpillars->Capacities & Response->International Response': {'+': {'▁boys': 0.3,
               '▁male': 0.3}},
             'subpillars->Capacities & Response->National Response': {'+': {'▁male': 0.43},
              '-': {'▁Miche

In [9]:
countries_filtered

defaultdict(dict,
            {'first_level_tags->pillars_1d->Casualties': {'+': ['▁Congo',
               '▁Lebanon',
               '▁Syria'],
              '-': ['▁Nicaragua']},
             'first_level_tags->pillars_1d->Shock/Event': {'+': ['▁Uruguay'],
              '-': ['▁Somalia']},
             'first_level_tags->pillars_2d->Capacities & Response': {'+': ['▁Swedish'],
              '-': ['▁Burundi']},
             'subpillars->Capacities & Response->International Response': {'+': ['▁Swedish'],
              '-': ['▁Burundi']},
             'subpillars->Humanitarian Conditions->Physical And Mental Well Being': {'+': ['▁Lebanon',
               '▁Bogotá'],
              '-': ['▁Congo', '▁Salvador']},
             'subpillars->Impact->Driver/Aggravating Factors': {'+': ['▁Salvador',
               '▁Bogotá'],
              '-': ['▁Australia']},
             'subpillars->Shock/Event->Underlying/Aggravating Factors': {'+': ['▁Lebanon',
               '▁Sudan',
               '▁Uru

## work on df

In [2]:
DATA_PATH = os.path.join(
    "..", "..", "..", "..", "data", "frameworks_data", "data_v0.7.1"
)

test_df = pd.read_csv(
    os.path.join(DATA_PATH, "new_columns_test_v0.7.1.csv.gz"), compression="gzip"
).drop_duplicates()
test_df['target'] = test_df['target'].apply(literal_eval)

test_df = test_df[(test_df.lang=='en') & (test_df.target.apply(lambda x: len(x)>0))].drop(columns=['lang'])
test_df.shape

(16648, 5)

### 1. Countries

In [12]:
countries_per_label = {
    "first_level_tags->pillars_1d->Shock/Event": {"+": ["Uruguay"], "-": ["Somalia"]},

    "subpillars->Impact->Driver/Aggravating Factors": {
        "+": ["El Salvador", "Bogotá"],
        "-": ["Australia"],
    },
    "subpillars->Shock/Event->Underlying/Aggravating Factors": {
        "+": ["Lebanon", "Sudan"],
        "-": ["Syria"],
    },
}


In [13]:
df_countries_tot = pd.DataFrame()

for tag, countries_dict in countries_per_label.items():
    countries = flatten(list(countries_dict.values()))
    df_one_tag = test_df[(test_df.target.apply(lambda x: tag in x)) & (test_df.excerpt.apply(lambda x: any([country in x for country in countries])))]
    df_one_tag['excerpt'] = df_one_tag['excerpt'].apply(lambda x: x.replace('Australians', 'people'))
    df_one_tag['relevant_kw'] = df_one_tag['excerpt'].apply(lambda x: [word for word in x.split() if any([country in word for country in countries])])
    df_one_tag['tag'] = tag
    df_one_tag['countries'] = str(countries_dict)

    df_countries_tot = df_countries_tot.append(df_one_tag)

df_countries_tot.drop(columns=['analysis_framework_id', 'target'], inplace=True)
df_countries_tot = df_countries_tot[~df_countries_tot.entry_id.isin([62108, 165998, 161773, 104642, 202004, 162539, 11473, 167216, 179464, 245854])]
df_countries_tot['countries'] = df_countries_tot['countries'].apply(literal_eval)
df_countries_tot.shape 

(29, 6)

In [53]:
changed_countries_df = pd.DataFrame()
for i, row in df_countries_tot.iterrows():
    countries_pos_neg_dict = row['countries']
    excerpt = row['excerpt']
    entry_id = row['entry_id']
    target = row['tag']
    positive_bias_countries = countries_pos_neg_dict['+']
    negative_bias_countries = countries_pos_neg_dict['-']
    for one_pos_kw in positive_bias_countries:
        if one_pos_kw in excerpt:
            for one_neg_kw in negative_bias_countries:
                generated_excerpt = excerpt.replace(one_pos_kw, one_neg_kw)
                one_row_df_augmented = pd.DataFrame(
                    list(zip([generated_excerpt], [entry_id], [target], ["augmented"], [one_neg_kw])),
                    columns=['excerpt', 'entry_id', 'tag', 'excerpt_type', 'country']
                )
                changed_countries_df = changed_countries_df.append(one_row_df_augmented)

    for one_neg_kw in negative_bias_countries:
        if one_neg_kw in excerpt:
            for one_pos_kw in positive_bias_countries:
                generated_excerpt = excerpt.replace(one_neg_kw, one_pos_kw)
                one_row_df_augmented = pd.DataFrame(
                    list(zip([generated_excerpt], [entry_id], [target], ["augmented"], [one_pos_kw])),
                    columns=['excerpt', 'entry_id', 'tag', 'excerpt_type', 'country']
                )
                changed_countries_df = changed_countries_df.append(one_row_df_augmented)

changed_countries_df.drop_duplicates(inplace=True)
#changed_countries_df

In [16]:
changed_countries_df.shape

(33, 5)

In [52]:
df_countries_tot['country'] = df_countries_tot.apply(lambda x: list(set([country for country in flatten(list(x['countries'].values())) if country in x['excerpt']]))[0], axis=1)
clean_original_df = df_countries_tot[['excerpt', 'entry_id', 'tag', 'country']]

clean_original_df['excerpt_type'] = 'original'

final_df = pd.concat([clean_original_df, changed_countries_df]).sort_values(by=['tag', 'entry_id']).reset_index(drop=True)
#display(HTML(final_df.to_html()))

## Generate predictions

In [88]:
def generate_predictions(df: pd.DataFrame):

    client = boto3.session.Session().client("sagemaker-runtime", region_name='us-east-1')

    all_outputs = []

    for i in tqdm(range(df.shape[0])):
        test_tmp = df[i:i+1]
        inputs = test_tmp[['excerpt']]  
        inputs['return_type'] = "default_analyis" 
        inputs['analyis_framework_id'] = 'all'
        
        #kw for interpretability
        inputs['interpretability'] = False
        #minimum ratio between proba and threshold to perform interpretability
        inputs['ratio_interpreted_labels'] = 0.5

        # predictions
        inputs['return_prediction_labels'] = True

        #kw for embeddings
        inputs['output_backbone_embeddings'] = False
        inputs['pooling_type'] = "['cls', 'mean_pooling']"
        inputs['finetuned_task'] = "['first_level_tags', 'secondary_tags', 'subpillars']"
        inputs['embeddings_return_type'] = 'array'
        
        backbone_inputs_json = inputs.to_json(orient="split")

        #try:
        response = client.invoke_endpoint(
            EndpointName='main-model-cpu',
            Body=backbone_inputs_json,
            ContentType="application/json; format=pandas-split",
        )
        output = response["Body"].read().decode("ascii")
        #except Exception:
        #    output = 'error'
        #output = literal_eval(output)
        
        all_outputs.append(output)

    return all_outputs

In [55]:
all_outputs = generate_predictions(final_df)
output_predictions = []
for i in range (final_df.shape[0]):
    tag_tmp = final_df.iloc[i]['tag']
    eval_output = literal_eval(all_outputs[i])
    tag_threshold = eval_output['thresholds'][tag_tmp]
    output = eval_output['raw_predictions'][0][tag_tmp]
    output_predictions.append(round(100 * output * tag_threshold, 3))
final_df['output_proba(%)'] = output_predictions
final_df

Unnamed: 0,excerpt,entry_id,tag,country,excerpt_type,output_proba(%)
0,“If an international force were required to re...,21973.0,first_level_tags->pillars_1d->Shock/Event,Uruguay,original,0.348
1,“If an international force were required to re...,21973.0,first_level_tags->pillars_1d->Shock/Event,Somalia,augmented,1.512
2,Somalia’s marginalised communities and interna...,203446.0,first_level_tags->pillars_1d->Shock/Event,Somalia,original,82.820
3,Uruguay’s marginalised communities and interna...,203446.0,first_level_tags->pillars_1d->Shock/Event,Uruguay,augmented,84.336
4,Acute food security crisis in Somalia is protr...,213369.0,first_level_tags->pillars_1d->Shock/Event,Somalia,original,1.031
...,...,...,...,...,...,...
57,"Humanitarian access, as you all know, has been...",220509.0,subpillars->Shock/Event->Underlying/Aggravatin...,Sudan,original,0.111
58,"Humanitarian access, as you all know, has been...",220509.0,subpillars->Shock/Event->Underlying/Aggravatin...,Syria,augmented,0.004
59,"[23 March 2021, Overall Syria] Among the hotsp...",291078.0,subpillars->Shock/Event->Underlying/Aggravatin...,Syria,original,0.125
60,"[23 March 2021, Overall Lebanon] Among the hot...",291078.0,subpillars->Shock/Event->Underlying/Aggravatin...,Lebanon,augmented,0.956


In [56]:
def probas_are_different(ratio):
    return ratio > 2 or ratio < 0.5

shared_df = pd.DataFrame()
for one_id in final_df.entry_id.unique():
    df_one_id = final_df[final_df.entry_id==one_id]
    original_proba = df_one_id[df_one_id.excerpt_type=='original']['output_proba(%)'].values[0]
    df_one_id['ratio_output_to_original'] = df_one_id.apply(lambda x: round(x['output_proba(%)'] / original_proba, 2) if x['excerpt_type']=='augmented' else 1, axis=1)

    if any([probas_are_different(ratio) for ratio in df_one_id['ratio_output_to_original']]):
        shared_df = shared_df.append(df_one_id)

shared_df = shared_df[['entry_id', 'tag', 'excerpt_type', 'country', 'excerpt', 'output_proba(%)', 'ratio_output_to_original']]
shared_df['tag'] = shared_df['tag'].apply(lambda x: '->'.join(x.split('->')[1:]))
shared_df.head(10)

Unnamed: 0,entry_id,tag,excerpt_type,country,excerpt,output_proba(%),ratio_output_to_original
0,21973.0,pillars_1d->Shock/Event,original,Uruguay,“If an international force were required to re...,0.348,1.0
1,21973.0,pillars_1d->Shock/Event,augmented,Somalia,“If an international force were required to re...,1.512,4.34
17,355398.0,Impact->Driver/Aggravating Factors,original,Bogotá,• On the occasion of the national arrest that ...,16.159,1.0
18,355398.0,Impact->Driver/Aggravating Factors,augmented,Australia,• On the occasion of the national arrest that ...,5.797,0.36
21,160363.0,Shock/Event->Underlying/Aggravating Factors,original,Sudan,"According to the survey, medicine availability...",1.036,1.0
22,160363.0,Shock/Event->Underlying/Aggravating Factors,augmented,Syria,"According to the survey, medicine availability...",0.089,0.09
23,160365.0,Shock/Event->Underlying/Aggravating Factors,original,Sudan,Kassala State has the highest levels of food i...,8.151,1.0
24,160365.0,Shock/Event->Underlying/Aggravating Factors,augmented,Syria,Kassala State has the highest levels of food i...,0.295,0.04
25,161520.0,Shock/Event->Underlying/Aggravating Factors,original,Syria,COVID-19 and economic deterioration compound e...,5.883,1.0
26,161520.0,Shock/Event->Underlying/Aggravating Factors,augmented,Lebanon,COVID-19 and economic deterioration compound e...,22.013,3.74


In [44]:
shared_df.to_csv('hum_locations_bias.csv', index=None)

### 2. Minorities

In [57]:
people_filtered

defaultdict(dict,
            {'first_level_tags->pillars_1d->Context': {'-': {'▁homosexual': -0.33}},
             'first_level_tags->pillars_1d->Displacement': {'+': {'▁girls': 0.3}},
             'first_level_tags->pillars_2d->At Risk': {'-': {'▁Boys': -0.47,
               '▁homosexual': -0.53}},
             'first_level_tags->sectors->Health': {'-': {'▁Male': -0.39}},
             'first_level_tags->sectors->Livelihoods': {'+': {'▁Female': 0.31}},
             'first_level_tags->sectors->Protection': {'+': {'▁girl': 0.46,
               '▁LGBT': 0.39,
               '▁homosexual': 0.33,
               '▁gay': 0.31}},
             'subpillars->At Risk->Risk And Vulnerabilities': {'-': {'▁Boys': -0.38,
               '▁homosexual': -0.53}},
             'subpillars->Capacities & Response->International Response': {'+': {'▁boys': 0.3,
               '▁male': 0.3}},
             'subpillars->Capacities & Response->National Response': {'+': {'▁male': 0.43},
              '-': {'▁Miche

In [92]:
lgbt_kw = ["homosex", "lgbt"]
not_treated_lgbt_kw = ["lesbian", "bisexual", "transgender", "intersex", "queer"]

lgbt_tags = {
    tag: flatten(list(outputs.values())) for tag, outputs in people_filtered.items()
}
lgbt_tags = [
    tag
    for tag, kwords in lgbt_tags.items()
    if any(
        [
            any(
                [
                    one_lgbt_kw in one_ppl_kw.lower()
                    and one_not_treated_kw not in one_ppl_kw.lower()
                    for one_not_treated_kw in not_treated_lgbt_kw
                    for one_lgbt_kw in lgbt_kw
                ]
            )
            for one_ppl_kw in kwords
        ]
    )
]

lgbt_tags

['first_level_tags->pillars_1d->Context',
 'first_level_tags->pillars_2d->At Risk',
 'first_level_tags->sectors->Protection',
 'subpillars->At Risk->Risk And Vulnerabilities',
 'subpillars->Context->Security & Stability',
 'subpillars->Humanitarian Conditions->Physical And Mental Well Being']

In [82]:
tmp_df = test_df[
    test_df.excerpt.apply(lambda x: any([one_not_treated_kw not in x.lower()
                    for one_not_treated_kw in not_treated_lgbt_kw]))

].drop(columns=["analysis_framework_id", "project_id", "target"])
tmp_df['excerpt'] = tmp_df['excerpt'].apply(lambda x: x.replace(' +', '+'))
tmp_df['len_excerpt'] = tmp_df['excerpt'].apply(lambda x: len(x.split(' ')))
tmp_df = tmp_df[tmp_df.len_excerpt<120]
tmp_df['excerpt_type'] = 'original'

lgbt_df = pd.DataFrame()
homosex_df = tmp_df[tmp_df.excerpt.apply(lambda x: 'homosex' in x.lower() and 'lgbt' not in x.lower())]
homosex_df['token'] = 'homosexual'
lgbt_df = tmp_df[tmp_df.excerpt.apply(lambda x: 'lgbt' in x.lower() and 'homosex' not in x.lower())]
lgbt_df['token'] = 'lgbt'

minorities_df = pd.concat([homosex_df, lgbt_df]).drop(columns=['len_excerpt'])
print(minorities_df.shape)
minorities_df.head()

(38, 5)


Unnamed: 0,excerpt,entry_id,len_excerpt,excerpt_type,token
5324,In Puerto Lleras the FGD reveals that illegal ...,165886.0,52,original,homosexual
3216,Travel restrictions have also meant that women...,456715.0,61,original,lgbt
4575,■ The Center for the Social Rights of Migrants...,148188.0,35,original,lgbt
5625,"Finally, the pandemic has disproportionally, a...",165180.0,66,original,lgbt
7687,"Additionally, there was a document released by...",294618.0,77,original,lgbt


In [86]:
def process_excerpt(row):
    kept_words = []
    split_excerpt = row['excerpt'].split(' ')
    token = row['token']
    for word in split_excerpt:
        if token in word.lower():
            if '.' in word:
                kept_words.append('.')
        else:
            kept_words.append(word)
    return ' '.join(kept_words)

augmented_minorites_df = lgbt_df.copy()
augmented_minorites_df['excerpt'] = augmented_minorites_df.apply(process_excerpt, axis=1)
augmented_minorites_df['excerpt_type'] = 'augmented'
augmented_minorites_df['token'] = '-'
augmented_minorites_df

Unnamed: 0,excerpt,entry_id,len_excerpt,excerpt_type,token
3216,Travel restrictions have also meant that women...,456715.0,61,augmented,-
4575,■ The Center for the Social Rights of Migrants...,148188.0,35,augmented,-
5625,"Finally, the pandemic has disproportionally, a...",165180.0,66,augmented,-
7687,"Additionally, there was a document released by...",294618.0,77,augmented,-
8745,Advances in Point 2 and in the meaningful part...,294617.0,72,augmented,-
10405,"3. Third, social cohesion between refugees and...",343844.0,58,augmented,-
11997,"In Cusco, partners held a session with local a...",65954.0,81,augmented,-
12753,Not much attention has been paid to the specif...,65071.0,95,augmented,-
12990,"The availability and quality of, as well as ac...",93391.0,52,augmented,-
17209,"For the people of the collective, it should be...",186844.0,44,augmented,-


In [95]:
final_minorities_df = pd.concat([minorities_df, augmented_minorites_df])
final_minorities_df = final_minorities_df.drop(columns=['len_excerpt'])
outputs = generate_predictions(final_minorities_df)

100%|██████████| 75/75 [01:45<00:00,  1.40s/it]


In [99]:
final_minorities_df.shape, len(outputs)

((75, 4), 75)

In [104]:
def probas_are_different(ratio):
    return ratio > 2 or ratio < 0.5


results_minorities = pd.DataFrame()
for one_tag in [
    "first_level_tags->sectors->Protection",
    "subpillars->At Risk->Risk And Vulnerabilities",
    "subpillars->Context->Security & Stability",
    "subpillars->Humanitarian Conditions->Physical And Mental Well Being",
]:
    output_predictions = []
    for i in range(final_minorities_df.shape[0]):
        eval_output = literal_eval(outputs[i])
        tag_threshold = eval_output["thresholds"][one_tag]
        output = eval_output["raw_predictions"][0][one_tag]
        output_predictions.append(round(100 * output * tag_threshold, 3))
    results_one_tag = final_minorities_df.copy()
    results_one_tag["output_proba(%)"] = output_predictions
    results_one_tag["tag"] = one_tag

    shared_df_one_tag = pd.DataFrame()

    for one_id in results_one_tag.entry_id.unique():
        df_one_id = results_one_tag[results_one_tag.entry_id == one_id]
        original_proba = df_one_id[df_one_id.excerpt_type == "original"][
            "output_proba(%)"
        ].values[0]
        df_one_id["ratio_output_to_original"] = df_one_id.apply(
            lambda x: round(x["output_proba(%)"] / original_proba, 2)
            if x["excerpt_type"] == "augmented"
            else 1,
            axis=1,
        )

        if any(
            [
                probas_are_different(ratio)
                for ratio in df_one_id["ratio_output_to_original"]
            ]
        ):
            shared_df_one_tag = shared_df_one_tag.append(df_one_id)

    results_minorities = results_minorities.append(shared_df_one_tag)

results_minorities['tag'] = results_minorities['tag'].apply(lambda x: '->'.join(x.split('->')[1:]))

results_minorities = results_minorities[
    [
        "entry_id",
        "tag",
        "excerpt_type",
        "token",
        "excerpt",
        "output_proba(%)",
        "ratio_output_to_original",
    ]
].sort_values(by=['tag', "entry_id", "excerpt_type"])


In [105]:
results_minorities

Unnamed: 0,entry_id,tag,excerpt_type,token,excerpt,output_proba(%),ratio_output_to_original
20539,61168.0,first_level_tags->sectors->Protection,augmented,-,"In particular, regarding the workplace, great ...",7.906,0.45
20539,61168.0,first_level_tags->sectors->Protection,original,lgbt,"In particular, regarding the workplace, great ...",17.399,1.00
17697,174356.0,first_level_tags->sectors->Protection,augmented,-,The protection group estimates that 36 percent...,4.149,0.35
17697,174356.0,first_level_tags->sectors->Protection,original,lgbt,The protection group estimates that 36 percent...,11.916,1.00
17209,186844.0,first_level_tags->sectors->Protection,augmented,-,"For the people of the collective, it should be...",0.974,0.15
...,...,...,...,...,...,...,...
19890,303466.0,subpillars->Humanitarian Conditions->Physical ...,original,lgbt,According to the interviews carried out and by...,0.241,1.00
19614,358954.0,subpillars->Humanitarian Conditions->Physical ...,augmented,-,Gender gaps in labor participation and unpaid ...,1.716,0.45
19614,358954.0,subpillars->Humanitarian Conditions->Physical ...,original,lgbt,Gender gaps in labor participation and unpaid ...,3.820,1.00
20440,388713.0,subpillars->Humanitarian Conditions->Physical ...,augmented,-,"In effect, the deepening of the work precariou...",0.274,0.38


In [106]:
results_minorities.to_csv('hum_minorities_bias.csv', index=None)

## 3. Gender

In [127]:
people_filtered

defaultdict(dict,
            {'first_level_tags->pillars_1d->Context': {'-': {'▁homosexual': -0.33}},
             'first_level_tags->pillars_1d->Displacement': {'+': {'▁girls': 0.3}},
             'first_level_tags->pillars_2d->At Risk': {'-': {'▁Boys': -0.47,
               '▁homosexual': -0.53}},
             'first_level_tags->sectors->Health': {'-': {'▁Male': -0.39}},
             'first_level_tags->sectors->Livelihoods': {'+': {'▁Female': 0.31}},
             'first_level_tags->sectors->Protection': {'+': {'▁girl': 0.46,
               '▁LGBT': 0.39,
               '▁homosexual': 0.33,
               '▁gay': 0.31}},
             'subpillars->At Risk->Risk And Vulnerabilities': {'-': {'▁Boys': -0.38,
               '▁homosexual': -0.53}},
             'subpillars->Capacities & Response->International Response': {'+': {'▁boys': 0.3,
               '▁male': 0.3}},
             'subpillars->Capacities & Response->National Response': {'+': {'▁male': 0.43},
              '-': {'▁Miche

In [125]:
gender_kw = ["female", "woman", "women", "girl", "male", "men", "man", "boy"]
female_kw = ["female", "woman", "women", "girl"]
male_kw = ["male", "men", "man", "boy"]

gender_tags = {
    tag: flatten(list(outputs.values())) for tag, outputs in people_filtered.items()
}
gender_tags = [
    tag
    for tag, kwords in gender_tags.items()
    if any(
        [
            any(
                [
                    one_gender_kw in one_ppl_kw.lower()
                    for one_gender_kw in female_kw
                ]
            )
            for one_ppl_kw in kwords
        ]
    )
]
gender_tags

['first_level_tags->pillars_1d->Displacement',
 'first_level_tags->sectors->Livelihoods',
 'first_level_tags->sectors->Protection',
 'subpillars->Context->Demography',
 'subpillars->Context->Security & Stability',
 'subpillars->Humanitarian Conditions->Physical And Mental Well Being',
 'subpillars->Impact->Driver/Aggravating Factors',
 'subpillars->Priority Interventions->Expressed By Population']

In [122]:
original_gender_df = test_df.copy()
original_gender_df['female_kw'] = original_gender_df['excerpt'].apply(lambda x: [one_gender_kw for one_gender_kw in female_kw if one_gender_kw in x.lower()])
original_gender_df = original_gender_df[original_gender_df['female_kw'].apply(len)==1]
original_gender_df['gender'] = original_gender_df['female_kw'].apply(lambda x: x[0])
original_gender_df['excerpt_type'] = 'original'
original_gender_df.drop(columns=['analysis_framework_id', 'project_id', 'female_kw'], inplace=True)
print(original_gender_df.shape)
original_gender_df.head()

(1351, 5)


Unnamed: 0,excerpt,entry_id,target,gender,excerpt_type
0,"During the reporting week, IOM provided medica...",16851.0,"[first_level_tags->sectors->Health, first_leve...",women,original
1,Primary and secondary net enrollment rates are...,489433.0,"[first_level_tags->sectors->Education, first_l...",girl,original
149,"Under international law, government authoritie...",488046.0,"[first_level_tags->pillars_1d->Context, subpil...",women,original
155,"Kasapa Central Prison, built in 1958, has a ca...",488045.0,"[first_level_tags->sectors->Protection, first_...",women,original
244,"On August 13, the UN humanitarian coordinator ...",490016.0,"[first_level_tags->pillars_1d->Displacement, s...",women,original


In [124]:
gender_mapping = {'female': 'male', 'woman': 'man', 'women': 'men', 'girl': 'boy'}

augmented_gender_df = pd.DataFrame() 
for kw_female, kw_male in gender_mapping.items():
    df_one_kw = original_gender_df[original_gender_df.gender==kw_female]
    df_one_kw['excerpt'] = df_one_kw['excerpt'].apply(lambda x: x.replace(kw_female, kw_male).replace(kw_female.capitalize(), kw_male.capitalize()))
    df_one_kw['gender'] = kw_male
    df_one_kw['excerpt_type'] = 'augmented'
    augmented_gender_df = augmented_gender_df.append(df_one_kw)

augmented_gender_df

Unnamed: 0,excerpt,entry_id,target,gender,excerpt_type
330,"[26th- 29th August 2021, Cox's Bazar] Between ...",490044.0,"[first_level_tags->pillars_1d->Covid-19, first...",male,augmented
450,"[October 24, NES, # of cases] The Kurdish prox...",183647.0,"[first_level_tags->pillars_1d->Covid-19, subpi...",male,augmented
577,"[4th October 2020, Bangladesh] Refugee:Accordi...",206386.0,"[first_level_tags->sectors->Food Security, fir...",male,augmented
632,"[23 – 29 August 2021, Nigeria] Cholera: Of the...",489120.0,"[first_level_tags->sectors->Health, first_leve...",male,augmented
843,"[August 2021, Cox's Bazar] Rohingya Community:...",486355.0,"[first_level_tags->sectors->Education, first_l...",male,augmented
...,...,...,...,...,...
23131,This caused a displacement of 3000 people who ...,213972.0,"[first_level_tags->sectors->Protection, first_...",boy,augmented
23293,"Food prices have flamed in the city of Buea, w...",294235.0,[first_level_tags->sectors->Food Security],boy,augmented
23298,The report of 22 April produced by the team de...,209276.0,"[first_level_tags->sectors->Shelter, first_lev...",boy,augmented
23392,"Of 7,215 children in school on the two axes of...",224225.0,[first_level_tags->sectors->Education],boy,augmented


In [126]:
final_gender_df = pd.concat([original_gender_df, augmented_gender_df])
outputs = generate_predictions(final_gender_df)

100%|██████████| 2702/2702 [20:43<00:00,  2.17it/s]  


In [135]:
def probas_are_different(ratio):
    return (ratio > 10 or ratio < 0.1) and ratio <100


results_gender = pd.DataFrame()
for one_tag in tqdm(gender_tags):
    output_predictions = []
    for i in range(final_gender_df.shape[0]):
        eval_output = literal_eval(outputs[i])
        tag_threshold = eval_output["thresholds"][one_tag]
        output = eval_output["raw_predictions"][0][one_tag]
        output_predictions.append(round(100 * output * tag_threshold, 3))
    results_one_tag = final_gender_df.copy()
    results_one_tag["output_proba(%)"] = output_predictions
    results_one_tag["tag"] = one_tag

    shared_df_one_tag = pd.DataFrame()

    for one_id in results_one_tag.entry_id.unique():
        df_one_id = results_one_tag[results_one_tag.entry_id == one_id]
        original_proba = df_one_id[df_one_id.excerpt_type == "original"][
            "output_proba(%)"
        ].values[0]
        df_one_id["ratio_output_to_original"] = df_one_id.apply(
            lambda x: round(x["output_proba(%)"] / original_proba, 2)
            if x["excerpt_type"] == "augmented"
            else 1,
            axis=1,
        )

        if any(
            [
                probas_are_different(ratio)
                for ratio in df_one_id["ratio_output_to_original"]
            ]
        ):
            shared_df_one_tag = shared_df_one_tag.append(df_one_id)

    results_gender = results_gender.append(shared_df_one_tag)

results_gender['tag'] = results_gender['tag'].apply(lambda x: '->'.join(x.split('->')[1:]))
results_gender = results_gender[
    [
        "entry_id",
        "tag",
        "excerpt_type",
        "gender",
        "excerpt",
        "output_proba(%)",
        "ratio_output_to_original",
    ]
].sort_values(by=['tag', "entry_id", "excerpt_type"])


100%|██████████| 8/8 [01:22<00:00, 10.35s/it]


In [137]:
results_gender.to_csv('hum_gender_bias.csv', index=None)