In [168]:
import glob
import pandas as pd
import re
import json
import nltk
import numpy as np
import random

In [176]:
# Tagged lists of Activity/ object images from kayburns' Github Repo "Women Snowboard"
# https://github.com/kayburns/women-snowboard

activity_list_paths = glob.glob("./data/list/intersection_*")
activity_image_ids = dict()
for path in activity_list_paths:
    with open(path, 'r') as f:
        p = re.compile(r'(?<=_)(.*)(?=_)')
        activity = p.findall(path)[0]
        im_ids= f.read().split('\n')
        im_ids = [int(i) for i in im_ids if i != '']
        activity_image_ids[activity] = im_ids

print(activity_image_ids.keys()), activity_image_ids

dict_keys(['food', 'baseball', 'umbrella', 'kitchen', 'tie', 'snowboard', 'skateboard', 'table', 'cell', 'motorcycle'])


(None,
 {'food': [19456,
   468993,
   401411,
   281601,
   179720,
   239625,
   563723,
   111629,
   61967,
   146961,
   410627,
   537620,
   152598,
   468505,
   409627,
   262686,
   370209,
   19491,
   132644,
   192039,
   284764,
   466986,
   213255,
   430125,
   540209,
   298547,
   260150,
   272440,
   482362,
   201141,
   95297,
   330818,
   193547,
   539717,
   63047,
   276552,
   31817,
   498765,
   310797,
   334417,
   311378,
   377427,
   199764,
   100438,
   182362,
   299023,
   347228,
   451679,
   396385,
   137315,
   239717,
   95249,
   109673,
   369259,
   564332,
   208494,
   574069,
   115830,
   214648,
   69140,
   219771,
   42620,
   136042,
   392320,
   214720,
   562308,
   419974,
   51335,
   173704,
   360073,
   570515,
   465556,
   208871,
   326938,
   374430,
   410272,
   190115,
   246436,
   286523,
   334509,
   439982,
   132272,
   60596,
   575006,
   462006,
   401591,
   373945,
   574138,
   348447,
   128704,
   468

In [154]:
# Gender word lists from sueqian6's Github Repo "Reducing Gender Bias in Word-level Language Models"
# https://github.com/sueqian6/ACL2019-Reducing-Gender-Bias-in-Word-Level-Language-Models-Using-A-Gender-Equalizing-Loss-Function

gender_list_paths = glob.glob("./data/list/*_word_file.txt")
gender_list_paths.append('./data/list/neutral_occupations.txt')
gender_nouns_lookup = dict()
for path in gender_list_paths:
    with open(path, 'r') as f:
        if path == './data/list/neutral_occupations.txt':
            gender = 'neutral'
        else:
            p = re.compile(r'(?<=list/)(.*)(?=_word)')
            gender = p.findall(path)[0]
        nouns = f.read().split('\n')
        nouns = [n for n in nouns if n != '']
        gender_nouns_lookup[gender] = nouns

print(gender_nouns_lookup.keys())      

dict_keys(['female', 'male', 'neutral'])


In [155]:
# There are some inappropriate in the word lists, we are doing some hand crafting to avoid adding bias.
# Remove non-human words
words = ['cow', 'cows', 'hen', 'hens']
for word in words:
    gender_nouns_lookup['female'].remove(word)
words = ['bull', 'bulls', 'lion', 'lions', 'governor']
for word in words:
    gender_nouns_lookup['male'].remove(word)
    
# Add gender-neutral words
words = ['surfer', 'child', 'kid', 'kids', 'children', 'passenger', 'passengers',\
         'governor', 'someone', 'pedestrian', 'pedestrians']
for word in words:
    gender_nouns_lookup['neutral'].append(word)

In [157]:
annotations_path = './data/annotations/'
datatype = 'train'
'''
captions_dict (dict)- key: image_id, value: list of captions

im_gender_summary (dict of dict)- key: image_id, value: dict()
keys in dict: pred_gt- predicted ground truth label of the gender noun
              per_gt- % of annotations (out of 5 total) that agreed with the GT
              agreement_score- agreement score calculated using distance between 5 predictions, with 1 being the best
                               male = 1, female = -1, neutral = 0
                               e.g.0 annotations indicate [f, f, f, f, f], agreement_score = 1.00
                               e.g.1 annotations indicate [m, m, f, f, f], agreement_score = 0.00
                               e.g.2 annotation indicate [n, n, f, f, f], agreement_score = 0.50                                                        
              anno_gender- list of gender sentiment, e.g. ['male', 'female', 'neutral', 'female', 'female']
              anno_nouns- list of nouns used to describe human
              clean_gender- binary variable indicating if all notations used the same gender/ gender-neutral noun 
              clean_noun- binary variable indicating if all notations used the identical noun

not_human_im_ids(list)- list of image ids of images with >1 captions that do not mention humans.
Since the COCO dataset does not label whether human (or other objects) is the major subject 
matter of the image. This list helps us isolate images with human figures as the focus.
'''
captions_dict = dict()
im_gender_summary = dict()
not_human_im_ids = list() 

with open(f'{annotations_path}/captions_{datatype}2014.json') as f:
    captions_json = json.load(f)
    
    for i in range(len(captions_json['annotations'])):
        image_id = captions_json['annotations'][i]['image_id']
        caption = captions_json['annotations'][i]['caption']
        tokens = nltk.word_tokenize(caption)
        c_female = 0 # count of gender nouns and gender-neutral nouns
        c_male = 0
        c_neutral = 0
        noun = []
        
        # Evaluate annotator's noun used to describe humans
        for t in tokens:
            t = t.lower()
            if t in gender_nouns_lookup['female']:
                c_female += 1
                noun.append(t)
            elif t in gender_nouns_lookup['male']:
                c_male += 1
                noun.append(t)
            elif t in gender_nouns_lookup['neutral']:
                c_neutral += 1
                noun.append(t)
        
        # Only include image for training if more than one caption of the image include one human
        if c_female + c_male + c_neutral == 1:
            # Assign gender sentiment to the caption
            if c_female > 0:
                gender = 'female'
            elif c_male > 0:
                gender = 'male'
            else:
                gender = 'neutral'
                
            # Populate captions dict and image gender summary dict
            if image_id in captions_dict:
                captions_dict[image_id] += [caption]
                im_gender_summary[image_id]['anno_gender'].append(gender)
                im_gender_summary[image_id]['anno_noun'].append(noun[0])
            else:
                captions_dict[image_id] = [caption]
                im_gender_summary[image_id] = dict()
                im_gender_summary[image_id]['anno_gender'] = [gender]
                im_gender_summary[image_id]['anno_noun'] = [noun[0]]

        if i % 100000 == 0:
            print()
            print(f"Caption {i} processed, out of {len(captions_json['annotations'])} captions")
            print(f"No. of qualified images processed: {len(im_gender_summary)}")



Caption 0 processed, out of 414113 captions
No. of qualified images processed: 0

Caption 100000 processed, out of 414113 captions
No. of qualified images processed: 6452

Caption 200000 processed, out of 414113 captions
No. of qualified images processed: 13359

Caption 300000 processed, out of 414113 captions
No. of qualified images processed: 24080

Caption 400000 processed, out of 414113 captions
No. of qualified images processed: 34712


In [158]:
'''
im_gender_summary (dict of dict)- key: image_id, value: dict()
keys in dict: pred_gt- predicted ground truth label of the gender noun
              per_gt- % of annotations (out of 5 total) that agreed with the GT
              agreement_score- agreement score calculated using distance between 5 predictions, with 1 being the best
                               male = 1, female = -1, neutral = 0
                               e.g.0 annotations indicate [f, f, f, f, f], agreement_score = 1.00
                               e.g.1 annotations indicate [m, m, f, f, f], agreement_score = 0.00
                               e.g.2 annotation indicate [n, n, f, f, f], agreement_score = 0.50                                                        
              anno_gender- list of gender sentiment, e.g. ['male', 'female', 'neutral', 'female', 'female']
              anno_nouns- list of nouns used to describe human
            clean_gender- binary variable indicating if all notations used the same gender/ gender-neutral noun 
              clean_noun- binary variable indicating if all notations used the identical noun
'''
#values for agreement score calculations
score_cal_dict ={
    'male':1, 'female':-1, 'neutral':0
}

for image_id in im_gender_summary:
    error = 0
    
    # Delete images where <3 annotators mentioned the human figure
    # Because it is impossible to estimate the ground truth using only 1 or 2 captions 
    if len(im_gender_summary[image_id]['anno_gender']) < 3:
        not_human_im_ids.append(image_id)
    
    else:
        pred = im_gender_summary[image_id]['anno_gender']

        # Evaluate groundtruth guesses and agreement scores
        gt = max(set(pred), key = pred.count)
        for ind, p in enumerate(pred):
            for other_p in [x for i,x in enumerate(pred) if i != ind]:
                error += np.abs(score_cal_dict[p] - score_cal_dict[other_p])
        agreement_score = (24 - error) / 24

        # Populate dictionary
        im_gender_summary[image_id]['pred_gt'] = gt
        im_gender_summary[image_id]['per_gt'] = sum([1 for p in pred if p == gt]) / len(pred)
        im_gender_summary[image_id]['agreement_score'] = agreement_score
        if len(set(pred)) == 1:
            im_gender_summary[image_id]['clean_gender'] = 1
        else:
            im_gender_summary[image_id]['clean_gender'] = 0
        if len(set(im_gender_summary[image_id]['anno_noun'])) == 1:
            im_gender_summary[image_id]['clean_noun'] = 1
        else:
            im_gender_summary[image_id]['clean_noun'] = 0
            
for image_id in not_human_im_ids:
    try:
        del captions_dict[image_id]
        del im_gender_summary[image_id]
    except:
        pass

In [183]:
# Save list of image_ids of qualified images
# with open('./data/list/qualified_image_ids.csv', "w") as outfile:
#     for image_id in im_gender_summary.keys():
#         outfile.write(str(image_id))
#         outfile.write("\n")

In [194]:
def get_training_data(im_gender_summary, captions_dict, training_size, mode = 'random'):
    assert mode in ['random','balanced_mode', 'balanced_gender_only', 'balanced_clean', 'activity_balanced', 'activity_clean']
    assert isinstance(training_size, int)
    # Mode must be in random, balanced_mode, balanced_clean, activity_balanced, activity_clean
    
    random.seed(123)
    training_captions_dict = dict()
    
    if mode == 'random':
        training_captions_dict = dict(random.sample(captions_dict.items(), training_size))
        
    elif mode == 'balanced_mode':
        i = 0
        male_count = 0
        female_count = 0
        neutral_count = 0
        for image_id in im_gender_summary.keys():
            if i < training_size:
                if im_gender_summary[image_id]['pred_gt'] == 'male' and (male_count < training_size / 3):
                    training_captions_dict[image_id] = captions_dict[image_id]
                    male_count += 1
                    i += 1
                elif im_gender_summary[image_id]['pred_gt'] == 'female' and (female_count < training_size / 3):
                    training_captions_dict[image_id] = captions_dict[image_id]
                    female_count += 1
                    i += 1
                elif im_gender_summary[image_id]['pred_gt'] == 'neutral'and (neutral_count < training_size / 3):
                    training_captions_dict[image_id] = captions_dict[image_id]
                    neutral_count += 1
                    i += 1
                    
                if i % 100 == 0:
                    print(f"{i} captions are added")
    
    elif mode == 'balanced_clean':
        i = 0
        male_count = 0
        female_count = 0
        neutral_count = 0
        for image_id in im_gender_summary.keys():
            if i < training_size:
                if im_gender_summary[image_id]['pred_gt'] == 'male' and (male_count < training_size / 3):
                    training_captions_dict[image_id] = captions_dict[image_id]
                    male_count += 1
                    i += 1
                elif im_gender_summary[image_id]['pred_gt'] == 'female' and (female_count < training_size / 3):
                    training_captions_dict[image_id] = captions_dict[image_id]
                    female_count += 1
                    i += 1
                elif im_gender_summary[image_id]['pred_gt'] == 'neutral'and (neutral_count < training_size / 3):
                    training_captions_dict[image_id] = captions_dict[image_id]
                    neutral_count += 1
                    i += 1
                    
                if i % 100 == 0:
                    print(f"{i} captions are added")
    
    elif mode == 'activity_balanced':
        pass
    
    elif mode == 'activity_clean':
        pass
    
    #activity_image_ids
    return training_captions_dict
    

In [195]:
training = get_training_data(im_gender_summary, captions_dict, training_size = 100, mode = 'balanced_mode')
len(training)

100 captions are added


100