In [7]:
import json
import time
import numpy as np
import copy
import itertools
import os
from collections import defaultdict
import sys
from pprint import pprint
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ACER\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\ACER\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [8]:
def read_data(input_file):
    ##Load the annotations file
    dataset=json.load(open(input_file, 'r'))

##    pprint(dataset)
##    print(dataset.keys())
    annots={}
    img_capts=defaultdict(list)
    if 'annotations' in dataset:
        for annot in dataset['annotations']:
            ##append annotation to image id
            if 'caption' in annot:
                img_capts[annot['image_id']].append(annot['caption'])
            annots[annot['id']]=annot

    imgs={}
    if 'images' in dataset:
        for img in dataset['images']:
            imgs[img['id']]=img['file_name']

    categories={}
    if 'categories' in dataset:
        for category in dataset['categories']:
            categories[category['id']]=category['supercategory']
##    print(categories)
    
    catg_imgs=defaultdict(list)
    if 'annotations' in dataset and 'categories' in dataset:
        for annot in dataset['annotations']:
            if annot['iscrowd']==0:
                catg_imgs[annot['category_id']].append(annot['image_id'])
##    print(len(catg_imgs))
    return img_capts, catg_imgs, imgs


In [9]:
##Balancing training data based on only gender word tags
def print_stats(img_capts=None,catg_imgs=None,sample_size=100):
    ##img_capts has image_ids as keys and the corresponding captions as vals
    ##catg_imgs has categories as keys and the corresponding image_ids as vals

    ##get only images with humans in it
    male_tags=['man','boy','gentleman','guy','male']
    female_tags=['woman','girl','lady','female']
    neutral_tags=['person','someone']
    train_data=defaultdict(list)
    num_imgs=0
    
##        human_img_capts[img_id]=img_capts[img_id]
    
    ##check if a gender word is in the caption
    print(len(catg_imgs[1]))
    for img_id in catg_imgs[1]:
        if sum(any(word in nltk.word_tokenize(caption) for word in male_tags) for caption in img_capts[img_id])>2:
            train_data['male'].append(img_id)
            num_imgs+=1
        if sum(any(word in nltk.word_tokenize(caption) for word in female_tags) for caption in img_capts[img_id])>2:
            train_data['female'].append(img_id)
            num_imgs+=1
        if sum(any(word in nltk.word_tokenize(caption) for word in neutral_tags) for caption in img_capts[img_id])>2:
            train_data['neutral'].append(img_id)
            num_imgs+=1
        if num_imgs>sample_size:
            break
    
    for key in train_data:
        print(key, len(train_data[key]))
        

In [10]:
##Balancing training data based on only gender word tags
def split_train_data(img_capts=None,catg_imgs=None,sample_size=100):
    ##img_capts has image_ids as keys and the corresponding captions as vals
    ##catg_imgs has categories as keys and the corresponding image_ids as vals

    ##get only images with humans in it
    male_tags=['man','boy','gentleman','guy','male']
    female_tags=['woman','girl','lady','female']
    neutral_tags=['person','someone']
    train_data=defaultdict(list)
    num_imgs=0
    
##        human_img_capts[img_id]=img_capts[img_id]
    
    ##check if a gender word is in the caption
    for img_id in catg_imgs[1]:
        if sum(any(word in nltk.word_tokenize(caption) for word in male_tags) for caption in img_capts[img_id])>2:
            train_data['male'].append(img_id)
            num_imgs+=1
##            print(num_imgs)
        if num_imgs>=int(sample_size/3.):
            break

    for img_id in catg_imgs[1]:
        if sum(any(word in nltk.word_tokenize(caption) for word in female_tags) for caption in img_capts[img_id])>2:
            train_data['female'].append(img_id)
            num_imgs+=1
##            print(num_imgs)
        if num_imgs>=int(2.*sample_size/3.):
            break

    for img_id in catg_imgs[1]:
        if sum(any(word in nltk.word_tokenize(caption) for word in neutral_tags) for caption in img_capts[img_id])>2:
            train_data['neutral'].append(img_id)
            num_imgs+=1
##            print(num_imgs)
        if num_imgs==sample_size:
            break
    
    for key in train_data:
        print(key, len(train_data[key]))
        
    return train_data

In [11]:
def split_data_context(img_capts=None,catg_imgs=None):
    ##POS tag each caption
    ##work only with cat_imgs[1]
    pass

In [12]:
img_capts,_,imgs=read_data('captions_val2014.json')
_,catg_imgs,_=read_data('instances_val2014.json')
print_stats(img_capts,catg_imgs,sample_size=1000)
train_data=split_train_data(img_capts,catg_imgs,sample_size=1000)

86348
female 354
male 591
neutral 56
male 333
female 333
neutral 334


In [23]:
##To get data in the required format
def get_data(img_capts=None,imgs=None,train_data=None):
    captions={}
    for gender in train_data:
        for img_id in train_data[gender]:
            captions[imgs[img_id]]=img_capts[img_id]
    all_captions = []
    all_captions.extend(itertools.chain(*(value for key,value in captions.items())))
    return captions, all_captions

In [24]:
captions,all_captions = get_data(img_capts,imgs,train_data)
# print(captions)
print(all_captions)

['A local hero rides through the streets of town on his motorcycle with stuffed animals.', 'A man on his motorcycle with a teddy bear attached.', 'A man rides a motorcycle that is decorated with three teddy bears.', 'The man is on a motorcycle with three stuffed animals fastened on at the headlight and each front tire.', 'A man riding on a motorcycle with three stuffed teddy bears attached to the motorcycle.', 'A man kneeling down in front of a refrigerator.', 'A person kneels down to peer into a refrigerator', 'A man is searching for food in a refrigerator.', 'A chef is opening and looking into the fridge.', 'A man wearing an apron peering into the bottom of an open fridge', 'A person on a red and black moped on a sidewalk. ', 'An Asian man riding a motor scooter on a street', 'A person on a scooter passing a blue garage.', 'a man on a scooter at the top of some steps', 'A man riding a motor scooter on a wet road.', 'a man wearing a helmet sits on a motorcycle', 'a man sitting on a mo

In [15]:
gender_nouns = ['man','woman','person','boy','girl','men','women','someone',
                'lady','boys','girls','gentleman','ladies','gentlemen','person','people',
                'child','pedestrian','guy','male','female',
               'passenger']
gender_pronouns = ['his','he','hers','her''she','its','it']
verb_tags = ['VBG','VBD','VBN','VB']
adj_tags = ['JJ','JJR','JJS']
noun_tags = ['NN','NNS','NNP','NNPS']

In [16]:
def get_tag_dicts(get_tags):
    gender_tags_dict = defaultdict(list)
    pronoun_tags_dict = defaultdict(list)
    for caption in all_captions:
        # verbs : VBG, VBD, VBN
        tokens = nltk.word_tokenize(caption)

        if any(nn in tokens for nn in gender_nouns) or any(pn in tokens for pn in gender_pronouns):
            tags = nltk.pos_tag(tokens)

            only_tags = [tag[1] for tag in tags]
            if any(get_tag in only_tags for get_tag in get_tags):
                only_verbs = [tag[0] for tag in tags if tag[1] in get_tags]
                if set(gender_nouns).intersection(tokens):   

                    for nn in set(gender_nouns).intersection(tokens):

                        gender_tags_dict[nn].extend(only_verbs)

                if set(gender_pronouns).intersection(tokens):
                    for pn in set(gender_pronouns).intersection(tokens):
    #                     print(pn, pronoun_tags_dict[pn])
                        pronoun_tags_dict[pn].extend(only_verbs)
    
    return gender_tags_dict, pronoun_tags_dict

In [25]:
gender_dict, pronoun_dict = get_tag_dicts(verb_tags)
gender_count,pronoun_count = {},{}
print (gender_dict)
print (pronoun_dict)

defaultdict(<class 'list'>, {'man': ['attached', 'decorated', 'fastened', 'riding', 'attached', 'kneeling', 'searching', 'wearing', 'peering', 'riding', 'riding', 'wearing', 'sitting', 'wearing', 'wearing', 'decorated', 'themed', 'riding', 'riding', 'sitting', 'sitting', 'pedaling', 'riding', 'picking', 'riding', 'getting', 'hit', 'receiving', 'riding', 'carrying', 'carrying', 'watching', 'sitting', 'standing', 'walking', 'holding', 'seen', 'walking', 'sitting', 'sitting', 'wearing', 'wearing', 'sitting', 'standing', 'pose', 'posing', 'parked', 'laying', 'giving', 'sitting', 'holding', 'holding', 'smiling', 'standing', 'holding', 'holding', 'smiling', 'wearing', 'posing', 'riding', 'playing', 'playing', 'playing', 'playing', 'riding', 'cast', 'including', 'moped', 'standing', 'sitting', 'riding', 'standing', 'standing', 'standing', 'standing', 'walking', 'pulling', 'riding', 'holding', 'parked', 'riding', 'riding', 'standing', 'boarding', 'walking', 'skating', 'smiling', 'parked', 'sit

In [28]:
def top_n_items(nn='woman',pos='verb',count=0):
    global gender_count, pronoun_count
    corresp_tags = {'adj': adj_tags, 'verb':verb_tags, 'noun':noun_tags}
    gender_dict, pronoun_dict = get_tag_dicts(corresp_tags[pos])  
    gender_count,pronoun_count = {},{}
    
    for (key1, value1) in gender_dict.items():
        gender_count[key1] = set([(i,value1.count(i)) for i in value1 if i not in gender_nouns])
    for (key2, value2) in pronoun_dict.items():   
        pronoun_count[key2] = set([(i,value2.count(i)) for i in value2 if i not in gender_nouns])     
    
    dictionary = gender_count if nn in gender_nouns else pronoun_count
    if count==0:
        return
    print(sorted(dict(dictionary[nn]), key = dict(dictionary[nn]).get, reverse=True)[:count])

In [29]:
top_n_items('woman','verb',15)

['standing', 'holding', 'sitting', 'riding', 'looking', 'taking', 'walking', 'wearing', 'smiling', 'preparing', 'cooking', 'using', 'putting', 'carrying', 'posing']


In [30]:
top_n_items('man','noun',15)

['motorcycle', 'bike', 'dog', 'bicycle', 'street', 'kitchen', 'front', 'back', 'top', 'food', 'car', 'umbrella', 'computer', 'cat', 'dogs']


In [32]:
def counts():
    global all_count,pro_all_count
    all_count = {}
    pro_all_count = {}    
    for item1 in gender_count:
        all_count[item1] = {each[0]:each[1] for each in gender_count[item1]}
    for item2 in pronoun_count:
        pro_all_count[item2]= {each[0]:each[1] for each in pronoun_count[item2]}

def bias(gender1 = 'man',gender2 = 'woman'):
    global bias_dict1,bias_dict2,data
    # data : gender nouns or pronouns : all_count or pro_all_count
#     data = all_count.copy() if gender in gender_nouns else pro_all_count.copy()
    
    data = all_count.copy()
    counts()
    dictionary = set([a for key,values in data.items() for a in values if key==gender1 or key==gender2])    
#     print(data)
    bias_dict1, bias_dict2 = {}, {} 
    for word in dictionary:
        gender1_count = data[gender1].get(word,0)
        gender2_count = data[gender2].get(word,0)
        if gender1_count!=0 and gender2_count!=0: bias_dict1[word] = gender1_count/(gender1_count+gender2_count)
        if gender1_count!=0 and gender2_count!=0: bias_dict2[word] = gender2_count/(gender1_count+gender2_count)



In [33]:
bias()
gender1 = 'man'
gender2 = 'woman'
# print(bias_dict1)
for i in sorted(bias_dict1, key=bias_dict1.get,reverse=True):
    if bias_dict1[i]>0.0:
        print(i,round(bias_dict1[i],4),data[gender1][i])