In [None]:
import tensorflow_datasets as tfds
import os
import pandas as pd
import re
import json

dataset_dir = '../datasets/'



# Downloading and extracting captions

For MS Coco TensorFlow Datasets is employed.  
For Open Images V7 the json and csv files were downloaded by start.sh.  

## MS Coco

First the dataset will be downloaded from TensorFlow.  
After the dataset is downloaded a first filter is applied, all images with less than 5 individuals are filtered out.  
The filtered images are then stored in the DataFrame

In [None]:
decoders = {
    'image': tfds.decode.SkipDecoding(),
    'image/filename': tfds.decode.SkipDecoding(),
}

# DataFrame that stores all filtered prompts
coco_df = pd.DataFrame(columns=['image_id', 'caption', 'dataset'])

# coco includes the images and coco_captions stores the captions that are used as prompts
datasets = [('coco_captions', 'coco')]
person_id = 0 # this values is found in the official MS Coco documentation
for dataset, folder in datasets:
    set = tfds.load(dataset, data_dir=os.path.join(dataset_dir, folder), download=True, decoders=decoders)

    # filter out all images that do not contain at least one person
    for part_set in set:
        for element in set[part_set]:
            s = sum(map(lambda x: 1 if x == person_id else 0, element['objects']['label']))            
            if s >= 5:
                for caption in element['captions']['text']:
                    if element['image/id'].numpy() == None or len(caption.numpy().decode("utf-8")) == 0:
                        continue
                    coco_df.loc[len(coco_df)] = [element['image/id'].numpy(), ''.join(caption.numpy().decode("utf-8").splitlines()), 'coco']
    coco_df.to_csv(os.path.join(dataset_dir, os.path.join('coco/result_csv', 'person_captions.csv')), index=False)

## Open Image V7

First all images that contain people are found

In [None]:
def contains_person(text: str) -> bool:
    return re.search(r'\b%s\b' % (re.escape('person')), text.lower()) is not None 


def get_imgid_person_from_csv(csv: str, person_labels: list[str]):
    csv_df = pd.read_csv(csv)
    csv_df = csv_df[csv_df['LabelName'].isin(person_labels)]
    csv_df = csv_df['ImageID']
    return csv_df


In [None]:
# this DataFrame contains all ImageIDs for image containing at least one person.
people_ids = pd.DataFrame(columns=['ImageID'])

class_desc_dir = os.path.join(dataset_dir, 'open-images/csv/oidv7-class-descriptions.csv')

# csv that containt the labels
label_csv = ['open-images/csv/oidv7-test-annotations-human-imagelabels.csv',
                'open-images/csv/oidv7-test-annotations-machine-imagelabels.csv',
                'open-images/csv/oidv7-train-annotations-human-imagelabels.csv',
                'open-images/csv/oidv7-train-annotations-machine-imagelabels.csv',
                'open-images/csv/oidv7-val-annotations-human-imagelabels.csv',
                'open-images/csv/oidv7-val-annotations-machine-imagelabels.csv' ]


# getting label name for classes that are people
class_df = pd.read_csv(class_desc_dir)

person_labels = []

#checking if label describes a person
for index, rw in class_df.iterrows():
    if contains_person(rw['DisplayName']):
        person_labels.append(rw['LabelName'])

# go through label csv and extract image ids from images containing people
people_ids = pd.DataFrame(columns=['ImageID'])
for csv in label_csv:
    print(f"extracting image ids from: {csv}")
    people_ids = pd.concat([get_imgid_person_from_csv(os.path.join(dataset_dir, csv), person_labels), people_ids])

# make sure that ids are unique
people_ids = people_ids.drop_duplicates(subset=['ImageID'])

Now all images that don't include at least one person are removed.

In [None]:
def clean_json(path: str, people_ids: pd.DataFrame):
    json_df = pd.read_json(path_or_buf=path, lines=True)
    
    # filter for images with people in it
    json_df = json_df[json_df['image_id'].isin(people_ids['ImageID'])]

    # only take caption and image id
    json_df = json_df[['image_id', 'caption']]
    json_df['dataset'] = 'open_images'
    return json_df


def clean_json_large(path: str, people_ids: pd.DataFrame):
    fs = open(path)
    captions = pd.DataFrame(columns=['image_id', 'caption', 'dataset'])

    for line in fs:
        js = json.loads(line)
        if js['image_id'] in people_ids['ImageID']:
            captions.loc[len(captions)] = [js['image_id'], js['caption'], 'open_images']
            
    return captions


In [None]:
json_dir = os.path.join(dataset_dir, 'open-images/json')

captions = pd.DataFrame(columns=['image_id', 'caption', 'dataset'])

j = 0
GiB =  2 ** 30

skip = True

for json_file in os.listdir(json_dir):

    if os.path.exists(os.path.join(dataset_dir, os.path.join('open-images/result_csv', f'all_people_captions_{j}.csv'))):
        j += 1
        continue
    print(f'reading file {json}')
    # check if file exceds two GiB
    if os.path.getsize(os.path.join(json_dir, json_file)) > 2 * GiB:
        print(f'processing large file: {json_file}, {os.path.getsize(os.path.join(json_dir, json_file))} Bytes')
        captions = clean_json_large(os.path.join(json_dir, json_file), people_ids)
    else:
        captions = clean_json(os.path.join(json_dir, json_file), people_ids)

    # make sure that ids are unique
    captions = captions.drop_duplicates(subset=['image_id'])

    #write to file 
    captions.to_csv(os.path.join(dataset_dir, os.path.join('open-images/result_csv', f'all_people_captions_{j}.csv')), index=False)
    
    captions = pd.DataFrame(columns=['image_id', 'caption'])
    j += 1

## Filter extracted captions
First it is checked that a plural word, describing groups of people is inluded.   
Afterwards gendered terms are replaced with 'people'

In [None]:
person_plural_words = [
    # Healthcare
    "doctors", "surgeons", "nurses", "therapists", "paramedics", "dentists",
    "optometrists", "pharmacists", "pediatricians", "psychiatrists", "radiologists",
    "anesthesiologists", "pathologists", "endocrinologists", "oncologists", "cardiologists",
    "neurologists", "dermatologists", "general practitioners", "midwives", "physicians",
    "emergency responders", "clinicians", "geneticists", "urologists", "gastroenterologists",
    "nephrologists", "ophthalmologists", "immunologists", "allergists",
    
    # Education
    "teachers", "professors", "lecturers", "instructors", "educators", "mentors",
    "counselors", "principals", "deans", "tutors", "coaches", "trainers", "academics",
    "scholars", "researchers", "administrators",
    
    # Science, Engineering, and Technology
    "engineers", "scientists", "programmers", "developers", "designers", "analysts",
    "mathematicians", "statisticians", "physicists", "chemists", "biologists", "geologists",
    "astronomers", "economists", "sociologists", "anthropologists", "psychologists",
    "philosophers", "historians", "librarians", "geographers", "computer scientists",
    "IT specialists", "cybersecurity experts", "data scientists", "system administrators",
    
    # Business and Finance
    "accountants", "managers", "executives", "directors", "entrepreneurs", "investors",
    "bankers", "brokers", "salespeople", "marketers", "consultants", "advisors",
    "strategists", "negotiators", "coordinators", "planners", "auditors", "traders",
    "risk managers",
    
    # Law, Government, and Public Service
    "lawyers", "judges", "attorneys", "policemen", "policewomen", "firefighters",
    "soldiers", "marines", "officers", "agents", "diplomats", "ambassadors", "envoys",
    "mediators", "arbitrators", "councillors", "mayors", "representatives", "senators",
    "congressmen", "assemblymen", "ministers", "bureaucrats", "public servants",
    "legislators", "politicians", "deputies", "commissioners",
    
    # Arts and Entertainment
    "musicians", "singers", "dancers", "actors", "actresses", "painters", "sculptors",
    "writers", "poets", "novelists", "journalists", "broadcasters", "reporters",
    "photographers", "videographers", "models", "stylists", "illustrators", "animators",
    "composers", "orchestrators", "producers", "performers", "entertainers", "storytellers",
    "comedians", "magicians", "acrobats", "improvisers", "mimes", "puppeteers",
    
    # Trades and Manual Labor
    "carpenters", "electricians", "plumbers", "mechanics", "drivers", "pilots", "sailors",
    "chefs", "bakers", "butchers", "tailors", "seamstresses", "dressmakers", "gardeners",
    "farmers", "fishers", "hunters", "blacksmiths", "welders", "masons", "bricklayers",
    "roofers", "installers", "packers", "movers", "cleaners", "laborers", "technicians",
    "operators", "cashiers", "clerks", "servicemen", "waiters", "waitresses", "bartenders",
    "baristas", "cheerleaders",
    
    # Sports and Recreation
    "athletes", "runners", "swimmers", "cyclists", "skaters", "golfers", "tennis players",
    "footballers", "basketball players", "baseball players", "hockey players", "boxers",
    "wrestlers", "martial artists", "soccer players", "cricketers", "fencers", "surfers",
    "skiers", "snowboarders", "rowers", "kayakers", "climbers", "fishermen",
    
    # Miscellaneous and Other Groups
    "volunteers", "activists", "campaigners", "participants", "contributors",
    "collaborators", "organizers", "innovators", "thinkers", "visionaries", "pioneers",
    "trendsetters", "enthusiasts", "collectors", "critics", "reviewers", "bloggers",
    "vloggers", "podcasters", "gamers", "streamers", "followers", "subscribers",
    "examiners", "observers", "spectators", "bystanders", "residents", "citizens",
    "inhabitants", "locals", "immigrants", "refugees", "delegates", "evangelists",
    "architects", "adventurers", "explorers", "pilgrims", "campers", "travelers",
    "tourists", "visitors", "commuters", "shoppers", "vendors", "merchants",
    "distributors", "consumers", "farmhands", "contractors", "freelancers", "interns",
    "apprentices", "trainees", "seniors", "elders", "youths", "teens", "children",
    "infants", "adults", "men", "women", "couples", "siblings", "parents", "grandparents",
    "cousins", "relatives", "classmates", "roommates", "colleagues", "associates",
    "partners", "comrades", "confederates", "allies", "supporters", "backers", "fans",
    "members", "cadets", "rookies", "veterans", "champions", "contenders", "finalists",
    "competitors", "medalists", "record-holders", "brainiacs", "geeks", "nerds",
    "whizzes", "techies", "creatives", "ideators", "facilitators", "moderators",
    "supervisors", "evaluators", "assessors"
]

In [None]:
result_dirs = ['open-images/result_csv', 'coco/result_csv']
final_dir = '../results'
captions = pd.DataFrame(columns=['image_id', 'caption', 'dataset'])
for dir in result_dirs:
    for csv in os.listdir(os.path.join(dataset_dir, dir)):
        if 'captions' not in csv:
            continue
        df = pd.read_csv(os.path.join(os.path.join(dataset_dir, dir), csv))
        captions = pd.concat([captions, df])
filtered = pd.DataFrame(columns=['image_id', 'caption', 'dataset'])
# check if caption contains single
for i, keyword in enumerate(person_plural_words):
    print(f'filtering for word {i}/{len(person_plural_words)}')
    filtered = pd.concat([filtered, captions[(captions['caption'].str.contains(f"\\b{keyword}\\b", case=False))]])
    
filtered = filtered[~filtered['caption'].str.contains('two')]
filtered = filtered[~filtered['caption'].str.contains('Two')]
filtered = filtered[~filtered['caption'].str.contains('TWO')]
filtered = filtered[~filtered['caption'].str.contains('three')]
filtered = filtered[~filtered['caption'].str.contains('Three')]
filtered = filtered[~filtered['caption'].str.contains('THREE')]


for word in ['women', 'Women', 'WOMEN', 'WOMAN', 'Woman', 'woman', 'man', 'MAN', 'Man', 'men', 'Men', 'MEN']:
    filtered['caption'] = filtered['caption'].apply(lambda x: re.sub(f"\\b{word}\\b", 'person' if 'a' in word else 'persons', x))


print(filtered)
filtered.to_csv(os.path.join(final_dir, "filtered_captions.csv"), index=False)

In [None]:
final_csv_path = '../results/filtered_captions.csv'
sample_dir = '../results/batches'
df = pd.read_csv(final_csv_path)
num_sample = 10
total = 8000
num_needed_samples = total / num_sample
for i in range(int(num_needed_samples)):
    sample = df.sample(n=num_sample)

    # print(sample)
    sample.to_csv(os.path.join(sample_dir, f'{i}.csv'), index=False)
    df = df.drop(sample.index)