```
Copyright 2021 Twitter, Inc.
SPDX-License-Identifier: Apache-2.0
```

# Gender Gaze Analysis

* This notebook prepares a dataset for gender gaze analysis. 
* It selects `MAX_FOUND` number of images
* The selected images' saliency maps are stored in the folder `./gender_gaze/annotations/{GENDER}` with the same name as the image. 
* Each image's salienct segment regions are saved in a file with a suffix `_regions`
* Once the images are generated you can look at the saliency map images and assess if the most salient point is on the face or not as well as if any non face area is getting detected as a salient region using the `_regions` file. 

In [None]:
import logging
import shlex
import subprocess
import sys
from collections import namedtuple
from pathlib import Path

import matplotlib.image as mpimg
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from matplotlib.collections import PatchCollection
from matplotlib.patches import Rectangle

logging.basicConfig(level=logging.ERROR)

In [None]:
import platform

BIN_MAPS = {"Darwin": "mac", "Linux": "linux"}

HOME_DIR = Path("../").expanduser()

try:
    import google.colab
    ! pip install pandas scikit-learn scikit-image statsmodels requests dash
    ! [[ -d image-crop-analysis ]] || git clone https://github.com/twitter-research/image-crop-analysis.git
    HOME_DIR = Path("./image-crop-analysis").expanduser()
    IN_COLAB = True
except:
    IN_COLAB = False

sys.path.append(str(HOME_DIR / "src"))
bin_dir = HOME_DIR / Path("./bin")
bin_path = bin_dir / BIN_MAPS[platform.system()] / "candidate_crops"
model_path = bin_dir / "fastgaze.vxm"
data_dir = HOME_DIR / Path("./data/")
data_dir.exists()

In [None]:
df = pd.read_csv(data_dir / Path("dataset.tsv"), sep="\t")
df.head()

In [None]:
from crop_api import parse_output, ImageSaliencyModel, is_symmetric, reservoir_sampling
from image_manipulation import get_image_saliency_map, process_image

In [None]:
model = ImageSaliencyModel(crop_binary_path=bin_path, crop_model_path=model_path)

In [None]:
%%time
MAX_FOUND = 100
for gender in df.sex_or_gender.unique(): 
    annotation_dir = data_dir / Path(f"./gender_gaze/annotations/{gender}")
    annotation_dir.mkdir(parents=True, exist_ok=True)
    found = 0
    for img_path in df[df.sex_or_gender == gender].sample(frac = 1, random_state=42).local_path:
        if not img_path.lower().endswith((".jpg", ".jpeg")): continue
        if found >= MAX_FOUND: break
        img_path = data_dir / Path(f"./images/{img_path}")
        if (annotation_dir / img_path.name).exists():
            found += 1
            continue
        try:
            img, image_label_overlay, regions, threshold = get_image_saliency_map(img_path, model)
        except TypeError as e:
            print(img_path, e)
            continue
        img_shape = img.shape
        n_regions = len([r for r in regions if r.area > 1000])
        print(img_path.name, img_shape[0] / img_shape[1], n_regions)
        if n_regions < 2 or (img_shape[0] / img_shape[1]) < 1.25:
            # Only select images if it has more than 2 big regions (of area > 1000) and image is significantly tall. 
            continue
        found += 1
        process_image(img_path, model)
        img_path_parts = img_path.name.rsplit(".", 1)
        plt.savefig(annotation_dir / f"{img_path_parts[0]}_regions.{img_path_parts[-1]}", bbox_inches="tight")
        plt.close("all")
        model.plot_img_crops(img_path, aspectRatios=[1], topK=1)
        plt.savefig(annotation_dir / img_path.name, bbox_inches="tight")
        plt.close("all")