In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os

if os.path.isdir("../notebooks/"):
    os.chdir("..")

In [3]:
import json
import random
import itertools

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import gensim.models as gm
from tqdm import tqdm

from badseeds import utils, seedbank, metrics

In [4]:
# path to config json file containing paths to datasets. change if necessary
CONFIG_PATH = "./config.json"

In [15]:
with open(CONFIG_PATH, "r") as f:
    config = json.load(f)

In [6]:
# for replicability
np.random.seed(42)
random.seed(42)

## Models and Data

### Models

In [16]:
embeddings_dir = os.path.join(
    config["models"]["dir_path"], config["models"]["wiki_subpath"]["10"]
)

In [17]:
models = []
for file in tqdm(os.listdir(embeddings_dir)):
    if file.endswith(".kv"):
        models.append(gm.KeyedVectors.load(os.path.join(embeddings_dir, file)))

100%|██████████████████████████████████████████████████| 40/40 [00:09<00:00,  4.32it/s]


### Data

#### Gathered Seed Sets

In [18]:
seeds = seedbank.seedbanking(config["seeds"]["dir_path"] + "seeds.json", index=False)

In [19]:
gathered_seeds = seeds["Seeds"]

In [20]:
highlighted_ids = {
    "names": ["white_names-Knoche_et_al_2019", "black_names-Knoche_et_al_2019"],
    "roles": ["caucasian_roles-Manzini_et_al_2019", "black_roles-Manzini_et_al_2019"],
}

In [21]:
seeds[seeds["Seeds ID"].isin(highlighted_ids["names"])]

Unnamed: 0,Category,Seeds,Source / Justification,Source Categories,Used in Paper,Link,Seeds ID
137,white names,"[adam, chip, harry, josh, roger, alan, frank, ...",We combined and extended sets which were previ...,"borrowed-from-social-sciences, prior-work",Identifying Biases in Politically Biased Wikis...,https://github.com/MKnoche/wiki_bias_embedding,white_names-Knoche_et_al_2019
138,black names,"[alonzo, jamel, lerone, percell, theo, alphons...",We combined and extended sets which were previ...,"borrowed-from-social-sciences, prior-work",Identifying Biases in Politically Biased Wikis...,https://github.com/MKnoche/wiki_bias_embedding,black_names-Knoche_et_al_2019


In [22]:
seeds[seeds["Seeds ID"].isin(highlighted_ids["roles"])]

Unnamed: 0,Category,Seeds,Source / Justification,Source Categories,Used in Paper,Link,Seeds ID
48,black roles,"[slave, musician, runner, criminal, homeless]",For race we consulted a number of different so...,borrowed-from-social-sciences,Black is to Criminal as Caucasian is to Police...,https://github.com/TManzini/DebiasMulticlassWo...,black_roles-Manzini_et_al_2019
49,caucasian roles,"[manager, executive, redneck, hillbilly, leade...",For race we consulted a number of different so...,borrowed-from-social-sciences,Black is to Criminal as Caucasian is to Police...,https://github.com/TManzini/DebiasMulticlassWo...,caucasian_roles-Manzini_et_al_2019


#### Generated Seed Sets

In [23]:
# 50 generated seed sets of size 25
generated_seeds = [
    utils.generate_seed_set(model, n=24) for model in tqdm(random.choices(models, k=50))
]

100%|██████████████████████████████████████████████████| 50/50 [00:06<00:00,  7.86it/s]


## Processing

### Embedding

In [24]:
gathered_seeds_embeddings = [
    utils.get_embeddings(seed_set, models, query_strat="average")
    for seed_set in gathered_seeds
]

In [25]:
generated_seeds_embeddings = [
    utils.get_embeddings(seed_set, models, query_strat="average")
    for seed_set in generated_seeds
]

### Pairing

#### Gathered Seed Sets

Here we manually group gathered seed sets, based on whether they originate from the same paper/dataset in and whether the pairing between the seed sets makes sense (human judgement).

In [26]:
seeds.iloc[0]

Category                                                           pleasant
Seeds                     [caress, freedom, health, love, peace, cheer, ...
Source / Justification    Implicit Association Test (Greenwald et al., 1...
Source Categories                             borrowed-from-social-sciences
Used in Paper             Semantics derived automatically from language ...
Link                      https://dataverse.harvard.edu/dataset.xhtml?pe...
Seeds ID                                       pleasant-Caliskan_et_al_2017
Name: 0, dtype: object

In [27]:
# manually finding indices, by using the seeds.iloc[num] cell above, for each num
group_indices = [
    [0, 1],
    [2, 3],
    [4, 5],
    [6, 7],
    [8, 9],
    [10, 11],
    [12, 13],
    [14, 15],
    [18, 19],
    [20, 21],
    [22, 23],
    # need to skip a few here
    [28, 29],
    [30, 31],
    [32, 33],
    [34, 35],
    # skip "equalize", "gender specific"
    [40, 41],
    [42, 43],
    # here we have black, white and asian triplets
    [45, 46, 47],
    [48, 49, 50],
    # and jew, muslim christian triplets
    [52, 53, 54],
    # and jewish muslim christian troplets
    [55, 56, 57],
    # skip some tests, now back to pairs
    [60, 61],
    # skip some weird ones
    [66, 67],
    # black, asian, white, hispanic, russian, chineze names
    [68, 69, 70, 71, 72, 73],
    # back to pairs
    [78, 79],
    [80, 81],
    # skip a few unpaired ones
    [88, 89],
    [90, 91],
    [92, 93],
    [94, 95],
    [96, 97],
    [98, 99],
    [100, 101],
    [102, 103],
    [104, 105],
    [106, 107],
    [108, 109],
    # career vs violence doesn't seem appropriate pair
    [112, 113],
    # white collar job, blue collar jon, domestic work, occupation quadruplet
    [115, 116, 117, 118],
    [119, 120],
    [121, 122],
    [123, 124],
    # male/female singular/plural
    [126, 127],
    [126, 128],
    [128, 129],
    [127, 129],
    # back to normal pairs
    [131, 132],
    [133, 134],
    [135, 136],
    [137, 138],
    # christianity, islam, atheism
    [139, 140, 141],
    [142, 143],
    [144, 145],
    [146, 147],
    [148, 149],
    [150, 151, 152],
    [153, 154],
    [155, 156],
    [157, 158],
]

In [28]:
# For each tuple, find paired combinations with itertools. Need 2 for loops to flatten
pair_indices = [
    list(pair) for group in group_indices for pair in itertools.combinations(group, 2)
]

In [29]:
# convert these to IDs
pair_ids = [seeds.iloc[pair]["Seeds ID"].tolist() for pair in pair_indices]

In [30]:
pair_df = pd.DataFrame.from_records(pair_ids, columns=["ID_A", "ID_B"])

In [33]:
pair_df.to_csv("seed_set_pairings.csv", index=False)

In [None]:
pair_ids = [list(x) for x in pair_df.to_records(index=False)]

In [None]:
pair_idxs = [seeds[seeds["Seeds ID"].isin(pair)].index.to_list() for pair in pair_ids]

In [None]:
gathered_emb_pairs = [
    [gathered_seeds_embeddings[i], gathered_seeds_embeddings[j]] for (i, j) in pair_idxs
]

#### Generated Seed Sets

In [None]:
generated_emb_pairs = list(itertools.combinations(generated_seeds_embeddings, 2))

### PCA Explained Variance

#### Gathered Seed Sets

In [None]:
gathered_pca_models = [
    metrics.do_pca_embeddings(set_a, set_b, 10)
    for (set_a, set_b) in (gathered_emb_pairs)
]

In [None]:
gathered_exp_var = [
    model.explained_variance_ratio_[0] if model is not None else np.nan
    for model in gathered_pca_models
]

#### Generated Seed Sets

In [None]:
generated_pca_models = [
    metrics.do_pca_embeddings(set_a, set_b, 10)
    for (set_a, set_b) in (generated_emb_pairs)
]

In [None]:
generated_exp_var = [
    model.explained_variance_ratio_[0] if model is not None else np.nan
    for model in generated_pca_models
]

### Set Similarity

In [None]:
gathered_set_sim = [
    metrics.set_similarity(set_a, set_b, False)
    for (set_a, set_b) in (gathered_emb_pairs)
]

In [None]:
generated_set_sim = [
    metrics.set_similarity(set_a, set_b, False)
    for (set_a, set_b) in (generated_emb_pairs)
]

### Additional Processing

In [None]:
# linear fit of generated seed data
gen_coef = np.polyfit(generated_set_sim, generated_exp_var, 1)
gen_poly1d_fn = np.poly1d(gen_coef)

## Plotting

In [None]:
fig, ax = plt.subplots(1, 1)

fig.set_size_inches(w=6.50127, h=5)

ax.plot(
    generated_set_sim,
    generated_exp_var,
    "o",
    generated_set_sim,
    gen_poly1d_fn(generated_set_sim),
    color="#B8CCE1",
    linewidth=3,
    markersize=10,
    markerfacecolor="#B8CCE1",
    markeredgecolor="white",
)
ax.plot(
    gathered_set_sim,
    gathered_exp_var,
    "o",
    markersize=10,
    markerfacecolor="#F1B7B0",
    markeredgecolor="white",
)

plt.show()