# Table 4: Coherence and Seed Set Semantic Similarity

## Imports and Constants

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os, string, random
import numpy as np

if os.path.isdir("../notebooks/"):
    os.chdir("../badseeds/")

In [4]:
import json
import pandas as pd
import gensim.models as gm
from tqdm import tqdm

from badseeds import seedbank, tab_a2
from badseeds.utils import generate_seed_set

In [3]:
# constants
SEED = 42
CONFIG_PATH = "../config.json"

## Loading Files

In [4]:
# path to config json file containing paths to datasets. change if necessary
with open(CONFIG_PATH, "r") as f:
    config = json.load(f)

In [5]:
# load our gathered seeds
seeds = seedbank.seedbanking(config["seeds"]["dir_path"] + "seeds.json")

In [6]:
# load embeddings
models = []
models_dir = os.path.join(
    config["models"]["dir_path"], config["models"]["nyt_subpath"]["10"]
)
for file in os.listdir(models_dir):
    if file.endswith(".kv"):
        models.append(gm.KeyedVectors.load(os.path.join(models_dir, file)))

if len(models) == 0:
    raise ValueError("No embeddings found in directory.")

## Generating Table

#### Gathered Seeds

In [7]:
all_coherence = []
for model in tqdm(models, unit="model"):
    coh = tab_a2.build_row_table_a2(
        model,
        seeds,
        pairing_method="file",
        pair_path="./seed_set_pairings.csv",
    )
    all_coherence.append(coh)

# aggregate
coh_avg_gath = tab_a2.agg_coherence(all_coherence)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 20/20 [00:32<00:00,  1.65s/model]


#### Generated Seeds

For generated seeds, first we have to generate _nice enough_ seed sets (i.e., seed sets that use Latin characters). We set the seed as specified in the first section.

In [8]:
check = string.printable
np.random.seed(SEED)
random.seed(SEED)

# generate random seeds, ignore non-alpha characters
sampled = []
for model in random.choices(models, k=50):
    while True:
        s = generate_seed_set(model)
        if 0 not in [c in check for w in s for c in w]:
            sampled.append(s)
            break
g_seeds = pd.DataFrame(data=pd.Series(sampled), columns=["Seeds"])

# check for duplicates in seeds
if 0 in g_seeds.apply(str).duplicated():
    raise ValueError("Duplicate seeds found.")

# uncomment below to visualize seeds
# g_seeds

Next, we generate the table in a similar manner as with gathered seeds.

In [9]:
prefix = "Generated"

# do coherence
all_coherence = []
for model in tqdm(models, unit="model"):
    coh = tab_a2.build_row_table_a2(model, g_seeds, pairing_method="all")
    all_coherence.append(coh)

coh_avg_gen = tab_a2.agg_coherence(all_coherence)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 20/20 [08:17<00:00, 24.86s/model]


## Displaying Table

In [10]:
# clean up for display
coh_avg_gath = tab_a2.clean_tab_a2(coh_avg_gath, seeds, "Gathered")
coh_avg_gen = tab_a2.clean_tab_a2(coh_avg_gen, seeds, "Generated")

with pd.option_context("display.max_rows", 9, "display.max_colwidth", 30):
    print(coh_avg_gath)
    print(coh_avg_gen)

    Coherence                 Gathered Set A                 Gathered Set B
2       0.999  CAREER: executive, managem...  FAMILY: home, parents, chi...
3       0.999  CAREER: executive, managem...  FAMILY: home, parents, chi...
15      0.968  MALE: brother, father, unc...  FEMALE: sister, mother, au...
32      0.942  TERRORISM: terror, terrori...  OCCUPATIONS: banker, carpe...
..        ...                            ...                            ...
29      0.100  NAMES HISPANIC: ruiz, alva...  NAMES WHITE: harris, nelso...
16      0.093  MALE NAMES: john, paul, mi...  FEMALE NAMES: amy, joan, l...
26      0.053  NAMES BLACK: harris, robin...  NAMES WHITE: harris, nelso...
21      0.026  NAMES ASIAN: cho, wong, ta...  NAMES CHINESE: chung, liu,...

[34 rows x 3 columns]
     Coherence                Generated Set A                Generated Set B
603      1.000  know, believe, think, gues...  governor, mayor, legislatu...
460      1.000  foot-8, foot-7, foot-3, fo...  rousteing, atkin