In [93]:
import os
import time

start = time.time()
if os.path.isdir("../notebooks/"):
    os.chdir("..")

In [94]:
import json
import random
import itertools
%load_ext memory_profiler

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import gensim.models as gm
from tqdm import tqdm
from gensim.models import KeyedVectors
import seaborn as sns

from badseeds import replicate_fig2
from badseeds import utils, seedbank, metrics


The memory_profiler extension is already loaded. To reload it, use:
  %reload_ext memory_profiler


In [95]:
# path to config json file containing paths to datasets. change if necessary
CONFIG_PATH = "./config.json"

In [96]:
with open(CONFIG_PATH, "r") as f:
    config = json.load(f)

In [97]:
# for replicability
np.random.seed(42)
random.seed(42)

## Models and Data

### Models

> load in models

In [None]:
%%memit
datasets = []

filenames = [
    "goodreads_r_subpath",
    "goodreads_hb_subpath",
]

for f in filenames:
    models = []
    direct = os.fsencode(
        os.path.join(config["models"]["dir_path"], config["models"][f]["0"])
    )

    for filename in os.listdir(direct):
        f = os.path.join(direct, filename)

        # checking if it is a file
        if os.path.isfile(f):
            f = os.fsdecode(f)
            if ".npy" not in f:
                models.append(KeyedVectors.load(f))

    datasets.append(models)
print(len(datasets[0]))

20


## Seeds

In [None]:
%%memit
seeds = seedbank.seedbanking(config["seeds"]["dir_path"] + "seeds.json", index = True)


seed_sets = [
    "female-Kozlowski_et_al_2019",
    "female_1-Caliskan_et_al_2017",
    "definitional_female-Bolukbasi_et_al_2016",
    "female_singular-Hoyle_et_al_2019",
    "female_definition_words_2-Zhao_et_al_2018",
    "female_stereotype_words-Zhao_et_al_2018",
]

# seed_sets = [
#     "male_1-Caliskan_et_al_2017",
#     "male_2-Caliskan_et_al_2017",
#     "male_pairs-Garg_et_al_2018",
#     "male_definition_words_2-Zhao_et_al_2018",
#     "male_stereotype_words-Zhao_et_al_2018",
#     "male_2-Rudinger_et_al_2017",
# ]




extracted_seeds = [seeds.loc[seed_set]['Seeds'] for seed_set in seed_sets]
extracted_seeds = [[item.lower() for item in seed_set] for seed_set in extracted_seeds]

#### Frequency of word for cosince similarity

In [None]:
datasets[0][0].get_vecattr('unpleasant','count')

## Function call 

In [None]:
%%memit
similarity = replicate_fig2.figure_2(extracted_seeds, datasets)

In [None]:
%%memit
for sim in similarity:
        for i, j in zip(extracted_seeds, sim):
            print(i, "\n")
            print(j, "\n \n")

## Visualization

In [None]:
df1 = pd.DataFrame(
    zip(similarity[0], seed_sets, ["history and biography"] * len(seeds)),
    columns=["cosine similarity", "seed set", "dataset"],
)
df2 = pd.DataFrame(
    zip(similarity[1], seed_sets, ["romance"] * len(seeds)),
    columns=["cosine similarity", "seed set", "dataset"],
)

df = pd.concat([df1, df2])
df = df.explode("cosine similarity")
df["cosine similarity"] = df["cosine similarity"].astype("float")

# Creating plot
sns.set_theme(style="whitegrid")
fig, ax = plt.subplots()
ax1 = sns.boxplot(
    x="cosine similarity", y="seed set", hue="dataset", data=df, palette="Accent"
)
ax2 = sns.stripplot(
    x="cosine similarity",
    y="seed set",
    hue="dataset",
    data=df,
    jitter=True,
    palette="Accent",
    dodge=True,
    linewidth=1,
    edgecolor="gray",
)

legend = ax1.get_legend()
handles = legend.legendHandles
ax.legend(handles, ["history and biography", "romance"])
plt.xlabel('cosine similairty to unpleasentness')


# show plot
plt.show()


In [None]:
print("Time taken is: ", time.time() - start)

In [None]:
# save to latex
# plt.rc("pgf", texsystem="pdflatex")
fig.savefig("images/fig_2_male.pdf", bbox_inches='tight', dpi = 600)