# Embedding evaluation

This notebook contains the code to evaluate the different embedding models generated in the `embedding-generation.ipynb` notebook.

Paper reference: _Spliethöver, Keiff, Wachsmuth (2022): "No Word Embedding Model Is Perfect: Evaluating the Representation Accuracy for Social Bias in the Media", EMNLP 2022, Abu Dhabi._

Code & Data reference: https://github.com/webis-de/EMNLP-22

## Data preparation and loading

Please run the following two cells for any of the embedding models. They load the most common packages and set commonly used variables. They are necessary to run the training cells below.

In [None]:
import json
import logging
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import pickle
import seaborn as sns
import sqlite3
import sys

from os import chdir, getcwd, listdir, path
from collections import Counter
from gensim.models import KeyedVectors, Word2Vec
from tqdm.notebook import tqdm
from wefe.metrics import WEAT
from wefe.word_embedding_model import WordEmbeddingModel
from wefe.query import Query

PARENT_DIR = path.abspath("../src")
sys.path.append(PARENT_DIR)
from embedding_bias.config import (
    CRAWL_FIRST_YEAR, CRAWL_LAST_YEAR, SENTENCE_ENDING_TOKEN, NEWS_ARTICLE_DB_NAME)
from embedding_bias.preprocessing import preprocess_text
from embedding_bias.util import *

tqdm.pandas()

In [None]:
DATA_DIR = path.join(PARENT_DIR.parent, "data")
DB_PATH = path.join(DATA_DIR, "raw", NEWS_ARTICLE_DB_NAME)
ALLSIDES_RANKING_PATH = path.join(DATA_DIR, "raw", "allsides-ranking.csv")
OUTLET_CONFIG_PATH = path.join(DATA_DIR, "raw", "outlet-config.json")
WORD_SETS_PATH = path.join(DATA_DIR, "raw", "word-sets.json")

# Target sqlite database
target_db_connection = sqlite3.connect(DB_PATH)

# Outlet config file
outlet_config = pd.read_json(OUTLET_CONFIG_PATH)
outlet_selection = outlet_config

# Word sets
with open(WORD_SETS_PATH, "r") as f:
    word_sets = json.load(f)

# Allsides ranking
allsides_ranking = pd.read_csv(ALLSIDES_RANKING_PATH)

# Evaluation metrics
METRICS = [WEAT()]

# Groups of political orientations
orientation_groups = {
    "left": ["Lean Left", "Left"],
    "center": ["Center"],
    "right": ["Lean Right", "Right"]}

## Word similarity benchmarks

In [None]:
from embedding_evaluation.evaluate import Evaluation
from embedding_evaluation.load_embedding import load_embedding_textfile

In [None]:
SIM_EVAL_DATA_DIR = path.join(
    PARENT_DIR, "embedding_evaluation", "embedding_evaluation", "data")
os.environ["EMBEDDING_EVALUATION_DATA_PATH"] = SIM_EVAL_DATA_DIR

BENCHMARK_RESULTS_DIR = path.join(DATA_DIR, "processed", "embedding-benchmark-results")

### Representation Degeneration error analysis test generation
_Generates the test files needed for the representation degeneration error analysis. Needs to be run only the first time. After that, the test files are saved to disk._

In [None]:
# Open tokenized articles cache
W2V_ARTICLE_PREPROCESS_CACHE = path.join(
    DATA_DIR, "processed", "nato", "articles-preproc-cache.pkl")

# Embedding model similarity evaluation files directory
SIM_EVAL_DIR = path.join(DATA_DIR, "processed", "embedding-benchmark-results")
MODEL_SIM_EVAL_RESULTS = listdir(SIM_EVAL_DIR)

# Number of least common tokens to return
LEAST_COMMON = 100

In [None]:
# Data loading
with open(W2V_ARTICLE_PREPROCESS_CACHE, "rb") as f:
    articles = pickle.load(f)

# Retrieve evalaution tokens from disk
men_eval_file = path.join(SIM_EVAL_DATA_DIR, "men", "MEN_dataset_natural_form_full")
with open(men_eval_file, "r") as f:
    lines = f.read().splitlines()
    men_pairs = [line.split(" ") for line in lines][1:]
    men_tokens = [token for pair in men_pairs for token in pair[:-1]]

ws353_eval_file = path.join(SIM_EVAL_DATA_DIR, "wordsim", "combined.csv")
with open(ws353_eval_file, "r") as f:
    lines = f.read().splitlines()
    ws353_pairs = [line.split(",") for line in lines][1:]
    ws353_tokens = [token for pair in ws353_pairs for token in pair[:-1]]

In [None]:
# Flattening article tokens, as there is no need for sentence structure in this case.
if not "tokens_flat" in articles.columns:
    print("Flattening article tokens.")
    articles["tokens_flat"] = articles.text_prep.progress_apply(lambda x: [t for s in x for t in s])

# Count tokens in each orientation to retrieve the least common ones
orientation_least_common = {}
print("Retrieving least common tokens.")
for orientation, grouping in orientation_groups.items():
    orientation_articles = articles[articles.orientation.isin(grouping)].tokens_flat
    all_tokens = [token for article in tqdm(orientation_articles) for token in article]
    token_couter = Counter(all_tokens)

    test_least_common = {}
    least_common = sorted(token_couter.most_common(), key=lambda x: x[1])
    test_least_common["men"] = [token[0] for token in least_common if token[0] in men_tokens][:LEAST_COMMON]
    test_least_common["ws353"] = [token[0] for token in least_common if token[0] in ws353_tokens][:LEAST_COMMON]

    orientation_least_common[orientation] = test_least_common

In [None]:
# Retrieve evaluation pairs containing the rare tokens
orientation_rare_pairs = {}
for orientation, tests in orientation_least_common.items():
    test_rare_pairs = {}
    for test, rare_tokens in tests.items():
        pairs = men_pairs if test == "men" else ws353_pairs
        rare_pairs = [(pair[0], pair[1], pair[2]) for pair in pairs if pair[0] in rare_tokens or pair[1] in rare_tokens]
        test_rare_pairs[test] = rare_pairs

    orientation_rare_pairs[orientation] = test_rare_pairs

In [None]:
# Export rare tokens into a file format that the evaluation library understands
for orientation, tests in orientation_rare_pairs.items():
    for test, pairs in tests.items():
        with open(f"{SIM_EVAL_DATA_DIR}/{test}-rare/{orientation}.test", "w") as f:
            for pair in pairs:
                f.write(f"{pair[0]} {pair[1]} {float(pair[2])}\n")

### word2vec models (Static embeddings)

In [None]:
MODELS_PATH = path.join(DATA_DIR, "models", "nato-w2v")
MODEL_LIST = [
    "left.model",
    "center.model",
    "right.model"]

# Load previously trained models from disk
loaded_models = {}
for i, model in enumerate(MODEL_LIST):
    print("=" * 10, f"{i+1}/{len(MODEL_LIST)} Loading model", model)
    model_file_name = path.join(MODELS_PATH, model)
    loaded_models[model] = Word2Vec.load(model_file_name).wv

In [None]:
# Benchmark models
full_results, benchmark_results = benchmark_models(loaded_models, return_full_json=True)
model_results_file = path.join(BENCHMARK_RESULTS_DIR, f"{MODEL_NAME}.json")
with open(model_results_file, "w") as f:
    json.dump(full_results, f, indent=4)

print(benchmark_results)

### Decontextualized embeddings (Decontext)

In [None]:
MODEL_NAME = "contextualized2static"
MODELS_PATH = path.join(DATA_DIR, "models", MODEL_NAME)
MODELS = [
    "left.model",
    "center.model",
    "right.model"]

# Load models from disk
loaded_models = {}
for i, model in enumerate(MODELS):
    print("=" * 10, f"{i+1}/{len(MODELS)} Loading model", model)
    model_file_name = path.join(MODELS_PATH, model)
    loaded_models[model] = KeyedVectors.load_word2vec_format(model_file_name, binary=False)

In [None]:
# Benchmark models
full_results, benchmark_results = benchmark_models(loaded_models, return_full_json=True)
model_results_file = path.join(BENCHMARK_RESULTS_DIR, f"{MODEL_NAME}.json")
with open(model_results_file, "w") as f:
    json.dump(full_results, f, indent=4)

print(benchmark_results)

### Frequency agnostic embeddings (FreqAgn)

In [None]:
FRAGE_PATH = path.join(PARENT_DIR, "Frequency-Agnostic")

normal_base_dir = getcwd()
chdir(path.join(FRAGE_PATH, "lm"))

In [None]:
import hashlib
import torch

# FRAGE imports
import model
import data as data

In [None]:
# Load trained FRAGE model and corpus for each orientation
MODEL_NAME = "frage-lstm"
model_files = [
    ("left-frage-v0--b600.pt", "left/frage-corpus.7899431f957ce95000ec90d10e1fa2d0.data"),
    ("center-frage-v1--b600.pt", "center/frage-corpus.f1ec93b6d6495c8ade1ce55d0f0c99e7.data"),
    ("right-frage-v0--b600.pt", "right/frage-corpus.b381924fee77a9f10329361df32cf68d.data")]

loaded_models = {}
for i, model in enumerate(model_files):
    print("=" * 10, f"{i+1}/{len(model_files)} Loading model", model[0])
    model_path = path.join(DATA_DIR, "models", MODEL_NAME, model[0])
    kv_model_path = f"{model_path}.kv"

    if not path.exists(kv_model_path):
        print("No cache found. Retrieving embedding from trained model.")
        with open(model_path, "rb") as f:
            pt_model, criterion, optimizer, epoch = torch.load(f)

        model_corpus_name = path.join(DATA_DIR, "processed", "corpus-awd-lstm-format", model[1])
        model_corpus = torch.load(model_corpus_name)
        model_dictionary = model_corpus.dictionary.word2idx

        print("Retrieving token/vector dict.")
        embedding_dict = get_embedding_dict_from_pytorch(pt_model, model_dictionary)
        loaded_models[model[0]] = dict_to_word2vec_file(embedding_dict, kv_model_path)
    else:
        print("Output file already exists. Loading directly instead.")
        loaded_models[model[0]] = KeyedVectors.load_word2vec_format(kv_model_path)

In [None]:
# Benchmark models
full_results, benchmark_results = benchmark_models(loaded_models, return_full_json=True)
model_results_file = path.join(BENCHMARK_RESULTS_DIR, f"{MODEL_NAME}.json")
with open(model_results_file, "w") as f:
    json.dump(full_results, f, indent=4)

print(benchmark_results)

In [None]:
chdir(normal_base_dir)

### Fine-tuned BERT models

In [None]:
from flair.data import Sentence
from flair.embeddings import TransformerWordEmbeddings

In [None]:
# Load all models as KeyedVector instances
MODEL_NAME = "bert-finetuned"
MODELS_PATH = path.join(DATA_DIR, "models", MODEL_NAME)
MODEL_LIST = [
    ("left", "195500"),
    ("center", "32500"),
    ("right", "66500"),
    ("bert-base", "-1")]

vocabulary = get_test_vocabulary(word_sets=word_sets, similarity_eval_data_path=SIM_EVAL_DATA_DIR)
vocab_list = list(vocabulary)
loaded_models = {}
for i, model in enumerate(MODEL_LIST):
    # Load the model to evaluate. The BERT base model is identified by the "-1" checkpoint number.
    if model[1] == "-1":
        model_name = f"{model[0]}.model"
        pt_model_path = path.join(MODELS_PATH, model[0])
    else:
        model_name = f"{model[0]}-c{model[1]}.model"
        pt_model_path = path.join(MODELS_PATH, model[0], f"checkpoint-{model[1]}")

    kv_model_path = path.join(MODELS_PATH, model_name)
    print("=" * 10, f"{i+1}/{len(MODEL_LIST)} Loading model", model_name)

    # If the model does not yet exist as KeyedVector file, create it first for easier evaluation
    if not path.exists(kv_model_path):
        print(f"{model_name} doesn't exist as KeyedVector instance. Generating first...")
        # Load BERT model from file using the flair library
        if model[0] == "bert-base":
            embeddings = TransformerWordEmbeddings("bert-base-uncased")
        else:
            embeddings = TransformerWordEmbeddings(
                model=pt_model_path,
                name=model_name)

        embedding_dict = {}
        for token in vocabulary:
            sent = Sentence(token)
            embeddings.embed(sent)
            embedding_dict[token] = sent[0].embedding

        with open(f"{kv_model_path}", "w") as f:
            f.write(f"{len(embedding_dict)} {len(embedding_dict[vocab_list[0]])}\n")
            for token in vocabulary:
                token_vector = " ".join([str(d) for d in embedding_dict[token].tolist()])
                f.write(f"{token} {token_vector}\n")

    model_key = f"{model[0]}-{model[1]}"
    loaded_models[model_key] = KeyedVectors.load_word2vec_format(kv_model_path)

In [None]:
# Benchmark models
full_results, benchmark_results = benchmark_models(loaded_models, return_full_json=True)
model_results_file = path.join(BENCHMARK_RESULTS_DIR, f"{MODEL_NAME}.json")
with open(model_results_file, "w") as f:
    json.dump(full_results, f, indent=4)

print(benchmark_results)

## Social Bias evaluation

### word2vec models (Static embeddings)

In [None]:
MODELS_PATH = path.join(DATA_DIR, "models", "nato-w2v")
MODEL_LIST = [
    "left.model",
    "center.model",
    "right.model"]

# Load models from disk
loaded_models = {}
for i, model in enumerate(MODEL_LIST):
    print("=" * 10, f"{i+1}/{len(MODEL_LIST)} Loading model", model)
    model_file_name = path.join(MODELS_PATH, model)
    loaded_models[model] = Word2Vec.load(model_file_name).wv

In [None]:
# Evaluate all loaded models using specified metrics
evaluation_results = evaluate_models(
    models=loaded_models,
    word_sets=word_sets,
    metrics=METRICS,
    threshold=0.4)

In [None]:
# Create strip-plot from data (x-axis is metric, blob color is model)
# (in this particular case, its a swarm plot -> strip-plot without overlapping points)
sns.set_theme()

weat_results = evaluation_results[evaluation_results.metric == "WEAT"]
g = sns.catplot(
    data=weat_results, kind="swarm", x="bias_type", y="result", hue="model", height=8, s=20)
g.set(ylim=(-2.0, 2.0))
plt.title("WEAT")
plt.show()

print("WEAT\n", weat_results)

### Decontextualized embeddings (Decontext)

In [None]:
MODELS_PATH = path.join(DATA_DIR, "models", "contextualized2static")
MODELS = [
    "left.model",
    "center.model",
    "right.model"]

# Load models from disk
loaded_models = {}
for i, model in enumerate(MODELS):
    print("=" * 10, f"{i+1}/{len(MODELS)} Loading model", model)
    model_file_name = path.join(MODELS_PATH, model)
    loaded_models[model] = KeyedVectors.load_word2vec_format(model_file_name, binary=False)

In [None]:
# Evaluate models
evaluation_results = evaluate_models(
    models=loaded_models,
    word_sets=word_sets,
    metrics=METRICS,
    threshold=0.4)

In [None]:
# Create strip-plot from data (x-axis is metric, blob color is model)
# (in this particular case, its a swarm plot -> strip-plot without overlapping points)
sns.set_theme()

weat_results = evaluation_results[evaluation_results.metric == "WEAT"]
g = sns.catplot(
    data=weat_results, kind="swarm", x="bias_type", y="result", hue="model", height=8, s=20)
g.set(ylim=(-2.0, 2.0))
plt.title("WEAT")
plt.show()

print("WEAT\n", weat_results)

### FRAGE embeddings

In [None]:
FRAGE_PATH = path.join(PARENT_DIR, "Frequency-Agnostic")

normal_base_dir = getcwd()
chdir(path.join(FRAGE_PATH, "lm"))

In [None]:
import hashlib
import torch

import model
import data as data

In [None]:
# Load trained FRAGE model and corpus
MODEL_NAME = "frage-lstm"
model_files = [
    ("left-frage-v0--b600.pt", "left/frage-corpus.7899431f957ce95000ec90d10e1fa2d0.data"),
    ("center-frage-v1--b600.pt", "center/frage-corpus.f1ec93b6d6495c8ade1ce55d0f0c99e7.data"),
    ("right-frage-v0--b600.pt", "right/frage-corpus.b381924fee77a9f10329361df32cf68d.data")]

loaded_models = {}
for i, model in enumerate(model_files):
    print("=" * 10, f"{i+1}/{len(model_files)} Loading model", model[0])
    model_path = path.join(DATA_DIR, "models", MODEL_NAME, model[0])
    kv_model_path = f"{model_path}.kv"

    if not path.exists(kv_model_path):
        print("No cache found. Retrieving embedding from trained model.")
        with open(model_path, "rb") as f:
            pt_model, criterion, optimizer, epoch = torch.load(f)

        model_corpus_name = path.join(DATA_DIR, "processed", "corpus-awd-lstm-format", model[1])
        model_corpus = torch.load(model_corpus_name)
        model_dictionary = model_corpus.dictionary.word2idx

        print("Retrieving token/vector dict.")
        embedding_dict = get_embedding_dict_from_pytorch(pt_model, model_dictionary)
        loaded_models[model[0]] = dict_to_word2vec_file(embedding_dict, kv_model_path)
    else:
        print("Output file already exists. Loading directly instead.")
        loaded_models[model[0]] = KeyedVectors.load_word2vec_format(kv_model_path)

In [None]:
# Evaluate models
evaluation_results = evaluate_models(
    models=loaded_models,
    word_sets=word_sets,
    metrics=METRICS,
    threshold=0.5)

In [None]:
# Create strip-plot from data (x-axis is metric, blob color is model)
# (in this particular case, its a swarm plot -> strip-plot without overlapping points)
sns.set_theme()

weat_results = evaluation_results[evaluation_results.metric == "WEAT"]
g = sns.catplot(
    data=weat_results, kind="swarm", x="bias_type", y="result", hue="model", height=8, s=20)
g.set(ylim=(-2.0, 2.0))
plt.title("WEAT")
plt.show()

print("WEAT\n", weat_results)

In [None]:
chdir(normal_base_dir)

### Fine-tuned BERT models

In [None]:
from flair.data import Sentence
from flair.embeddings import TransformerWordEmbeddings

In [None]:
# Load all models as KeyedVector instances
MODELS_PATH = path.join(DATA_DIR, "models", "bert-finetuned")

MODEL_LIST = [
    ("left", "195500"),
    ("center", "32500"),
    ("right", "66500"),
    ("bert-base", "-1")]

vocabulary = get_test_vocabulary(word_sets=word_sets)
vocab_list = list(vocabulary)
loaded_models = {}
for i, model in enumerate(MODEL_LIST):
    if model[1] == "-1":
        model_name = f"{model[0]}.model"
        pt_model_path = path.join(MODELS_PATH, model[0])
    else:
        model_name = f"{model[0]}-c{model[1]}.model"
        pt_model_path = path.join(MODELS_PATH, model[0], f"checkpoint-{model[1]}")

    kv_model_path = path.join(MODELS_PATH, model_name)
    print("=" * 10, f"{i+1}/{len(MODEL_LIST)} Loading model", model_name)

    if not path.exists(kv_model_path):
        print(f"{model_name} doesn't exist as KeyedVector instance. Generating first...")
        # Load BERT model from file using flair library
        if model[0] == "bert-base":
            embeddings = TransformerWordEmbeddings("bert-base-uncased")
        else:
            embeddings = TransformerWordEmbeddings(
                model=pt_model_path,
                name=model_name)

        embedding_dict = {}
        for token in vocabulary:
            sent = Sentence(token)
            embeddings.embed(sent)
            embedding_dict[token] = sent[0].embedding

        with open(f"{kv_model_path}", "w") as f:
            f.write(f"{len(embedding_dict)} {len(embedding_dict[vocab_list[0]])}\n")
            for token in vocabulary:
                token_vector = " ".join([str(d) for d in embedding_dict[token].tolist()])
                f.write(f"{token} {token_vector}\n")
        print("Generation done.")

    loaded_models[model] = KeyedVectors.load_word2vec_format(kv_model_path)

In [None]:
# Evaluate models
evaluation_results = evaluate_models(
    models=loaded_models,
    word_sets=word_sets,
    metrics=METRICS,
    threshold=0.4)

In [None]:
# Create strip-plot from data (x-axis is metric, blob color is model)
# (in this particular case, its a swarm plot -> strip-plot without overlapping points)
sns.set_theme()

weat_results = evaluation_results[evaluation_results.metric == "WEAT"]
g = sns.catplot(
    data=weat_results, kind="swarm", x="bias_type", y="result", hue="model", height=8, s=20)
g.set(ylim=(-2.0, 2.0))
plt.title("WEAT")
plt.show()

print("WEAT\n", weat_results)

## Social bias evaluation over time

In [None]:
FIG_PATH = path.join(DATA_DIR, "processed", "temporal-analysis")
MODELS_PATH = path.join(DATA_DIR, "models", "contextualized2static", "temporal")
YEARS = [
    "2010",
    "2011",
    "2012",
    "2013",
    "2014",
    "2015",
    "2016",
    "2017",
    "2018",
    "2019",
    "2020",
    "2021"]
ORIENTATIONS = [
    "left",
    "center",
    "right"]

# Load models from disk
loaded_models = {}
for i, orientation in enumerate(ORIENTATIONS):
    print("=" * 20, f"{i+1}/{len(ORIENTATIONS)} Loading models for orientation {orientation}")
    loaded_models[orientation] = {}
    for j, year in enumerate(YEARS):
        print("=" * 10, f"{j+1}/{len(YEARS)} {year}")
        model_file_name = path.join(MODELS_PATH, f"{orientation}-{year}.model")
        loaded_models[orientation][year] = KeyedVectors.load_word2vec_format(model_file_name, binary=False)

In [None]:
# Evaluate models
evaluation_results = pd.DataFrame(columns=["orientation", "year", "bias_type", "weat_score"])
for orientation, models in loaded_models.items():
    results = evaluate_models(
        models=models,
        word_sets=word_sets,
        metrics=METRICS,
        threshold=0.7)

    for i, row in results.iterrows():
        results_row = [orientation, float(row.model), row.bias_type, row.result]
        evaluation_results.loc[len(evaluation_results)] = results_row

In [None]:
for bias in evaluation_results.bias_type.unique():
    result_subset = evaluation_results[evaluation_results.bias_type == bias]
    sns.set_theme()
    g = sns.lineplot(
        data=result_subset,
        hue="orientation",
        x="year",
        y="weat_score",
        err_style="bars",
        ci=0)
    g.set(ylim=(0, 0.5), xlim=(2009, 2022), xticks=range(2010, 2022))
    g.set_xticklabels([int(y) for y in result_subset.year.unique()], rotation=45)
    plt.title(f"{bias.capitalize()} bias development over time")

    # Trendline
    z = np.polyfit(result_subset.year, result_subset.weat_score, 1)
    p = np.poly1d(z)
    plt.plot(result_subset.year, p(result_subset.year), color="purple", linewidth=3)

    file_name = f"{bias}-weat.png"
    plt.savefig(f"{FIG_PATH}/{file_name}", dpi=300)

    plt.show()