In [None]:
import numpy as np
import pickle as pkl
import matplotlib.pyplot as plt

from os import path
from utils import read_json_file
from scipy.stats import norm

In [None]:
def cosine_similarity(v1: np.ndarray, v2: np.ndarray) -> float:
    return np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))

def compute_association(w: np.ndarray, A: np.ndarray) -> float:
    return np.mean([cosine_similarity(w, a) for a in A])

def compute_attribute_association(X: np.ndarray, A: np.ndarray) -> np.ndarray:
    return np.array([compute_association(x, A) for x in X])

def compute_attribute_association_L2(A: np.ndarray, T: np.ndarray) -> np.ndarray:
    return np.array([compute_association(a, T) for a in A])

def compute_joint_std(X_Associations: np.ndarray, Y_Associations: np.ndarray) -> float:
    return np.std(np.concatenate([X_Associations, Y_Associations]), ddof=1)

def compute_p_value(X_Diff: np.ndarray, Y_Diff: np.ndarray, permutations: int=1000) -> float:
    test_statistic = np.sum(X_Diff) - np.sum(Y_Diff)
    empirical_distribution = np.array([np.random.choice(np.concatenate([X_Diff, Y_Diff]), size=len(X_Diff) + len(Y_Diff), replace=False) for _ in range(permutations)])
    empirical_differences = np.sum(empirical_distribution[:, :len(X_Diff)], axis=1) - np.sum(empirical_distribution[:, len(X_Diff):], axis=1)
    
    return 1-norm.cdf(test_statistic, loc=np.mean(empirical_differences), scale=np.std(empirical_differences, ddof=1))

def level_1(X: np.ndarray, Y: np.ndarray, A: np.ndarray, B: np.ndarray, permutations: int=1000) -> float:
    X_Associations_A = compute_attribute_association(X, A)
    X_Associations_B = compute_attribute_association(X, B)
    X_Differential_Associations = X_Associations_A - X_Associations_B

    Y_Associations_A = compute_attribute_association(Y, A)
    Y_Associations_B = compute_attribute_association(Y, B)
    Y_Differential_Associations = Y_Associations_A - Y_Associations_B

    X_Mean = np.mean(X_Differential_Associations)
    Y_Mean = np.mean(Y_Differential_Associations)

    p_value = compute_p_value(X_Differential_Associations, Y_Differential_Associations, permutations=permutations)

    return (X_Mean - Y_Mean) / compute_joint_std(X_Differential_Associations, Y_Differential_Associations), p_value

def level_2(T: np.ndarray, A: np.ndarray, B: np.ndarray, permutations: int=1000) -> float:
    A_Associations_T = compute_attribute_association_L2(A, T)
    B_Associations_T = compute_attribute_association_L2(B, T)

    p_value = compute_p_value(A_Associations_T, B_Associations_T, permutations=permutations)

    return (np.mean(A_Associations_T) - np.mean(B_Associations_T)) / compute_joint_std(A_Associations_T, B_Associations_T), p_value

def level_3(T: np.ndarray, A: np.ndarray) -> float:
    T_Associations_A = [cosine_similarity(t, a) for t in T for a in A]
    return np.mean(T_Associations_A), np.std(T_Associations_A, ddof=1)

def ML_EAT(A: np.ndarray, B: np.ndarray, X: np.ndarray, Y: np.ndarray, permutations: int=1000) -> dict:
    L1_effect_size, L1_p_value = level_1(X, Y, A, B, permutations=permutations)
    L2_effect_size_X, L2_p_value_X = level_2(X, A, B, permutations=permutations)
    L2_effect_size_Y, L2_p_value_Y = level_2(Y, A, B, permutations=permutations)
    L3_mean_AX, L3_std_AX = level_3(X, A)
    L3_mean_BX, L3_std_BX = level_3(X, B)
    L3_mean_AY, L3_std_AY = level_3(Y, A)
    L3_mean_BY, L3_std_BY = level_3(Y, B)

    return {
        'L1_effect_size': L1_effect_size,
        'L1_p_value': L1_p_value,
        'L2_effect_size_X': L2_effect_size_X,
        'L2_p_value_X': L2_p_value_X,
        'L2_effect_size_Y': L2_effect_size_Y,
        'L2_p_value_Y': L2_p_value_Y,
        'L3_mean_AX': L3_mean_AX,
        'L3_std_AX': L3_std_AX,
        'L3_mean_BX': L3_mean_BX,
        'L3_std_BX': L3_std_BX,
        'L3_mean_AY': L3_mean_AY,
        'L3_std_AY': L3_std_AY,
        'L3_mean_BY': L3_mean_BY,
        'L3_std_BY': L3_std_BY,
    }

In [None]:
def get_np_embeddings(target_words: list,
                      vocab_dict: dict,
                      embeddings: np.ndarray) -> np.ndarray:
    """
    Get the embeddings for the target words.
    """

    return np.array([embeddings[vocab_dict[word]] for word in target_words])


In [None]:
JSON_PATH = './eat_stimuli/text_stimuli'
stimuli = read_json_file(path.join(JSON_PATH, f'weat_7.json'))

# Replace zero-norm stimuli in the early 1800s embeddings
stimuli['Y'][-3] = 'music' # formerly symphony
stimuli['X'][0] = 'mathematics' # formerly math
stimuli['X'][3] = 'calculation' # formerly calculus

In [None]:
# Download historical embeddings from https://nlp.stanford.edu/projects/histwords/

years = list(range(1810, 1991, 10))
level1, level2x, level2y = [], [], []
l1p, l2xp, l2yp = [], [], []

for year in years:

    print(f'Year: {year}')
    embeddings = np.load(f'./sgns/{year}-w.npy', allow_pickle=True)
    vocab = pkl.load(open(f'./sgns/{year}-vocab.pkl', 'rb'))
    vocab_dict = {word: i for i, word in enumerate(vocab)}

    A = get_np_embeddings(stimuli['A'], vocab_dict, embeddings)
    B = get_np_embeddings(stimuli['B'], vocab_dict, embeddings)
    X = get_np_embeddings(stimuli['X'], vocab_dict, embeddings)
    Y = get_np_embeddings(stimuli['Y'], vocab_dict, embeddings)

    results = ML_EAT(A, B, X, Y)
    
    level1.append(results['L1_effect_size'])
    level2x.append(results['L2_effect_size_X'])
    level2y.append(results['L2_effect_size_Y'])

    l1p.append(results['L1_p_value'])
    l2xp.append(results['L2_p_value_X'])
    l2yp.append(results['L2_p_value_Y'])

In [None]:
# Plot results
plt.figure(figsize=(10, 5))
plt.plot(years, level1, label='Level 1', marker='o')
plt.plot(years, level2x, label='Level 2 X', marker='o')
plt.plot(years, level2y, label='Level 2 Y', marker='o')
plt.xlabel('Year')
plt.ylabel('Effect Size')
plt.legend()
plt.show()

In [None]:
# LaTeX plots
print(' '.join([f'({i}, {x})' for i, x in enumerate(level1)]))
print(' '.join([f'({i}, {x})' for i, x in enumerate(level2x)]))
print(' '.join([f'({i}, {x})' for i, x in enumerate(level2y)]))

In [None]:
# Observe statistical significance to determine EAT patterns
print(l1p)
print(l2xp)
print(l2yp)