In [None]:
import torch
import torchtext

from os import path, mkdir
from scipy.stats import norm

In [None]:
def sc_weat(x, A, B):
  """
  Calculate the SC-WEAT effect size for a single word.
  """

  A_normed = A / A.norm(p=2, dim=1, keepdim=True)
  B_normed = B / B.norm(p=2, dim=1, keepdim=True)
  x_normed = x / x.norm(p=2, keepdim=True)

  A_sims = torch.matmul(A_normed, x_normed)
  B_sims = torch.matmul(B_normed, x_normed)
  all_sims = torch.cat((A_sims, B_sims), dim=1)

  A_mean = A_sims.mean()
  B_mean = B_sims.mean()
  joint_std = torch.std(all_sims)

  effect_size = (A_mean - B_mean) / joint_std

  return effect_size

def get_similarities(x, A, B):
  """
  Return the raw cosine similarities for a word with two attribute groups.
  """

  A_normed = A / A.norm(p=2, dim=1, keepdim=True)
  B_normed = B / B.norm(p=2, dim=1, keepdim=True)
  x_normed = x / x.norm(p=2, keepdim=True)

  A_sims = torch.matmul(A_normed, x_normed)
  B_sims = torch.matmul(B_normed, x_normed)

  return A_sims, B_sims

def full_vocab_sc_weat(embedding, A, B):
  """
  Calculate the SC-WEAT effect size for each word in the embedding vocabulary.
  """

  A_normed = A / A.norm(p=2, dim=1, keepdim=True)
  B_normed = B / B.norm(p=2, dim=1, keepdim=True)
  embedding_normed = embedding / embedding.norm(p=2, dim=1, keepdim=True)

  assert A_normed.shape[1] == B_normed.shape[1] == embedding_normed.shape[1]

  A_sims = torch.matmul(embedding_normed, A_normed.T)
  B_sims = torch.matmul(embedding_normed, B_normed.T)
  all_sims = torch.cat((A_sims, B_sims), dim=1)

  A_means = A_sims.mean(dim=1)
  B_means = B_sims.mean(dim=1)
  all_stds = all_sims.std(dim=1)

  effect_sizes = (A_means - B_means) / all_stds

  return effect_sizes

def single_term_association(embedding, x):
    """
    Calculate cosine similarity of a single word with each word in the embedding vocabulary.
    """

    x_normed = x / x.norm(p=2)
    embedding_normed = embedding / embedding.norm(p=2, dim=1, keepdim=True)

    sims = torch.matmul(embedding_normed, x_normed)

    return sims

def full_vocab_associations(embedding, A):
  """
  Calculate the mean and standard deviation of the cosine similarities between each word in the embedding vocabulary and each word in A.
  """

  A_normed = A / A.norm(p=2, dim=1, keepdim=True)
  embedding_normed = embedding / embedding.norm(p=2, dim=1, keepdim=True)

  assert A_normed.shape[1] == embedding_normed.shape[1]

  A_sims = torch.matmul(embedding_normed, A_normed.T)
  A_means = A_sims.mean(dim=1)
  A_stds = A_sims.std(dim=1)

  return A_means, A_stds

# Function to compute p-value of EAT
def compute_p_value(associations1: torch.tensor,
                    associations2: torch.tensor,
                    permutations:int=1000
                    ) -> float:
    """
    Computes the one-tailed p-value of an EAT for the given sets of associations with two attribute groups.
    """

    # Compute test statistic, difference between sums of associations
    test_statistic = associations1.sum() - associations2.sum()

    # Create joint distribution of associations by concatenating
    joint_sims = torch.cat([associations1, associations2])

    # Permute joint distribution to form tensor of shape (permutations, len(joint_sims))
    joint_permutations = torch.stack([joint_sims[torch.randperm(len(joint_sims))] for _ in range(permutations)])

    # Compute differential associations for each permutation, choosing first len(associations1) for A and last len(associations2) for B
    differential_associations = joint_permutations[:, :len(associations1)].sum(dim=1) - joint_permutations[:, len(associations1):].sum(dim=1)

    # Compute mean and standard deviation of distribution of permutations
    dist_mean, dist_std = differential_associations.mean(), differential_associations.std(correction=1)

    # Compute p-value as probability of observing a test statistic as extreme as the one observed, given the null hypothesis
    p_value = min(norm.cdf(test_statistic, loc=dist_mean, scale=dist_std), 1 - norm.cdf(test_statistic, loc=dist_mean, scale=dist_std))

    return p_value

In [None]:
# Define word stimuli in English and Nepali
teenager_words = {
    'en': 'teenager',
    'ne': 'युवा'
}

ages_english = {
    'teenagers': ['teenager', 'teenagers', 'teen', 'teens', 'teenage', 'teenaged', 'adolescent', 'adolescence'],
    'adults' : ['adult', 'adults', 'adulthood', 'middle-age', 'middle-aged', 'grownup', 'grown-up', 'grownups'],
    'elderly': ['elder', 'elders', 'elderly', 'aged', 'aging', 'older', 'old-age', 'retiree'],
    'children': ['child', 'children', 'childlike', 'childhood', 'kid', 'kids',  'schoolchild', 'schoolchildren']
}

ages_nepali = {
    'teenagers': ['किशोर', 'किशोरी', 'किशोरावस्था', 'कन्या', 'जवान', 'तरुण', 'ल्यासे', 'किशोरहरु'],
    'adults': ['वयस्क', 'वयस्कहरू', 'अधबैंसे', 'पौढ', 'परिपक्व', 'परिपक्व', 'हुर्केको', 'बडा'],
    'elderly': ['बुढा', 'बुढाहरू', 'बुढ्यौली', 'वृद्ध', 'ज्येष्ठ', 'जेठो', 'बुढेसकाल', 'अभिभावक'],
    'children': ['बालक', 'बालकहरू', 'बालिका', 'बालबालिका', 'बच्चा', 'पाठी', 'छात्रा', 'बाल्यकाल']
}

In [None]:
def compute_and_write_embedding_results(embedding, embedding_name, write_dir, language='en'):
    """
    Calculate associations and effect sizes for a given embedding and write results to text files.
    """

    # Match word stimuli to language
    teenager_word = teenager_words[language]

    age_dict = ages_nepali if language.lower()=='ne' else ages_english
    teenagers_, adults_, elderly_, children_ = age_dict['teenagers'], age_dict['adults'], age_dict['elderly'], age_dict['children']

    # Calculate the cosine similarity of the word "teenager" with each word in the embedding vocabulary

    teenager_vec = embedding.get_vecs_by_tokens(teenager_word)
    teenager_associations = single_term_association(embedding.vectors, teenager_vec)
    top_associations = torch.topk(teenager_associations, 1000)

    # Create a text file of the 1000 most similar words to the word "teenager" and their cosine similarities

    write_str = ''
    associated_words = []

    for idx in range(1000):
        word = embedding.itos[top_associations[1][idx]]
        similarity = top_associations[0][idx]
        write_str += f'{word},{similarity.item()}\n'
        associated_words.append(word)

    with open(path.join(write_dir, f'{embedding_name}_teenager_sims.csv'), 'w') as f:
        f.write(write_str)

    # Check that all words are in the embedding vocabulary

    missing_words = []

    for word in teenagers_ + adults_ + elderly_ + children_:
        if word not in embedding.itos:
            missing_words.append(word)

    if missing_words:
        with open(path.join(write_dir, f'{embedding_name}_missing_words.txt'), 'w') as f:
            f.write('\n'.join(missing_words))

    # Load the embedding vectors for each set of word stimuli

    teenager_vecs = embedding.get_vecs_by_tokens(teenagers_)
    adult_vecs = embedding.get_vecs_by_tokens(adults_)
    elderly_vecs = embedding.get_vecs_by_tokens(elderly_)
    child_vecs = embedding.get_vecs_by_tokens(children_)

    # Get mean and standard deviation of cosine similarities between each word in the embedding vocabulary and each word in the teenager set
    teenager_means, teenager_stds = full_vocab_associations(embedding.vectors, teenager_vecs)
    top_associations = torch.topk(teenager_means, 1000)

    # Create a dictionary of the top 1000 words and their mean cosine similarities and standard deviations
    teenager_mean_dict = {}

    for topk_idx in range(1000):
        similarity = top_associations[0][topk_idx]
        mean_index = top_associations[1][topk_idx]
        std = teenager_stds[mean_index]
        word = embedding.itos[mean_index]
        teenager_mean_dict[word] = (similarity, std)

    # Write the dictionary to a text file

    write_str = ''

    for word in teenager_mean_dict:
        similarity, std = teenager_mean_dict[word]
        write_str += f'{word},{similarity.item()},{std.item()}\n'

    with open(path.join(write_dir, f'{embedding_name}_teenager_means.csv'), 'w') as f:
        f.write(write_str)

    # Calculate the SC-WEAT effect size for each word in the embedding vocabulary for each set of word stimuli

    teen_adult_associations = full_vocab_sc_weat(embedding.vectors, teenager_vecs, adult_vecs)
    teen_elderly_associations = full_vocab_sc_weat(embedding.vectors, teenager_vecs, elderly_vecs)
    teen_child_associations = full_vocab_sc_weat(embedding.vectors, teenager_vecs, child_vecs)

    # Concatenate the effect sizes for each set of word stimuli into a single tensor
    intersected_associations = torch.cat((teen_adult_associations.unsqueeze(0).T, teen_elderly_associations.unsqueeze(0).T, teen_child_associations.unsqueeze(0).T), dim=1)

    # Take only large effect sizes (effect size >= 0.8) demonstrating association with the teenager set for each set of word stimuli
    large_associations = torch.all((intersected_associations >= 0.8), dim=1)

    # Create a dictionary of the 1000 most frequent words and their effect sizes for each set of word stimuli

    association_dict = {}
    count = 0

    while len(association_dict) < 1000 and count < len(embedding.itos): # Note that words are ordered by frequency

        if large_associations[count]:

            # Get the target embedding
            target_embedding = embedding[count]

            # Compute p-values for WEATs - done only for large associations to prevent wasting resources
            p1 = compute_p_value(*get_similarities(target_embedding, teenager_vecs, adult_vecs))
            p2 = compute_p_value(*get_similarities(target_embedding, teenager_vecs, elderly_vecs))
            p3 = compute_p_value(*get_similarities(target_embedding, teenager_vecs, child_vecs))

            # Exclude any word + similarity that is not statistically significant
            if p1 > .05 or p2 > .05 or p3 > .05:
              continue

            # Add significant words to the dictionary
            word = embedding.itos[count]
            association_dict[word] = (intersected_associations[count][0], intersected_associations[count][1], intersected_associations[count][2])

        count += 1

    # Write the dictionary to a text file

    write_str = ''

    for word in association_dict:
        adult, elderly, child = association_dict[word]
        write_str += f'{word},{adult.item()},{elderly.item()},{child.item()}\n'

    with open(path.join(write_dir, f'{embedding_name}_teenager_effect_sizes.csv'), 'w') as f:
        f.write(write_str)

In [None]:
# Define the path to the embedding files
WRITE_DIR = f'./swe_results/'

# Create the directory if it does not exist
if not path.exists(WRITE_DIR):
    mkdir(WRITE_DIR)

In [None]:
vectors = torchtext.vocab.Vectors(name='glove.840B.300d.txt', cache='./.vector_cache')

In [None]:
# Compute and write the results for the GloVe 840B embedding
compute_and_write_embedding_results(vectors, 'glove_840B', WRITE_DIR, language='en')

In [None]:
vectors = torchtext.vocab.Vectors(name='crawl-300d-2M.vec', cache='./.vector_cache')

In [None]:
# Compute and write the results for the FT 300D 2M embedding
compute_and_write_embedding_results(vectors, 'ft_2m', WRITE_DIR, language='en')

In [None]:
vectors = torchtext.vocab.Vectors(name='nepali_glove_vectors.txt', cache='./.vector_cache')

In [None]:
compute_and_write_embedding_results(vectors, 'glove_ne', WRITE_DIR, language='ne')

In [None]:
vectors = torchtext.vocab.Vectors(name='cc.ne.300.vec', cache='./.vector_cache')

In [None]:
compute_and_write_embedding_results(vectors, 'ft_ne_cc', WRITE_DIR, language='ne')