In [None]:
! pip install -r requirements.txt

In [None]:
import torchtext
import pandas as pd

from os import path, listdir
from transformers import AutoModel, AutoTokenizer, AutoProcessor
from PIL import Image
from gensim.models.keyedvectors import KeyedVectors

from mleat import ML_EAT
from utils import read_json_file, get_torchtext_vectors, get_lm_embeddings, get_clip_text_embs, get_clip_image_embs

In [None]:
# Glove ML-EAT measurements
# -----------------------
# Path constants
JSON_PATH = './eat_stimuli/text_stimuli'
DF_PATH = './'

# ML-EAT constants
PERMUTATIONS = 10_000

# Load embedding
embedding = torchtext.vocab.GloVe(name="840B", dim=300)

# Store results in a list of dataframes
result_dfs = []

# Run ML-EAT on each set of WEAT stimuli
for i in range(1, 11):
    stimuli = read_json_file(path.join(JSON_PATH, f'weat_{i}.json'))

    A = get_torchtext_vectors(embedding, stimuli['A'])
    B = get_torchtext_vectors(embedding, stimuli['B'])
    X = get_torchtext_vectors(embedding, stimuli['X'])
    Y = get_torchtext_vectors(embedding, stimuli['Y'])

    ml_eat_results = ML_EAT(
        A=A,
        B=B,
        X=X,
        Y=Y,
        permutations=PERMUTATIONS,
    )

    df = pd.DataFrame(ml_eat_results, index=[stimuli['eat_name']])
    result_dfs.append(df)

# Concatenate dataframes and write to CSV
glove_df = pd.concat(result_dfs)
glove_df.to_csv(path.join(DF_PATH, 'glove_results.csv'))

In [None]:
# Google News Vectors ML-EAT measurements

# Google News ML-EAT measurements
# -----------------------
# Path constants
JSON_PATH = './eat_stimuli/text_stimuli'
DF_PATH = './'

# ML-EAT constants
PERMUTATIONS = 10000

# Google News Vectors link
GOOGLE_VECS_LINK = ''

# Load embedding
embedding = torchtext.vocab.Vectors(name=GOOGLE_VECS_LINK)

# Store results in a list of dataframes
result_dfs = []

# Run ML-EAT on each set of WEAT stimuli
for i in range(1, 11):
    stimuli = read_json_file(path.join(JSON_PATH, f'weat_{i}.json'))

    # Apply any word2vec replacements
    if i == 2:
        stimuli['Y'] = [i for i in stimuli['Y'] if i != 'axe']
        stimuli['X'] = [i for i in stimuli['X'] if i != 'banjo']
    
    if i == 5:
        stimuli['X'] += ['Jay', 'Kristen']
        stimuli['Y'] += ['Tremayne', 'Latonya']
    
    if i == 9:
        stimuli['A'] = [i for i in stimuli['A'] if i not in ('short-term')]
        stimuli['A'] += ['short']

    # Get word embeddings
    A = get_torchtext_vectors(embedding, stimuli['A'])
    B = get_torchtext_vectors(embedding, stimuli['B'])
    X = get_torchtext_vectors(embedding, stimuli['X'])
    Y = get_torchtext_vectors(embedding, stimuli['Y'])

    ml_eat_results = ML_EAT(
        A=A,
        B=B,
        X=X,
        Y=Y,
        permutations=PERMUTATIONS,
    )

    df = pd.DataFrame(ml_eat_results, index=[stimuli['eat_name']])
    result_dfs.append(df)

# Concatenate dataframes and write to CSV
glove_df = pd.concat(result_dfs)
glove_df.to_csv(path.join(DF_PATH, 'google_news_results.csv'))

In [None]:
# GPT-2 ML-EAT measurements
# --------------------------------
# Define constants
LANGUAGE_MODELS = [
    'gpt2',
    'gpt2-medium',
    'gpt2-large',
    'gpt2-xl',
]

PERMUTATIONS = 10_000

# Prompt for semantically neutral sentence
PROMPT = 'This is'

# Define vowels for a/an adjustment in prompts
VOWELS = ['a', 'e', 'i', 'o', 'u']

# Lambda function to adjust a/an based on first letter of target word
vowel_adjust = lambda x: f'{PROMPT} an {x}' if x[0] in VOWELS else f'{PROMPT} a {x}'

# Lambda function for target groups composed of words that don't take articles
prompt_adjust = lambda x: f'{PROMPT} {x}'

# Dictionary to map prompt adjustment functions to tests
adjustment_dict = {'0': prompt_adjust, '1': vowel_adjust}

# Specify prompt adjustments for each test
PROMPT_ADJUSTMENTS = [
    '01',
    '01',
    '00',
    '00',
    '00',
    '00',
    '00',
    '00',
    '00',
    '00',
]

# Define path constants - weat_1 corresponds to pleasant/unpleasant, flowers/insects
WEAT_PATH = './eat_stimuli/text_stimuli/'
DF_PATH = './'

# Store results in a list of dataframes
result_dfs = []

# Run ML-EAT on each set of WEAT stimuli
for model_name in LANGUAGE_MODELS:

    # Load model and tokenizer
    model = AutoModel.from_pretrained(model_name)
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    for i in range(0, 10):
        
        # Load WEAT stimuli
        text_stim = read_json_file(path.join(WEAT_PATH, f'weat_{i+1}.json'))

        # Define attribute and target word sets - pleasant, unpleasant, flowers, insects
        A, B, X, Y = text_stim['A'], text_stim['B'], text_stim['X'], text_stim['Y']

        # Get prompt adjustment functions
        attribute_adjust = adjustment_dict[PROMPT_ADJUSTMENTS[i][0]]
        target_adjust = adjustment_dict[PROMPT_ADJUSTMENTS[i][1]]

        # Create prompts for each word in the sets
        A_Prompts = [attribute_adjust(a) for a in A]
        B_Prompts = [attribute_adjust(b) for b in B]
        X_Prompts = [target_adjust(x) for x in X]
        Y_Prompts = [target_adjust(y) for y in Y]

        # Get embeddings for each set of words
        A_Embs = get_lm_embeddings(model, tokenizer, A_Prompts)
        B_Embs = get_lm_embeddings(model, tokenizer, B_Prompts)
        X_Embs = get_lm_embeddings(model, tokenizer, X_Prompts)
        Y_Embs = get_lm_embeddings(model, tokenizer, Y_Prompts)

        # Run ML-EAT
        ml_eat_results = ML_EAT(
            A=A_Embs,
            B=B_Embs,
            X=X_Embs,
            Y=Y_Embs,
            permutations=PERMUTATIONS,
        )

        # Keep track of model
        ml_eat_results['model_test'] = f'{model_name}_{text_stim["eat_name"]}'
        ml_eat_results['model'] = model_name
        ml_eat_results['test'] = text_stim['eat_name']

        # Convert results to dataframe and append to list
        df = pd.DataFrame(ml_eat_results, index=[0])
        result_dfs.append(df)

# Concatenate dataframes and write to CSV
lm_df = pd.concat(result_dfs)
lm_df.to_csv(path.join(DF_PATH, 'gpt2_results.csv'), index='model_test')

In [None]:
# CLIP ML-EAT measurements
# ----------------------------
# Set models to be tested
CLIP_MODELS = [
    'openai/clip-vit-base-patch32',
    'openai/clip-vit-base-patch16',
    'openai/clip-vit-large-patch14',
    'openai/clip-vit-large-patch14-336',
]

# Define constants for ML-EAT
PERMUTATIONS = 10_000

# Define path constants
TEXT_PATHS = [
    './eat_stimuli/text_stimuli/weat_1.json',
    './eat_stimuli/text_stimuli/weat_1.json',
    './eat_stimuli/text_stimuli/weat_1.json',
    './eat_stimuli/text_stimuli/weat_1.json',
    './eat_stimuli/text_stimuli/weat_3.json',
    './eat_stimuli/text_stimuli/weat_1.json',
    './eat_stimuli/text_stimuli/weat_1.json',
    './eat_stimuli/text_stimuli/weat_1.json',
    './eat_stimuli/text_stimuli/weat_1.json',
    './eat_stimuli/text_stimuli/weat_6.json',
    './eat_stimuli/text_stimuli/weat_8.json',
]

IMAGE_PATHS_X = [
    './eat_stimuli/image_stimuli/flower-insect/flower/',
    './eat_stimuli/image_stimuli/age/young/',
    './eat_stimuli/image_stimuli/arab-muslim/other-people/',
    './eat_stimuli/image_stimuli/disabled/abled/',
    './eat_stimuli/image_stimuli/race/european-american/',
    './eat_stimuli/image_stimuli/religion/christianity/',
    './eat_stimuli/image_stimuli/sexuality/straight/',
    './eat_stimuli/image_stimuli/skin-tone/light/',
    './eat_stimuli/image_stimuli/weight/thin/',
    './eat_stimuli/image_stimuli/gender/male/',
    './eat_stimuli/image_stimuli/gender/science/',
]

IMAGE_PATHS_Y = [
    './eat_stimuli/image_stimuli/flower-insect/insect/',
    './eat_stimuli/image_stimuli/age/old/',
    './eat_stimuli/image_stimuli/arab-muslim/arab-muslim/',
    './eat_stimuli/image_stimuli/disabled/disabled/',
    './eat_stimuli/image_stimuli/race/african-american/',
    './eat_stimuli/image_stimuli/religion/judaism/',
    './eat_stimuli/image_stimuli/sexuality/gay/',
    './eat_stimuli/image_stimuli/skin-tone/dark/',
    './eat_stimuli/image_stimuli/weight/fat/',
    './eat_stimuli/image_stimuli/gender/female/',
    './eat_stimuli/image_stimuli/gender/liberal-arts/',
]

TEST_NAMES = [
    'flower-insect',
    'age',
    'arab-muslim',
    'disabled',
    'race',
    'religion',
    'sexuality',
    'skin-tone',
    'weight',
    'gender',
    'science-arts',
]

DF_PATH = './'

# Store results in a list of dataframes
result_dfs = []

# Run ML-EAT on each set of WEAT stimuli
for model_name in CLIP_MODELS:

    # Load model, tokenizer, and processor
    model = AutoModel.from_pretrained(model_name)
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    processor = AutoProcessor.from_pretrained(model_name)

    for test_idx in range(len(TEXT_PATHS)):

        # Load WEAT stimuli
        text_stim = read_json_file(TEXT_PATHS[test_idx])

        # Define attribute and target word sets - pleasant, unpleasant
        A, B = text_stim['A'], text_stim['B']

        # Get image stimuli
        IMAGE_STIM_PATH_X = IMAGE_PATHS_X[test_idx]
        IMAGE_STIM_PATH_Y = IMAGE_PATHS_Y[test_idx]

        imgs_x = listdir(IMAGE_STIM_PATH_X)
        imgs_y = listdir(IMAGE_STIM_PATH_Y)

        # Open images and convert to PIL Image objects
        X = [Image.open(path.join(IMAGE_STIM_PATH_X, img)) for img in imgs_x]
        Y = [Image.open(path.join(IMAGE_STIM_PATH_Y, img)) for img in imgs_y]

        # Get embeddings for each set of stimuli
        A_Embs = get_clip_text_embs(model, tokenizer, A)
        B_Embs = get_clip_text_embs(model, tokenizer, B)
        X_Embs = get_clip_image_embs(model, processor, X)
        Y_Embs = get_clip_image_embs(model, processor, Y)

        # Run ML-EAT
        ml_eat_results = ML_EAT(
            A=A_Embs,
            B=B_Embs,
            X=X_Embs,
            Y=Y_Embs,
            permutations=PERMUTATIONS,
        )

        # Add model name and eat name to results
        ml_eat_results['model_test'] = f'{model_name}_{TEST_NAMES[test_idx]}'
        ml_eat_results['model'] = model_name
        ml_eat_results['test'] = TEST_NAMES[test_idx]

        # Convert results to dataframe and append to list
        df = pd.DataFrame(ml_eat_results, index=[0])
        result_dfs.append(df)

# Concatenate dataframes and write to CSV
clip_df = pd.concat(result_dfs)
clip_df.to_csv(path.join(DF_PATH, 'clip_results.csv'), index='model_test')

In [None]:
# CLIP ML-EAT measurements
# ----------------------------
# Set models to be tested
CLIP_MODELS = [
    'openai/clip-vit-base-patch32',
    'openai/clip-vit-base-patch16',
    'openai/clip-vit-large-patch14',
    'openai/clip-vit-large-patch14-336',
]

# Define constants for ML-EAT
PERMUTATIONS = 10_000

# Define path constants
TEXT_PATHS = [
    './eat_stimuli/text_stimuli/weat_1.json',
    './eat_stimuli/text_stimuli/weat_1.json',
    './eat_stimuli/text_stimuli/weat_1.json',
    './eat_stimuli/text_stimuli/weat_1.json',
    './eat_stimuli/text_stimuli/weat_3.json',
    './eat_stimuli/text_stimuli/weat_1.json',
    './eat_stimuli/text_stimuli/weat_1.json',
    './eat_stimuli/text_stimuli/weat_1.json',
    './eat_stimuli/text_stimuli/weat_1.json',
    './eat_stimuli/text_stimuli/weat_6.json',
    './eat_stimuli/text_stimuli/weat_8.json',
]

IMAGE_PATHS_X = [
    './eat_stimuli/image_stimuli/flower-insect/flower/',
    './eat_stimuli/image_stimuli/age/young/',
    './eat_stimuli/image_stimuli/arab-muslim/other-people/',
    './eat_stimuli/image_stimuli/disabled/abled/',
    './eat_stimuli/image_stimuli/race/european-american/',
    './eat_stimuli/image_stimuli/religion/christianity/',
    './eat_stimuli/image_stimuli/sexuality/straight/',
    './eat_stimuli/image_stimuli/skin-tone/light/',
    './eat_stimuli/image_stimuli/weight/thin/',
    './eat_stimuli/image_stimuli/gender/male/',
    './eat_stimuli/image_stimuli/gender/science/',
]

IMAGE_PATHS_Y = [
    './eat_stimuli/image_stimuli/flower-insect/insect/',
    './eat_stimuli/image_stimuli/age/old/',
    './eat_stimuli/image_stimuli/arab-muslim/arab-muslim/',
    './eat_stimuli/image_stimuli/disabled/disabled/',
    './eat_stimuli/image_stimuli/race/african-american/',
    './eat_stimuli/image_stimuli/religion/judaism/',
    './eat_stimuli/image_stimuli/sexuality/gay/',
    './eat_stimuli/image_stimuli/skin-tone/dark/',
    './eat_stimuli/image_stimuli/weight/fat/',
    './eat_stimuli/image_stimuli/gender/female/',
    './eat_stimuli/image_stimuli/gender/liberal-arts/',
]

TEST_NAMES = [
    'flower-insect',
    'age',
    'arab-muslim',
    'disabled',
    'race',
    'religion',
    'sexuality',
    'skin-tone',
    'weight',
    'gender',
    'science-arts',
]

DF_PATH = './'

# Store results in a list of dataframes
result_dfs = []

# Run ML-EAT on each set of WEAT stimuli
for model_name in CLIP_MODELS:

    # Load model, tokenizer, and processor
    model = AutoModel.from_pretrained(model_name)
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    processor = AutoProcessor.from_pretrained(model_name)

    for test_idx in range(len(TEXT_PATHS)):

        # Load WEAT stimuli
        text_stim = read_json_file(TEXT_PATHS[test_idx])

        # Define attribute and target word sets - pleasant, unpleasant
        A, B = text_stim['A'], text_stim['B']

        A = [f'a picture that brings to mind {a}' for a in A]
        B = [f'a picture that brings to mind {b}' for b in B]

        # Get image stimuli
        IMAGE_STIM_PATH_X = IMAGE_PATHS_X[test_idx]
        IMAGE_STIM_PATH_Y = IMAGE_PATHS_Y[test_idx]

        imgs_x = listdir(IMAGE_STIM_PATH_X)
        imgs_y = listdir(IMAGE_STIM_PATH_Y)

        # Open images and convert to PIL Image objects
        X = [Image.open(path.join(IMAGE_STIM_PATH_X, img)) for img in imgs_x]
        Y = [Image.open(path.join(IMAGE_STIM_PATH_Y, img)) for img in imgs_y]

        # Get embeddings for each set of stimuli
        A_Embs = get_clip_text_embs(model, tokenizer, A)
        B_Embs = get_clip_text_embs(model, tokenizer, B)
        X_Embs = get_clip_image_embs(model, processor, X)
        Y_Embs = get_clip_image_embs(model, processor, Y)

        # Run ML-EAT
        ml_eat_results = ML_EAT(
            A=A_Embs,
            B=B_Embs,
            X=X_Embs,
            Y=Y_Embs,
            permutations=PERMUTATIONS,
        )

        print(f'{model_name}_{TEST_NAMES[test_idx]}')
        for key, value in ml_eat_results.items():
            print(f'{key}: {value}')

        # Add model name and eat name to results
        ml_eat_results['model_test'] = f'{model_name}_{TEST_NAMES[test_idx]}'
        ml_eat_results['model'] = model_name
        ml_eat_results['test'] = TEST_NAMES[test_idx]

        # Convert results to dataframe and append to list
        df = pd.DataFrame(ml_eat_results, index=[0])
        result_dfs.append(df)

# Concatenate dataframes and write to CSV
clip_df = pd.concat(result_dfs)
clip_df.to_csv(path.join(DF_PATH, 'clip_results_prompts.csv'), index='model_test')