In [None]:
import pandas as pd
import pickle as pkl
import numpy as np
import seaborn as sns

from matplotlib import pyplot as plt
from os import listdir, path

from utils import create_attribute_dict, create_model_association_df, create_model_human_similarity_dict, get_model_names

In [None]:
# Set path constants
OMI_PATH = 'omi/attribute_means.csv'
HUMAN_IRR_PATH = 'peterson_irr/human_irr.csv'

ATTRIBUTE_PATH = 'prompts'
MODEL_IMPRESSIONS_PATH = 'first_impression_similarities'

In [None]:
# Read in OMI attribute rating data
omi_ratings = pd.read_csv(OMI_PATH, index_col=0)

# Get a list of the 34 OMI attributes
omi_attributes = omi_ratings.columns.to_list()

# View the first 5 rows of the data
omi_ratings.head()

In [None]:
# Create a dictionary mapping each attribute to its positive polar prompt
attribute_dict = create_attribute_dict(path.join(ATTRIBUTE_PATH,'attributes.txt'))

# Create a dictionary mapping each attribute to its opposite prompt (the prompt for the opposing pole of the attribute)
opposite_dict = create_attribute_dict(path.join(ATTRIBUTE_PATH,'attributes_opposites.txt'))

In [None]:
# Get a list of the models with similarity data saved as pickles
model_pickles = [i for i in listdir(MODEL_IMPRESSIONS_PATH) if i.split('.')[-1] == 'pkl']

# Get a list of the model names from the pickle file names
model_names = [i.split('_first_impression_similarities.pkl')[0] for i in model_pickles]

# Create a dictionary mapping model names to their similarity data
models_dict = dict(zip(model_names, model_pickles))

In [None]:
# Create empty lists to store model-human correlation and statistical significance dataframes
model_human_dfs, model_human_sig_dfs = [], []

# Iterate through each model
for model_name, model_file in models_dict.items():

    # Read in the model similarity data
    with open(path.join(MODEL_IMPRESSIONS_PATH, model_file), 'rb') as f:
        model_similarity_dict = pkl.load(f)
    
    # Create a dataframe of the model similarity data
    model_similarity_df = pd.DataFrame(model_similarity_dict)

    # Create a dataframe of the difference between the cosine similarity of each image to the positive prompt and the negative prompt for each attribute in a model
    model_association_df = create_model_association_df(model_similarity_df, attribute_dict, opposite_dict, baseline='difference')

    # Create dictionaries of the Spearman's r correlation coefficient and significances between the model association and the OMI rating for each attribute in a model
    model_human_correlations, model_human_sigs = create_model_human_similarity_dict(model_association_df, attribute_dict, omi_ratings)

    # Create dataframes of the model-human correlations and significances
    model_human_df, model_human_sig_df = pd.DataFrame(model_human_correlations, index=[model_name]), pd.DataFrame(model_human_sigs, index=[model_name])

    # Append the model-human correlation and significance dataframes to lists
    model_human_dfs.append(model_human_df)
    model_human_sig_dfs.append(model_human_sig_df)

# Concatenate model-human correlation and significance dataframes into single dataframes with model names as indices
model_human_df = pd.concat(model_human_dfs)
model_human_sig_df = pd.concat(model_human_sig_dfs)

In [None]:
# Read in human inter-rater reliability data provided in the body of Peterson et al. (2022)
human_irr = pd.read_csv(HUMAN_IRR_PATH, index_col=0)

In [None]:
# Get model-human correlations by model family
openai_df = model_human_df.copy().loc[get_model_names(MODEL_IMPRESSIONS_PATH, 'openai')]
scaling_df = model_human_df.copy().loc[get_model_names(MODEL_IMPRESSIONS_PATH, 'scaling')]
faceclip_df = model_human_df.copy().loc[get_model_names(MODEL_IMPRESSIONS_PATH, 'faceclip')]

In [None]:
# Get model-human correlations for dataset subsets of Scaling models
scaling_2b_df = model_human_df.copy().loc[[i for i in scaling_df.index.tolist() if '2B' in i]]
scaling_400m_df = model_human_df.copy().loc[[i for i in scaling_df.index.tolist() if '400M' in i]]
scaling_80m_df = model_human_df.copy().loc[[i for i in scaling_df.index.tolist() if '80M' in i]]
scaling_2b_sub_df = model_human_df.copy().loc[[i for i in scaling_df.index.tolist() if '2B' in i and 'g-14' not in i and 'H-14' not in i]]

In [None]:
# Get a list of attributes sorted from highest mean model-human similarity to lowest in the Scaling-2B models
sorted_attributes = scaling_2b_df.mean(axis=0).sort_values(ascending=False).index.tolist()

In [None]:
# Create a dictionary mapping each model family to its mean model-human similarity for each attribute
family_mean_dict = {
    'OpenAI': [],
    'Scaling': [],
    'Scaling 2B': [],
    'Scaling 400M': [],
    'Scaling 80M': [],
    'Scaling 2B Sub': [],
    'FaceCLIP': []
}

# Create a dictionary mapping each model family to its standard error of the mean model-human similarity for each attribute
family_error_dict = {
    'OpenAI': [],
    'Scaling': [],
    'Scaling 2B': [],
    'Scaling 400M': [],
    'Scaling 80M': [],
    'Scaling 2B Sub': [],
    'FaceCLIP': []
}

In [None]:
# Compute mean model-human similarity and standard error of the mean model-human similarity for each attribute in each model family
for attribute in sorted_attributes:
    family_mean_dict['OpenAI'].append(openai_df[attribute].mean())
    family_mean_dict['Scaling'].append(scaling_df[attribute].mean())
    family_mean_dict['Scaling 2B'].append(scaling_2b_df[attribute].mean())
    family_mean_dict['Scaling 400M'].append(scaling_400m_df[attribute].mean())
    family_mean_dict['Scaling 80M'].append(scaling_80m_df[attribute].mean())
    family_mean_dict['Scaling 2B Sub'].append(scaling_2b_sub_df[attribute].mean())
    family_mean_dict['FaceCLIP'].append(faceclip_df[attribute].mean())

    family_error_dict['OpenAI'].append(openai_df[attribute].sem())
    family_error_dict['Scaling'].append(scaling_df[attribute].sem())
    family_error_dict['Scaling 2B'].append(scaling_2b_df[attribute].sem())
    family_error_dict['Scaling 400M'].append(scaling_400m_df[attribute].sem())
    family_error_dict['Scaling 80M'].append(scaling_80m_df[attribute].sem())
    family_error_dict['Scaling 2B Sub'].append(scaling_2b_sub_df[attribute].sem())
    family_error_dict['FaceCLIP'].append(faceclip_df[attribute].sem())

In [None]:
# Create a LaTeX table string of the mean model-human similarity and standard error of the mean model-human similarity for each attribute in each model family
tex_string = f'Attribute\tOpenAI\tOpenAI_Error\tScaling\tScaling_Error\tScaling_2B\tScaling_2B_Error\tScaling_400M\tScaling_400M_Error\tScaling_80M\tScaling_80M_Error\tScaling_2B_Sub\tScaling_2B_Sub_Error\tFaceCLIP\tFaceCLIP_Error\n'

# Iterate through each attribute sorted from highest mean model-human similarity to lowest in the Scaling-2B models
for idx, attribute in enumerate(sorted_attributes):

    # Add a row to the LaTeX table string for each attribute
    tex_string += f'{attribute}\t'

    # Add the mean model-human similarity and standard error of the mean model-human similarity for each attribute in each model family to the LaTeX table string
    for family in family_mean_dict.keys():
        tex_string += f'{family_mean_dict[family][idx]}\t{family_error_dict[family][idx]}\t'

    # Add a newline to the LaTeX table string after each attribute
    tex_string += '\n'

# Print the string
print(tex_string)

In [None]:
# Get the mean correlation coefficient for each attribute across all models
openai_mean = openai_df.mean(axis=0)
scaling_mean = scaling_df.mean(axis=0)
faceclip_mean = faceclip_df.mean(axis=0)

# Create a dataframe of the mean correlation coefficients for each attribute across all models
combined_mean_df = pd.concat([openai_mean, scaling_mean, faceclip_mean, human_irr], axis=1)
combined_mean_df.columns = ['OpenAI', 'Scaling', 'FaceCLIP', 'Human']
combined_mean_df['Human'] = combined_mean_df['Human'] / 100

# Export combined_mean_df to data table for use in creating a Tikz scatterplot
combined_mean_df.to_csv('combined_mean_df.csv', index=False, header=True, float_format='%.2f', sep='\t')

In [None]:
# Create a seaborn scatterplot of the mean correlation coefficient for each attribute plotted against the human inter-rater reliability for each attribute
# Approximates the TikZ scatterplot created in the paper
sns.scatterplot(data=combined_mean_df, x='Human', y='OpenAI', color='blue', label='OpenAI', alpha=0.5, s=20, marker='o', edgecolor='black')
sns.scatterplot(data=combined_mean_df, x='Human', y='Scaling', color='red', label='Scaling', alpha=0.5, s=20, marker='o', edgecolor='black')
sns.scatterplot(data=combined_mean_df, x='Human', y='FaceCLIP', color='green', label='FaceCLIP', alpha=0.5, s=20, marker='o', edgecolor='black')
sns.regplot(data=combined_mean_df, x='Human', y='OpenAI', color='blue', scatter=False, label='OpenAI', ci=None, line_kws={'linestyle': '--', 'color': 'blue', 'linewidth': 1})
sns.regplot(data=combined_mean_df, x='Human', y='Scaling', color='red', scatter=False, label='Scaling', ci=None, line_kws={'linestyle': '--', 'color': 'red', 'linewidth': 1})
sns.regplot(data=combined_mean_df, x='Human', y='FaceCLIP', color='green', scatter=False, label='FaceCLIP', ci=None, line_kws={'linestyle': '--', 'color': 'green', 'linewidth': 1})
plt.xlabel('Human Inter-Rater Reliability')
plt.ylabel('Model-Human Correlation')
plt.ylim(-.5,1)
plt.xlim(.2,1)
plt.legend()
plt.title('Model-Human Correlation vs. Human Inter-Rater Reliability')

# Format scatterplot
sns.set_context('paper')
sns.set_style('white')
sns.despine()
plt.tight_layout()

# Show scatterplot
plt.show()

In [None]:
# Compute correlation matrix for mean model-human similarity for each attribute across all models, plus human inter-rater reliability
corr_matrix = combined_mean_df.corr(method='spearman')

In [None]:
# Correlelogram of correlation matrix, approximating the TikZ correlelogram created in the paper
sns.heatmap(corr_matrix, annot=True, cmap='Blues', vmin=.4, vmax=1, center=.7, square=True, linewidths=.5, cbar_kws={"shrink": .5}, annot_kws={"size": 14, "weight": "bold"},
            mask=np.triu(np.ones_like(corr_matrix, dtype=int))-np.eye(corr_matrix.shape[0], dtype=int), cbar=False)
hfont = {'fontname':'Times New Roman', 'size': '14'}
plt.yticks(rotation=90, **hfont)
plt.xticks(rotation=0, **hfont)
plt.show()