In [None]:
import pandas as pd
import pickle as pkl
import numpy as np
import seaborn as sns

from matplotlib import pyplot as plt
from os import listdir, path

from utils import create_attribute_dict, create_model_association_df, create_model_human_similarity_dict
from utils import cohens_d, paired_ttest, round_and_format, round_and_format_cohens_d

In [None]:
# Set path constants
OMI_PATH = 'omi/attribute_means.csv'

HUMAN_IRR_PATH = 'peterson_irr/human_irr.csv'
ATTRIBUTE_PATH = 'prompts'
MODEL_IMPRESSIONS_PATH = 'first_impression_similarities'

In [None]:
# Read in OMI attribute rating data
omi_ratings = pd.read_csv(OMI_PATH, index_col=0)

# Get a list of the 34 OMI attributes
omi_attributes = omi_ratings.columns.to_list()

# View the first 5 rows of the data
omi_ratings.head()

In [None]:
# Create a dictionary mapping each attribute to its positive polar prompt
attribute_dict = create_attribute_dict(path.join(ATTRIBUTE_PATH,'attributes.txt'))

# Create a dictionary mapping each attribute to its opposite prompt (the prompt for the opposing pole of the attribute)
opposite_dict = create_attribute_dict(path.join(ATTRIBUTE_PATH,'attributes_opposites.txt'))

In [None]:
# Get a list of the models with similarity data saved as pickles
model_pickles = [i for i in listdir(MODEL_IMPRESSIONS_PATH) if i.split('.')[-1] == 'pkl' and i.split('_')[0] == 'scaling']

# Get a list of the model names from the pickle file names
model_names = [i.split('_first_impression_similarities.pkl')[0] for i in model_pickles]

# Create a dictionary mapping model names to their similarity data
models_dict = dict(zip(model_names, model_pickles))

In [None]:
# Create empty lists to store model-human correlation and statistical significance dataframes
model_human_dfs, model_human_sig_dfs = [], []

# Iterate through each model
for model_name, model_file in models_dict.items():

    # Read in the model similarity data
    with open(path.join(MODEL_IMPRESSIONS_PATH, model_file), 'rb') as f:
        model_similarity_dict = pkl.load(f)
    
    # Create a dataframe of the model similarity data
    model_similarity_df = pd.DataFrame(model_similarity_dict)

    # Create a dataframe of the difference between the cosine similarity of each image to the positive prompt and the negative prompt for each attribute in a model
    model_association_df = create_model_association_df(model_similarity_df, attribute_dict, opposite_dict, baseline='difference')

    # Create dictionaries of the Spearman's r correlation coefficient and significances between the model association and the OMI rating for each attribute in a model
    model_human_correlations, model_human_sigs = create_model_human_similarity_dict(model_association_df, attribute_dict, omi_ratings)

    # Create dataframes of the model-human correlations and significances
    model_human_df, model_human_sig_df = pd.DataFrame(model_human_correlations, index=[model_name]), pd.DataFrame(model_human_sigs, index=[model_name])

    # Append the model-human correlation and significance dataframes to lists
    model_human_dfs.append(model_human_df)
    model_human_sig_dfs.append(model_human_sig_df)

# Concatenate model-human correlation and significance dataframes into single dataframes with model names as indices
model_human_df = pd.concat(model_human_dfs)
model_human_sig_df = pd.concat(model_human_sig_dfs)

In [None]:
# Get model names by dataset size
models_2b = [model for model in model_human_df.index if 'Data-2B' in model and 'Model-g-14' not in model and 'Model-H-14' not in model]
models_400m = [model for model in model_human_df.index if 'Data-400M' in model]
models_80m = [model for model in model_human_df.index if 'Data-80M' in model]

# Check that the number of models in each dataset size is correct - 9 each
len(models_2b), len(models_400m), len(models_80m)

In [None]:
# Create dataframes of model-human correlations by dataset size
df_2b = model_human_df.loc[models_2b]
df_400m = model_human_df.loc[models_400m]
df_80m = model_human_df.loc[models_80m]

In [None]:
# Compute means, standard deviations, and maximums for each attribute in each dataset size
means_2b = [df_2b[attribute].mean() for attribute in omi_attributes]
means_400m = [df_400m[attribute].mean() for attribute in omi_attributes]
means_80m = [df_80m[attribute].mean() for attribute in omi_attributes]

stds_2b = [df_2b[attribute].std() for attribute in omi_attributes]
stds_400m = [df_400m[attribute].std() for attribute in omi_attributes]
stds_80m = [df_80m[attribute].std() for attribute in omi_attributes]

max_2b = [df_2b[attribute].max() for attribute in omi_attributes]
max_400m = [df_400m[attribute].max() for attribute in omi_attributes]
max_80m = [df_80m[attribute].max() for attribute in omi_attributes]

# Compute Cohen's d and paired t-test p-values for each attribute in each dataset size
d_2b_400m = [cohens_d(df_2b[attribute], df_400m[attribute]) for attribute in omi_attributes]
d_2b_80m = [cohens_d(df_2b[attribute], df_80m[attribute]) for attribute in omi_attributes]
d_400m_80m = [cohens_d(df_400m[attribute], df_80m[attribute]) for attribute in omi_attributes]

t_2b_400m = [paired_ttest(df_2b[attribute], df_400m[attribute]).pvalue for attribute in omi_attributes]
t_2b_80m = [paired_ttest(df_2b[attribute], df_80m[attribute]).pvalue for attribute in omi_attributes]
t_400m_80m = [paired_ttest(df_400m[attribute], df_80m[attribute]).pvalue for attribute in omi_attributes]

In [None]:
# Create table of means + stds, max, and Cohen's d for each attribute in each dataset size
latex_strings, index_means = [], []

# Iterate through each attribute
for i, attribute in enumerate(omi_attributes):
        
        # Add the attribute name to the LaTeX table string
        latex_table_string = attribute + ' & '
        
        # Add the mean and standard deviation for each dataset size to the LaTeX table string
        latex_table_string += f"{round_and_format(means_2b[i])} ({stds_2b[i]:.2f}) & "
        latex_table_string += f"{round_and_format(means_400m[i])} ({stds_400m[i]:.2f}) & "
        latex_table_string += f"{round_and_format(means_80m[i])} ({stds_80m[i]:.2f}) & "
        
        # Add the maximum for each dataset size to the LaTeX table string
        latex_table_string += f"{round_and_format(max_2b[i])} & "
        latex_table_string += f"{round_and_format(max_400m[i])} & "
        latex_table_string += f"{round_and_format(max_80m[i])} & "
        
        # Add the Cohen's d for each dataset size to the LaTeX table string
        latex_table_string += f"{round_and_format_cohens_d(d_2b_80m[i], t_2b_80m[i])} & "
        latex_table_string += f"{round_and_format_cohens_d(d_400m_80m[i], t_400m_80m[i])} &"
        latex_table_string += f"{round_and_format_cohens_d(d_2b_400m[i], t_2b_400m[i])}  \\\\"

        # Add the LaTeX table string to the list of LaTeX table strings
        latex_strings.append(latex_table_string)

        # Add the mean for each dataset size to the list of means
        index_means.append(means_2b[i])

# Sort the LaTeX table strings by the mean of the 2B dataset
latex_strings = [latex_strings[i] for i in reversed(np.argsort(index_means))]
table_string = '\n'.join(latex_strings)

# Print the LaTeX table string
print(table_string)

In [None]:
# Read in human inter-rater reliability data provided in the body of Peterson et al. (2022)
human_irr = pd.read_csv(HUMAN_IRR_PATH, index_col=0)
transpose_irr = human_irr.transpose() / 100

In [None]:
# Sort columns for barplot based on the mean of the 2B dataset
column_ranks = df_2b.mean().sort_values().index.tolist()

In [None]:
# Adjust to overlay data in a barplot, such that the largest mean is in the back and the smallest in the front

# Dataframe of 2B attributes that are shorter than 400M attributes
df400_taller = df_2b.copy()[[column for column in column_ranks if abs(df_2b[column].mean()) < abs(df_400m[column].mean())]]
for column in column_ranks:
    if column not in df400_taller:
        df400_taller[column] = 0

# Dataframe of 400M attributes that are shorter than 80M attributes
df80_taller = df_400m.copy()[[column for column in column_ranks if abs(df_400m[column].mean()) < abs(df_80m[column].mean())]]
for column in column_ranks:
    if column not in df80_taller:
        df80_taller[column] = 0

# Dataframe of 2B attributes that are shorter than 80M attributes
df2b80_taller = df_2b.copy()[[column for column in column_ranks if abs(df_2b[column].mean()) < abs(df_80m[column].mean())]]
for column in column_ranks:
    if column not in df2b80_taller:
        df2b80_taller[column] = 0

In [None]:
# Barplot of model-human similarity by dataset size

# Column ranks are based on the mean of the 2B dataset
column_ranks = df_2b.mean().sort_values().index.tolist()

# Human IRR barplot - would come first if uncommented, because it is always the largest bar
#sns.barplot(data=transpose_irr, orient='v', errorbar=None, edgecolor='black', order=column_ranks, hue_order=column_ranks, color='lightyellow', dodge=False, label='Human IRR')

# Overlay barplots of model-human similarity by dataset size
sns.barplot(data=df_2b, orient='v', errorbar=None, edgecolor='black', order=column_ranks, hue_order=column_ranks, color='salmon', dodge=False, label='2b', hatch='xxx')
sns.barplot(data=df_400m, orient='v', errorbar=None,  edgecolor='black', order=column_ranks, hue_order=column_ranks, color='skyblue', dodge=False, label='400m', hatch='---')
sns.barplot(data=df400_taller, orient='v', errorbar=None,  edgecolor='black', order=column_ranks, hue_order=column_ranks, color='salmon', dodge=False, hatch='xxx')
sns.barplot(data=df_80m, orient='v', errorbar=None,  edgecolor='black', order=column_ranks, hue_order=column_ranks, color='lightgreen', dodge=False, label='80m', hatch='///')
sns.barplot(data=df80_taller, orient='v', errorbar=None,   edgecolor='black', order=column_ranks, hue_order=column_ranks, color='skyblue', dodge=False, hatch='---')
sns.barplot(data=df2b80_taller, orient='v', errorbar=None, edgecolor='black',  order=column_ranks, hue_order=column_ranks, color='salmon', dodge=False, hatch='xxx')

# Set plot parameters
plt.legend(title='Training Dataset Size')
plt.ylim(-.35,1)
plt.xlim(-1,34)

# Make it large
plt.gcf().set_size_inches(14, 8)

# Get the current axis out of the plot and set the font sizes and families for the legend, axes, and title
ax = plt.gca()
plt.setp(ax.get_legend().get_texts(), fontsize='16', fontname='Times New Roman') 
plt.setp(ax.get_legend().get_title(), fontsize='16', fontname='Times New Roman') 
plt.xticks(rotation=30, ha='right', fontname='Times New Roman', fontsize=15)
plt.yticks(fontname='Times New Roman', fontsize=14)
plt.xlabel('OMI Attribute', fontname='Times New Roman', fontsize=16)
plt.ylabel('Mean Model-Human Similarity (Spearman\'s r)', rotation=90, fontname='Times New Roman', fontsize=16)
plt.title('Model-Human Similarity by Training Dataset Size', fontname='Times New Roman', fontsize=20)

# Save the plot
plt.savefig(f'model_human_similarity_dataset_size.png', dpi=300)

# Show the plot
plt.show()

In [None]:
# Same thing, just horizontal formatted

# Column ranks are based on the mean of the 2B dataset
column_ranks = df_2b.mean().sort_values().index.tolist()

# Human IRR barplot - would come first if uncommented, because it is always the largest bar
#sns.barplot(data=transpose_irr, orient='v', errorbar=None, edgecolor='black', order=column_ranks, hue_order=column_ranks, color='lightyellow', dodge=False, label='Human IRR')

# Overlay barplots of model-human similarity by dataset size
sns.barplot(data=df_2b, orient='h', errorbar=None, edgecolor='black', order=column_ranks, hue_order=column_ranks, color='salmon', dodge=False, label='2b', hatch='xxx')
sns.barplot(data=df_400m, orient='h', errorbar=None,  edgecolor='black', order=column_ranks, hue_order=column_ranks, color='skyblue', dodge=False, label='400m', hatch='---')
sns.barplot(data=df400_taller, orient='h', errorbar=None,  edgecolor='black', order=column_ranks, hue_order=column_ranks, color='salmon', dodge=False, hatch='xxx')
sns.barplot(data=df_80m, orient='h', errorbar=None,  edgecolor='black', order=column_ranks, hue_order=column_ranks, color='lightgreen', dodge=False, label='80m', hatch='///')
sns.barplot(data=df80_taller, orient='h', errorbar=None,   edgecolor='black', order=column_ranks, hue_order=column_ranks, color='skyblue', dodge=False, hatch='---')
sns.barplot(data=df2b80_taller, orient='h', errorbar=None, edgecolor='black',  order=column_ranks, hue_order=column_ranks, color='salmon', dodge=False, hatch='xxx')

# Set plot parameters
plt.legend(title='Training Dataset Size', loc='lower right')
plt.xlim(-.35,1)
plt.ylim(-1,34)

# Make it large
plt.gcf().set_size_inches(14, 8)

# Get the current axis out of the plot and set the font sizes and families for the legend, axes, and title
ax = plt.gca()
plt.setp(ax.get_legend().get_texts(), fontsize='16', fontname='Times New Roman') 
plt.setp(ax.get_legend().get_title(), fontsize='16', fontname='Times New Roman') 
plt.xticks(rotation=30, ha='right', fontname='Times New Roman', fontsize=13)
plt.yticks(fontname='Times New Roman', fontsize=14)
plt.ylabel('OMI Attribute', fontname='Times New Roman', fontsize=16)
plt.xlabel('Model-Human Similarity - Spearman\'s r', fontname='Times New Roman', fontsize=16)
plt.title('Model-Human Similarity by Training Dataset Size', fontname='Times New Roman', fontsize=20)

# Save the plot
plt.savefig(f'model_human_similarity_dataset_size.png', dpi=300)

# Show the plot
plt.show()