In [None]:
import pandas as pd
import pickle as pkl
import statsmodels.api as sm

from os import listdir, path
from utils import create_attribute_dict, create_model_association_df, create_model_human_similarity_dict

In [None]:
# Set path constants
OMI_PATH = 'omi/attribute_means.csv'
SCALING_INFO = 'scaling_info/trained_models_info.csv'
ARCH_INFO = 'scaling_info/arch_info.csv'
HUMAN_IRR_PATH = 'peterson_irr/human_irr.csv'

ATTRIBUTE_PATH = 'prompts'
MODEL_IMPRESSIONS_PATH = 'first_impression_similarities'

In [None]:
# Read in OMI attribute rating data
omi_ratings = pd.read_csv(OMI_PATH, index_col=0)

# Get a list of the 34 OMI attributes
omi_attributes = omi_ratings.columns.to_list()

# View the first 5 rows of the data
omi_ratings.head()

In [None]:
# Create a dictionary mapping each attribute to its positive polar prompt
attribute_dict = create_attribute_dict(path.join(ATTRIBUTE_PATH,'attributes.txt'))

# Create a dictionary mapping each attribute to its opposite prompt (the prompt for the opposing pole of the attribute)
opposite_dict = create_attribute_dict(path.join(ATTRIBUTE_PATH,'attributes_opposites.txt'))

In [None]:
# Get a list of the models with similarity data saved as pickles
model_pickles = [i for i in listdir(MODEL_IMPRESSIONS_PATH) if i.split('.')[-1] == 'pkl' and i.split('_')[0] == 'scaling']
model_pickles = [i for i in model_pickles if 'g-14' not in i and 'H-14' not in i]

# Get a list of the model names from the pickle file names
model_names = [i.split('_first_impression_similarities.pkl')[0] for i in model_pickles]

# Create a dictionary mapping model names to their similarity data
models_dict = dict(zip(model_names, model_pickles))

In [None]:
# Create empty lists to store model-human correlation and statistical significance dataframes
model_human_dfs, model_human_sig_dfs = [], []

# Iterate through each model
for model_name, model_file in models_dict.items():

    # Read in the model similarity data
    with open(path.join(MODEL_IMPRESSIONS_PATH, model_file), 'rb') as f:
        model_similarity_dict = pkl.load(f)
    
    # Create a dataframe of the model similarity data
    model_similarity_df = pd.DataFrame(model_similarity_dict)

    # Create a dataframe of the difference between the cosine similarity of each image to the positive prompt and the negative prompt for each attribute in a model
    model_association_df = create_model_association_df(model_similarity_df, attribute_dict, opposite_dict, baseline='difference')

    # Create dictionaries of the Spearman's r correlation coefficient and significances between the model association and the OMI rating for each attribute in a model
    model_human_correlations, model_human_sigs = create_model_human_similarity_dict(model_association_df, attribute_dict, omi_ratings)

    # Create dataframes of the model-human correlations and significances
    model_human_df, model_human_sig_df = pd.DataFrame(model_human_correlations, index=[model_name]), pd.DataFrame(model_human_sigs, index=[model_name])

    # Append the model-human correlation and significance dataframes to lists
    model_human_dfs.append(model_human_df)
    model_human_sig_dfs.append(model_human_sig_df)

# Concatenate model-human correlation and significance dataframes into single dataframes with model names as indices
model_human_df = pd.concat(model_human_dfs)
model_human_sig_df = pd.concat(model_human_sig_dfs)

In [None]:
# Read in data related to data scale and model architecture
scaling_data = pd.read_csv(SCALING_INFO, index_col=0)
arch_data = pd.read_csv(ARCH_INFO, index_col=0)
human_irr = pd.read_csv(HUMAN_IRR_PATH, index_col=0)

In [None]:
# Map data size to number of unique examples (in millions)
data_sizes = {'2B': 2320, '400M': 407, '80M': 80}

In [None]:
# Create dataframe of scaling data
scaling_regression_df = scaling_data[['samples_per_epoch', 'epochs', 'data']]

# Transform the scaling data to be in terms of total samples
scaling_regression_df['total_samples'] = scaling_regression_df['samples_per_epoch'] * scaling_regression_df['epochs']

# Map data size to number of unique examples
scaling_regression_df['data'] = scaling_regression_df['data'].map(data_sizes)

# Obtain the number of parameters for each model from the architecture data
scaling_regression_df['model'] = scaling_regression_df.index
scaling_regression_df['image_mparams'] = scaling_regression_df['model'].apply(lambda x: arch_data.loc[x.split('_')[0].replace('Model', 'ViT'), 'image_mparams'])
scaling_regression_df['text_mparams'] = scaling_regression_df['model'].apply(lambda x: arch_data.loc[x.split('_')[0].replace('Model', 'ViT'), 'text_mparams'])

# Replace the model names with the model names without the pt checkpoint extension
scaling_regression_df.index = [i.replace('.pt','') for i in scaling_regression_df.index]

In [None]:
# Create a dataframe with the model-human correlation as dependent variable and data size, total samples, number of parameters, and attribute IRR as independent variables
intermediate_dataframes = []
attributes = model_human_df.columns.to_list()

# Iterate through each model
for model in model_human_df.index:

    # Get model name as it appears in the scaling regression dataframe
    loc_model = 'Model-' + model.split('Model-')[1]

    # For each model, get the model-human correlation for each attribute
    model_series = model_human_df.loc[model].to_list()

    # Create a dataframe of that model's model-human correlations for the attributes
    model_df = pd.DataFrame(model_series, columns=['mh_similarity'], index=attributes)

    # Add the model's data size, total samples, and number of parameters to the dataframe
    model_df['data'] = scaling_regression_df.loc[loc_model, 'data']
    model_df['total_samples'] = scaling_regression_df.loc[loc_model, 'total_samples']
    model_df['image_mparams'] = scaling_regression_df.loc[loc_model, 'image_mparams']
    model_df['text_mparams'] = scaling_regression_df.loc[loc_model, 'text_mparams']

    # Add the human inter-rater reliability for all of the attributes to the dataframe
    model_df['human_irr'] = [float(human_irr.loc[attribute])/100. for attribute in model_df.index]

    # Add the model name and attribute name as the index
    model_df.index = [f'{model}_{attribute}' for attribute in model_df.index]

    # Append the dataframe to a list
    intermediate_dataframes.append(model_df)

# Concatenate the dataframes into a single dataframe
final_regression_df = pd.concat(intermediate_dataframes, axis=0)
final_regression_df.head()

In [None]:
# Normalize data by dividing by the maximum value for each column
normalized_df = final_regression_df.copy()

# Exclude columns with range in 0, 1 from normalization
norm_columns = [column for column in normalized_df.columns if column not in ['human_irr', 'mh_similarity']]

# Normalize columns
for column in norm_columns:
    normalized_df[column] = normalized_df[column] / normalized_df[column].max()

normalized_df.head()

In [None]:
# Fit a multiple linear regression model to predict model-human similarity from data size, total samples, number of parameters, and human inter-rater reliability

# Dependent variable is the model-human similarity
y = normalized_df['mh_similarity']

# Independent variables are data size, total samples, number of parameters, and human inter-rater reliability
X = normalized_df[['data', 'total_samples', 'image_mparams', 'text_mparams', 'human_irr']]

# Add a constant to the independent variables
X = sm.add_constant(X)

# Fit the model
model = sm.OLS(y, X).fit()
predictions = model.predict(X)

# Print the model summary
model.summary()