In [69]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [70]:
# Imports
import torch
import pandas as pd
import numpy as np
import ast
from scipy.stats import zscore
from sklearn.metrics.pairwise import cosine_similarity


In [71]:
groundTruthDF = pd.read_csv('/content/drive/MyDrive/Projects/zero/data_public/groundTruthObjectEmbeddings.csv')
parTranscriptDF = pd.read_csv('/content/drive/MyDrive/Projects/zero/data_public/participantDescriptionsByBlockEmbeddings.csv')

In [72]:
groundTruthDF['object_name'] = groundTruthDF['object_name'].str.replace('_', '')
print(groundTruthDF['object_name'])

0              scissors
1           stethoscope
2           frenchpress
3              shoehorn
4           fishingreel
5       crankflashlight
6               rolodex
7            floppydisk
8           bulbplanter
9        threeholepunch
10          pocketradio
11            handmixer
12    bloodpressurecuff
Name: object_name, dtype: object


In [73]:
parTranscriptDF

Unnamed: 0,subject_name,talk_block,object_name,description,embedding
0,tulip003,1,handmixer,A hand mixer is usually made out of metal and ...,"[-0.0862705186009407, -0.5986244678497314, 0.7..."
1,tulip003,2,handmixer,There are like other forms of hand mixer so th...,"[0.46811434626579285, -0.7116814851760864, 0.7..."
2,tulip003,1,fishingreel,This thing is usually made out of plastic and ...,"[0.4370953440666199, -0.20047757029533386, 0.3..."
3,tulip003,2,fishingreel,"This is a tool where a string is attached, so...","[0.5953291058540344, 0.023872539401054382, 0.2..."
4,tulip003,1,shoehorn,This object is usually made out of metal and i...,"[0.5472880601882935, 0.23555366694927216, -0.0..."
...,...,...,...,...,...
448,tulip024,2,stethoscope,it is this metal little circular thing and the...,"[0.3775777220726013, 0.07622368633747101, -0.9..."
449,tulip024,1,frenchpress,It's like a glass cylindrical pitcher and wit...,"[0.2045920342206955, -0.24838601052761078, -0...."
450,tulip024,2,frenchpress,It's a glass cylinder that works as a pitcher...,"[0.36618760228157043, -0.42792370915412903, 0...."
451,tulip024,1,threeholepunch,it is like it's its length is about the size o...,"[1.290147066116333, -0.5686261653900146, -0.44..."


In [74]:
# zscore the ground truth embeddings
groundTruthDF['embedding'] = groundTruthDF['embedding'].apply(ast.literal_eval)
embeddings_array = np.vstack(groundTruthDF['embedding'].values)
zscored_embeddings = zscore(embeddings_array, axis=0)
groundTruthDF['embedding_zscored'] = list(zscored_embeddings)



In [75]:
# zscore the participant transcript embeddings
parTranscriptDF['embedding'] = parTranscriptDF['embedding'].apply(ast.literal_eval)
embeddings_array = np.vstack(parTranscriptDF['embedding'].values)
zscored_embeddings = zscore(embeddings_array, axis=0)
parTranscriptDF['embedding_zscored'] = list(zscored_embeddings)


In [76]:
groundTruth_lookup = groundTruthDF.set_index('object_name')['embedding_zscored'].to_dict()


In [77]:
def row_cosine_similarity(row):
    obj = row['object_name']
    emb = row['embedding_zscored']
    gt_emb = groundTruth_lookup.get(obj)

    # Check if either embedding is missing or malformed
    if gt_emb is None:
        print(f"Missing ground truth for object: {obj}")
        return np.nan

    # Convert to arrays
    emb = np.array(emb)
    gt_emb = np.array(gt_emb)

    # Check shape and type
    if emb.shape != gt_emb.shape:
        print(f"Shape mismatch for object: {obj} | par shape: {emb.shape} | gt shape: {gt_emb.shape}")
        return np.nan

    if emb.dtype.kind not in 'fi' or gt_emb.dtype.kind not in 'fi':
        print(f"Non-numeric types for object: {obj}")
        return np.nan

    # Check for NaNs (just to be safe)
    if np.isnan(emb).any() or np.isnan(gt_emb).any():
        print(f"NaNs found for object: {obj}")
        return np.nan

    # Compute cosine similarity
    return cosine_similarity(emb.reshape(1, -1), gt_emb.reshape(1, -1))[0, 0]



In [78]:
# True for rows with any NaN in the embedding
groundTruthDF['has_nan'] = groundTruthDF['embedding_zscored'].apply(lambda x: np.isnan(x).any())

# Count how many have NaNs
num_nan_par = groundTruthDF['has_nan'].sum()
print(f"Rows in groundTruthDF with NaNs: {num_nan_par} / {len(groundTruthDF)}")


Rows in groundTruthDF with NaNs: 0 / 13


In [79]:
parTranscriptDF['cosine_similarity_to_gt'] = parTranscriptDF.apply(row_cosine_similarity, axis=1)

In [None]:
parTranscriptDF

In [81]:
parTranscriptDF.to_csv('/content/drive/MyDrive/Projects/zero/data_public/participantDescriptionsByBlockEmbeddings_CosSim.csv')