In [1]:
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [2]:
# Imports
import torch
import pandas as pd
import numpy as np
import ast
from scipy.stats import zscore
from sklearn.metrics.pairwise import cosine_similarity


In [3]:
groundTruthDF = pd.read_csv('/content/drive/MyDrive/Projects/zero/data_public/groundTruthObjectEmbeddings.csv')
parTranscriptDF = pd.read_csv('/content/drive/MyDrive/Projects/zero/data_public/participantDescriptionsByBlockEmbeddings.csv')

In [4]:
groundTruthDF['object_name'] = groundTruthDF['object_name'].str.replace('_', '')
print(groundTruthDF['object_name'])

0              scissors
1           stethoscope
2           frenchpress
3              shoehorn
4           fishingreel
5       crankflashlight
6               rolodex
7            floppydisk
8           bulbplanter
9        threeholepunch
10          pocketradio
11            handmixer
12    bloodpressurecuff
Name: object_name, dtype: object


In [5]:
parTranscriptDF

Unnamed: 0,subject_name,block_number,object_name,description,total_tokens,num_chunks,embedding
0,tulip003,1,handmixer,A hand mixer is usually made out of metal and ...,65,1,"[-0.03598262742161751, -0.6424020528793335, 0...."
1,tulip003,2,handmixer,There are like other forms of hand mixer so th...,71,1,"[0.4782978594303131, -0.712038516998291, 0.714..."
2,tulip003,1,fishingreel,This thing is usually made out of plastic and ...,51,1,"[0.4370953440666199, -0.20047757029533386, 0.3..."
3,tulip003,2,fishingreel,"This is a tool where a string is attached, so...",82,2,"[0.8284783363342285, -0.06916505098342896, 0.4..."
4,tulip003,1,shoehorn,This object is usually made out of metal and i...,37,1,"[0.5472880601882935, 0.23555366694927216, -0.0..."
...,...,...,...,...,...,...,...
448,tulip024,2,stethoscope,it is this metal little circular thing and the...,82,2,"[0.37746819853782654, -0.22213833034038544, -0..."
449,tulip024,1,frenchpress,It's like a glass cylindrical pitcher and wit...,140,3,"[-0.1387619525194168, -0.43567609786987305, -0..."
450,tulip024,2,frenchpress,It's a glass cylinder that works as a pitcher...,106,2,"[0.3350757658481598, -0.48047691583633423, -0...."
451,tulip024,1,threeholepunch,it is like it's its length is about the size o...,142,3,"[0.8015009760856628, -0.9206170439720154, -0.3..."


In [6]:
# zscore the ground truth embeddings
groundTruthDF['embedding'] = groundTruthDF['embedding'].apply(ast.literal_eval)
embeddings_array = np.vstack(groundTruthDF['embedding'].values)
zscored_embeddings = zscore(embeddings_array, axis=0)
groundTruthDF['embedding_zscored'] = list(zscored_embeddings)



In [7]:
# zscore the participant transcript embeddings
parTranscriptDF['embedding'] = parTranscriptDF['embedding'].apply(ast.literal_eval)
embeddings_array = np.vstack(parTranscriptDF['embedding'].values)
zscored_embeddings = zscore(embeddings_array, axis=0)
parTranscriptDF['embedding_zscored'] = list(zscored_embeddings)


In [8]:
groundTruth_lookup = groundTruthDF.set_index('object_name')['embedding_zscored'].to_dict()


In [9]:
def row_cosine_similarity(row):
    obj = row['object_name']
    emb = row['embedding_zscored']
    gt_emb = groundTruth_lookup.get(obj)

    # Check if either embedding is missing or malformed
    if gt_emb is None:
        print(f"Missing ground truth for object: {obj}")
        return np.nan

    # Convert to arrays
    emb = np.array(emb)
    gt_emb = np.array(gt_emb)

    # Check shape and type
    if emb.shape != gt_emb.shape:
        print(f"Shape mismatch for object: {obj} | par shape: {emb.shape} | gt shape: {gt_emb.shape}")
        return np.nan

    if emb.dtype.kind not in 'fi' or gt_emb.dtype.kind not in 'fi':
        print(f"Non-numeric types for object: {obj}")
        return np.nan

    # Check for NaNs (just to be safe)
    if np.isnan(emb).any() or np.isnan(gt_emb).any():
        print(f"NaNs found for object: {obj}")
        return np.nan

    # Compute cosine similarity
    return cosine_similarity(emb.reshape(1, -1), gt_emb.reshape(1, -1))[0, 0]



In [10]:
# True for rows with any NaN in the embedding
groundTruthDF['has_nan'] = groundTruthDF['embedding_zscored'].apply(lambda x: np.isnan(x).any())

# Count how many have NaNs
num_nan_par = groundTruthDF['has_nan'].sum()
print(f"Rows in groundTruthDF with NaNs: {num_nan_par} / {len(groundTruthDF)}")


Rows in groundTruthDF with NaNs: 0 / 13


In [11]:
parTranscriptDF['cosine_similarity_to_gt'] = parTranscriptDF.apply(row_cosine_similarity, axis=1)

In [12]:
parTranscriptDF

Unnamed: 0,subject_name,block_number,object_name,description,total_tokens,num_chunks,embedding,embedding_zscored,cosine_similarity_to_gt
0,tulip003,1,handmixer,A hand mixer is usually made out of metal and ...,65,1,"[-0.03598262742161751, -0.6424020528793335, 0....","[-1.645684851692725, -1.4361910460559044, 1.98...",0.414138
1,tulip003,2,handmixer,There are like other forms of hand mixer so th...,71,1,"[0.4782978594303131, -0.712038516998291, 0.714...","[-0.2083565278061882, -1.6304728208737613, 2.0...",0.406287
2,tulip003,1,fishingreel,This thing is usually made out of plastic and ...,51,1,"[0.4370953440666199, -0.20047757029533386, 0.3...","[-0.3235106972947484, -0.2032468709292848, 1.2...",0.337080
3,tulip003,2,fishingreel,"This is a tool where a string is attached, so...",82,2,"[0.8284783363342285, -0.06916505098342896, 0.4...","[0.7703395938512678, 0.16310759049397716, 1.56...",0.391764
4,tulip003,1,shoehorn,This object is usually made out of metal and i...,37,1,"[0.5472880601882935, 0.23555366694927216, -0.0...","[-0.015540404785339983, 1.0132554848364526, 0....",0.321679
...,...,...,...,...,...,...,...,...,...
448,tulip024,2,stethoscope,it is this metal little circular thing and the...,82,2,"[0.37746819853782654, -0.22213833034038544, -0...","[-0.4901586400655473, -0.2636791596547357, -0....",0.377866
449,tulip024,1,frenchpress,It's like a glass cylindrical pitcher and wit...,140,3,"[-0.1387619525194168, -0.43567609786987305, -0...","[-1.9329359507521036, -0.8594373876342876, 0.1...",0.623709
450,tulip024,2,frenchpress,It's a glass cylinder that works as a pitcher...,106,2,"[0.3350757658481598, -0.48047691583633423, -0....","[-0.6086384303377665, -0.9844291220357801, 0.3...",0.417393
451,tulip024,1,threeholepunch,it is like it's its length is about the size o...,142,3,"[0.8015009760856628, -0.9206170439720154, -0.3...","[0.694942364167696, -2.212395049723942, -0.422...",0.377223


In [13]:
parTranscriptDF.to_csv('/content/drive/MyDrive/Projects/zero/data_public/participantDescriptionsByBlockEmbeddings_CosSim.csv')