In [49]:
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [50]:
# Imports
import torch
import pandas as pd
import numpy as np
from transformers import CLIPTokenizer, CLIPTextModel
import re
import math

In [51]:
# Initialize tokenizer and model first so they can be used in functions
print("Loading CLIP model...")
device = 'cuda' if torch.cuda.is_available() else 'cpu'
tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch32")
model = CLIPTextModel.from_pretrained("openai/clip-vit-base-patch32")
model.to(device)
model.eval()


Loading CLIP model...


CLIPTextModel(
  (text_model): CLIPTextTransformer(
    (embeddings): CLIPTextEmbeddings(
      (token_embedding): Embedding(49408, 512)
      (position_embedding): Embedding(77, 512)
    )
    (encoder): CLIPEncoder(
      (layers): ModuleList(
        (0-11): 12 x CLIPEncoderLayer(
          (self_attn): CLIPAttention(
            (k_proj): Linear(in_features=512, out_features=512, bias=True)
            (v_proj): Linear(in_features=512, out_features=512, bias=True)
            (q_proj): Linear(in_features=512, out_features=512, bias=True)
            (out_proj): Linear(in_features=512, out_features=512, bias=True)
          )
          (layer_norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (mlp): CLIPMLP(
            (activation_fn): QuickGELUActivation()
            (fc1): Linear(in_features=512, out_features=2048, bias=True)
            (fc2): Linear(in_features=2048, out_features=512, bias=True)
          )
          (layer_norm2): LayerNorm((512,), eps=1e

In [52]:
# This function removes "Um" and "Uh"

def remove_fillers(text):
    # Pattern matches standalone "um" or "uh" or "okay" (case insensitive), with optional trailing punctuation
    pattern = r'\b(um+|uh+|okay)\b[\.,;:!?"]*'

    # Replace with a single space, then strip extra whitespace
    cleaned = re.sub(pattern, '', text, flags=re.IGNORECASE)

    # Normalize spacing (e.g., multiple spaces → single space)
    cleaned = re.sub(r'\s+', ' ', cleaned).strip()

    return cleaned


In [53]:
# This function's aim is to break up long descriptions into evenly sized chunks that slightly overlap
def find_best_chunking(total_tokens, max_tokens=60, min_tokens=40, overlap_ratio=0.3, max_chunks=50):
    best_n = None
    best_chunk_size = None
    best_stride = None
    best_diff = float('inf')

    max_search = min(total_tokens, max_chunks)
    for n in range(1, max_search + 1):
        # Calculate chunk size for this number of chunks n
        chunk_size = total_tokens / (1 + (n - 1) * (1 - overlap_ratio))

        # Skip if chunk size is out of allowed range
        if chunk_size > max_tokens or chunk_size < min_tokens:
            continue

        # Compute stride from chunk size and overlap ratio
        stride = chunk_size * (1 - overlap_ratio)

        # Calculate total coverage (should be approx total_tokens)
        total_covered = chunk_size + stride * (n - 1)
        diff = abs(total_tokens - total_covered)  # Likely very close to 0

        # Track the chunk count with minimal coverage difference
        if diff < best_diff:
            best_diff = diff
            best_n = n
            best_chunk_size = chunk_size
            best_stride = stride

    if best_n is None:
        raise ValueError("No valid chunking found within constraints")

    return best_n, int(round(best_chunk_size)), int(round(best_stride))

In [54]:
# This function uses the optimal n chunks solved for above, to chunk the description
def chunk_evenly_balanced(text, tokenizer, max_tokens=60, min_tokens=40, overlap_ratio=0.3):
    tokens = tokenizer.encode(text)
    total_tokens = len(tokens)

    if total_tokens <= max_tokens:
        return [tokenizer.decode(tokens, skip_special_tokens=True)]

    n_chunks, chunk_size, stride = find_best_chunking(total_tokens, max_tokens, min_tokens, overlap_ratio)

    chunks = []
    start = 0
    for _ in range(n_chunks):
        end = min(int(start + chunk_size), total_tokens)
        chunk_tokens = tokens[int(start):end]
        chunk_text = tokenizer.decode(chunk_tokens, skip_special_tokens=True)
        chunks.append(chunk_text)
        if end == total_tokens:
            break
        start += stride

    return chunks


In [55]:
# Get embedding for each chunk of the description
def get_embeddings_for_chunks(chunks, tokenizer, model, device, max_chunks=8):
    embeddings = []
    for chunk in chunks[:max_chunks]:  # Process only up to max_chunks
        cleaned_chunk = remove_fillers(chunk)  # remove fillers

        with torch.no_grad():
            inputs = tokenizer(cleaned_chunk, padding=True, truncation=True, return_tensors="pt")
            inputs = {k: v.to(device) for k, v in inputs.items()}
            outputs = model(**inputs)
            # Mean pooling over token embeddings (dim=1)
            embedding = outputs.last_hidden_state.mean(dim=1).cpu().numpy()[0]
            embeddings.append(embedding)
    return embeddings

In [56]:
# Average embeddings for each chunk
def average_embeddings(embeddings):
    if not embeddings:
        return np.zeros(512)  # CLIP embedding size
    return np.mean(embeddings, axis=0)

In [57]:
# 1a. Load ground truth object descriptions
print("Loading CSV...")
df = pd.read_csv('/content/drive/MyDrive/Projects/zero/data_public/groundTruthObjectDescriptions.csv')
df.head()

Loading CSV...


Unnamed: 0,object_name,description,Unnamed: 2
0,scissors,Scissors are handheld cutting tools consisting...,
1,stethoscope,A medical instrument used by healthcare profes...,
2,french_press,A manual coffee brewing device invented in the...,
3,shoe_horn,A tool designed to aid in putting on shoes wit...,
4,fishing_reel,A mechanical device attached to a fishing rod ...,


In [58]:
# 1b. Get embeddings for the ground truth object descriptions file
results = []
for idx, row in df.iterrows():
    object_name = row['object_name']
    description = row['description']

    if pd.isna(description) or not description:
        continue

    print(f"Processing {object_name}...")

    # Truncate to max n tokens if necessary
    max_tokens = 400
    tokens = tokenizer.encode(description)
    if len(tokens) > max_tokens:
        tokens = tokens[:max_tokens]
        description = tokenizer.decode(tokens, skip_special_tokens=True)

    # Chunk the description using your balanced chunking function
    chunks = chunk_evenly_balanced(description, tokenizer, max_tokens=75, min_tokens=40, overlap_ratio=0.3)

    # VALIDATE CHUNK LENGTHS right after chunking
    for i, chunk in enumerate(chunks):
        token_count = len(tokenizer.encode(chunk))
        if token_count > 77:
            print(f"Warning: Chunk {i} for {object_name} exceeds token limit ({token_count} tokens)")
            print(f"Problematic chunk: {chunk[:100]}...")  # Show first 100 chars

    # Get embeddings for chunks
    chunk_embeddings = get_embeddings_for_chunks(chunks, tokenizer, model, device, max_chunks=10)

    # Average embeddings (assuming you have this function defined)
    avg_embedding = average_embeddings(chunk_embeddings)

    results.append({
        'object_name': object_name,
        'description': description,
        'embedding': avg_embedding.tolist()
    })



Token indices sequence length is longer than the specified maximum sequence length for this model (178 > 77). Running this sequence through the model will result in indexing errors


Processing scissors...
Processing stethoscope...
Processing french_press...
Processing shoe_horn...
Processing fishing_reel...
Processing crank_flashlight...
Processing rolodex...
Processing floppy_disk...
Processing bulb_planter...
Processing three_hole_punch...
Processing pocket_radio...
Processing hand_mixer...
Processing blood_pressure_cuff...


In [None]:
# (Optional) Print the first row of the results

results[0]

In [60]:
# 1c. Save results
output_df = pd.DataFrame(results)
output_df.to_csv('/content/drive/MyDrive/Projects/zero/data_public/groundTruthObjectEmbeddings.csv', index=False)
print(f"Saved {len(results)} embeddings to ground_truth_embeddings_chunked.csv")
output_df.head()

Saved 13 embeddings to ground_truth_embeddings_chunked.csv


Unnamed: 0,object_name,description,embedding
0,scissors,Scissors are handheld cutting tools consisting...,"[0.13763639330863953, -0.7288196086883545, -0...."
1,stethoscope,A medical instrument used by healthcare profes...,"[0.5690135955810547, 0.21134905517101288, -0.2..."
2,french_press,A manual coffee brewing device invented in the...,"[-0.2351902723312378, -0.1026310920715332, -0...."
3,shoe_horn,A tool designed to aid in putting on shoes wit...,"[0.8594603538513184, -0.11204422265291214, 0.2..."
4,fishing_reel,A mechanical device attached to a fishing rod ...,"[0.677139401435852, -0.45117059350013733, 0.43..."


In [61]:
# 2a. Load participant descriptions
print("Loading CSV...")
df = pd.read_csv('/content/drive/MyDrive/Projects/zero/data_public/participantDescriptionsByBlock.csv')
df.head()

Loading CSV...


Unnamed: 0,SubjectID,ObjectID,TalkBlock,Transcription
0,tulip003,handmixer,1,A hand mixer is usually made out of metal and ...
1,tulip003,handmixer,2,There are like other forms of hand mixer so th...
2,tulip003,fishingreel,1,This thing is usually made out of plastic and ...
3,tulip003,fishingreel,2,"This is a tool where a string is attached, so..."
4,tulip003,shoehorn,1,This object is usually made out of metal and i...


In [62]:
# 2b. get embeddings for the participant descriptions
results = []
for idx, row in df.iterrows():
    object_name = row['ObjectID']
    description = row['Transcription']

    if pd.isna(description) or not description:
        continue

    # Truncate to max n tokens if necessary
    max_tokens = 400
    tokens = tokenizer.encode(description)
    if len(tokens) > max_tokens:
        tokens = tokens[:max_tokens]
        description = tokenizer.decode(tokens, skip_special_tokens=True)

    # Tokenize whole description once
    total_tokens = len(tokens[:max_tokens])

    # Chunk the description using your balanced chunking function
    chunks = chunk_evenly_balanced(description, tokenizer, max_tokens=75, min_tokens=40, overlap_ratio=0.3)

    # VALIDATE CHUNK LENGTHS right after chunking
    for i, chunk in enumerate(chunks):
        token_count = len(tokenizer.encode(chunk))
        if token_count > 77:
            print(f"Warning: Chunk {i} for {object_name} exceeds token limit ({token_count} tokens)")
            print(f"Problematic chunk: {chunk[:100]}...")  # Show first 100 chars

    # Get embeddings for chunks
    chunk_embeddings = get_embeddings_for_chunks(chunks, tokenizer, model, device, max_chunks=10)

    # Average embeddings (assuming you have this function defined)
    avg_embedding = average_embeddings(chunk_embeddings)

    results.append({
        'subject_name': row['SubjectID'],
        'block_number': row['TalkBlock'],
        'object_name': object_name,
        'description': description,
        'total_tokens': total_tokens,          # <-- Save total token count here
        'num_chunks': len(chunks),              # <-- Save number of chunks here
        'embedding': avg_embedding.tolist()
    })


In [None]:
results[0]

In [64]:
# 2c. Save results
output_df = pd.DataFrame(results)
output_df.to_csv('/content/drive/MyDrive/Projects/zero/data_public/participantDescriptionsByBlockEmbeddings.csv', index=False)
output_df.head()

Unnamed: 0,subject_name,block_number,object_name,description,total_tokens,num_chunks,embedding
0,tulip003,1,handmixer,A hand mixer is usually made out of metal and ...,65,1,"[-0.03598262742161751, -0.6424020528793335, 0...."
1,tulip003,2,handmixer,There are like other forms of hand mixer so th...,71,1,"[0.4782978594303131, -0.712038516998291, 0.714..."
2,tulip003,1,fishingreel,This thing is usually made out of plastic and ...,51,1,"[0.4370953440666199, -0.20047757029533386, 0.3..."
3,tulip003,2,fishingreel,"This is a tool where a string is attached, so...",82,2,"[0.8284783363342285, -0.06916505098342896, 0.4..."
4,tulip003,1,shoehorn,This object is usually made out of metal and i...,37,1,"[0.5472880601882935, 0.23555366694927216, -0.0..."
