# LLM humor detection with Subspace based metric

In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import pandas as pd

# Initialize tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-9b-it")
model = AutoModelForCausalLM.from_pretrained(
    "google/gemma-2-9b-it",
    device_map="cuda:0",
    torch_dtype=torch.bfloat16,
    output_hidden_states=True  # Enable hidden states output
)

  from .autonotebook import tqdm as notebook_tqdm
Loading checkpoint shards: 100%|██████████| 4/4 [00:02<00:00,  1.41it/s]


In [2]:
ground_truth = pd.read_csv('/home/ada/humor/data/stand_up_dataset/standup_data.csv')
transcript = pd.read_csv('/home/ada/humor/data/stand_up_dataset/standup_transcripts.csv')

In [72]:
INSTRUCTIONS = [
    "Extract the key humorous lines and punchlines for this stand-up comedy transcript. Focus on the quotes highlighting the main comedic moments. List of quotes:",
    "The following is a stand-up comedy transcript. When performed in front of a live audience, which jokes do you think made the audience laugh?  List of quotes:",
    "You are a person who enjoys aggressive humor. Extract the key humorous lines and punchlines for this stand-up comedy transcript. Focus on the quotes highlighting the main comedic moments. List of quotes:",
    "You are a person who enjoys self-enhancing humor. Extract the key humorous lines and punchlines for this stand-up comedy transcript. Focus on the quotes highlighting the main comedic moments. List of quotes:",
    "You are a person who enjoys self-deprecating humor. Extract the key humorous lines and punchlines for this stand-up comedy transcript. Focus on the quotes highlighting the main comedic moments. List of quotes:",
    "You are a person who enjoys dark humor. Extract the key humorous lines and punchlines for this stand-up comedy transcript. Focus on the quotes highlighting the main comedic moments. List of quotes:",
    "You are a person who enjoys affiliative humor. Extract the key humorous lines and punchlines for this stand-up comedy transcript. Focus on the quotes highlighting the main comedic moments. List of quotes:",
    "The following is a stand-up comedy transcript. What are the funniest punchlines from the transcript. List of quotes:",
    "Below is a transcript from a stand-up comedy routine. Analyze the transcript and extract the quotes that are most likely to have made the audience laugh. List of quotes:",
    "The following is a stand-up comedy transcript. When preformed in front of a live audience, which jokes do you think made the audience laugh? List of quotes:",
    "Pretend that you are a stand-up comedian reading the following stand-up comedy transcript. Focus on the quotes highlighting the main comedic moments. List of quotes:",
    "Pretend that you are a stand-up comedy fan reading the following stand-up comedy transcript. Focus on the quotes highlighting the main comedic moments. List of quotes:",
    "Pretend that you are a stand-up comedy critic reading the following stand-up comedy transcript. Focus on the quotes highlighting the main comedic moments. List of quotes:" 
]

TRANSCRIPTS = [
    {"comedian": row["comedian"], "transcript": row["transcript"]}
    for index, row in transcript.head(20).iterrows()
]

transcript_comedians = {comedian["comedian"] for comedian in TRANSCRIPTS} 
GROUND_TRUTHS = []
for index, row in ground_truth.iterrows():
    if row["comedian"] not in transcript_comedians: continue
    found_comedian = False
    for comedian_dict in GROUND_TRUTHS:
        if comedian_dict["comedian"] == row["comedian"]:
            comedian_dict["sentence"].append(row["sentence"])
            found_comedian = True
            break
    
    if not found_comedian:
        GROUND_TRUTHS.append({
            "comedian": row["comedian"],
            "sentence": [row["sentence"]]
        })

In [64]:
from itertools import product

all_representations = []
for inst, text in product(INSTRUCTIONS, TRANSCRIPTS):
   comedian_name = text['comedian']
   matching_truths = [truth['sentence'] for truth in GROUND_TRUTHS if truth['comedian'] == comedian_name]
   sentence = f"{inst}\n{text['transcript']}\n" + "\n - ".join(matching_truths[0])
   inputs = tokenizer(sentence, return_tensors="pt").to(model.device)
   with torch.inference_mode():
      outputs = model(**inputs)

   representation = {"comedian": text["comedian"], "rep": torch.cat(outputs.hidden_states)[-1, -16:].flatten()}
   all_representations.append(representation)

OutOfMemoryError: CUDA out of memory. Tried to allocate 636.00 MiB. GPU 0 has a total capacity of 23.58 GiB of which 125.12 MiB is free. Including non-PyTorch memory, this process has 23.44 GiB memory in use. Of the allocated memory 21.69 GiB is allocated by PyTorch, and 1.51 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [15]:
# from itertools import product

# all_representations = []
# for inst, text in product(INSTRUCTIONS, TRANSCRIPTS):
#    sentence = f"{inst}\n{text}\n" + "\n -".join(GROUND_TRUTHS)
#    inputs = tokenizer(sentence, return_tensors="pt").to(model.device)
#    with torch.inference_mode():
#        outputs = model(**inputs)

#   representation = torch.cat(outputs.hidden_states)[-1, -16:].flatten()
#    all_representations.append(representation)

In [23]:
gt_reference_list = []
for comedian, rep in all_representations:
    gt_references = torch.stack(rep)
    *_, gt_reference_subspace = torch.pca_lowrank(gt_references.float(), q=10)
    gt_reference_list.append({"comedian": comedian, "subspace": gt_reference_subspace})
    
# gt_reference_subspace.shape

torch.Size([57344, 3])

In [38]:
#sentences = [f"{inst}\n{text}\n" for inst, text in product(INSTRUCTIONS, TRANSCRIPTS)]

#inputs = tokenizer(sentences, return_tensors="pt", padding=True, truncation=True).to(model.device)
#with torch.inference_mode():
    #outputs = model(input_ids=model.generate(**inputs, max_new_tokens=128))


# representations = torch.cat(outputs.hidden_states)[-1, -16:].flatten()
# all_representations.append(representation)

In [63]:
all_representations = []
for inst, text in product(INSTRUCTIONS, TRANSCRIPTS):
    prompt = f"{inst}\n{text['transcript']}\n"
    inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True).to(model.device)
    with torch.inference_mode():
        outputs = model(input_ids=model.generate(**inputs, max_new_tokens=128))
    representation = {"comedian": text["comedian"], "rep": torch.cat(outputs.hidden_states)[-1, -16:].flatten()}
    all_representations.append(representation)

In [None]:
model_reference_answers = []
for comedian, rep in all_representations:
    model_references = torch.stack(rep)
    *_, model_reference_subspace = torch.pca_lowrank(model_references.float(), q=10)
    model_reference_answers.append({"comedian": comedian, "subspace": model_reference_subspace})

In [69]:
metric = []

for model_entry in model_reference_answers:
    comedian = model_entry["comedian"]
    subspace = model_entry["subspace"]
    gt_entry = next(entry for entry in gt_reference_list if entry["comedian"] == comedian)
    gt_subspace = gt_entry["subspace"] 
        
    A = subspace @ gt_subspace
    score = A @ A / A.shape[0]
    metric.append({"comedian": comedian, "score": score})

6
8


In [35]:
outputs.hidden_states[-1][-1].shape

torch.Size([4, 1, 3584])