# Decoding

## Base Setup

In [1]:
import sys
import pandas as pd
sys.path.append("..")
import seaborn as sns
import transformers
import torch

from humor.bipartite_metric import bipartite_metric

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from huggingface_hub import login
login(token="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")

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /home/ada/.cache/huggingface/token
Login successful


In [3]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

MODEL_ID = "google/gemma-2b-it"

tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
model = AutoModelForCausalLM.from_pretrained(MODEL_ID, device_map="auto")

`config.hidden_act` is ignored, you should use `config.hidden_activation` instead.
Gemma's activation function will be set to `gelu_pytorch_tanh`. Please, use
`config.hidden_activation` if you want to override this behaviour.
See https://github.com/huggingface/transformers/pull/29402 for more details.
Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.10s/it]


In [4]:
transcripts = pd.read_csv('/home/ada/humor/data/stand_up_dataset/standup_transcripts.csv')

In [14]:
jeselnik_transcript = transcripts.loc[transcripts['comedian'] == 'Anthony_Jeselnik', 'transcript'].values[0]

In [5]:
instruction = "Extract the key humorous lines and punchlines for this stand-up comedy transcript. Focus on the quotes highlighting the main comedic moments. List of quotes:"

## Beam Search

In [30]:
prac_prompt = f"'''{jeselnik_transcript}'''\n\n{instruction}"
i = 0

In [33]:
for index, row in transcripts.iterrows():
    comedian = row['comedian']
    transcript = row['transcript'] 
    
    prompt = f"'''{transcript}'''\n\n{instruction}"    
    input_ids = tokenizer(prompt, return_tensors="pt").to("cuda")
    beam_outputs = model.generate(
        **input_ids,
        max_new_tokens = 120,
        num_beams = 3,
        num_return_sequences= 3,
        early_stopping = True)
    
    print(comedian)
    for i, beam_output in enumerate(beam_outputs):
        print("{}: {}".format(i, tokenizer.decode(beam_output, skip_special_tokens=True)).replace(prompt, "").strip())
        print(100 * '-')


Anthony_Jeselnik
0: 

1. "When I was a kid, I used to fantasize about getting older, growing up and having money, and buying my mom nice things for her birthday."


2. "My mom actually should’ve been on one of the planes that crashed on 9/11."


3. "When I was a kid, like nine years old, I’d come home after school. Once in a while, I’d bring a friend over to play with me at my house. Once in a while, I’d bring a black friend over. And when I
----------------------------------------------------------------------------------------------------
1: 

1. "When I was a kid, I used to fantasize about getting older, growing up and having money, and buying my mom nice things for her birthday."


2. "My mom actually should’ve been on one of the planes that crashed on 9/11."


3. "When I was a kid, like nine years old, I’d come home after school. Once in a while, I’d bring a friend over to play with me at my house. Once in a while, I’d bring a black friend over."


4.
-----------------------------

## Top K

In [6]:
for index, row in transcripts.iterrows():
    comedian = row['comedian']
    transcript = row['transcript'] 
    
    prompt = f"'''{transcript}'''\n\n{instruction}"
    input_ids = tokenizer(prompt, return_tensors="pt").to("cuda")
    
    print(f"Comedian: {comedian}")
    
    for k in range(1, 11):  
        print(f"\nTop-k Sampling with k={k}:\n" + 100 * '-')
        
        sample_output = model.generate(
            **input_ids,
            max_new_tokens=120,
            do_sample=True,
            top_k=k,
            num_return_sequences=1 
        )
        
        for i, output in enumerate(sample_output):
            print(f"Sample {i}:")
            print(tokenizer.decode(output, skip_special_tokens=True).replace(prompt, "").strip())
            print(100 * '-')


Comedian: Anthony_Jeselnik

Top-k Sampling with k=1:
----------------------------------------------------------------------------------------------------
Sample 0:
- "When I was a kid, I used to fantasize about getting older, growing up and having money, and buying my mom nice things for her birthday."


- "My mom actually should’ve been on one of the planes that crashed on 9/11."


- "When I was a kid, like nine years old, I’d come home after school. Once in a while, I’d bring a friend over to play with me at my house. Once in a while, I’d bring a black friend over. And when I did that,
----------------------------------------------------------------------------------------------------

Top-k Sampling with k=2:
----------------------------------------------------------------------------------------------------
Sample 0:
1. "When I was a kid, I used to fantasize about getting older, growing up and having money, and buying my mom nice things for her birthday."


2. "My mom actually shou

## Top P

p = 0, 90 to 100

In [7]:
for index, row in transcripts.iterrows():
    comedian = row['comedian']
    transcript = row['transcript'] 
    
    prompt = f"'''{transcript}'''\n\n{instruction}"
    input_ids = tokenizer(prompt, return_tensors="pt").to("cuda")
    
    print(f"Comedian: {comedian}")
    
    for p in [0] + [i / 100 for i in range(90, 101)]: 
        print(f"\nTop-p Sampling with p={p:.2f}:\n" + 100 * '-')
        
        sample_output = model.generate(
            **input_ids,
            max_new_tokens=120,
            do_sample=True,
            top_p=p,
            top_k=0,  
            num_return_sequences= 1  
        )
        
        for i, output in enumerate(sample_output):
            print(f"Sample {i}:")
            print(tokenizer.decode(output, skip_special_tokens=True).replace(prompt, "").strip())
            print(100 * '-')


Comedian: Anthony_Jeselnik

Top-p Sampling with p=0.00:
----------------------------------------------------------------------------------------------------
Sample 0:
- "When I was a kid, I used to fantasize about getting older, growing up and having money, and buying my mom nice things for her birthday."


- "My mom actually should’ve been on one of the planes that crashed on 9/11."


- "When I was a kid, like nine years old, I’d come home after school. Once in a while, I’d bring a friend over to play with me at my house. Once in a while, I’d bring a black friend over. And when I did that,
----------------------------------------------------------------------------------------------------

Top-p Sampling with p=0.90:
----------------------------------------------------------------------------------------------------
Sample 0:
1. "When I was a kid, I used to fantasize about getting older, growing up and having money, and buying my mom nice things for her birthday."
2. "My mom actually 

## p = i/10 where i = 1-10

In [7]:
for index, row in transcripts.iterrows():
    comedian = row['comedian']
    transcript = row['transcript'] 
    
    prompt = f"'''{transcript}'''\n\n{instruction}"
    input_ids = tokenizer(prompt, return_tensors="pt").to("cuda")
    
    print(f"Comedian: {comedian}")
    
    for p in [i / 10 for i in range(1, 11)]: 
        print("\nTop-p Sampling with p=",p)
        
        sample_output = model.generate(
            **input_ids,
            max_new_tokens=120,
            do_sample=True,
            top_p=p,
            top_k=0,  
            num_return_sequences= 1  
        )
        
        for i, output in enumerate(sample_output):
            print(f"Sample {i}:")
            print(tokenizer.decode(output, skip_special_tokens=True).replace(prompt, "").strip())

Comedian: Anthony_Jeselnik

Top-p Sampling with p= 0.1
Sample 0:
- "When I was a kid, I used to fantasize about getting older, growing up and having money, and buying my mom nice things for her birthday."


- "My mom actually should’ve been on one of the planes that crashed on 9/11."


- "When I was a kid, like nine years old, I’d come home after school. Once in a while, I’d bring a friend over to play with me at my house. Once in a while, I’d bring a black friend over. And when I did that,

Top-p Sampling with p= 0.2
Sample 0:
- "When I was a kid, I used to fantasize about getting older, growing up and having money, and buying my mom nice things for her birthday."


- "My mom actually should’ve been on one of the planes that crashed on 9/11."


- "When I was a kid, like nine years old, I’d come home after school. Once in a while, I’d bring a friend over to play with me at my house. Once in a while, I’d bring a black friend over. And when I did that,

Top-p Sampling with p= 0.3
Sample 