In [3]:
from transformers import T5ForConditionalGeneration, AutoTokenizer
import torch
import os

model = T5ForConditionalGeneration.from_pretrained("google/ul2", cache_dir='/work/09127/tomyoung/ls6/LLM_cache/google-ul2/', low_cpu_mem_usage=True, torch_dtype=torch.bfloat16).to("cuda")
model.parallelize()
tokenizer = AutoTokenizer.from_pretrained("google/ul2")

input_string = "[NLU] Mr. Dursley was the director of a firm called <extra_id_0>, \
    which made <extra_id_1>. He was a big, solid man with a bald head. \
        Mrs. Dursley was thin and <extra_id_2> of neck, which came in very useful as she spent \
            so much of her time <extra_id_3>. The Dursleys had a small son \
                called Dudley and <extra_id_4>"                                           

print(sum(p.numel() for p in model.parameters()))


self.device_map
{0: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10], 1: [11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21], 2: [22, 23, 24, 25, 26, 27, 28, 29, 30, 31]}
19459613696


In [60]:
import torch.nn as nn
loss_fn_sum = nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id, reduction='sum') #reduction='sum'

In [2]:
# generate random math equations in the form of x + y = z
# where x, y are integers between 0 and 100
# and x + y = z is a valid equation
import random
import numpy as np
import re
# generate x
x = random.randint(0, 10)
# generate y
y = random.randint(0, 10)
# generate z
z = x + y
# generate equation in string
equation = str(x) + " + " + str(y) + " = " + str(z)
equation
# generate a question form by replacing z with <extra_id_0>
question = '[NLU] ' + str(x) + " + " + str(y) + " = " + "<extra_id_0> ."

In [63]:
question = '[NLU] The most popular breed for a pet dog in Brazil is <extra_id_0>, while the most popular color is white.'
inputs = tokenizer(question, return_tensors="pt").input_ids.to("cuda")
num_beams = 100
outputs = model.generate(inputs, max_length=15, num_beams=num_beams, num_return_sequences=num_beams)
for i in range(num_beams):
    print(tokenizer.decode(outputs[i]))

<pad><extra_id_0> the Bichon Frise<extra_id_1> The most popular breed for a
<pad><extra_id_0> the Chihuahua<extra_id_1> The most popular breed
<pad><extra_id_0> the Labrador Retriever<extra_id_1> The most popular breed for 
<pad><extra_id_0> the Fila Brasileiro (Brazilian Fila
<pad><extra_id_0> the chihuahua<extra_id_1> The most popular
<pad><extra_id_0> the<extra_id_1> The most popular breed for a pet dog in Brazil
<pad><extra_id_0> the Fila Brasileiro<extra_id_1> The most popular breed for
<pad><extra_id_0> the Cocker Spaniel<extra_id_1> The most popular breed for a
<pad><extra_id_0> the Pumi<extra_id_1> The most popular breed for a pet
<pad><extra_id_0> the Chihuahua, followed by the
<pad><extra_id_0> the Chow Chow<extra_id_1> The most popular breed for a
<pad><extra_id_0> the Brazilian Fila Brasileiro<extra_id_1> The most popular breed
<pad><extra_id_0> the Cavalier King Charles Spaniel<extra_id_1> The most popular breed
<pad><extra_id_0> the Bichon frise<extra_id_1> The most popul

In [74]:
target_string = '<extra_id_0> the Labrador Retriever<extra_id_1>'
labels_target = tokenizer(target_string, return_tensors="pt").input_ids.to("cuda")
labels_target = labels_target[:, :-1].contiguous()
outputs = model(inputs, labels=labels_target)
outputs.loss
log_p = -loss_fn_sum(outputs.logits[0][1:], labels_target[0][1:]) # [1:] to remove the first token <extra_id_0>
import math
print(math.exp(log_p))

0.0680508540250102


In [72]:
question = '[NLU] The most popular breed for a pet dog in Brazil is <extra_id_0>, while the most popular color is <extra_id_1>.'
inputs = tokenizer(question, return_tensors="pt").input_ids.to("cuda")
num_beams = 100
outputs = model.generate(inputs, max_length=10, num_beams=num_beams, num_return_sequences=num_beams)
for i in range(num_beams):
    print(tokenizer.decode(outputs[i]))

<pad><extra_id_0> the Chihuahua
<pad><extra_id_0> the Labrador Retriever<extra_id_1> white
<pad><extra_id_0> the Bichon Frise<extra_id_1> white<extra_id_2>
<pad><extra_id_0> the Labrador Retriever<extra_id_1> the
<pad><extra_id_0> the Labrador Retriever<extra_id_1> brown
<pad><extra_id_0> the Cavalier King Charles Spaniel
<pad><extra_id_0> the Labrador Retriever<extra_id_1> black
<pad><extra_id_0> the chihuahu
<pad><extra_id_0> Chihuahua<extra_id_1>
<pad><extra_id_0> the Brazilian Fila Brasileiro
<pad><extra_id_0> the Labrador Retriever<extra_id_1> 
<pad><extra_id_0> the Fila Brasileiro<extra_id_1>
<pad><extra_id_0> the Cocker Spaniel<extra_id_1> white<extra_id_2>
<pad><extra_id_0> the Miniature Schnauzer
<pad><extra_id_0> the Labrador Retriever<extra_id_1> White
<pad><extra_id_0> the Rottweiler<extra_id_1> white<extra_id_2>
<pad><extra_id_0> the Bichon frise<extra_id_1> white
<pad><extra_id_0> the American Staffordshire Terrier<extra_id_1> white
<pad><extra_id_0> the Labrador Retrieve

In [73]:
target_string = '<extra_id_0> the Labrador Retriever<extra_id_1> white<extra_id_2>'
labels_target = tokenizer(target_string, return_tensors="pt").input_ids.to("cuda")
labels_target = labels_target[:, :-1].contiguous()
outputs = model(inputs, labels=labels_target)
outputs.loss
log_p = -loss_fn_sum(outputs.logits[0][1:], labels_target[0][1:]) # [1:] to remove the first token <extra_id_0>
import math
print(math.exp(log_p))

0.012588142242433998


In [1]:
question = '[NLU] Tom bought 14 apples. He then ate <extra_id_0>. He now has 11 apples.'
inputs = tokenizer(question, return_tensors="pt").input_ids.to("cuda")
num_beams = 1
outputs = model.generate(inputs, max_length=3, num_beams=num_beams, num_return_sequences=num_beams)
for i in range(num_beams):
    print(tokenizer.decode(outputs[i]))

NameError: name 'tokenizer' is not defined

In [None]:
target_string = '<extra_id_0> 7<extra_id_1>'
labels_target = tokenizer(target_string, return_tensors="pt").input_ids.to("cuda")
labels_target = labels_target[:, :-1].contiguous()
outputs = model(inputs, labels=labels_target)
outputs.loss
log_p = -loss_fn_sum(outputs.logits[0][1:], labels_target[0][1:]) # [1:] to remove the first token <extra_id_0>
import math
print(math.exp(log_p))

In [22]:
target_string = '<extra_id_0> 7<extra_id_1>'
labels_target = tokenizer(target_string, return_tensors="pt").input_ids.to("cuda")
labels_target = labels_target[:, :-1].contiguous()
outputs = model(inputs, labels=labels_target)
outputs.loss
log_p = -loss_fn_sum(outputs.logits[0][1:], labels_target[0][1:]) # [1:] to remove the first token <extra_id_0>
import math
print(math.exp(log_p))

0.18354186323251906


In [14]:
question = '[NLU] Tom bought 14 apples. He then ate <extra_id_0>. He now has <extra_id_1> apples.'
inputs = tokenizer(question, return_tensors="pt").input_ids.to("cuda")
num_beams = 100
outputs = model.generate(inputs, max_length=20, num_beams=num_beams, num_return_sequences=num_beams)
for i in range(num_beams):
    print(tokenizer.decode(outputs[i]))

<pad><extra_id_0> 6<extra_id_1> 8<extra_id_2> How many apples did Tom buy? Tom bought 14 apples. He then
<pad><extra_id_0> 6<extra_id_1> 10<extra_id_2> How many apples did Tom buy? Tom bought 14 apples. He then
<pad><extra_id_0> 5<extra_id_1> 9<extra_id_2> How many apples did Tom buy? Tom bought 14 apples. He then
<pad><extra_id_0> 3<extra_id_1> 11<extra_id_2> How many apples did Tom buy? Tom bought 14 apples. He then
<pad><extra_id_0> 4<extra_id_1> 10<extra_id_2> How many apples did Tom buy? Tom bought 14 apples. He then
<pad><extra_id_0> 6<extra_id_1> 8<extra_id_2> How many apples did he buy? Tom bought 14 apples. He
<pad><extra_id_0> 3<extra_id_1> 9<extra_id_2> How many apples did Tom buy? Tom bought 14 apples. He then
<pad><extra_id_0> 4<extra_id_1> 12<extra_id_2> How many apples did Tom buy? Tom bought 14 apples. He then
<pad><extra_id_0> 1<extra_id_1> 13<extra_id_2> How many apples did Tom buy? Tom bought 14 apples. He then
<pad><extra_id_0> 2<extra_id_1> 12<extra_id_2> How many 

In [16]:
target_string = '<extra_id_0> 7<extra_id_1> 11<extra_id_2>'
labels_target = tokenizer(target_string, return_tensors="pt").input_ids.to("cuda")
labels_target = labels_target[:, :-1].contiguous()
outputs = model(inputs, labels=labels_target)
outputs.loss
log_p = -loss_fn_sum(outputs.logits[0][1:], labels_target[0][1:]) # [1:] to remove the first token <extra_id_0>
import math
print(math.exp(log_p))

0.0017576984932042732


In [48]:
question = '[NLU] Tom bought 14 apples. He then ate <extra_id_0>. He now has 11 apples.'
inputs = tokenizer(question, return_tensors="pt").input_ids.to("cuda")
num_beams = 10
outputs = model.generate(inputs, max_length=3, num_beams=num_beams, num_return_sequences=num_beams)
for i in range(num_beams):
    print(tokenizer.decode(outputs[i]))

<pad><extra_id_0> 7
<pad><extra_id_0> 5
<pad><extra_id_0> 3
<pad><extra_id_0> 9
<pad><extra_id_0> 8
<pad><extra_id_0> 6
<pad><extra_id_0> seven
<pad><extra_id_0> 4
<pad><extra_id_0> 2
<pad><extra_id_0> five


In [6]:
question = '[NLU] Tom bought 17 apples. He then ate 10 apples. He now has <extra_id_0> apples.'
inputs = tokenizer(question, return_tensors="pt").input_ids.to("cuda")
outputs = model.generate(inputs, max_length=3, num_beams=2, num_return_sequences=2, output_scores=True)
for i in range(2):
    print(tokenizer.decode(outputs[i]))

<pad><extra_id_0> 5
<pad><extra_id_0> 7


In [71]:
outputs['sequences_scores']

tensor([-0.5469, -0.7031], device='cuda:0')

In [1]:
pwd

'/work/09127/tomyoung/ls6/inconsistencies_project'

In [1]:
import os
from datetime import datetime
# get a list of all files in the current dir
file_list = os.listdir()
# remove those that does not end with .ipynb or .py
file_list = [file for file in file_list if file.endswith('.ipynb') or file.endswith('.py')]

# get the last change date for each file
file_change_dates = [datetime.fromtimestamp(os.path.getmtime(file)) for file in file_list]
# for i in range(len(file_list)):
#     print(file_list[i], file_change_dates[i])
# put them in a nice table
import pandas as pd
df = pd.DataFrame({'file': file_list, 'date': file_change_dates})
df.sort_values(by=['date'], inplace=True, ascending=False)
df

# save table to a csv file
df.to_csv('code_note.csv')