In [4]:
import pandas as pd
import torch
from torchmetrics import Accuracy
import lightning.pytorch as pl
from tqdm import tqdm
import numpy as np
from torch.utils.data import DataLoader
from data import ResumeDataset, ResumePromptDataset
from transformers import (
    AutoModel, AutoTokenizer, 
    T5ForConditionalGeneration,
    AdamW,
    get_linear_schedule_with_warmup
)

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
print(torch.cuda.is_available()) 

True


In [6]:

from utils import label_idx, idx_label, all_labels, n_classes
target_json_path = "../data/json/data_fine.json"

df = pd.read_json(target_json_path)
print(df.iloc[1])


print(label_idx, idx_label, all_labels)

buf_str    (January 31, 2021)
lbuf                43.235294
rbuf                57.215686
hbuf                        1
boldbuf                     0
italbuf                     0
stk_str    James Hye Suk Yoon
lstk                41.098039
rstk                59.392157
hstk                        1
boldstk                     1
italstk                     0
type                  discard
Name: 1, dtype: object
{'discard': 0, 'merge': 1, 'pop': 2, 'subordinate': 3} {0: 'discard', 1: 'merge', 2: 'pop', 3: 'subordinate'} ['discard', 'merge', 'pop', 'subordinate']


In [7]:
tokenizer_args = {
    'padding': 'max_length',
    'return_tensors': 'pt',
}

In [8]:
model_name = "t5-large"
tokenizer = AutoTokenizer.from_pretrained(model_name)

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-large automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [9]:
resume_dataset = ResumePromptDataset(tokenizer, target_json_path)
total_count = len(resume_dataset)
train_count = int(0.85 * total_count)
valid_count = int(0.1 * total_count)
test_count = total_count - train_count - valid_count

seed = torch.Generator().manual_seed(42)
train_dataset, valid_dataset, test_dataset = torch.utils.data.random_split(
    resume_dataset, (train_count, valid_count, test_count)
)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [10]:
from experiment_textonly import T5FineTuner
model = T5FineTuner.load_from_checkpoint("/home/zxliu2/sp23/sean-liu-resume-dependency/model/output/epoch=2-step=12156.ckpt").eval()


For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-large automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [11]:
valid_dataloader = DataLoader(valid_dataset, num_workers=4, batch_size=4)

In [12]:
batch = next(iter(valid_dataloader))
batch = {k: v.cuda() for k, v in batch.items()}
labels = batch['target_ids']


In [13]:
print(batch["source_ids"].shape)

torch.Size([4, 512])


In [14]:
all_labels = ['discard', 'merge', 'pop', 'subordinate']
action_ids = tokenizer(all_labels, max_length=2)
action_to_token_id = torch.LongTensor(action_ids["input_ids"])[:, 0]
print(action_to_token_id)

tensor([27324,  7986,  2783,   769])


In [20]:
import numpy
k = np.array([2, 1, 3, 4])
x = torch.Tensor([0, 1, 2, 3, 4, 5, 6]).to("cuda:0")
x[k]


RuntimeError: CUDA error: device-side assert triggered
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [18]:
results = model._step(batch)

In [19]:
logits = results[0]
logits[k]

/opt/conda/conda-bld/pytorch_1682343967769/work/aten/src/ATen/native/cuda/IndexKernel.cu:92: operator(): block: [226,0,0], thread: [0,0,0] Assertion `index >= -sizes[i] && index < sizes[i] && "index out of bounds"` failed.
/opt/conda/conda-bld/pytorch_1682343967769/work/aten/src/ATen/native/cuda/IndexKernel.cu:92: operator(): block: [226,0,0], thread: [1,0,0] Assertion `index >= -sizes[i] && index < sizes[i] && "index out of bounds"` failed.
/opt/conda/conda-bld/pytorch_1682343967769/work/aten/src/ATen/native/cuda/IndexKernel.cu:92: operator(): block: [226,0,0], thread: [2,0,0] Assertion `index >= -sizes[i] && index < sizes[i] && "index out of bounds"` failed.
/opt/conda/conda-bld/pytorch_1682343967769/work/aten/src/ATen/native/cuda/IndexKernel.cu:92: operator(): block: [226,0,0], thread: [3,0,0] Assertion `index >= -sizes[i] && index < sizes[i] && "index out of bounds"` failed.
/opt/conda/conda-bld/pytorch_1682343967769/work/aten/src/ATen/native/cuda/IndexKernel.cu:92: operator(): blo

RuntimeError: CUDA error: device-side assert triggered
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [None]:
preds = logits.argmax(axis=1)
preds.shape


In [None]:
print(f"preds = {preds}\nlabels={labels[:, 0]}")

In [None]:
accuracy = Accuracy('multiclass', num_classes = tokenizer.vocab_size).cuda()

In [None]:
preds = logits[:, :tokenizer.vocab_size].cuda()
print(preds.shape) 

In [None]:
acc = accuracy(preds, labels[:, 0])

In [None]:
print(acc)

In [6]:
trainer = pl.Trainer(devices = 1)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [None]:
trainer.validate(model)

In [None]:
trainer.test(model)

In [26]:
tokenizer = AutoTokenizer.from_pretrained("t5-large")


For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-large automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


[[27324, 1], [7986, 1], [2783, 1], [769, 1]] 
 [[27324, 1], [7986, 1], [2783, 1], [769, 21122, 1]]


In [31]:
model._step(batch)

NameError: name 'batch' is not defined