In [1]:
import os
import argparse
import json
import math
import os
import random
from pprint import pformat

import torch
import torch.nn as nn
from torch.utils.data import DataLoader

import nltk
import datasets
import evaluate

import transformers
from transformers import (
    AutoModelForSeq2SeqLM,
    AutoModelForCausalLM,
    AutoTokenizer,
    SchedulerType,
    get_scheduler,
    set_seed,
    DataCollatorForLanguageModeling,
)

from accelerate import Accelerator
from accelerate.utils import set_seed
from datasets import load_dataset

import wandb
from tqdm.auto import tqdm, trange
from loguru import logger

import scripts
from adapters.models.llama.adapter_model import LlamaAdapterModel
import peft_comparison
import peft_comparison.text2text_utils
import peft_comparison.mappings
from peft_comparison.collation import DataCollatorForSeq2SeqWithMetadata, DataCollatorForCausalLMWithMetadata

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
padding = "max_length"
truncation = True

source_prefix = ""
max_source_length = 512
decoder_only = True
max_target_length = 512

tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")
tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token})

0

In [3]:

# First we tokenize all the texts.
def preprocess_function(examples, is_eval=False, decoder_only=False):
    inputs = examples["source_text"]
    targets = examples["target_text"]
    inputs = [source_prefix + inp for inp in inputs]

    if not decoder_only:
        model_inputs = tokenizer(inputs, max_length=max_source_length, padding=padding, truncation=True)
        labels = tokenizer(text_target=targets, max_length=max_target_length, padding=padding, truncation=True)
        if padding == "max_length":
            labels["input_ids"] = [
                [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"]
            ]
        model_inputs["labels"] = labels["input_ids"]
        if is_eval:
            model_inputs["metadata"] = [{"targets": t} for t in targets]

    else:
        if is_eval:
            model_inputs = tokenizer(inputs, max_length=max_source_length, padding=False, truncation=True)
        else:
            model_inputs = tokenizer(inputs, targets, max_length=max_source_length, padding=False, truncation=True)

        # @NOTE: we can set labels to input_ids because the token shifting is taken care of in the modeling_llaama file
        model_inputs["labels"] = model_inputs["input_ids"]
        if is_eval:
            input_wo_label = tokenizer(inputs, max_length=max_source_length, padding=False, truncation=False)
            input_wo_label = input_wo_label["input_ids"]
            model_inputs["metadata"] = []
            for idx in range(len(targets)):
                model_inputs["metadata"].append(
                    {
                        "targets": targets[idx],
                        "input_len": len(input_wo_label[idx]),
                    }
                )

    return model_inputs

In [4]:
raw_datasets = load_dataset("super_glue", "copa")
raw_datasets, postprocess_fn = peft_comparison.text2text_utils.dataset_to_text2text(
    raw_datasets,
    task_type="classification",
    dataset_name="copa",
    decoder_only=True,
)
column_names = raw_datasets["train"].column_names

{'premise': 'My body cast a shadow over the grass.', 'choice1': 'The sun was rising.', 'choice2': 'The grass was cut.', 'question': 'cause', 'idx': 0, 'label': 0}
<class 'datasets.dataset_dict.DatasetDict'>
<class 'datasets.arrow_dataset.Dataset'>
<class 'dict'>
{'premise': 'My body cast a shadow over the grass.', 'choice1': 'The sun was rising.', 'choice2': 'The grass was cut.', 'question': 'cause', 'idx': 0, 'label': 0, 'source_text': 'COPA (Choice of Plausible Alternatives): Given a premise, a question (cause/effect) and two alternative choices, identify plausible answer from the alternative choices.  premise: My body cast a shadow over the grass. question: cause choice1: The sun was rising. choice2: The grass was cut.Select answer from: choice1,choice2. Answer:', 'target_text': 'choice1'}
<class 'datasets.dataset_dict.DatasetDict'>
<class 'datasets.arrow_dataset.Dataset'>
<class 'dict'>


In [5]:
raw_datasets["train"][0]

{'premise': 'My body cast a shadow over the grass.',
 'choice1': 'The sun was rising.',
 'choice2': 'The grass was cut.',
 'question': 'cause',
 'idx': 0,
 'label': 0,
 'source_text': 'COPA (Choice of Plausible Alternatives): Given a premise, a question (cause/effect) and two alternative choices, identify plausible answer from the alternative choices.  premise: My body cast a shadow over the grass. question: cause choice1: The sun was rising. choice2: The grass was cut.Select answer from: choice1,choice2. Answer:',
 'target_text': 'choice1'}

In [6]:
eval_dataset = raw_datasets["validation"].map(
    preprocess_function,
    batched=True,
    num_proc=8,
    remove_columns=column_names,
    desc="Running tokenizer on val dataset  ",
    fn_kwargs={"is_eval": True, "decoder_only": decoder_only},
)
train_dataset = raw_datasets["train"].map(
    preprocess_function,
    batched=True,
    batch_size=min(5000, len(raw_datasets["train"]) // 8),
    num_proc=8,
    remove_columns=column_names,
    desc="Running tokenizer on train dataset",
    fn_kwargs={"decoder_only": decoder_only},
)

In [7]:
label_pad_token_id = -100
data_collator = DataCollatorForCausalLMWithMetadata(
    tokenizer=tokenizer,
    padding=True,
    pad_to_multiple_of=8,
    max_length=max_source_length,
)

train_dataloader = DataLoader(train_dataset, shuffle=True, collate_fn=data_collator, batch_size=2)
eval_dataloader = DataLoader(eval_dataset, collate_fn=data_collator, batch_size=2)


In [8]:
for batch in train_dataloader:
    print(batch["input_ids"].shape)
    t_ = tokenizer.batch_decode(batch["input_ids"], skip_special_tokens=False)
    for ex in t_:
        print(ex)
    break

You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


torch.Size([2, 104])
<s> COPA (Choice of Plausible Alternatives): Given a premise, a question (cause/effect) and two alternative choices, identify plausible answer from the alternative choices.  premise: The man begged for forgiveness. question: effect choice1: The woman took pity on him. choice2: The woman joked around with him.Select answer from: choice1,choice2. Answer:<s> choice1</s></s></s></s></s></s></s></s></s></s></s></s>
<s> COPA (Choice of Plausible Alternatives): Given a premise, a question (cause/effect) and two alternative choices, identify plausible answer from the alternative choices.  premise: The security guard replayed the surveillance footage. question: cause choice1: The surveillance camera was out of focus. choice2: He noticed some suspicious activity.Select answer from: choice1,choice2. Answer:<s> choice2</s></s></s></s></s>




In [9]:
s_ = batch["attention_mask"][0, :].sum()

In [10]:
batch["input_ids"][0, s_:]

tensor([2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [None]:
for batch in eval_dataloader:
    print(batch["input_ids"].shape)
    t_ = tokenizer.batch_decode(batch["input_ids"], skip_special_tokens=False)
    for ex in t_:
        print(ex)
    break