In [1]:
import os
import argparse
import json
import math
import os
import random
from pprint import pformat

import torch
import torch.nn as nn
from torch.utils.data import DataLoader

import nltk
import datasets
import evaluate

import transformers
from transformers import (
    AutoModelForSeq2SeqLM,
    AutoModelForCausalLM,
    AutoTokenizer,
    SchedulerType,
    get_scheduler,
    set_seed,
)

from accelerate import Accelerator
from accelerate.utils import set_seed
from datasets import load_dataset

import wandb
from tqdm.auto import tqdm, trange
from loguru import logger

import scripts
from adapters.models.llama.adapter_model import LlamaAdapterModel
import peft_comparison
import peft_comparison.text2text_utils
import peft_comparison.mappings
#from peft_comparison.collation import DataCollatorForSeq2SeqWithMetadata, DataCollatorForCausalLMWithMetadata

from dataclasses import dataclass
from typing import Any, Optional, Union

import numpy as np

from transformers.tokenization_utils_base import PreTrainedTokenizerBase
from transformers.utils import PaddingStrategy

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
padding = "max_length"
truncation = True

source_prefix = ""
max_source_length = 512
decoder_only = True
max_target_length = 8

tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")
tokenizer.add_special_tokens({'pad_token': "<pad>"})

0

In [3]:

# First we tokenize all the texts.
def preprocess_function(examples, is_eval=False, decoder_only=False):
        inputs = examples["source_text"]
        targets = examples["target_text"]
        inputs = [source_prefix + inp for inp in inputs]

        if not decoder_only:
            # T5
            model_inputs = tokenizer(inputs, max_length=max_source_length, padding=padding, truncation=True)
            labels = tokenizer(text_target=targets, max_length=max_target_length, padding=padding, truncation=True)
            if padding == "max_length":
                labels["input_ids"] = [
                    [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"]
                ]
            model_inputs["labels"] = labels["input_ids"]
            if is_eval:
                model_inputs["metadata"] = [{"targets": t} for t in targets]

        else:
            # @NOTE: the way we have written preprocessing and collation for llama,
            # - set padding=False in preprocessing (so that we know what's the max_len in the batch)
            # - set padding=True in collation (so that we can pad to the multiple of 8 > max_len in the batch)
            # - we can set labels to input_ids because the token shifting is taken care of in the modeling_llaama file
            if is_eval:
                model_inputs = tokenizer(inputs, max_length=max_source_length, padding=False, truncation=True)
            else:
                inputs = [i + " " + t for i, t in zip(inputs, targets)]
                model_inputs = tokenizer(inputs, max_length=max_source_length, padding=False, truncation=True)
            model_inputs["labels"] = model_inputs["input_ids"]
            if is_eval:
                input_wo_label = tokenizer(inputs, max_length=max_source_length, padding=False, truncation=False)
                input_wo_label = input_wo_label["input_ids"]
                model_inputs["metadata"] = []
                for idx in range(len(targets)):
                    model_inputs["metadata"].append(
                        {
                            "targets": targets[idx],
                            "input_len": len(input_wo_label[idx]),
                        }
                    )

        return model_inputs


@dataclass
class DataCollatorForCausalLMWithMetadata:

    tokenizer: PreTrainedTokenizerBase
    padding: Union[bool, str, PaddingStrategy] = True
    max_length: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None
    label_pad_token_id: int = -100
    return_tensors: str = "pt"
    padding_side: str = "left"

    def __call__(self, features):
        return_tensors = self.return_tensors
        labels = [feature["labels"] for feature in features] if "labels" in features[0].keys() else None
        # We have to pad the labels before calling `tokenizer.pad` as this method won't pad them and needs them of the
        # same length to return tensors.
        max_label_length = max(len(l) for l in labels)
        if self.pad_to_multiple_of is not None:
            max_label_length = (
                (max_label_length + self.pad_to_multiple_of - 1)
                // self.pad_to_multiple_of
                * self.pad_to_multiple_of
            )

        batch = {}
        for feature in features:
            for k, v in feature.items():
                if k == "metadata": continue
                if k not in batch:
                    batch[k] = []

                # fill the sequence upto the "max_label_length" with appropriate token_id (either eos or 0)
                if k in ["input_ids"]:
                    remainder = [self.tokenizer.pad_token_id] * (max_label_length - len(v))
                elif k in ["labels"]:
                    remainder = [self.label_pad_token_id] * (max_label_length - len(v))
                elif k in ["attention_mask", "decoder_attention_mask"]:
                    remainder = [0] * (max_label_length - len(v))
                else:
                    ValueError(f"Invalid key {k}")

                # padding: either to the right or left
                if self.padding_side == "right":
                    v = v + remainder
                else:
                    v = remainder + v

                assert len(v) == max_label_length, f"len(v)={len(v)}, max_label_length={max_label_length}"
                batch[k].append(v)

        # convert values to torch tensors
        batch = {k: torch.LongTensor(v) for k, v in batch.items()}
        if "metadata" in features[0].keys():
            batch["metadata"] = [feature["metadata"] for feature in features]

        """
        if labels is not None:
            max_label_length = max(len(l) for l in labels)
            if self.pad_to_multiple_of is not None:
                max_label_length = (
                    (max_label_length + self.pad_to_multiple_of - 1)
                    // self.pad_to_multiple_of
                    * self.pad_to_multiple_of
                )

            padding_side = self.padding_side
            for feature in features:
                remainder = [self.tokenizer.eos_token_id] * (max_label_length - len(feature["labels"])) #[self.label_pad_token_id] * (max_label_length - len(feature["labels"]))
                if isinstance(feature["labels"], list):
                    if padding_side == "right":
                        feature["labels"] = feature["labels"] + remainder
                    else:
                        print("here")
                        feature["labels"] = remainder + feature["labels"]
                    feature["labels"] = (
                        feature["labels"] + remainder if padding_side == "right" else remainder + feature["labels"]
                    )

                elif padding_side == "right":
                    feature["labels"] = np.concatenate([feature["labels"], remainder]).astype(np.int64)
                else:
                    feature["labels"] = np.concatenate([remainder, feature["labels"]]).astype(np.int64)

        #non_str_features = [
        #    {k: v for k, v in feature.items() if k != "metadata"} for feature in features
        #]

        non_str_features = {}
        for feature in features:
            for k, v in feature.items():
                if k != "metadata":
                    if k not in non_str_features:
                        non_str_features[k] = []
                        print(len(v))
                    non_str_features[k].append(v)
        non_str_features = {k: torch.LongTensor(v) for k, v in non_str_features.items()}
        non_str_features = self.tokenizer.pad(
            non_str_features,
            truncation=True,
            padding=False,#self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors=return_tensors,
        )

        #
        if "metadata" in features[0].keys():
            non_str_features["metadata"] = [feature["metadata"] for feature in features]
        features = non_str_features
        """

        return batch

In [4]:
raw_datasets = load_dataset("super_glue", "copa")
raw_datasets, postprocess_fn = peft_comparison.text2text_utils.dataset_to_text2text(
    raw_datasets,
    task_type="classification",
    dataset_name="copa",
    decoder_only=True,
)
column_names = raw_datasets["train"].column_names

In [5]:
raw_datasets["train"][0]

{'premise': 'My body cast a shadow over the grass.',
 'choice1': 'The sun was rising.',
 'choice2': 'The grass was cut.',
 'question': 'cause',
 'idx': 0,
 'label': 0,
 'source_text': 'Given a premise, a question (cause/effect) and two alternative choices, identify plausible answer from the alternative choices.  premise: My body cast a shadow over the grass. question: cause choice1: The sun was rising. choice2: The grass was cut. Select answer from: choice1,choice2. Answer:',
 'target_text': 'choice1'}

In [6]:
eval_dataset = raw_datasets["validation"].map(
    preprocess_function,
    batched=True,
    num_proc=8,
    remove_columns=column_names,
    desc="Running tokenizer on val dataset  ",
    fn_kwargs={"is_eval": True, "decoder_only": decoder_only},
)
train_dataset = raw_datasets["train"].map(
    preprocess_function,
    batched=True,
    batch_size=min(5000, len(raw_datasets["train"]) // 8),
    num_proc=8,
    remove_columns=column_names,
    desc="Running tokenizer on train dataset",
    fn_kwargs={"decoder_only": decoder_only},
)

In [7]:
label_pad_token_id = -100
data_collator = DataCollatorForCausalLMWithMetadata(
    tokenizer=tokenizer,
    padding=True,
    pad_to_multiple_of=8,
    max_length=max_source_length,
)

train_dataloader = DataLoader(train_dataset, shuffle=True, collate_fn=data_collator, batch_size=2)
eval_dataloader = DataLoader(eval_dataset, collate_fn=data_collator, batch_size=2)


In [10]:
for batch in train_dataloader:
    print(batch["input_ids"].shape)
    print(batch.keys())
    print((~(batch["attention_mask"].bool())).sum(dim=1))
    s_ = tokenizer.batch_decode(batch["input_ids"], skip_special_tokens=False)
    #t_ = tokenizer.batch_decode(batch["labels"], skip_special_tokens=False)
    for idx, ex in enumerate(s_):
        print(f"Source: {ex}")
    break

torch.Size([2, 88])
dict_keys(['input_ids', 'attention_mask', 'labels'])
tensor([ 5, 12])
Source: </s></s></s></s></s><s> Given a premise, a question (cause/effect) and two alternative choices, identify plausible answer from the alternative choices.  premise: The boy mimicked his older brother. question: cause choice1: The boy looked up to his older brother. choice2: The boy wrestled with his older brother. Select answer from: choice1,choice2. Answer: choice1
Source: </s></s></s></s></s></s></s></s></s></s></s></s><s> Given a premise, a question (cause/effect) and two alternative choices, identify plausible answer from the alternative choices.  premise: The service at the restaurant was slow. question: cause choice1: There were many empty tables. choice2: The restaurant was crowded. Select answer from: choice1,choice2. Answer: choice2


In [11]:
for batch in eval_dataloader:
    print(batch["input_ids"].shape)
    print(batch.keys())
    print((~(batch["attention_mask"].bool())).sum(dim=1))
    s_ = tokenizer.batch_decode(batch["input_ids"], skip_special_tokens=False)
    #t_ = tokenizer.batch_decode(batch["labels"], skip_special_tokens=False)
    for idx, ex in enumerate(s_):
        print(f"Source: {ex}")
    break

torch.Size([2, 88])
dict_keys(['input_ids', 'attention_mask', 'labels', 'metadata'])
tensor([9, 7])
Source: </s></s></s></s></s></s></s></s></s><s> Given a premise, a question (cause/effect) and two alternative choices, identify plausible answer from the alternative choices.  premise: The man turned on the faucet. question: effect choice1: The toilet filled with water. choice2: Water flowed from the spout. Select answer from: choice1,choice2. Answer:
Source: </s></s></s></s></s></s></s><s> Given a premise, a question (cause/effect) and two alternative choices, identify plausible answer from the alternative choices.  premise: The girl found a bug in her cereal. question: effect choice1: She poured milk in the bowl. choice2: She lost her appetite. Select answer from: choice1,choice2. Answer:


In [None]:
batch["input_ids"][0, s_:]

In [None]:
for batch in eval_dataloader:
    print(batch["input_ids"].shape)
    t_ = tokenizer.batch_decode(batch["input_ids"], skip_special_tokens=False)
    for ex in t_:
        print(ex)
    break