In [7]:
#%pip install torch==1.13.1+cu116 torchvision==0.14.1+cu116 torchaudio==0.13.1 --extra-index-url https://download.pytorch.org/whl/cu116
#%pip install transformers datasets evaluate rouge-score py7zr
#%pip install nltk
#%pip install accelerate
#%pip install sentencepiece
#%pip install bitsandbytes
#%pip install peft

Collecting peft
  Downloading peft-0.3.0-py3-none-any.whl (56 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.8/56.8 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: peft
Successfully installed peft-0.3.0
Note: you may need to restart the kernel to use updated packages.


In [1]:
import copy
from dataclasses import dataclass, field
import json
import logging
import pathlib
from typing import Dict, Optional, Sequence

import torch
import os
import sys

from typing import List
import transformers
from torch.utils.data import Dataset
from transformers import Trainer

In [2]:
model_path = "/home/jupyter/vicuna-7b"
data_path = "/home/jupyter/fine_tune.json"
#data_path = "/home/jupyter/FinBot/test data/finetunetest.json"
cache_dir = "/home/jupyter/data/transformers"
output_dir = "/home/jupyter/testvicuna"

optimizer = "adamw_torch"
max_seq_length = 1024
IGNORE_INDEX = -100
DEFAULT_PAD_TOKEN = "[PAD]"
DEFAULT_EOS_TOKEN = "</s>"
DEFAULT_BOS_TOKEN = "</s>"
DEFAULT_UNK_TOKEN = "</s>"

In [3]:
def smart_tokenizer_and_embedding_resize(
    special_tokens_dict: Dict,
    tokenizer: transformers.PreTrainedTokenizer,
    model: transformers.PreTrainedModel,
):
    """Resize tokenizer and embedding.
    Note: This is the unoptimized version that may make your embedding size not be divisible by 64.

    Note from Shilong: this function seems to create a pad token when the original model/tokenizer doesn't support it.
    In particular, it adds the pad tokens to the tokenizer's token dictionary so that it can parse the token properly. Then,
    for the actual tensor input that correspond to the token that goes into the model (i.e. the token embeddings),
    it fills it with the average of all known token embeddings.
    """
    num_new_tokens = tokenizer.add_special_tokens(special_tokens_dict)
    model.resize_token_embeddings(len(tokenizer))

    if num_new_tokens > 0:
        input_embeddings = model.get_input_embeddings().weight.data
        output_embeddings = model.get_output_embeddings().weight.data

        input_embeddings_avg = input_embeddings[:-num_new_tokens].mean(
            dim=0, keepdim=True)
        output_embeddings_avg = output_embeddings[:-num_new_tokens].mean(
            dim=0, keepdim=True)

        input_embeddings[-num_new_tokens:] = input_embeddings_avg
        output_embeddings[-num_new_tokens:] = output_embeddings_avg




In [4]:
from transformers import TrainingArguments
from transformers import LlamaForCausalLM, LlamaTokenizer

system = """A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. """
human = ""


def _tokenize_fn(strings: Sequence[str],
                 tokenizer: transformers.PreTrainedTokenizer) -> Dict:
    """Tokenize a list of strings.
       Note from Shilong: this seems to mostly be about converting the strings into token ids as required by
       hugging face library.
    """

    # Note from Shilong: They truncate the texts here due to the model input size constraint.
    # In the future, we may need to come up with a smarter way of doing this rather than truncation.
    tokenized_list = [
        tokenizer(
            text,
            return_tensors="pt",
            padding="longest",
            max_length=tokenizer.model_max_length,
            truncation=True,
        ) for text in strings
    ]

    # Note from Shilong: Since the task is language modeling, i.e. predicting the next token from the previous ones,
    # the inputs are also the labels in this case.
    input_ids = labels = [
        tokenized.input_ids[0] for tokenized in tokenized_list
    ]
    input_ids_lens = labels_lens = [
        tokenized.input_ids.ne(tokenizer.pad_token_id).sum().item()
        for tokenized in tokenized_list
    ]
    return dict(
        input_ids=input_ids,
        labels=labels,
        input_ids_lens=input_ids_lens,
        labels_lens=labels_lens,
    )


def _add_speaker_and_signal(header, source, get_conversation=True):
    """Add speaker and start/end signal on each round.
       Note from Shilong: I changed this from version 0 of their model to version 1, which
       is what we tried.
    """
    conversation = header
    unknown_role = "UNKNOWN"  # use default unknown role
    roles = {
        "human": "USER",  # human role
        "gpt": "ASSISTANT",  # gpt role
    }
    seperator = {
        "human": " ",
        "gpt": DEFAULT_EOS_TOKEN
    }
    for sentence in source:
        sentence_from = sentence["from"].lower()
        sentence["value"] = (
            roles.get(sentence_from, unknown_role)
            + ": "
            + sentence["value"]
            + seperator[sentence_from]
        )
        if get_conversation:
            conversation += sentence["value"]
    return conversation

tokenizer = LlamaTokenizer.from_pretrained(model_path)
if tokenizer.pad_token is None:
    tokenizer.pad_token_id = (0)
tokenizer.padding_side="left"
   

def _mask_targets(target, tokenized_lens, speakers, header_len, s_ids):
    """
    Note from Shilong: This function just change the token id of the prompt to IGNORE so that
    they don't count in the loss function. TODO: again, need to verify if it is correct.
    :param target:
    :param tokenized_lens:
    :param speakers:
    :param header_len:
    :param s_ids:
    :return:
    """
        
    cur_idx = header_len
    tgt_len = target.shape[0]
    for tokenized_len, speaker, s_id in zip(tokenized_lens, speakers, s_ids):
        if cur_idx >= tgt_len:
            break
        elif cur_idx + tokenized_len < tgt_len:
            pass
            # Check whether the mask is applied to the correct position
            # if not torch.equal(target[cur_idx + 2:cur_idx + tokenized_len],
            #                    s_id[2:]):
            #     logging.warning("a sentence mismatches the corresponding piece "
            #                     "in the conversation")
            #     logging.warning("Part 1")
            #     logging.warning(tokenizer.decode(target[cur_idx + 0:cur_idx + tokenized_len]))
            #     logging.warning("Part 2")
            #     logging.warning(tokenizer.decode( s_id[1:]))
        if speaker == "human":
            logging.warning("Masked:")
            logging.warning(tokenizer.decode(target[cur_idx:cur_idx + tokenized_len]))
            target[cur_idx:cur_idx + tokenized_len] = IGNORE_INDEX
        cur_idx += tokenized_len


def preprocess(
    sources: Sequence[str],
    tokenizer: transformers.PreTrainedTokenizer,
) -> Dict:
    """
    Given a list of sources, each is a conversation list. This transform:
    1. Add signal '### ' at the beginning each sentence, with end signal '\n';
    2. Concatenate conversations together;
    3. Tokenize the concatenated conversation;
    4. Make a deepcopy as the target. Mask human words with IGNORE_INDEX.

    Note from Shilong: The people at Vicuna didn't bother to update their training code
    on GitHub with their newest model, which follows the format in llama_vicuna.ipynb generate
    function. I have taken steps to correct it here. In addition, while it make since for them
    to use some more sophisticated library to keep track of the prompting format, we don't need
    to go to that complication at this point yet. Thus, I removed the conversation lib from the
    code, and filled in the appropriate format per the generate function.
    """
    # add end signal and concatenate together
    conversations = []
    header = system
    for source in sources:
        conversation = _add_speaker_and_signal(header, source)
        conversations.append(conversation)
    # tokenize conversations
    conversations_tokenized = _tokenize_fn(conversations, tokenizer)
    input_ids = conversations_tokenized["input_ids"]
    targets = copy.deepcopy(input_ids)
    header_len = _tokenize_fn([header], tokenizer)["input_ids_lens"][0] - 1
    for target, source in zip(targets, sources):
        tokenized_sentence = _tokenize_fn([s["value"] for s in source], tokenizer)
        tokenized_lens = tokenized_sentence["input_ids_lens"]

        # Note from Shilong: TODO: Check to make sure the lengths here is correct, because
        # the v0 model's tokenization had to deal with "###" but the v1.1 model doesn't
        tokenized_lens = [l+3 for l in tokenized_lens]
        speakers = [sentence["from"] for sentence in source]
        ids = tokenized_sentence["input_ids"]
        _mask_targets(target, tokenized_lens, speakers, header_len, ids)

    return dict(input_ids=input_ids, labels=targets)


class SupervisedDataset(Dataset):
    """Dataset for supervised fine-tuning.
       Note from Shilong: That means it extended the huggingface dataset. They use huggingface trainer which takes the huggingface
       dataset naturally.
    """

    def __init__(self, data_path: str,
                 tokenizer: transformers.PreTrainedTokenizer):
        super(SupervisedDataset, self).__init__()
        logging.warning("Loading data...")

        # Note from Shilong: See https://github.com/lm-sys/FastChat/blob/main/playground/data/dummy.json for actual example data format, which is in json
        list_data_dict = json.load(open(data_path, "r"))

        logging.warning("Formatting inputs...")
        # Note from Shilong: Again, see the https://github.com/lm-sys/FastChat/blob/main/playground/data/dummy.json. It looks like the conversation dict holds the actual data.
        sources = [example["conversations"] for example in list_data_dict]


        data_dict = preprocess(sources, tokenizer)

        self.input_ids = data_dict["input_ids"]
        self.labels = data_dict["labels"]

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, i) -> Dict[str, torch.Tensor]:
        return dict(input_ids=self.input_ids[i], labels=self.labels[i])

Using pad_token, but it is not set yet.


In [5]:
# Note from Shilong: Those two class/function are pretty much implementation of standard HuggingFace functionality
# that can be found via Google Search, i.e. what does datacollator do etc

@dataclass
class DataCollatorForSupervisedDataset(object):
    """Collate examples for supervised fine-tuning."""

    tokenizer: transformers.PreTrainedTokenizer

    def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]:
        input_ids, labels = tuple([instance[key] for instance in instances]
                                  for key in ("input_ids", "labels"))
        input_ids = torch.nn.utils.rnn.pad_sequence(
            input_ids,
            batch_first=True,
            padding_value=self.tokenizer.pad_token_id)
        labels = torch.nn.utils.rnn.pad_sequence(labels,
                                                 batch_first=True,
                                                 padding_value=IGNORE_INDEX)
        return dict(
            input_ids=input_ids,
            labels=labels,
            attention_mask=input_ids.ne(self.tokenizer.pad_token_id),
        )


def make_supervised_data_module(tokenizer: transformers.PreTrainedTokenizer,
                                data_path) -> Dict:
    """Make dataset and collator for supervised fine-tuning."""
    dataset_cls = SupervisedDataset
    train_dataset = dataset_cls(tokenizer=tokenizer,
                                data_path=data_path)
    data_collator = DataCollatorForSupervisedDataset(tokenizer=tokenizer)
    return dict(train_dataset=train_dataset,
                eval_dataset=None,
                data_collator=data_collator)

In [6]:
### from transformers import TrainingArguments
from transformers import LlamaForCausalLM, LlamaTokenizer

from peft import (
    LoraConfig,
    get_peft_model,
    get_peft_model_state_dict,
    prepare_model_for_int8_training,
    set_peft_model_state_dict,
)

def safe_save_model_for_hf_trainer(trainer: transformers.Trainer,
                                   output_dir: str,
                                  per_device_train_batch_size: int = 8, 
                                  per_device_eval_batch_size: int = 8):
    """Collects the state dict and dump to disk.
       Note from Shilong: I am not sure why they had this special function. Thus,
       I didn't want to touch it.
    """
    state_dict = trainer.model.state_dict()
    if trainer.args.should_save:
        cpu_state_dict = {
            key: value.cpu()
            for key, value in state_dict.items()
        }
        del state_dict
        trainer._save(output_dir, state_dict=cpu_state_dict)  # noqa


def train(
    batch_size: int = 128,
    micro_batch_size: int = 4,
    num_epochs: int = 3,
    learning_rate: float = 3e-4,
    cutoff_len: int = 256,
    val_set_size: int = 2000,
    # lora hyperparams
    lora_r: int = 8,
    lora_alpha: int = 16,
    lora_dropout: float = 0.05,
    lora_target_modules: List[str] = [
        "q_proj",
        "v_proj",
    ],
    # llm hyperparams
    train_on_inputs: bool = True,  # if False, masks out inputs in loss
    add_eos_token: bool = False,
    group_by_length: bool = False,  # faster, but produces an odd training loss curve
):
    
#    model = transformers.LlamaForCausalLM.from_pretrained("/home/jupyter/vicuna-7b", low_cpu_mem_usage=True, 
#                                             torch_dtype=torch.float16,
#                                             device_map="auto"
#                                            )
    device_map = "auto"
    world_size = int(os.environ.get("WORLD_SIZE", 1))
    ddp = world_size != 1
    if ddp:
        device_map = {"": int(os.environ.get("LOCAL_RANK") or 0)}
        gradient_accumulation_steps = gradient_accumulation_steps // world_size
    

    
    
    # training_args = TrainingArguments(
    #        output_dir = output_dir,
    #        fp16=True,
    #        per_device_train_batch_size=1,
    #        per_device_eval_batch_size=1,
    #        per_gpu_eval_batch_size=1,
    #        per_gpu_train_batch_size=1,
    #        report_to = 'none'
    # )
    
    
    model = LlamaForCausalLM.from_pretrained(
        model_path,
        load_in_8bit=True,
        torch_dtype=torch.float16,
        device_map=device_map,
    )

#   tokenizer = LlamaTokenizer.from_pretrained(model_path)

     
#     if tokenizer.pad_token is None:
#         smart_tokenizer_and_embedding_resize(
#             special_tokens_dict=dict(pad_token=DEFAULT_PAD_TOKEN),
#             tokenizer=tokenizer,
#             model=model,
#         )

#     # Note From Shilong: Looks like the end of sequence token is used for beginning of sequence as well as unknown token
#     # It may be important to find out if it needs to be done for StableLM or Koala
#     if "llama" in model_path:
#         tokenizer.add_special_tokens({
#             "eos_token": DEFAULT_EOS_TOKEN,
#             "bos_token": DEFAULT_BOS_TOKEN,
#             "unk_token": DEFAULT_UNK_TOKEN,
#         })

    data_module = make_supervised_data_module(tokenizer=tokenizer,
                                              data_path=data_path) 
    
    model = prepare_model_for_int8_training(model)

    config = LoraConfig(
        r=lora_r,
        lora_alpha=lora_alpha,
        target_modules=lora_target_modules,
        lora_dropout=lora_dropout,
        bias="none",
        task_type="CAUSAL_LM",
    )
    model = get_peft_model(model, config)
    
    model.print_trainable_parameters()
    
    gradient_accumulation_steps = batch_size // micro_batch_size
    
    if not ddp and torch.cuda.device_count() > 1:
        # keeps Trainer from trying its own DataParallelism when more than 1 gpu is available
        model.is_parallelizable = True
        model.model_parallel = True    
    
    trainer = transformers.Trainer(
        model=model,
        **data_module,
        args=transformers.TrainingArguments(
            per_device_train_batch_size=micro_batch_size,
            gradient_accumulation_steps=gradient_accumulation_steps,
            warmup_steps=100,
            num_train_epochs=num_epochs,
            learning_rate=learning_rate,
            fp16=True,
            logging_steps=10,
            optim="adamw_torch",
            evaluation_strategy="steps" if val_set_size > 0 else "no",
            save_strategy="steps",
            eval_steps=200 if val_set_size > 0 else None,
            save_steps=200,
            output_dir=output_dir,
            save_total_limit=3,
            load_best_model_at_end=True if val_set_size > 0 else False,
            ddp_find_unused_parameters=False if ddp else None,
            group_by_length=group_by_length,
            report_to="none",
        ),

    )
    model.config.use_cache = False
    
    

    
#     model = transformers.LlamaForCausalLM.from_pretrained(
#         model_path,
#         cache_dir=cache_dir,
#         torch_dtype=torch.float16,
#         device_map="auto",
#         load_in_8bit = True,
#     )

#     tokenizer = transformers.AutoTokenizer.from_pretrained(
#         model_path,
#         cache_dir=cache_dir,
#         model_max_length=max_seq_length,
#         padding_side="right",
#         use_fast=False,
#     )

    # Note From Shilong: A pad token seems to be artificially added to the model/tokenizer if it does not exists already.
    # It may be important to find out if StableLM or Koala need such adjustment.


    # Note from Shilong: the **data_module just expand the dictionary from the make_supervised_data_module into arguments
    # I removed the args=training_args for now for simplicity, but might need to tune those as well in future.
    # See original implementation at: https://github.com/lm-sys/FastChat/blob/dc69abce16fcac6a1d7dab8a7b60cc06f9cf1bb2/fastchat/train/train.py#L281
    # and also see the note in the markdown cell
#     trainer = Trainer(model=model,
#                     tokenizer=tokenizer,
#                     args=training_args,
#                     **data_module,
                      
#                      )

    if list(pathlib.Path(output_dir).glob("checkpoint-*")):
        trainer.train(resume_from_checkpoint=True)
    else:
        trainer.train()
    trainer.save_state()
    safe_save_model_for_hf_trainer(trainer=trainer,
                                   output_dir=output_dir)


Welcome to bitsandbytes. For bug reports, please run

python -m bitsandbytes

 and submit this information together with your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
bin /opt/conda/lib/python3.7/site-packages/bitsandbytes/libbitsandbytes_cuda110.so
CUDA SETUP: CUDA runtime path found: /usr/local/cuda/lib64/libcudart.so
CUDA SETUP: Highest compute capability among GPUs detected: 7.5
CUDA SETUP: Detected CUDA version 110
CUDA SETUP: Loading binary /opt/conda/lib/python3.7/site-packages/bitsandbytes/libbitsandbytes_cuda110.so...


  warn(msg)


In [None]:
train()

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

## In this article
Follow your favorite stocksCREATE FREE ACCOUNT
Florida Gov. Ron DeSantis responds to a question during a press conference at the headquarters of the former Reedy Creek Improvement District that a newly appointed board now calls the Central Florida Tourism Oversight District, in Lake Buena Vista, Florida, Monday, April 17, 2023.
Florida Gov. Ron DeSantis and Florida Gov. Ron DeSantis's allies are ramping up Florida Gov. Ron DeSantis and his allies's fight against Walt Disney Co., even as more of Florida Gov. Ron DeSantis's rivals criticize Florida Gov. Ron DeSantis for Florida Gov. Ron DeSantis's long battle with Walt Disney Co..
Florida Gov. Ron DeSantis ripped Walt Disney Co. repeatedly this week over Walt Disney Co.'s recent maneuvers to thwart Florida Gov. Ron DeSantis's efforts to seize some control of Walt Disney Co.'s Orlando parks and properties.
"We\'ll make sure that we keep Walt Disney Co. in Walt Disney Co.'s pen, one way or another," Florida Gov. Ron DeSa

In [None]:
torch.cuda.is_available()

In [9]:
torch.cuda.device_count()

2

In [10]:
torch.cuda.current_device()

0

In [10]:
print(torch.__version__)

2.0.0


This is how Vicuna is tuned. All of the --parameters gets passed to the train function. I have simplified it down by
removing most of the arguments. However, it may be worth seeing the original implementation at
https://github.com/lm-sys/FastChat/blob/dc69abce16fcac6a1d7dab8a7b60cc06f9cf1bb2/fastchat/train/train.py#L281
to add the parameters back in later. Most importantly, it might be necessary to change the batch size to not
run out of memory.

torchrun --nproc_per_node=4 --master_port=20001 fastchat/train/train_mem.py \
    --model_name_or_path ~/model_weights/llama-7b  \
    --data_path playground/data/dummy.json \
    --bf16 True \
    --output_dir output \
    --num_train_epochs 3 \
    --per_device_train_batch_size 2 \
    --per_device_eval_batch_size 2 \
    --gradient_accumulation_steps 16 \
    --evaluation_strategy "no" \
    --save_strategy "steps" \
    --save_steps 1200 \
    --save_total_limit 10 \
    --learning_rate 2e-5 \
    --weight_decay 0. \
    --warmup_ratio 0.03 \
    --lr_scheduler_type "cosine" \
    --logging_steps 1 \
    --fsdp "full_shard auto_wrap" \
    --fsdp_transformer_layer_cls_to_wrap 'LlamaDecoderLayer' \
    --tf32 True \
    --model_max_length 2048 \
    --gradient_checkpointing True \
    --lazy_preprocess True