## Training

In [6]:
import os
os.environ["WANDB_DISABLED"] = "true"
import sys
import torch
from torch.nn.functional import pad
import transformers
from datasets import load_dataset
from peft import (
    LoraConfig,
    get_peft_model,
    get_peft_model_state_dict,
    prepare_model_for_int8_training,
    set_peft_model_state_dict,
)
from transformers import LlamaForCausalLM, LlamaTokenizer, GenerationConfig


Welcome to bitsandbytes. For bug reports, please submit your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
CUDA SETUP: CUDA runtime path found: /usr/local/cuda-11.6/lib64/libcudart.so
CUDA SETUP: Highest compute capability among GPUs detected: 8.6
CUDA SETUP: Detected CUDA version 116
CUDA SETUP: Loading binary /usr/local/lib/python3.9/dist-packages/bitsandbytes/libbitsandbytes_cuda116.so...


In [3]:
template = {
    "prompt_with_input": "Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request. \n\n ### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:\n",
    "prompt_no_input": "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\n{instruction}\n\n### Response:\n",
    "response_split": "### Response:"
}

In [4]:
def make_prompt(instruction, input = None, label = None):
    if input:
        result = template["prompt_with_input"].format(instruction = instruction, input = input)
    else:
        result = template["prompt_no_input"].format(instruction = instruction)
    
    if label:
        result = f"{result}{label}"
    
    return result

In [5]:
def fetch_response(output):
    return output.split(template["response_split"])[1].split("### Instruction")[0]

In [8]:
base_model = "decapoda-research/llama-7b-hf"
data_path = "yahma/alpaca-cleaned"
output_dir = "./lora_alpaca_2"
batch_size = 32
micro_batch_size = 2
num_epochs = 5
learning_rate = 1e-4
cutoff_len = 512
lora_r = 8
lora_alpha = 16
lora_dropout = 0.05
lora_target_modules = ["q_proj", "v_proj"]
train_on_inputs = True
group_by_length = True
gradient_accumulation_steps = batch_size // micro_batch_size
device_map = "auto"

In [None]:
model = LlamaForCausalLM.from_pretrained(
    base_model,
    load_in_8bit=True,
    torch_dtype=torch.float16,
    device_map=device_map
)
tokenizer = LlamaTokenizer.from_pretrained(base_model)
tokenizer.pad_token_id = (0)
tokenizer.padding_side = "left"

Downloading (…)lve/main/config.json:   0%|          | 0.00/427 [00:00<?, ?B/s]

Downloading (…)model.bin.index.json:   0%|          | 0.00/25.5k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/33 [00:00<?, ?it/s]

Downloading (…)l-00001-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

Downloading (…)l-00002-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

Downloading (…)l-00003-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

Downloading (…)l-00004-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

Downloading (…)l-00005-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

Downloading (…)l-00006-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

Downloading (…)l-00007-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

Downloading (…)l-00008-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

Downloading (…)l-00009-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

Downloading (…)l-00010-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

Downloading (…)l-00011-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

Downloading (…)l-00012-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

Downloading (…)l-00013-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

Downloading (…)l-00014-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

Downloading (…)l-00015-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

Downloading (…)l-00016-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

Downloading (…)l-00017-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

Downloading (…)l-00018-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

Downloading (…)l-00019-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

Downloading (…)l-00020-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

Downloading (…)l-00021-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

Downloading (…)l-00022-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

Downloading (…)l-00023-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

Downloading (…)l-00024-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

Downloading (…)l-00025-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

Downloading (…)l-00026-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

Downloading (…)l-00027-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

Downloading (…)l-00028-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

Downloading (…)l-00029-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

Downloading (…)l-00030-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

Downloading (…)l-00031-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

Downloading (…)l-00032-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

Downloading (…)l-00033-of-00033.bin:   0%|          | 0.00/524M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/33 [00:00<?, ?it/s]

Downloading (…)neration_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Downloading tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/141 [00:00<?, ?B/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'LLaMATokenizer'. 
The class this function is called from is 'LlamaTokenizer'.


In [26]:
def tokenize(prompt, add_eos_token=True):
    result = tokenizer(prompt, truncation = True, max_length=cutoff_len, padding = False, return_tensors=None)
    if (result["input_ids"][-1] != tokenizer.eos_token_id and len(result["input_ids"]) < cutoff_len and add_eos_token):
        result["input_ids"].append(tokenizer.eos_token_id)
        result["attention_mask"].append(1)
    input_ids = pad(torch.Tensor(result.get("input_ids")).long(), pad=(0, cutoff_len - len(result.get("input_ids"))), mode="constant", value=0)
    attention_mask = pad(torch.Tensor(result.get("attention_mask")).long(), pad=(0, cutoff_len - len(result.get("attention_mask"))), mode="constant", value=0)
    result.update({"input_ids": input_ids, "attention_mask": attention_mask})
    result["labels"] = result["input_ids"]
    return result

In [27]:
def generate_tokenize_prompt(data_point):
    
    full_prompt = make_prompt(data_point["instruction"], data_point["input"], data_point["output"])
    
    tokenized_full_prompt = tokenize(full_prompt)
    
    if not train_on_inputs:
        user_prompt = make_prompt(data_point["instruction"], data_point["input"])
        tokenized_user_prompt = tokenize(user_prompt, add_eos_token=False)
        user_prompt_len = len(tokenized_user_prompt.get("input_ids"))
        
        tokenized_full_prompt["labels"] = [-100] * user_prompt_len + tokenized_full_prompt[user_prompt_len:]
        
    return tokenized_full_prompt

In [14]:
model = prepare_model_for_int8_training(model)

In [15]:
config = LoraConfig(
    r = lora_r,
    lora_alpha = lora_alpha,
    target_modules=lora_target_modules,
    lora_dropout=lora_dropout,
    bias="none",
    task_type="CAUSAL_LM"
)
model = get_peft_model(model, config)

In [18]:
import json

In [19]:
data = json.loads(open("./alpaca_data_cleaned.json").read())

In [20]:
len(data)

51724

In [22]:
# data[0]

In [23]:
model.print_trainable_parameters()

trainable params: 4194304 || all params: 6742609920 || trainable%: 0.06220594176090199


In [28]:
train_data = list(map(lambda row: generate_tokenize_prompt(row), data))

In [29]:
trainer = transformers.Trainer(
    model = model,
    train_dataset = train_data,
    eval_dataset=None,
    args = transformers.TrainingArguments(
        per_device_train_batch_size=micro_batch_size,
        gradient_accumulation_steps=gradient_accumulation_steps,
        warmup_steps=10,
        num_train_epochs=num_epochs,
        learning_rate=learning_rate,
        fp16=True,
        logging_steps=5,
        optim="adamw_torch",
        save_strategy="steps",
        save_steps=200,
        output_dir=output_dir,
        save_total_limit=3,
        report_to=None,
        group_by_length=group_by_length
    ),
    data_collator=transformers.DataCollatorForSeq2Seq(
        tokenizer, pad_to_multiple_of=8, return_tensors="pt", padding="max_length", max_length=cutoff_len
    )
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [30]:
model.config.use_cache = False

In [31]:
old_state_dict = model.state_dict

In [32]:
model.state_dict = (
    lambda self, *_, **__: get_peft_model_state_dict(
        self, old_state_dict()
    )
).__get__(model, type(model))

In [None]:
trainer.train()

Step,Training Loss
5,13.0261
10,12.8512
15,13.1401
20,12.4462
25,10.3306
30,7.5917
35,5.9292
40,5.6324
45,5.025
50,4.0213


In [None]:
model.save_pretrained(output_dir)

## Generate Text

In [None]:
model.eval()

In [None]:
generation_config = GenerationConfig(
            temperature=0.6,
            top_p=0.8,
            top_k=100,
            num_beams=3
)

In [None]:
prompt = make_prompt("You are a model which writes code", "write hello world in python")
inputs = tokenizer(prompt, return_tensors="pt")
generate_params = {
            "input_ids": inputs.get("input_ids"),
            "generation_config": generation_config,
            "return_dict_in_generate": True,
            "output_scores": True,
            "max_new_tokens": 512,
        }

In [None]:
with torch.no_grad():
    generation_output = model.generate(
        input_ids=inputs.get("input_ids"),
        generation_config=generation_config,
        return_dict_in_generate=True,
        output_scores=True,
        max_new_tokens=128,
    )
s = generation_output.sequences[0]
output = tokenizer.decode(s)
fetch_response(output)