In [None]:
a = []
while(1):
    a.append('1')

In [None]:
  !pip install transformers datasets peft accelerate bitsandbytes safetensors



In [None]:
import os, sys
import torch
import datasets
from transformers import (
    pipeline,
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig,
    DataCollatorForLanguageModeling,
    DataCollatorForSeq2Seq,
    Trainer,
    TrainingArguments,
    GenerationConfig
)
from peft import PeftModel, LoraConfig, prepare_model_for_kbit_training, get_peft_model

In [None]:
### config ###
model_id = "togethercomputer/LLaMA-2-7B-32K"
max_length = 512
device_map = "auto"
batch_size = 128
micro_batch_size = 32
gradient_accumulation_steps = batch_size // micro_batch_size

# nf4" use a symmetric quantization scheme with 4 bits precision
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

In [None]:
# load model from huggingface
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    use_cache=False,
    device_map=device_map,
    token = "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"
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
# load tokenizer from huggingface
tokenizer = AutoTokenizer.from_pretrained(model_id, token = "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")
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

In [None]:
### generate ###
prompt = """<s>[INST] <<SYS>>
Write a response that appropriately completes the request.
<</SYS>>

What are some unique sports?[/INST]"""
inputs = tokenizer(prompt, return_tensors="pt")
generate_ids = model.generate(
    inputs.input_ids,
    do_sample=True,
    top_k=10,
    top_p=0.7,
    num_return_sequences=1,
    eos_token_id=tokenizer.eos_token_id,
    max_length=300,
    repetition_penalty=1.1,
    )
res = tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
print(res)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
### generate prompt based on template ###
prompt_template = {
    "prompt": \
    "Below is an instruction that describes a task.\
    {context}\
    \n\n### Instruction:\n{instruction}\n\n### Response:\n",

    "response_split": "### Response:"
}

def generate_prompt(instruction, label=None, context="Write a response that appropriately completes the request.", prompt_template=prompt_template):

    res = prompt_template["prompt"].format(
        instruction=instruction,context=context)
    if label:
        res = f"{res}{label}"
    return res

In [None]:
Tmax_length = 256
dataset = (datasets.load_dataset("pandas", data_files = "/content/drive/MyDrive/data.pkl", split='train').train_test_split(train_size=0.9, test_size=0.1))
dataset['test'][2]


In [None]:
def tokenize(tokenizer, prompt, max_length=max_length, add_eos_token=False):
    result = tokenizer(
        prompt,
        truncation=True,
        max_length=max_length,
        padding=False,
        return_tensors=None)

    result["labels"] = result["input_ids"].copy()
    return result

def generate_and_tokenize_prompt(data_point):
    full_prompt = generate_prompt(
        data_point["instruction"],
        data_point["response"],
    )
    tokenized_full_prompt = tokenize(tokenizer, full_prompt)
    user_prompt = generate_prompt(data_point["instruction"])
    tokenized_user_prompt = tokenize(tokenizer, user_prompt)
    user_prompt_len = len(tokenized_user_prompt["input_ids"])
    mask_token = [-100] * user_prompt_len
    tokenized_full_prompt["labels"] = mask_token + tokenized_full_prompt["labels"][user_prompt_len:]
    return tokenized_full_prompt

cols = ["instruction","response"]
train_data = dataset["train"].shuffle().map(generate_and_tokenize_prompt, remove_columns=cols)
val_data = dataset["test"].shuffle().map(generate_and_tokenize_prompt, remove_columns=cols,)

In [None]:
train_data[1]


In [None]:
def print_number_of_trainable_model_parameters(model):
    trainable_model_params = 0
    all_model_params = 0
    for _, param in model.named_parameters():
        all_model_params += param.numel()
        if param.requires_grad:
            trainable_model_params += param.numel()
    print(f"trainable model parameters: {trainable_model_params}. All model parameters: {all_model_params} ")
    return trainable_model_params

ori_p = print_number_of_trainable_model_parameters(model)

In [None]:
# LoRA config
model = prepare_model_for_kbit_training(model)
peft_config = LoraConfig(
    r=8,
    lora_alpha=32,
    lora_dropout=0.1,
    target_modules=["q_proj", "v_proj"],
    bias="none",
    task_type="CAUSAL_LM",
)
model = get_peft_model(model, peft_config)

### compare trainable parameters #
peft_p = print_number_of_trainable_model_parameters(model)
print(f"# Trainable Parameter \nBefore: {ori_p} \nAfter: {peft_p} \nPercentage: {round(peft_p / ori_p * 100, 2)}")

In [None]:
args = TrainingArguments(
    output_dir="/content/drive/MyDrive/ChatModel",
    num_train_epochs=20,
    max_steps=200,
    fp16=True,
    optim="paged_adamw_32bit",
    learning_rate=2e-4,
    lr_scheduler_type="constant",
    per_device_train_batch_size=micro_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    gradient_checkpointing=True,
    group_by_length=False,
    logging_steps=10,
    save_strategy="epoch",
    save_total_limit=3,
    disable_tqdm=False,
)

trainer = Trainer(
    model=model,
    train_dataset=train_data,
    eval_dataset=val_data,
    args=args,
    data_collator=DataCollatorForSeq2Seq(
      tokenizer, pad_to_multiple_of=8, return_tensors="pt", padding=True),
)

# silence the warnings. re-enable for inference!
model.config.use_cache = False
trainer.train()
model.save_pretrained("/content/drive/MyDrive/ChatModel")

In [None]:
# nf4" use a symmetric quantization scheme with 4 bits precision
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

In [None]:
# model path and weight
model_id = "meta-llama/Llama-2-7b-chat-hf"
peft_path = "thhwarrior/Llama2-Tukl"

# loading model
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    use_cache=True,
    device_map="auto",
    token = "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"
)

# loading peft weight
model = PeftModel.from_pretrained(
    model,
    peft_path,
    torch_dtype=torch.float16,
)
model.eval()




In [None]:
# generation config
generation_config = GenerationConfig(
    do_sample=False,
    temperature=0.1,
    top_p=0.75,
    top_k=1,
    num_beams=4, # beam search
)
# generating reply
with torch.no_grad():
    prompt = "best countries for tourists?"
    inputs = tokenizer(prompt, return_tensors="pt")
    generation_output = model.generate(
        input_ids=inputs.input_ids.to('cuda'),
        generation_config=generation_config,
        return_dict_in_generate=True,
        output_scores=True,
        max_new_tokens=64,
    )
    print('\nAnswer: ', tokenizer.decode(generation_output.sequences[0]))

In [None]:
pipeline = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    torch_dtype=torch.float16,
    device_map="auto",
)

sequences = pipeline(
    'I liked "Breaking Bad" and "Band of Brothers". Do you have any recommendations of other shows I might like?\n',
    do_sample=True,
    top_k=10,
    num_return_sequences=1,
    eos_token_id=tokenizer.eos_token_id,
    max_length=200,
)
for seq in sequences:
    print(f"Result: {seq['generated_text']}")

In [None]:
model.save_pretrained("/content/drive/MyDrive/Llama2-Tukl")

In [None]:
model.push_to_hub("Llama2-Tukl", token = "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")