In [None]:
def load_adapter(falcon, lora_apply_dir=None, lora_config=None, ddp=None):
    if lora_apply_dir is None:
        model = get_peft_model(falcon, lora_config)
    else:
        if ddp:
            device_map = {'': 0}
        else:
            if torch.cuda.device_count() > 1:
                device_map = "auto"
            else:
                device_map = {'': 0}

        print('Device map for lora:', device_map)

        model = PeftModel.from_pretrained(
            falcon, lora_apply_dir, device_map=device_map,
            torch_dtype=torch.float32, is_trainable=True)

        print(lora_apply_dir, 'loaded')

    return model

In [None]:
import torch
import time

from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

model_id = "tiiuae/falcon-40b"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

tokenizer = AutoTokenizer.from_pretrained(model_id)

# Falcon requires you to allow remote code execution. This is because the model uses a new architecture that is not part of transformers yet.
# The code is provided by the model authors in the repo.

model = AutoModelForCausalLM.from_pretrained(model_id, trust_remote_code=True, quantization_config=bnb_config, device_map="auto", cache_dir='/mnt/artifacts/falcon_40b/')


In [None]:
# Set the Falcon tokenizer
tokenizer.pad_token = tokenizer.eos_token

In [None]:
from peft import LoraConfig, get_peft_model,PeftModel

config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=[
        "query_key_value",
        "dense",
        "dense_h_to_4h",
        "dense_4h_to_h",
        ],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, config)

In [None]:
# Load the adapter
model.config.use_cache = True
# Please change the path below, it maybe different for your model
model = load_adapter(model, lora_apply_dir='/mnt/artifacts/outputs_sample/checkpoint-63/')

In [None]:
# save the model to disc
pt_save_directory = "/mnt/artifacts/fine_tune_model/8bit"
tokenizer.save_pretrained(pt_save_directory)
model.save_pretrained(pt_save_directory)

In [None]:
from datasets import load_dataset
from random import randint

# Load dataset from the hub
test_dataset = load_dataset("samsum", split="test")

In [None]:
# select a random test sample
sample = test_dataset[randint(0, len(test_dataset))]

# format sample
prompt_template = f"Summarize the chat dialogue:\n{{dialogue}}\n---\nSummary:\n"

test_sample = prompt_template.format(dialogue=sample["dialogue"])

print(test_sample)

In [None]:
input_ids = tokenizer(test_sample, return_tensors="pt").input_ids
input_ids = input_ids.to('cuda')

In [None]:
#set the tokens for the summary evaluation
tokens_for_summary = 50
output_tokens = input_ids.shape[1] + tokens_for_summary

start_time = time.time()
with torch.no_grad():
    outputs = model.generate(inputs=input_ids, do_sample=True, pad_token_id=tokenizer.pad_token_id, max_length=output_tokens)
end_time = time.time()
gen_text = tokenizer.batch_decode(outputs)[0]
print(gen_text)

In [None]:
print(f'\nTook {round(end_time - start_time, 3)} s') 