In [1]:
from accelerate import FullyShardedDataParallelPlugin, Accelerator
from torch.distributed.fsdp.fully_sharded_data_parallel import FullOptimStateDictConfig, FullStateDictConfig

fsdp_plugin = FullyShardedDataParallelPlugin(
    state_dict_config=FullStateDictConfig(offload_to_cpu=True, rank0_only=False),
    optim_state_dict_config=FullOptimStateDictConfig(offload_to_cpu=True, rank0_only=False),
)

accelerator = Accelerator(fsdp_plugin=fsdp_plugin)



In [2]:
from datasets import load_dataset

train_dataset = load_dataset('samsum', split='train')
eval_dataset = load_dataset('samsum', split='validation')
test_dataset = load_dataset('samsum', split='test')

print(train_dataset)
print(eval_dataset)
print(test_dataset)

Dataset({
    features: ['id', 'dialogue', 'summary'],
    num_rows: 14732
})
Dataset({
    features: ['id', 'dialogue', 'summary'],
    num_rows: 818
})
Dataset({
    features: ['id', 'dialogue', 'summary'],
    num_rows: 819
})


In [3]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

base_model_id = "mistralai/Mistral-7B-v0.1"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    cache_dir="models"
)

model = AutoModelForCausalLM.from_pretrained(base_model_id, quantization_config=bnb_config)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [4]:
tokenizer = AutoTokenizer.from_pretrained(
    base_model_id,
    model_max_length=512,
    padding_side="left",
    add_eos_token=True)

tokenizer.pad_token = tokenizer.eos_token

def tokenize(prompt):
    result = tokenizer(
        prompt,
        truncation=True,
        max_length=512,
        padding="max_length",
    )
    result["labels"] = result["input_ids"].copy()
    return result

# I'm giving a very simple prompt to see fine-tuning effect more clearly.
# With good enough prompt, Mistral-7B base model should be able to do summarization already pretty well.
def generate_and_tokenize_prompt(data_point):
    full_prompt =f"""
### Dialogue:
{data_point["dialogue"]}

### Summary:
{data_point["summary"]}
"""
    return tokenize(full_prompt)

tokenized_train_dataset = train_dataset.map(generate_and_tokenize_prompt)
tokenized_val_dataset = eval_dataset.map(generate_and_tokenize_prompt)

In [5]:
print("Dialogue: " + test_dataset[1]['dialogue'])
print("Summary: " + test_dataset[1]['summary'] + "\n")

eval_prompt = """
### Dialogue:
Jennifer: The new Marvel movies haven't been doing well, and I am pretty sure the new one also sucks.
Lee: But I really wanted to see how Miss Marvel would look in theater!
Zain: Well, my friend have seen Miss Marvel movie already, and they said that it really sucked. That's even considering them having all the comic books too.
Lee: Huh... Well, I guess I'll see it by myself when it hits the VOD. So, which one should we watch now?
Jennifer: There's tickets to Avatar 2 in about an hour. We could get those and watch that instead.
Lee: Sounds good to me. I really liked the first one anyways.
Zain: I'm on board also. Let's see that one instead! I'll buy the popcorn.

### Summary:
"""

# Re-init the tokenizer so it doesn't add padding or eos token
eval_tokenizer = AutoTokenizer.from_pretrained(
    base_model_id,
    add_bos_token=True,
)

model_input = eval_tokenizer(eval_prompt, return_tensors="pt").to("cuda")

model.eval()
with torch.no_grad():
    print(eval_tokenizer.decode(model.generate(**model_input, max_new_tokens=256)[0], skip_special_tokens=True))

Dialogue: Eric: MACHINE!
Rob: That's so gr8!
Eric: I know! And shows how Americans see Russian ;)
Rob: And it's really funny!
Eric: I know! I especially like the train part!
Rob: Hahaha! No one talks to the machine like that!
Eric: Is this his only stand-up?
Rob: Idk. I'll check.
Eric: Sure.
Rob: Turns out no! There are some of his stand-ups on youtube.
Eric: Gr8! I'll watch them now!
Rob: Me too!
Eric: MACHINE!
Rob: MACHINE!
Eric: TTYL?
Rob: Sure :)
Summary: Eric and Rob are going to watch a stand-up on youtube.



Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.



### Dialogue:
Jennifer: The new Marvel movies haven't been doing well, and I am pretty sure the new one also sucks.
Lee: But I really wanted to see how Miss Marvel would look in theater!
Zain: Well, my friend have seen Miss Marvel movie already, and they said that it really sucked. That's even considering them having all the comic books too.
Lee: Huh... Well, I guess I'll see it by myself when it hits the VOD. So, which one should we watch now?
Jennifer: There's tickets to Avatar 2 in about an hour. We could get those and watch that instead.
Lee: Sounds good to me. I really liked the first one anyways.
Zain: I'm on board also. Let's see that one instead! I'll buy the popcorn.

### Summary:
The new Marvel movies haven't been doing well, and I am pretty sure the new one also sucks. But I really wanted to see how Miss Marvel would look in theater! But my friend have seen Miss Marvel movie already, and they said that it really sucked. That's even considering them having all the comic book

In [6]:
from peft import prepare_model_for_kbit_training
from peft import LoraConfig, get_peft_model

model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
        "lm_head",
    ],
    bias="none",
    lora_dropout=0.05,  # Conventional
    task_type="CAUSAL_LM",
)

model = get_peft_model(model, config)
print_trainable_parameters(model)

# Apply the accelerator. You can comment this out to remove the accelerator.
model = accelerator.prepare_model(model)

trainable params: 21260288 || all params: 3773331456 || trainable%: 0.5634354746703705


In [None]:
import transformers
from datetime import datetime

project = "samsum-finetune"
base_model_name = "mistral"
run_name = base_model_name + "-" + project
output_dir = "./" + run_name

tokenizer.pad_token = tokenizer.eos_token

trainer = transformers.Trainer(
    model=model,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_val_dataset,
    args=transformers.TrainingArguments(
        output_dir=output_dir,
        warmup_steps=5,
        per_device_train_batch_size=2,
        gradient_checkpointing=True,
        gradient_accumulation_steps=4,
        gradient_checkpointing_kwargs={'use_reentrant':False},
        max_steps=2000,
        learning_rate=2.5e-5, # Want about 10x smaller than the Mistral learning rate
        logging_steps=100,
        bf16=True,
        optim="paged_adamw_8bit",
        logging_dir="./logs",        # Directory for storing logs
        save_strategy="steps",       # Save the model checkpoint every logging step
        save_steps=100,                # Save checkpoints every 50 steps
        evaluation_strategy="steps", # Evaluate the model every logging step
        eval_steps=100,               # Evaluate and save checkpoints every 50 steps
        do_eval=True,                # Perform evaluation at the end of training
    ),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)

model.config.use_cache = False  # silence the warnings. Please re-enable for inference!
trainer.train()

In [7]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

base_model_id = "mistralai/Mistral-7B-v0.1"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    cache_dir="models"
)

base_model = AutoModelForCausalLM.from_pretrained(
    base_model_id,  # Mistral, same as before
    quantization_config=bnb_config,  # Same quantization config as before
    device_map="auto",
    trust_remote_code=True,
    use_auth_token=True
)

eval_tokenizer = AutoTokenizer.from_pretrained(
    base_model_id,
    add_bos_token=True,
    trust_remote_code=True,
)

from peft import PeftModel

ft_model = PeftModel.from_pretrained(base_model, "mistral-samsum-finetune/checkpoint-2000")



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [8]:
eval_prompt = """
### Dialogue:
Jennifer: The new Marvel movies haven't been doing well, and I am pretty sure the new one also sucks.
Lee: But I really wanted to see how Miss Marvel would look in theater!
Zain: Well, my friend have seen Miss Marvel movie already, and they said that it really sucked. That's even considering them having all the comic books too.
Lee: Huh... Well, I guess I'll see it by myself when it hits the VOD. So, which one should we watch now?
Jennifer: There's tickets to Avatar 2 in about an hour. We could get those and watch that instead.
Lee: Sounds good to me. I really liked the first one anyways.
Zain: I'm on board also. Let's see that one instead! I'll buy the popcorn.

### Summary:
"""

model_input = tokenizer(eval_prompt, return_tensors="pt").to("cuda")

ft_model.eval()
with torch.no_grad():
    print(eval_tokenizer.decode(ft_model.generate(**model_input, max_new_tokens=100)[0], skip_special_tokens=True))

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.



### Dialogue:
Jennifer: The new Marvel movies haven't been doing well, and I am pretty sure the new one also sucks.
Lee: But I really wanted to see how Miss Marvel would look in theater!
Zain: Well, my friend have seen Miss Marvel movie already, and they said that it really sucked. That's even considering them having all the comic books too.
Lee: Huh... Well, I guess I'll see it by myself when it hits the VOD. So, which one should we watch now?
Jennifer: There's tickets to Avatar 2 in about an hour. We could get those and watch that instead.
Lee: Sounds good to me. I really liked the first one anyways.
Zain: I'm on board also. Let's see that one instead! I'll buy the popcorn.

### Summary:
  Lee, Jennifer and Zain will watch Avatar 2 instead of Miss Marvel. Zain will buy the popcorn.

### Analysis:
Lee, Jennifer and Zain will watch Avatar 2 instead of Miss Marvel. Zain will buy the popcorn.

### Special notes:


### Comparison:


### Summary:
Lee, Jennifer and Zain will watch Avatar 2