In [1]:
from transformers import AutoTokenizer, LlamaForCausalLM
from transformers import LlamaModel, LlamaConfig
from datasets import load_dataset
from torch import nn, optim
import torch
from transformers import AutoTokenizer
from torch.utils.data import DataLoader
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm



Welcome to bitsandbytes. For bug reports, please run

python -m bitsandbytes

 and submit this information together with your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
bin /home/shawn/anaconda3/envs/lavis_llama/lib/python3.9/site-packages/bitsandbytes/libbitsandbytes_cuda117.so
CUDA SETUP: CUDA runtime path found: /home/shawn/anaconda3/envs/lavis_llama/lib/libcudart.so.11.0
CUDA SETUP: Highest compute capability among GPUs detected: 8.0
CUDA SETUP: Detected CUDA version 117
CUDA SETUP: Loading binary /home/shawn/anaconda3/envs/lavis_llama/lib/python3.9/site-packages/bitsandbytes/libbitsandbytes_cuda117.so...
[2023-11-16 00:20:43,663] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)


In [2]:
configuration = LlamaConfig(bos_token_id = 1,
                            eos_token_id = 2,
                            hidden_act = "silu",
                            hidden_size = 32,
                            initializer_range = 0.02,
                            intermediate_size = 256,
                            max_position_embeddings = 1024,
                            model_type = "llama",
                            num_attention_heads = 4,
                            num_hidden_layers = 4,
                            pad_token_id = 0,
                            rms_norm_eps = 1e-06,
                            tie_word_embeddings = False,
                            torch_dtype = "bfloat16",
                            transformers_version = "4.30.2",
                            use_cache = True,
                            vocab_size = 32000,
)

llama2_model = LlamaForCausalLM.from_pretrained("lmsys/vicuna-7b-v1.5")
llama2_model.config = configuration
llama2_model.model = LlamaModel(configuration)
llama2_model.vocab_size = configuration.vocab_size
llama2_model.lm_head = nn.Linear(configuration.hidden_size, configuration.vocab_size, bias=False)
tokenizer = AutoTokenizer.from_pretrained("lmsys/vicuna-7b-v1.5")

In [5]:
raw_datasets = load_dataset("glue", "mrpc")

def tokenize_function(example):
    return tokenizer(example["sentence1"], example["sentence2"], padding="max_length", truncation=True, return_tensors='pt')

def collate_batch(batch):
    return {
        'input_ids': torch.tensor([item['input_ids'] for item in batch]),
        'attention_mask': torch.tensor([item['attention_mask'] for item in batch]),
        'labels': torch.tensor([item['input_ids'] for item in batch])
    }

tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
train_dataset = tokenized_datasets["train"]
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=8, collate_fn=collate_batch)
valid_dataset = tokenized_datasets["validation"]
valid_dataloader = DataLoader(valid_dataset, shuffle=True, batch_size=8, collate_fn=collate_batch)

Map: 100%|██████████| 408/408 [00:00<00:00, 641.05 examples/s]


In [6]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(llama2_model.parameters(), lr=0.001)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
llama2_model.to(device)

num_epochs = 3
for epoch in range(num_epochs):
    llama2_model.train()
    total_loss = 0
    for batch in tqdm(train_dataloader):
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['input_ids'].to(device)  # In causal LM, labels are usually the input_ids
        outputs = llama2_model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
    print(f"Average training loss: {total_loss / len(train_dataloader)}")

    llama2_model.eval()
    total_eval_loss = 0
    with torch.no_grad():
        for batch in valid_dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['input_ids'].to(device)
            outputs = llama2_model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            total_eval_loss += loss.item()
    print(f"Validation loss: {total_eval_loss / len(valid_dataloader)}")

100%|██████████| 459/459 [02:18<00:00,  3.31it/s]


Average training loss: 1.1344809457984366
Validation loss: 0.12175728555987864


100%|██████████| 459/459 [02:17<00:00,  3.33it/s]


Average training loss: 0.10656059955394866
Validation loss: 0.0981914107413853


100%|██████████| 459/459 [02:17<00:00,  3.34it/s]


Average training loss: 0.089357374183233
Validation loss: 0.09006432728732333
