In [10]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
import re
import numpy as np
from tqdm import tqdm
import math
import time
from peft import LoraConfig, get_peft_model, TaskType, PeftModel
import pickle

In [11]:
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-0.5B", padding_side = 'right')
# model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2.5-0.5B")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# config = LoraConfig(
#     r=16,
#     lora_alpha=16,
#     lora_dropout=0.05,
#     bias="none",
#     task_type=TaskType.CAUSAL_LM,
#     target_modules=["q_proj", "v_proj"],  # or add k/o_proj for broader adaptation
# )

# model = get_peft_model(base_model, config)
# model.to(device)

In [12]:
# model = PeftModel.from_pretrained(base_model, './latest_model')
# for name, param in model.named_parameters():
#     if 'lora' in name:
#         param.requires_grad = True

tokenizer.add_special_tokens({'pad_token': '<|pad|>',
                              'bos_token': '<|im_start|>',
                              'eos_token': '<|im_end|>'})



1

In [13]:
potential_template = (
    "{% set image_count = namespace(value=0) %}"
    "{% set video_count = namespace(value=0) %}"
    "{% for message in messages %}"
    "{% if loop.first and message['role'] != 'system' %}"
    "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
    "{% endif %}"
    "<|im_start|>{{ message['role'] }}\n"
    "{% if message['content'] is string %}"
    "{% if message['role'] == 'assistant' %}"
    "{% generation %}"
    "{{ message['content'] }}"
    "{% endgeneration %}"
    "{% else %}"
    "{{ message['content'] }}"
    "{% endif %}"
    "<|im_end|>\n"
    "{% else %}"
    "{% for content in message['content'] %}"
    "{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}"
    "{% set image_count.value = image_count.value + 1 %}"
    "{% if add_vision_id %}"
    "Picture {{ image_count.value }}: "
    "{% endif %}"
    "<|vision_start|><|image_pad|><|vision_end|>"
    "{% elif content['type'] == 'video' or 'video' in content %}"
    "{% set video_count.value = video_count.value + 1 %}"
    "{% if add_vision_id %}"
    "Video {{ video_count.value }}: "
    "{% endif %}"
    "<|vision_start|><|video_pad|><|vision_end|>"
    "{% elif 'text' in content %}"
    "{% if message['role'] == 'assistant' %}"
    "{% generation %}"
    "{{ content['text'] }}"
    "{% endgeneration %}"
    "{% else %}"
    "{{ content['text'] }}"
    "{% endif %}"
    "{% endif %}"
    "{% endfor %}"
    "<|im_end|>\n"
    "{% endif %}"
    "{% endfor %}"
    "{% if add_generation_prompt %}"
    "<|im_start|>assistant\n"
    "{% endif %}")

In [14]:
class SFTSmolTalkDataset(Dataset):
    def __init__(self, dataset, tokenizer, max_length = 620):
        self.dataset = dataset
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.dataset)
    
    def __getitem__(self, idx):
        messages = self.dataset[idx]['messages']

        new_messages = []
        for m in messages:
            if not new_messages and m["role"] == "user":
                new_messages.append(m)
            elif new_messages and m["role"] == "assistant":
                new_messages.append(m)
                break
        if len(new_messages) != 2:
            return self._empty_item()
        
        print(new_messages)
        
        try:
            not_tokenized = tokenizer.apply_chat_template(
                new_messages,
                tokenize = False
            )
            
            tokenized = tokenizer.apply_chat_template(
                new_messages,
                tokenize = True,
                max_length = self.max_length,
                padding = 'max_length',
                truncation = 'only_second',
                return_dict = True,
                return_assistant_tokens_mask=True,
                add_generation_prompt = False,
                chat_template = potential_template,
                return_tensors = 'pt'
            )
        except:
            return self._empty_item()

        input_ids = tokenized['input_ids']
        assistant_masks = tokenized['assistant_masks']

        mod_assistant_mask = assistant_masks.clone()
        matches = (input_ids == tokenizer.convert_tokens_to_ids("<|im_end|>"))
        indices = torch.nonzero(matches)
        mod_assistant_mask[tuple(indices[-1])] = 1 # inlcude end speaking token in assistant to include in lables

        attention_mask = tokenized['attention_mask']

        labels = input_ids.clone()
        labels[mod_assistant_mask == 0] = -100

        return {
            'input_ids': input_ids.squeeze(0),
            'attention_mask': attention_mask.squeeze(0),
            'labels': labels.squeeze(0),
            # 'not_tokenized_str': not_tokenized,
        }
    
    def _empty_item(self):
        return {
            'input_ids': torch.zeros(self.max_length, dtype=torch.int32),
            'attention_mask': torch.zeros(self.max_length, dtype=torch.int32),
            'labels': torch.full((self.max_length,), -100, dtype=torch.int32),
            # 'not_tokenized_str': "",
        }

In [16]:
# train_smoltalk_loaded = load_dataset("HuggingFaceTB/smol-smoltalk", split="train")
val_smoltalk_loaded = load_dataset("HuggingFaceTB/smol-smoltalk", split="test")

Generating train split:   0%|          | 0/460341 [00:00<?, ? examples/s]

DatasetGenerationError: An error occurred while generating the dataset

In [None]:
train_smoltalk = SFTSmolTalkDataset(train_smoltalk_loaded.select(range(72000,200000)), 
                                    tokenizer = tokenizer)
val_smoltalk = SFTSmolTalkDataset(val_smoltalk_loaded.select(range(200000,210000)), 
                                   tokenizer = tokenizer)


In [None]:
smoltalk_batch_size = 8
smoltalk_num_epochs = 1

train_smoltalk_loader = DataLoader(train_smoltalk, batch_size=smoltalk_batch_size)
val_smoltalk_loader = DataLoader(val_smoltalk, batch_size=smoltalk_batch_size)

In [None]:
from torch.utils.tensorboard import SummaryWriter

def train(train_loader, optimizer, writer, epoch, accumulation_steps=64):

    total_loss = 0
    batch_times = []
    progress = tqdm(train_loader, desc=f"Training Epoch {epoch}", leave=True)

    optimizer.zero_grad()

    for i, batch in enumerate(progress):
        start = time.time()

        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        if torch.all(labels == -100):
            continue

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss / accumulation_steps  # normalize loss for accumulation

        assert not math.isnan(loss.item()), f'Loss: {loss}, \nOutputs: {outputs}'

        loss.backward()

        if (i + 1) % accumulation_steps == 0 or (i + 1) == len(train_loader):
            optimizer.step()
            optimizer.zero_grad()

        total_loss += loss.item() * accumulation_steps  
        avg_loss = total_loss / (i + 1)

        if i % 50 == 0 and i != 0:
            writer.add_scalar("Loss/train", avg_loss, epoch * len(train_loader) + i)

        batch_time = time.time() - start
        batch_times.append(batch_time)
        avg_time = sum(batch_times) / len(batch_times)
        eta = avg_time * (len(train_loader) - (i + 1))
        eta_hr, remainder = divmod(int(eta), 3600)
        eta_min, eta_sec = divmod(remainder, 60)

        progress.set_postfix(loss=[loss.item() * accumulation_steps, avg_loss], eta=f"{eta_hr}h {eta_min}m {eta_sec}s")

        if i % 1000 == 0 and i != 0:
            torch.cuda.empty_cache()
            model.save_pretrained('./latest_model')
            with open('latest_opt.pkl', 'wb') as f:
                pickle.dump(optimizer, f)

    avg_loss = total_loss / len(train_loader)
    return avg_loss


def test(test_loader, writer, epoch):
    total_loss = 0
    batch_times = []
    progress = tqdm(test_loader, desc=f"Testing Epoch {epoch}", leave=True)

    for i, batch in enumerate(progress):
        start = time.time()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        if torch.all(labels == -100):
            continue

        with torch.no_grad():
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)

        loss = outputs.loss
        total_loss += loss.item()
        avg_loss = total_loss / (i + 1)

        assert not math.isnan(loss.item()), f'Loss: {loss}, \nOutputs: {outputs}'

        batch_time = time.time() - start
        batch_times.append(batch_time)
        avg_time = sum(batch_times) / len(batch_times)
        eta = avg_time * (len(test_loader) - (i + 1))
        eta_hr, remainder = divmod(int(eta), 3600)
        eta_min, eta_sec = divmod(remainder, 60)


        progress.set_postfix(loss=[loss.item(), avg_loss], eta=f"{eta_hr}h {eta_min}m {eta_sec}s")

        if i % 1000 == 0 and i != 0:
            torch.cuda.empty_cache()

    avg_loss = total_loss / len(test_loader)
    writer.add_scalar("Loss/val", avg_loss, epoch)
    return avg_loss

def fine_tune(train_loader, test_loader, optimizer, num_epochs):
    writer = SummaryWriter()  

    for epoch in range(num_epochs):
        train_loss = train(train_loader, optimizer, writer, epoch, accumulation_steps=8)
        model.save_pretrained('./smoltalk')
        val_loss = test(test_loader, writer, epoch)
        print(f'Epoch: {epoch}. Train Loss: {train_loss}. Val Loss: {val_loss}.')

    writer.close()


In [None]:
# optim = torch.optim.AdamW(filter(lambda p: p.requires_grad, model.parameters()), lr=1e-5)
with open('latest_opt.pkl', 'rb') as f:
    optim = pickle.load(f)

model = AutoModelForCausalLM.from_pretrained('./latest_model')

model.config.pad_token_id = tokenizer.pad_token_id
model.config.bos_token_id = tokenizer.bos_token_id
model.config.eos_token_id = tokenizer.eos_token_id
model.to(device)

print(device)

cuda


In [None]:

observations = """
"Exception: Truncation error: Second sequence not provided" happens during right side truncation 
if the user input is longer than max length in which case, the messages is no longer a message pair
"""

fine_tune(
    train_loader = train_smoltalk_loader,
    test_loader = val_smoltalk_loader,
    optimizer = optim,
    num_epochs = smoltalk_num_epochs
)