In [None]:
%pip install --upgrade peft bitsandbytes

import torch
import torch.nn as nn
import torch.nn.functional as F

import transformers
from tqdm.auto import tqdm, trange
assert torch.cuda.is_available(), "you need cuda for this part"
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model_name = 'unsloth/Qwen3-8B-Base-bnb-4bit'

tokenizer = transformers.AutoTokenizer.from_pretrained(model_name, device_map=device)
tokenizer.pad_token_id = tokenizer.eos_token_id

def load_model():
    model = transformers.AutoModelForCausalLM.from_pretrained(
        model_name, device_map='auto', low_cpu_mem_usage=True, offload_state_dict=True,
        torch_dtype=torch.float32,  # weights are 4-bit; layernorms and activations are fp32
    )
    for param in model.parameters():
        param.requires_grad=False

    model.gradient_checkpointing_enable()  # only store a small subset of activations, re-compute the rest.
    model.enable_input_require_grads()   # override an implementation quirk in gradient checkpoints that disables backprop unless inputs require grad
    return model

model = load_model()



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
`torch_dtype` is deprecated! Use `dtype` instead!


Loading weights:   0%|          | 0/399 [00:00<?, ?it/s]

In [None]:
def generate(model, prompt, max_length=100):
    batch = tokenizer(prompt, return_tensors='pt').to(device)
    output_ids = model.generate(**batch, max_new_tokens=max_length)
    return tokenizer.decode(output_ids[0], skip_special_tokens=True)

In [None]:
prompts = [
    """def fibonacci(n):
    # Возвращает n-е число Фибоначчи
    """,

    """def binary_search(arr, target):
    # Реализация бинарного поиска
    """,

    """async def handle_client(reader, writer):
    # Обработка клиента
    """
]

for i, prompt in enumerate(prompts):
    print(f"Prompt {i}:\n{generate(model, prompt)}\n\n")

Prompt 0:
def fibonacci(n):
    # Возвращает n-е число Фибоначчи
    
    if n == 0:
        return 0
    elif n == 1:
        return 1
    else:
        return fibonacci(n-1) + fibonacci(n-2)

def lucas(n):
    # Возвращает n-е число Люка
    
    if n == 0:
        return 2
    elif n == 1:
        return 1
    else:
        return lucas(n-1) + lucas(n-2)

def


Prompt 1:
def binary_search(arr, target):
    # Реализация бинарного поиска
    
    left, right = 0, len(arr) - 1
    
    while left <= right:
        mid = (left + right) // 2
        
        if arr[mid] == target:
            return mid
        elif arr[mid] < target:
            left = mid + 1
        else:
            right = mid - 1
    
    return -1

def find_first_occurrence(arr, target):
    # Ищем индекс первого вхождения


Prompt 2:
async def handle_client(reader, writer):
    # Обработка клиента
     data = await reader.read(100)
     message = data.decode()
     addr = writer.get_extra_info('peername')
     pr

In [None]:
from datasets import load_dataset, Dataset
from torch.utils.data import DataLoader
from transformers import DataCollatorForLanguageModeling

BATCH_SIZE = 4

def preprocess(example):
    return tokenizer(
        example['content'],
        truncation=True,
        max_length=64,
        padding=False
    )


def load_data(preprocess=preprocess, split='train', train_samples=300, val_samples=50):
    total = train_samples + val_samples

    stream_ds = load_dataset(
        'codeparrot/codeparrot-clean',
        split='train',
        streaming=True
    ).select_columns(['content'])

    stream_ds = stream_ds.take(total)

    data = list(stream_ds)
    ds = Dataset.from_list(data)
    ds = ds.map(preprocess, batched=True, remove_columns=['content'])

    match split:
        case 'train':
            ds = ds.shuffle(seed=42)
            ds = ds.take(train_samples)
        case 'validation':
            ds = ds.skip(train_samples).take(val_samples)


    ds = ds.with_format('torch')

    collator = DataCollatorForLanguageModeling(
        tokenizer,
        mlm=False,
        return_tensors='pt'
    )

    dataloader = DataLoader(
        ds,
        batch_size=BATCH_SIZE,
        collate_fn=collator,
        pin_memory=True,
    )

    return dataloader

In [None]:
def model_perplexity(model):
    dataloader = load_data(split='validation')
    perplexities = []
    model.eval()

    with torch.inference_mode():
        pad_ix = tokenizer.pad_token_id

        for batch in dataloader:
            batch = {k: v.to(device) for k, v in batch.items()}
            batch_size = batch['input_ids'].size(0)

            probas = model(**batch).logits[:, :-1, :]
            targets = batch['input_ids'][:, 1:]

            loss = F.cross_entropy(
                probas.flatten(0, 1),
                targets.flatten(),
                ignore_index=pad_ix,
                reduction='none'
            ).view(targets.shape)

            mask = (targets != pad_ix).float()
            mask_sum = mask.sum(dim=1)
            valid = mask_sum > 0

            seq_losses = (loss * mask).sum(dim=1)[valid] / mask_sum[valid]
            seq_perplexities = torch.exp(seq_losses)
            perplexities.extend(seq_perplexities.tolist())

    if not perplexities:
        raise Exception('Perplexities list is empty')

    return sum(perplexities) / len(perplexities)

### Base Perplexity

In [None]:
print(f'Base perplexity: {model_perplexity(model)}')



Resolving data files:   0%|          | 0/54 [00:00<?, ?it/s]

Map:   0%|          | 0/350 [00:00<?, ? examples/s]

Base perplexity: 4.921199011802673


In [None]:
def train(model, dataloader, optimizer, accumulation_steps=4, num_epochs=5):
    model.train()

    for _ in trange(num_epochs):
        batch_loss = 0.0
        loader_len = 0.0
        for i, batch in enumerate(dataloader):
            batch = {k: v.to(device) for k, v in batch.items()}
            output = model(**batch, use_cache=False)

            loss = output.loss / accumulation_steps
            batch_loss += loss.item() * accumulation_steps
            loader_len += 1
            loss.backward()

            if (i + 1) % accumulation_steps == 0:
                optimizer.step()
                optimizer.zero_grad()


        if loader_len % accumulation_steps != 0:
            optimizer.step()
            optimizer.zero_grad()

        print(f'Loss: {batch_loss / loader_len}')

# Prompt Tuning

In [None]:
NUM_PROMPTS = 8


def preprocess_prompt_tuning(example):
    batch = tokenizer(
        example['content'],
        truncation=True,
        max_length=64,
        padding='max_length',
        return_tensors='pt'
    )

    batch_size = batch['input_ids'].size(0)

    prompt_tokens = torch.full(
        (batch_size, NUM_PROMPTS),
        tokenizer.pad_token_id
    )

    batch['input_ids'] = torch.cat(
        [prompt_tokens, batch['input_ids']], dim=1
    )

    batch['attention_mask'] = torch.cat(
        [torch.ones_like(prompt_tokens), batch['attention_mask']], dim=1
    )

    labels = batch['input_ids'].clone()
    labels[:, :NUM_PROMPTS] = tokenizer.pad_token_id
    batch['labels'] = labels

    return batch

In [None]:
def prompt_model_perplexity(model):
    dataloader = load_data(preprocess_prompt_tuning, split='validation')
    perplexities = []
    model.eval()

    with torch.inference_mode():
        pad_ix = tokenizer.pad_token_id

        for batch in dataloader:
            batch = {k: v.to(device) for k, v in batch.items()}
            batch_size = batch['input_ids'].size(0)

            probas = model(**batch).logits[:, NUM_PROMPTS:-1, :]
            targets = batch['input_ids'][:, NUM_PROMPTS+1:]

            loss = F.cross_entropy(
                probas.flatten(0, 1),
                targets.flatten(),
                ignore_index=pad_ix,
                reduction='none'
            ).view(targets.shape)

            mask = (targets != pad_ix).float()
            mask_sum = mask.sum(dim=1)
            valid = mask_sum > 0

            seq_losses = (loss * mask).sum(dim=1)[valid] / mask_sum[valid]
            seq_perplexities = torch.exp(seq_losses)
            perplexities.extend(seq_perplexities.tolist())

    if not perplexities:
        raise Exception('Perplexities list is empty')

    return sum(perplexities) / len(perplexities)

In [None]:
class WordEmbeddingsWithLearnedPrompts(nn.Module):
    def __init__(self, word_embeddings: nn.Embedding, num_prompts: int):
        super().__init__()
        self.original_word_embeddings = word_embeddings
        self.num_prompts = num_prompts
        self.learnable_prompts = nn.Parameter(
            torch.randn(1, num_prompts, word_embeddings.embedding_dim), requires_grad=True)

    def forward(self, input_ids: torch.LongTensor):
        assert input_ids.dtype == torch.int64
        assert input_ids.shape[1] > self.num_prompts
        assert torch.all(input_ids[:, :self.num_prompts] == tokenizer.pad_token_id).item(), "don't forget to prepend several BOS tokens to input_ids"

        batch_size = input_ids.size(0)
        original_embs = self.original_word_embeddings(input_ids[:, self.num_prompts:])

        prompts = self.learnable_prompts.expand(batch_size, -1, -1)
        prompted_embs = torch.cat([
            prompts,
            original_embs
        ], dim=1)

        return prompted_embs

In [None]:
assert isinstance(model.model.embed_tokens, nn.Embedding), "you have already replaced the embedding layer. If the replacement is broken, please reload the model"
model.model.embed_tokens = WordEmbeddingsWithLearnedPrompts(model.model.embed_tokens, num_prompts=NUM_PROMPTS).to(device)

In [None]:
dataloader = load_data(preprocess_prompt_tuning)

Resolving data files:   0%|          | 0/54 [00:00<?, ?it/s]

Map:   0%|          | 0/350 [00:00<?, ? examples/s]

In [None]:
def train_prompt(model, dataloader, optimizer, accumulation_steps=4, num_epochs=5):
    model.train()

    for _ in trange(num_epochs):
        batch_loss = 0.0
        loader_len = 0.0
        for i, batch in enumerate(dataloader):
            batch = {k: v.to(device) for k, v in batch.items()}
            output = model(**batch, use_cache=False).logits[:, NUM_PROMPTS:-1]
            target = batch['input_ids'][:, NUM_PROMPTS + 1:]


            loss = F.cross_entropy(
                output.flatten(0, 1),
                target.flatten(),
                ignore_index=tokenizer.pad_token_id
            ) / accumulation_steps

            batch_loss += loss.item() * accumulation_steps
            loader_len += 1
            loss.backward()

            if (i + 1) % accumulation_steps == 0:
                optimizer.step()
                optimizer.zero_grad()


        if loader_len % accumulation_steps != 0:
            optimizer.step()
            optimizer.zero_grad()

        print(f'Loss: {batch_loss / loader_len}')

In [None]:
print(f'Number of trainable parameters : {sum(p.numel() for p in model.parameters() if p.requires_grad)}')

Number of trainable parameters : 32768


In [None]:
optimizer = torch.optim.Adam([model.model.embed_tokens.learnable_prompts], lr=0.01)

In [None]:
train_prompt(model, dataloader, optimizer)

  0%|          | 0/5 [00:00<?, ?it/s]

Loss: 1.2909084645907085
Loss: 1.2075682707627615
Loss: 1.1820543309052784
Loss: 1.1735879453023275
Loss: 1.1667124380668004


In [None]:
prompt_model_perplexity(model)

Resolving data files:   0%|          | 0/54 [00:00<?, ?it/s]

Map:   0%|          | 0/350 [00:00<?, ? examples/s]

4.56043128490448

# LoRA

In [None]:
model = load_model()

In [None]:
class LoRALayer(nn.Module):
    def __init__(self, module: nn.Linear, rank: int, alpha: int=None):
        super().__init__()
        self.module = module
        self.rank = rank
        self.alpha = alpha or 2 * rank

        self.A = nn.Parameter(
            torch.empty(module.in_features, rank, device=module.weight.device)
        )
        nn.init.kaiming_uniform_(self.A, a=5**0.5)

        self.B = nn.Parameter(
            torch.zeros(rank, module.out_features, device=module.weight.device)
        )

    def forward(self, input):
        output = self.module(input)
        additive = (input @ self.A) @ self.B
        return output + self.alpha / self.rank * additive

In [None]:
lora_rank = 8

for name, module in model.model.layers.named_modules():
    if 'DecoderLayer' in repr(type(module)):
        module.self_attn.q_proj = LoRALayer(module.self_attn.q_proj, rank=lora_rank).to(device)
        module.self_attn.k_proj = LoRALayer(module.self_attn.k_proj, rank=lora_rank).to(device)
        module.self_attn.v_proj = LoRALayer(module.self_attn.v_proj, rank=lora_rank).to(device)

assert sum(isinstance(module, LoRALayer) for module in model.modules()) > 0, "Did not add any LoRA layers!"

In [None]:
print(f'Number of trainable parameters : {sum(p.numel() for p in model.parameters() if p.requires_grad)}')

Number of trainable parameters : 5308416


In [None]:
dataloader = load_data()
optimizer = torch.optim.Adam([p for p in model.parameters() if p.requires_grad], lr=2e-4)

train(model, dataloader, optimizer)

Resolving data files:   0%|          | 0/54 [00:00<?, ?it/s]

Map:   0%|          | 0/350 [00:00<?, ? examples/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Loss: 1.2230355234940846
Loss: 0.9654959708452224
Loss: 0.7800433580080668
Loss: 0.6159920521577199
Loss: 0.5361343980828921


In [None]:
print(f'LoRA perplexity: {model_perplexity(model)}')

Resolving data files:   0%|          | 0/54 [00:00<?, ?it/s]

Map:   0%|          | 0/350 [00:00<?, ? examples/s]

LoRA perplexity: 2.567058148384094
