In [3]:
from datasets import load_dataset
import lightning as L
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import LoraConfig, get_peft_model

import os
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

In [4]:
class OTPForDocstring(L.LightningModule):
    def __init__(self, model_name: str, learning_rate=2e-5):
        super().__init__()
        self.learning_rate = learning_rate
        self.opt_model = AutoModelForCausalLM.from_pretrained(model_name, device_map='auto')
        self.tokenizer = AutoTokenizer.from_pretrained(model_name, add_eos_token=True)
        
        for param in self.opt_model.parameters():
            param.requires_grad = False
            if param.ndim == 1:
                param.data = param.data.to(torch.float32)
        self.opt_model.gradient_checkpointing_enable()
        self.opt_model.enable_input_require_grads()
        
        class CastOutputToFloat(torch.nn.Sequential):
            def forward(self, x): return super().forward(x).to(torch.float32)
        self.opt_model.lm_head = CastOutputToFloat(self.opt_model.lm_head)

        self.lora_config = LoraConfig(
            r=16,
            lora_alpha=32,
            target_modules=["q_proj", "v_proj"],
            lora_dropout=0.05,
            bias="none",
            task_type="CAUSAL_LM"
        )

        self.peft_model = get_peft_model(self.opt_model, self.lora_config)
    
    def training_step(self, batch, batch_idx):
        outputs = self.peft_model(
            input_ids=batch['input_ids'], 
            attention_mask=batch['attention_mask'], 
            labels=batch['labels'],
        )
        loss = outputs.loss
        
        self.log('train_loss', loss, on_step=True, on_epoch=True, prog_bar=True, logger=True)
        return loss
    
    def validation_step(self, batch, batch_idx):
        outputs = self.peft_model(
            input_ids=batch['input_ids'], 
            attention_mask=batch['attention_mask'], 
            labels=batch['labels'],
        )
        loss = outputs.loss

        self.log('val_loss', loss, on_step=True, on_epoch=True, prog_bar=True, logger=True)
        return loss
    
    def test_step(self, batch, batch_idx):
        outputs = self.peft_model(
            input_ids=batch['input_ids'], 
            attention_mask=batch['attention_mask'], 
            labels=batch['labels'],
        )
        loss = outputs.loss

        self.log('test_loss', loss, on_step=True, on_epoch=True, prog_bar=True, logger=True)
        return loss
    
    def configure_optimizers(self):
        optimizer = torch.optim.AdamW(self.parameters(), lr=self.learning_rate)
        return optimizer
    
model = OTPForDocstring(model_name="facebook/opt-125m")
tokenizer = model.tokenizer

In [5]:
from lightning.pytorch.callbacks import LearningRateFinder


class FineTuneLearningRateFinder(LearningRateFinder):
    def __init__(self, milestones, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.milestones = milestones

    def on_fit_start(self, *args, **kwargs):
        return

    def on_train_epoch_start(self, trainer, pl_module):
        if trainer.current_epoch in self.milestones or trainer.current_epoch == 0:
            self.lr_find(trainer, pl_module)

# trainer = L.Trainer(max_epochs=10, devices=1, accelerator='gpu', callbacks=[FineTuneLearningRateFinder(milestones=(5, 10))])
trainer = L.Trainer(max_epochs=10, devices=1, accelerator='gpu')

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [6]:
class CodeDataset(Dataset):
    
    #DATASET_NAME = 'calum/the-stack-smol-python-docstrings'
    DATASET_NAME = '/home/valvarl/docstring-generator/the-stack-small-python-docstrings'
    INSTRUCTION = '# code\n```Python\n%s\n```\n# docstring\n%s'
    
    def __init__(self, tokenizer, max_length=2048):
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.tokenized_prompts = []
        
        ds = load_dataset(self.DATASET_NAME, split='train')
        for body, docstring in zip(ds['body_without_docstring'], ds['docstring']):
            prompt = self.INSTRUCTION % (body, docstring)
            tokenized_prompt = self.tokenizer(prompt)
            if len(tokenized_prompt) < self.max_length:
                tokenized_prompt['input_ids'].append(self.tokenizer.eos_token_id)
                tokenized_prompt['attention_mask'].append(1)
                self.tokenized_prompts.append(tokenized_prompt)
        
        self.tokenized_prompts.sort(key=lambda x: len(x['input_ids']))
    
    def __len__(self):
        return len(self.tokenized_prompts)
    
    def __getitem__(self, idx: int):
        return self.tokenized_prompts[idx]

    
train_data = CodeDataset(model.tokenizer, max_length=model.opt_model.config.max_position_embeddings)
    
train_size = int(0.8 * len(train_data))
test_size = len(train_data) - train_size
train_dataset, test_data = torch.utils.data.random_split(train_data, [train_size, test_size])

val_size = int(0.5 * len(test_data))
test_size = len(test_data) - val_size
val_dataset, test_dataset = torch.utils.data.random_split(test_data, [val_size, test_size])

In [9]:
def collate_fn(batch):
    max_len = max(len(tokenized_prompt['input_ids']) for tokenized_prompt in batch)
    padded_input_ids = []
    padded_attention_mask = []
    
    for tokenized_prompt in batch:
        input_ids = tokenized_prompt['input_ids']
        attention_mask = tokenized_prompt['attention_mask']
        
        padding_length = max_len - len(input_ids)
        padded_input_ids.append(input_ids + [tokenizer.pad_token_id] * padding_length)
        padded_attention_mask.append(attention_mask + [0] * padding_length)
    
    return {
        'input_ids': torch.tensor(padded_input_ids).cuda(),
        'attention_mask': torch.tensor(padded_attention_mask).cuda(),
        'labels': torch.tensor(padded_input_ids).cuda()
    }

train_loader = DataLoader(train_dataset, batch_size=2, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=2, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=2, collate_fn=collate_fn)

In [10]:
trainer.fit(model, train_loader, val_loader)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name       | Type                 | Params
----------------------------------------------------
0 | opt_model  | OPTForCausalLM       | 125 M 
1 | peft_model | PeftModelForCausalLM | 125 M 
----------------------------------------------------
589 K     Trainable params
125 M     Non-trainable params
125 M     Total params
503.316   Total estimated model params size (MB)


Sanity Checking: |                                                                                | 0/? [00:00…

/home/valvarl/anaconda3/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:441: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=19` in the `DataLoader` to improve performance.


Training: |                                                                                       | 0/? [00:00…

../aten/src/ATen/native/cuda/Indexing.cu:1290: indexSelectLargeIndex: block: [39,0,0], thread: [96,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1290: indexSelectLargeIndex: block: [39,0,0], thread: [97,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1290: indexSelectLargeIndex: block: [39,0,0], thread: [98,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1290: indexSelectLargeIndex: block: [39,0,0], thread: [99,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1290: indexSelectLargeIndex: block: [39,0,0], thread: [100,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1290: indexSelectLargeIndex: block: [39,0,0], thread: [101,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1290: indexSelectLargeIndex: block: [39,0,0],

RuntimeError: CUDA error: device-side assert triggered
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [34]:
trainer.test(model, test_loader)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
/home/valvarl/anaconda3/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:441: The 'test_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=19` in the `DataLoader` to improve performance.


Testing: |                                                                                        | 0/? [00:00…

────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       Test metric             DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
     test_loss_epoch       8.225456440413836e-07
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────


[{'test_loss_epoch': 8.225456440413836e-07}]

In [42]:
print(tokenizer.decode(test_dataset[0]['input_ids']))

</s># code
```Python
def _check_u_and_t_for_simulation(m, dt, u, t, isdiscrete):
    '\n    \n    '
    if (t is None):
        if (not isdiscrete):
            raise ValueError('Continuous time models need an evenly spaced time sequence from which the sampling period will be obtained.')
        else:
            u_samples = len(u)
            t = np.linspace(0, ((u_samples - 1) * dt), num=u_samples)
    else:
        t = np.asarray(t, dtype=float).squeeze()
        if (t.ndim!= 1):
            raise ValueError('Time array needs to be a 1D array.')
        t_diff = np.diff(t)
        if ((not np.allclose(t_diff, t_diff[0])) or (not (t_diff[0] > 0.0))):
            raise ValueError('Time array should be equally spaced and increasing.')
        if (isdiscrete and (not np.isclose(dt, t_diff[0]))):
            raise ValueError('Time array increment {} is not equal to the model sampling period {}.'.format(t_diff[0], dt))
    if (u.size < 1):
        raise ValueError('The input array should 

In [36]:
inputs = tokenizer('''# code
```Python
def load_excel(path):
    return pd.read_excel(path)
```
# docstring
''', return_tensors='pt')

doc_max_length = 128

generated_ids = model.peft_model.generate(
    **inputs,
    max_length=inputs.input_ids.shape[1] + doc_max_length,
    do_sample=False,
    return_dict_in_generate=True,
    num_return_sequences=1,
    output_scores=True,
    pad_token_id=50256,
    eos_token_id=50256  # <|endoftext|>
)

ret = tokenizer.decode(generated_ids.sequences[0], skip_special_tokens=False)
print(ret)

</s># code
```Python
def load_excel(path):
    return pd.read_excel(path)
```
# docstring
%s
%s</s># code

# docstring

# docstring

%s

%s

%s

%s

%s

%s

%s

%s

%s

%s

%s

%s

%s

%s

%s

%s

%s

%s

%s

%s

%s

%s

%s

%s

%s

%s

%s




In [43]:
%load_ext tensorboard 
%tensorboard --logdir lightning_logs