In [1]:
import torch
import re
from multiprocessing import Pool
import wandb
from transformers import AutoModelForCausalLM, AutoTokenizer
from omegaconf import OmegaConf
from torch.utils.data import DataLoader
from datasets import load_dataset
from torch.utils.data import Dataset
from tqdm import tqdm
from transformers import  AdamW, get_linear_schedule_with_warmup
from torch import optim
import transformers
import accelerate
import tensor_parallel as tp

[2023-06-30 21:42:40,311] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)


In [4]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
base = 'pygmalion-6b-vicuna-chatml'
tokenizer = AutoTokenizer.from_pretrained(ft)
model = AutoModelForCausalLM.from_pretrained(ft, torch_dtype=torch.float16, low_cpu_mem_usage=True)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [5]:
tokenizer

GPT2TokenizerFast(name_or_path='gpt-j-onlyk_v2', vocab_size=50257, model_max_length=2048, is_fast=True, padding_side='left', truncation_side='left', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'pad_token': '<|endoftext|>'}, clean_up_tokenization_spaces=True)

In [4]:
model.transformer.drop.p = 0.3

In [5]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [8]:
model.dtype

torch.float16

In [8]:
import json

# Открытие JSON-файла и чтение данных из него
with open('dialogs.json') as f:
    data = json.load(f)

In [9]:
tokenizer.model_max_length

2048

In [10]:
tokenizer.padding_side='right'

In [11]:
data.keys()

dict_keys(['topic', 'dialog', 'summary', "girl's persona", "boy's persona", 'role'])

In [18]:
class DialogDataset(Dataset):
    def __init__(self, tokenizer, data):
        self.tokenized = []
        dialog = ''
        data_len = len(data["topic"])
        for idx, sen in tqdm(enumerate(zip(data["topic"], data["girl's persona"], data["boy's persona"])), total=data_len):
            prompt = 'The first dialog participant who likes ' + sen[1][0] + ' talks with second participant who likes ' + sen[2][0] + ' having conversation about ' + sen[0][0] + ' <|endoftext|>'
            first_tensor = self._encode_test(text=dialog, tokenizer=tokenizer)['input_ids'][0]
            second_tensor = self._encode_test(text=prompt, tokenizer=tokenizer)['input_ids'][0]
            tens = torch.cat((first_tensor, second_tensor), dim=0)
            if len(tens) < 1028 - 300:
                dialog += prompt
            else:
                enc = self._encode(text=dialog, tokenizer=tokenizer)
                self.tokenized += [enc]
                dialog = prompt
            if idx >= 1929:
                break
            for rep in zip(data['role'][idx], data['dialog'][idx]):
                dialog += ' '
                try:
                    dialog += ': '.join(rep)
                    dialog += tokenizer.eos_token
                except:
                    try:
                        rep = rep[0]+ rep[1]['girl']
                        dialog += ': '.join(rep)
                    except:
                        try:
                            rep = rep[0]+ rep[1]['boy']
                            dialog += ': '.join(rep)
                        except:
                            rep = rep[0]+ rep[1]['message']
                            dialog += ': '.join(rep)

    def __len__(self):
        return len(self.tokenized)

    def __getitem__(self, item):
        return self.tokenized[item] 
    
    @staticmethod
    def _encode_test(text, tokenizer):
        encoded_sample = tokenizer(text, return_tensors='pt')

        return encoded_sample

    @staticmethod
    def _encode(text, tokenizer):
        encoded_sample = tokenizer(text, padding='max_length', max_length=1028, truncation=True, return_tensors='pt')

        return encoded_sample

In [19]:
dataset = DialogDataset(tokenizer=tokenizer, data=data)

100%|█████████▉| 1929/1931 [00:04<00:00, 435.38it/s]


In [20]:
train_dataloader = DataLoader(dataset, batch_size=1, shuffle=True)

In [21]:
len(train_dataloader)

611

In [22]:
dataset.tokenized[10]

{'input_ids': tensor([[  464,   717, 17310,  ..., 50256, 50256, 50256]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0]])}

In [23]:
tokenizer.decode(dataset.tokenized[1]['input_ids'][0])

"The first dialog participant who likes Alina, 22 years old, from Estonia, likes music, video games, and hiking. She is a student. talks with second participant who likes Herman, 20 years old, from France, a programmer, and enjoys playing video games and hiking. He is looking for a girlfriend. having conversation about Hobbies <|endoftext|> boy: Hey there!<|endoftext|> girl: Hi, how are you?<|endoftext|> boy: I'm doing great! I'm practicing my guitar<|endoftext|> girl: Oh, that sounds cool! Can you send me a photo?<|endoftext|> boy: <photo> Photo of girl bot playing guitar </photo><|endoftext|> girl: You're really good! How long have you been playing?<|endoftext|> boy: Thanks! I've been playing for a few years now, it's really relaxing<|endoftext|> girl: I can imagine. I love playing video games in my free time<|endoftext|> boy: That's awesome! What kind of games do you like?<|endoftext|> girl: I'm really into strategy games and RPGs. I find them really challenging<|endoftext|> boy: I 

In [25]:
from peft import LoraConfig, get_peft_model

lora_config = LoraConfig(
    r=3, lora_alpha=32, lora_dropout=0.05, bias="none"
)
model = get_peft_model(model, lora_config)


Welcome to bitsandbytes. For bug reports, please submit your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
For effortless bug reporting copy-paste your error into this form: https://docs.google.com/forms/d/e/1FAIpQLScPB8emS3Thkp66nvqwmjTEgxp8Y9ufuWTzFyr9kJ5AoI47dQ/viewform?usp=sf_link
CUDA SETUP: Loading binary /home/alexw/.local/lib/python3.8/site-packages/bitsandbytes/libbitsandbytes_cpu.so...


  warn(
  warn(
  warn(


In [26]:
print_trainable_parameters(model)

trainable params: 1376256 || all params: 6052259040 || trainable%: 0.022739542225542284


In [28]:
wandb.init(project='gpt-j',name='chatbot')

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [29]:
import torch.nn as nn
class EMA(nn.Module):
    def __init__(self, decay):
        super(EMA, self).__init__()
        self.decay = decay
        self.shadow_params = {}

    def forward(self, model):
        for name, param in model.named_parameters():
            if param.requires_grad:
                if name not in self.shadow_params:
                    self.shadow_params[name] = param.data.clone()
                else:
                    self.shadow_params[name] -= (1 - self.decay) * (self.shadow_params[name] - param.data)
                param.data = self.shadow_params[name]
                
ema = EMA(decay=0.992)

In [30]:
from transformers import  AdamW, get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup
from tqdm import tqdm
lr = 2e-5

optimizer = AdamW(model.parameters(), lr=lr, betas=(0.9, 0.999))
total_steps =  len(train_dataloader)
scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer, max_lr=lr, total_steps=total_steps, div_factor=25, pct_start=0.2)



In [32]:
len(train_dataloader)

611

In [33]:
from transformers import  AdamW, get_linear_schedule_with_warmup
import torch_optimizer
import gc

ema = EMA(decay=0.992)
optimizer = AdamW(model.parameters(), lr=1e-3, betas=(0.95, 0.99), weight_decay=0.1)
scheduler = get_linear_schedule_with_warmup(
                optimizer, num_warmup_steps=100, num_training_steps=len(train_dataloader)
            )
model.to(device)
model.train()


PeftModel(
  (base_model): LoraModel(
    (model): GPTJForCausalLM(
      (transformer): GPTJModel(
        (wte): Embedding(50400, 4096)
        (drop): Dropout(p=0.3, inplace=False)
        (h): ModuleList(
          (0-27): 28 x GPTJBlock(
            (ln_1): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)
            (attn): GPTJAttention(
              (attn_dropout): Dropout(p=0.0, inplace=False)
              (resid_dropout): Dropout(p=0.0, inplace=False)
              (k_proj): Linear(in_features=4096, out_features=4096, bias=False)
              (v_proj): Linear(
                in_features=4096, out_features=4096, bias=False
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=3, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=3, o

In [34]:
for epoch in range(1):
    for batch in tqdm(train_dataloader):
    
        input_ids = batch['input_ids'].to(model.device)
        attention_mask = batch['attention_mask'].to(model.device)
        
        # Переносим тензоры на устройство (GPU)
        labels = input_ids

    
        optimizer.zero_grad()
        loss = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)[0]
        wandb.log({"loss":  loss})

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        loss.backward()
        
        optimizer.step()
        ema(model)
        scheduler.step()
        del loss, input_ids
        torch.cuda.empty_cache()
        gc.collect()
model.eval()
del optimizer
torch.cuda.empty_cache()

100%|██████████| 611/611 [09:55<00:00,  1.03it/s]


In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [22]:
model = model.merge_and_unload()
model.push_to_hub('zjkarina/ChatGPTJ_6B')
tokenizer.push_to_hub('zjkarina/ChatGPTJ_6B')

pytorch_model-00002-of-00002.bin:   0%|          | 0.00/2.17G [00:00<?, ?B/s]

pytorch_model-00001-of-00002.bin:   0%|          | 0.00/10.0G [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

CommitInfo(commit_url='https://huggingface.co/zjkarina/ChatGPTJ_6B/commit/bf4bcd9b11efd9703d379dcc86bdf9ce58d97b20', commit_message='Upload tokenizer', commit_description='', oid='bf4bcd9b11efd9703d379dcc86bdf9ce58d97b20', pr_url=None, pr_revision=None, pr_num=None)