In [1]:
import os
import torch

import numpy as np
import pandas as pd

from transformers import (
    AutoModelWithLMHead, 
    AutoTokenizer, 
    Trainer,  
    TrainingArguments,
    DataCollatorForSeq2Seq,
    DataCollatorForLanguageModeling
)
from peft import LoraConfig, get_peft_model
from huggingface_hub import login
from tqdm import tqdm

'NoneType' object has no attribute 'cadam32bit_grad_fp32'


  warn("The installed version of bitsandbytes was compiled without GPU support. "


In [2]:
login()
np.random.seed(0)

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid.
Your token has been saved to /Users/ivanevgenyevich/.cache/huggingface/token
Login successful


In [3]:
tokenizer = AutoTokenizer.from_pretrained('tinkoff-ai/ruDialoGPT-medium')
model = AutoModelWithLMHead.from_pretrained('tinkoff-ai/ruDialoGPT-medium')

def generate(prompt):
    data = tokenizer(prompt, return_tensors='pt')
    output_ids = model.generate(
        **data,
        top_k=50,
        top_p=0.95,
        num_beams=3,
        do_sample=True,
        no_repeat_ngram_size=16,
        temperature=1.2,
        repetition_penalty=1.2,
        length_penalty=1.0,
        eos_token_id=50257,
        max_new_tokens=40
    )[0]
    output_ids = output_ids[len(data["input_ids"][0]):]
    output = tokenizer.decode(output_ids, skip_special_tokens=True).strip()
    output = output[:output.find('@@')]
    return output

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [4]:
lora_config = LoraConfig(
    task_type='CAUSAL_LM', 
    r=64, 
    lora_alpha=16,
    lora_dropout=0.05,
    bias='none',
    target_modules=['c_attn', 'c_proj']
)

model = get_peft_model(model, lora_config)



In [5]:
def make_dialog(dialog):
    flag = 0
    final_dialog = ''
    for message in dialog:
        final_dialog += '@@ВТОРОЙ@@ 'if flag else '@@ПЕРВЫЙ@@ '
        flag = 1 - flag
        final_dialog += message + ' '
    return final_dialog    

def make_dataset(path):
    dataset = pd.read_csv(path)
    dialogs = []
    len_dataset = dataset.shape[0]
    for i in range(len_dataset):
        if dataset.loc[i].isna()['context_1']:
            if i:
                dialogs.append(make_dialog(dialog))
            dialog = []
        if not dataset.loc[i].isna()['response']:
            dialog.append(dataset.loc[i, 'response'])
    return dialogs

In [7]:
def convert(dataset):
    converter_dataset = []
    for i in tqdm(dataset):
        input_ids = tokenizer(dataset)['input_ids'][0]
        converter_dataset.append({'input_ids': input_ids, 'labels': input_ids})
    return converter_dataset

dataset = make_dataset('data/mmro.csv') + make_dataset('data/prac.csv') + make_dataset('data/bayes.csv') + make_dataset('data/nlp.csv')[:300]
np.random.shuffle(dataset)
train_dataset = convert(dataset[:int(0.8 * len(dataset))])
eval_dataset = convert(dataset[int(0.8 * len(dataset)):])

100%|██████████| 681/681 [00:25<00:00, 26.86it/s]
100%|██████████| 171/171 [00:01<00:00, 109.21it/s]


In [8]:
training_arguments = TrainingArguments(
    num_train_epochs=1,
    output_dir='./tg_bot',
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    learning_rate=1e-4,
    lr_scheduler_type='cosine',
    warmup_steps=10,
    weight_decay=0.01,
    report_to='wandb',
    evaluation_strategy='steps',
    eval_steps=10,
    save_steps=10, 
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model='eval_loss'
)

training_arguments.set_dataloader(pin_memory=False)
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
trainer = Trainer(model=model, args=training_arguments, 
    train_dataset=train_dataset, eval_dataset=eval_dataset, data_collator=data_collator)


In [9]:
trainer.train()
model.save_pretrained(os.path.join('./tg_bot', 'final_checkpoint'))

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[34m[1mwandb[0m: Currently logged in as: [33mkadchenko-ivan[0m. Use [1m`wandb login --relogin`[0m to force relogin


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  0%|          | 0/86 [00:00<?, ?it/s]

You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'loss': 5.1235, 'learning_rate': 0.0001, 'epoch': 0.12}


  0%|          | 0/22 [00:00<?, ?it/s]

{'eval_loss': 4.9384446144104, 'eval_runtime': 11.6274, 'eval_samples_per_second': 14.707, 'eval_steps_per_second': 1.892, 'epoch': 0.12}
{'loss': 4.8052, 'learning_rate': 9.578866633275288e-05, 'epoch': 0.23}


  0%|          | 0/22 [00:00<?, ?it/s]

{'eval_loss': 4.84628438949585, 'eval_runtime': 12.223, 'eval_samples_per_second': 13.99, 'eval_steps_per_second': 1.8, 'epoch': 0.23}
{'loss': 4.1023, 'learning_rate': 8.386407858128706e-05, 'epoch': 0.35}


  0%|          | 0/22 [00:00<?, ?it/s]

{'eval_loss': 4.663123607635498, 'eval_runtime': 12.5788, 'eval_samples_per_second': 13.594, 'eval_steps_per_second': 1.749, 'epoch': 0.35}
{'loss': 3.192, 'learning_rate': 6.623497346023418e-05, 'epoch': 0.47}


  0%|          | 0/22 [00:00<?, ?it/s]

{'eval_loss': 4.454667568206787, 'eval_runtime': 13.0068, 'eval_samples_per_second': 13.147, 'eval_steps_per_second': 1.691, 'epoch': 0.47}
{'loss': 2.4781, 'learning_rate': 4.5871032726383386e-05, 'epoch': 0.58}


  0%|          | 0/22 [00:00<?, ?it/s]

{'eval_loss': 4.317712783813477, 'eval_runtime': 12.7423, 'eval_samples_per_second': 13.42, 'eval_steps_per_second': 1.727, 'epoch': 0.58}
{'loss': 2.0172, 'learning_rate': 2.6202630348146324e-05, 'epoch': 0.7}


  0%|          | 0/22 [00:00<?, ?it/s]

{'eval_loss': 4.245809078216553, 'eval_runtime': 12.3928, 'eval_samples_per_second': 13.798, 'eval_steps_per_second': 1.775, 'epoch': 0.7}
{'loss': 1.7127, 'learning_rate': 1.0542974530180327e-05, 'epoch': 0.81}


  0%|          | 0/22 [00:00<?, ?it/s]

{'eval_loss': 4.214721202850342, 'eval_runtime': 13.0846, 'eval_samples_per_second': 13.069, 'eval_steps_per_second': 1.681, 'epoch': 0.81}
{'loss': 1.5714, 'learning_rate': 1.5299867030334814e-06, 'epoch': 0.93}


  0%|          | 0/22 [00:00<?, ?it/s]

{'eval_loss': 4.207455158233643, 'eval_runtime': 13.016, 'eval_samples_per_second': 13.138, 'eval_steps_per_second': 1.69, 'epoch': 0.93}
{'train_runtime': 193.9409, 'train_samples_per_second': 3.511, 'train_steps_per_second': 0.443, 'train_loss': 3.0131957586421523, 'epoch': 1.0}


In [10]:
model.push_to_hub('my_tg_bot')

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

adapter_model.bin:   0%|          | 0.00/69.3M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/ivankadchenko/my_tg_bot/commit/56d5bdb80c8257fc1ef61f2ad2a791d3387d84b0', commit_message='Upload model', commit_description='', oid='56d5bdb80c8257fc1ef61f2ad2a791d3387d84b0', pr_url=None, pr_revision=None, pr_num=None)