In [None]:
# !pip install transformers datasets peft accelerate bitsandbytes --quiet

In [3]:
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model, TaskType, prepare_model_for_kbit_training
from datasets import Dataset
import pandas as pd
import torch

In [4]:
import pandas as pd

DATA_PATH = '../data/russianPoetryWithTheme_deduped.csv'
data = pd.read_csv(DATA_PATH)

In [5]:
# data.author.value_counts()

In [6]:
LORA_AUTHORS_LST = [
    'Александр Блок', 'Александр Пушкин', 'Алексей Толстой', 'Анна Ахматова', 'Афанасий Фет'
    'Борис Пастернак', 'Булат Окуджава', 'Валерий Брюсов', 'Велимир Хлебников', 'Владимир Маяковский',
    'Евгений Евтушенко', 'Зинаида Гиппиус', 'Иван Бунин', 'Иван Крылов',
    'Игорь Северянин', 'Иосиф Бродский', 'К. Р. (Константин Романов)', 'Константин Бальмонт', 'Константин Симонов',
    'Марина Цветаева', 'Михаил Лермонтов', 'Николай Гумилев', 'Осип Мандельштам',
    'Роберт Рождественский', 'Сергей Есенин', 'Федор Тютчев', 'Эдуард Асадов'

]

AUTHORS_MAP = {
    'Александр Блок': 'blok', 'Александр Пушкин': 'pushkin', 'Алексей Толстой': 'atolstoy', 'Анна Ахматова': 'akhmatova', 'Афанасий Фет': 'fet',
    'Борис Пастернак': 'pasternak', 'Булат Окуджава': 'okydzava', 'Валерий Брюсов': 'bryusov', 'Велимир Хлебников': 'hlebnikov', 
    'Владимир Маяковский': 'mayakovsky', 'Евгений Евтушенко': 'evtushenko', 'Зинаида Гиппиус': 'gippius', 'Иван Бунин': 'bynin', 'Иван Крылов': 'krylov',
    'Игорь Северянин': 'severyanin', 'Иосиф Бродский': 'brodsky', 'К. Р. (Константин Романов)': 'kromanov', 'Константин Бальмонт': 'balmont', 'Константин Симонов': 'simonov',
    'Марина Цветаева': 'tsvetaeva', 'Михаил Лермонтов': 'lermontov', 'Николай Гумилев': 'gumilev', 'Осип Мандельштам': 'mandelshtam',
    'Роберт Рождественский': 'rozhdest', 'Сергей Есенин': 'esenin', 'Федор Тютчев': 'tyutchev', 'Эдуард Асадов': 'asadov'
}

In [18]:
len(LORA_AUTHORS_LST)

26

In [7]:
data.loc[data.author.isin(LORA_AUTHORS_LST)].shape

(3906, 10)

In [8]:
def pre_prompt(name):
  return f"Запрос: Напиши стих в стиле автора - {name}.\nОтвет:\n"

In [9]:
data_lora = data.loc[data.author.isin(LORA_AUTHORS_LST)]
data_lora['pre_prompt'] = data_lora.author.apply(pre_prompt)
data_lora['prompt'] = data_lora.pre_prompt + data_lora.text

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_lora['pre_prompt'] = data_lora.author.apply(pre_prompt)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_lora['prompt'] = data_lora.pre_prompt + data_lora.text


In [10]:
dataset = Dataset.from_pandas(data_lora[["prompt"]]) 

In [11]:
MODEL_NAME = "t-tech/T-lite-it-1.0"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token 
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    device_map="auto",
    torch_dtype=torch.float16,
    load_in_4bit=True
)

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [12]:
model = prepare_model_for_kbit_training(model)

lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q_proj", "v_proj", "k_proj", "o_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.CAUSAL_LM
)

model = get_peft_model(model, lora_config)

In [13]:
def tokenize(example):
    result = tokenizer(
        example["prompt"],
        truncation=True,
        padding="max_length",
        max_length=512
    )
    result["labels"] = result["input_ids"].copy()
    return result

tokenized_dataset = dataset.map(tokenize, batched=True)

Map:   0%|          | 0/3906 [00:00<?, ? examples/s]

In [14]:
training_args = TrainingArguments(
    output_dir="../data/lora-poetry2", #
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    num_train_epochs=3, #
    learning_rate=2e-4,
    fp16=True,
    logging_steps=40, #
    save_strategy="epoch",
    save_total_limit=2,
    report_to="none"
)

trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    train_dataset=tokenized_dataset
)


  trainer = Trainer(
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [15]:
trainer.train()

  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)


Step,Training Loss
40,1.4976
80,1.1129
120,1.1283
160,1.1365
200,1.0959
240,1.0849
280,1.0636
320,1.0827
360,1.0935
400,1.0788


  return fn(*args, **kwargs)
  return fn(*args, **kwargs)


TrainOutput(global_step=732, training_loss=1.1155457548756418, metrics={'train_runtime': 5551.9738, 'train_samples_per_second': 2.111, 'train_steps_per_second': 0.132, 'total_flos': 2.539172853354332e+17, 'train_loss': 1.1155457548756418, 'epoch': 2.9907881269191403})

In [16]:
model.save_pretrained("../data/lora-poetry2") #


In [17]:
tokenizer.save_pretrained("../data/lora-poetry2") #

('../data/lora-poetry2/tokenizer_config.json',
 '../data/lora-poetry2/special_tokens_map.json',
 '../data/lora-poetry2/vocab.json',
 '../data/lora-poetry2/merges.txt',
 '../data/lora-poetry2/added_tokens.json',
 '../data/lora-poetry2/tokenizer.json')