In [None]:
# !pip install transformers datasets peft accelerate bitsandbytes --quiet

In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model, TaskType, prepare_model_for_kbit_training
from datasets import Dataset
import pandas as pd
import torch

In [2]:
import pandas as pd

DATA_PATH = '../data/russianPoetryWithTheme_deduped.csv'
data = pd.read_csv(DATA_PATH)

In [5]:
# data.author.value_counts()

In [3]:
LORA_AUTHORS_LST = ['Александр Пушкин']

In [4]:
data.loc[data.author.isin(LORA_AUTHORS_LST)].shape

(365, 10)

In [5]:
def pre_prompt(name):
  return f"Запрос: Напиши стих в стиле автора - {name}.\nОтвет:\n"

In [6]:
data_lora = data.loc[data.author.isin(LORA_AUTHORS_LST)]
data_lora['pre_prompt'] = data_lora.author.apply(pre_prompt)
data_lora['prompt'] = data_lora.pre_prompt + data_lora.text

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_lora['pre_prompt'] = data_lora.author.apply(pre_prompt)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_lora['prompt'] = data_lora.pre_prompt + data_lora.text


In [7]:
dataset = Dataset.from_pandas(data_lora[["prompt"]]) 

In [8]:
MODEL_NAME = "t-tech/T-lite-it-1.0"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token 
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    device_map="auto",
    torch_dtype=torch.float16,
    load_in_4bit=True
)

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [9]:
model = prepare_model_for_kbit_training(model)

lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q_proj", "v_proj", "k_proj", "o_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.CAUSAL_LM
)

model = get_peft_model(model, lora_config)

In [10]:
def tokenize(example):
    result = tokenizer(
        example["prompt"],
        truncation=True,
        padding="max_length",
        max_length=512
    )
    result["labels"] = result["input_ids"].copy()
    return result

tokenized_dataset = dataset.map(tokenize, batched=True)

Map:   0%|          | 0/365 [00:00<?, ? examples/s]

In [14]:
training_args = TrainingArguments(
    output_dir="../data/lora-poetry-pushkin", #
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    num_train_epochs=5, #
    learning_rate=2e-4,
    fp16=True,
    logging_steps=10, #
    save_strategy="epoch",
    save_total_limit=2,
    report_to="none"
)

trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    train_dataset=tokenized_dataset
)


  trainer = Trainer(
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [15]:
trainer.train()

  return fn(*args, **kwargs)


Step,Training Loss
10,1.0223
20,0.9494
30,0.9367
40,0.91
50,1.0414
60,0.9059
70,0.7778
80,0.8574
90,0.9156
100,0.9012


  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)


TrainOutput(global_step=115, training_loss=0.9137395775836447, metrics={'train_runtime': 881.6199, 'train_samples_per_second': 2.07, 'train_steps_per_second': 0.13, 'total_flos': 3.96609933017088e+16, 'train_loss': 0.9137395775836447, 'epoch': 5.0})

In [16]:
model.save_pretrained("../data/lora-poetry-pushkin") #


In [17]:
tokenizer.save_pretrained("../data/lora-poetry-pushkin") #

('../data/lora-poetry-pushkin/tokenizer_config.json',
 '../data/lora-poetry-pushkin/special_tokens_map.json',
 '../data/lora-poetry-pushkin/vocab.json',
 '../data/lora-poetry-pushkin/merges.txt',
 '../data/lora-poetry-pushkin/added_tokens.json',
 '../data/lora-poetry-pushkin/tokenizer.json')