In [1]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [2]:
%%capture
!pip install -U datasets bitsandbytes accelerate

# Модель

In [14]:
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model
import torch

# 2. Загружаем токенизатор и модель
model_name = "Qwen/Qwen2.5-0.5B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
# tokenizer.pad_token = tokenizer.eos_token  # LLaMA не имеет pad_token по умолчанию

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    load_in_4bit=True,
    device_map="auto",
    torch_dtype=torch.float16
)

# 3. Готовим модель к дообучению с LoRA
model = prepare_model_for_kbit_training(model)

lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q_proj", "v_proj", "k_proj", "o_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, lora_config)

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


# Датасет

In [4]:
from datasets import load_dataset

In [5]:
import pandas as pd

df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/МФТИ/генерация в NLP/ДЗ1/dataset/house_answers.csv')

df = df[['line', 'response']].rename(columns={'line': 'instruction', 'response': 'output'})

In [6]:
df

Unnamed: 0,instruction,output
0,You can't go in there.,"Who are you, and why are you wearing a tie?"
1,I'm Dr. Cuddy's new assistant. Can I tell her...,Yes. I would like to know why she gets a secr...
2,"I'm her assistant, not her secretary. I gradu...",Hmm. I didn't know they had a secretarial sch...
3,"Dr. House, we are in the middle of a meeting.",What's with hiring a male secretary? JDate no...
4,He is cute. Be careful.,She's not like you. She can't just walk into ...
...,...,...
17868,"Phone. A million times he needed me, and the ...",Hi.
17869,How?,I got out of the back of the building.
17870,The body�,Just switched the dental records.
17871,You're destroying your entire life. You can't...,"I'm dead, Wilson. How do you want to spend yo..."


In [7]:
from datasets import Dataset
dataset = Dataset.from_dict(df)

In [8]:
# 4. Форматируем примеры
def format(example):
    prompt = f"### Question:\n{example['instruction']}\n\n### Answer:\n{example['output']}"
    return tokenizer(prompt, truncation=True, padding="max_length", max_length=512)

tokenized_dataset = dataset.map(format)

Map:   0%|          | 0/17873 [00:00<?, ? examples/s]

In [9]:
# 4. Форматируем примеры
def format(example):
    messages = [
        {"role": "system", "content": "Answer like Dr.House"},
        {"role": "user", "content": example['instruction']},
        {"role": "assistant", "content": example['output']}
    ]
    text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )
    return tokenizer(text, truncation=True, padding="max_length", max_length=512)

tokenized_dataset = dataset.map(format)

Map:   0%|          | 0/17873 [00:00<?, ? examples/s]

In [10]:
dataset_dict = tokenized_dataset.train_test_split(test_size=0.3)

# Как модель отвечает до обучения

In [11]:
model.eval()

prompt = "You can't go in there."
messages = [
    {"role": "system", "content": "Answer like Dr.House"},
    {"role": "user", "content": prompt}
]
text = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True
)
model_inputs = tokenizer([text], return_tensors="pt").to(model.device)

generated_ids = model.generate(
    **model_inputs,
    max_new_tokens=32
)
generated_ids = [
    output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
]

response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response


"Sorry, I made a mistake. The answer is that it's not clear what you're asking. Could you please provide more information or ask a different question?"

# Обучение

In [15]:
# 5. Аргументы тренировки
training_args = TrainingArguments(
    output_dir="./qwen-finetuned",
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    num_train_epochs=3,
    logging_steps=100,
    # save_steps=100,
    # save_total_limit=2,
    learning_rate=2e-4,
    fp16=True,
    bf16=False,
    optim="paged_adamw_8bit",
    report_to="none"
)

# 6. Запуск тренировки
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset_dict['train'],
    eval_dataset=dataset_dict['test'],
    tokenizer=tokenizer,
    data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False)
)

trainer.train()

# # 7. Сохраняем модель
# model.save_pretrained("./llama2-character-bot")
# tokenizer.save_pretrained("./llama2-character-bot")


  trainer = Trainer(
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwar

Step,Training Loss
100,2.8726
200,2.3365
300,2.3094
400,2.3125
500,2.2935
600,2.209
700,2.2517
800,2.2628
900,2.2727
1000,2.2184


  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)


TrainOutput(global_step=4692, training_loss=2.1827374626608456, metrics={'train_runtime': 7979.6957, 'train_samples_per_second': 4.704, 'train_steps_per_second': 0.588, 'total_flos': 4.139082523253146e+16, 'train_loss': 2.1827374626608456, 'epoch': 3.0})

In [16]:
model.eval()

prompt = "You can't go in there."
messages = [
    {"role": "system", "content": "Answer like Dr.House"},
    {"role": "user", "content": prompt}
]
text = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True
)
model_inputs = tokenizer([text], return_tensors="pt").to(model.device)

generated_ids = model.generate(
    **model_inputs,
    max_new_tokens=32
)
generated_ids = [
    output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
]

response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response


" I'm not gonna be in here anymore."

In [17]:
torch.save(model.state_dict(), "/content/drive/MyDrive/Colab Notebooks/МФТИ/генерация в NLP/ДЗ2/models/qwen_House_2.pth")