Запустим инференс базовой и обученной модели, чтобы позже оценить результаты в ноутбуке eval.ipynb. Также напишем функцию инференса, чтобы можно было задавать модели любые вопросы

In [2]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [3]:
import pandas as pd

df = pd.read_csv('/content/drive/MyDrive/Генерация в NLP/hw_task_2/data/house_answers.csv')
df = df[['line', 'response']].rename(columns={'line': 'instruction', 'response': 'output'})

# Базовая модель

In [None]:
%%capture
!pip install bitsandbytes

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model
import torch

# Загружаем токенизатор и модель
model_name = "Qwen/Qwen2.5-0.5B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
# tokenizer.pad_token = tokenizer.eos_token  # LLaMA не имеет pad_token по умолчанию

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    load_in_4bit=True,
    device_map="auto",
    torch_dtype=torch.float16
)

# 3. Готовим модель к дообучению с LoRA
model = prepare_model_for_kbit_training(model)

lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q_proj", "v_proj", "k_proj", "o_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, lora_config)

In [None]:
# def get_model_answer(prompt):
#     messages = [
#         {"role": "system", "content": "Answer like Dr.House"},
#         {"role": "user", "content": prompt}
#     ]
#     text = tokenizer.apply_chat_template(
#         messages,
#         tokenize=False,
#         add_generation_prompt=True
#     )
#     model_inputs = tokenizer([text], return_tensors="pt").to(model.device)

#     generated_ids = model.generate(
#         **model_inputs,
#         max_new_tokens=32
#     )
#     generated_ids = [
#         output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
#     ]

#     response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
#     return response


def prepare_df(prompt):
    messages = [
        {"role": "system", "content": "Answer like Dr.House"},
        {"role": "user", "content": prompt}
    ]
    text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )
    return text

In [None]:
# get_model_answer("Hi")

In [None]:
df.shape

In [None]:
%%time
df['prepared_instruction'] = df['instruction'].apply(prepare_df)

In [None]:
texts = df['prepared_instruction'].to_list()

In [None]:
from tqdm import tqdm

model.eval()
chunk_size = 10

answers = []
for step in tqdm(range(0, len(texts), chunk_size)):
  texts_chunk = texts[step:step + chunk_size]
  model_inputs = tokenizer(texts_chunk, return_tensors="pt", truncation=True, padding="max_length", max_length=512, padding_side='left').to(model.device)

  generated_ids = model.generate(
      **model_inputs,
      max_new_tokens=32
  )
  generated_ids = [
      output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
  ]

  response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
  answers.extend(response)

In [None]:
df['base_model_answers'] = answers

# Обученная модель

In [7]:
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model
import torch

# Загружаем токенизатор и модель
# model_name = "Qwen/Qwen2.5-0.5B-Instruct"
# tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)

finetuned_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    load_in_4bit=True,
    device_map="auto",
    torch_dtype=torch.float16
)

# 3. Готовим модель к дообучению с LoRA
finetuned_model = prepare_model_for_kbit_training(finetuned_model)

lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q_proj", "v_proj", "k_proj", "o_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

finetuned_model = get_peft_model(finetuned_model, lora_config)

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


In [8]:
finetuned_model.load_state_dict(torch.load('/content/drive/MyDrive/Генерация в NLP/hw_task_2/models/qwen_House_1.pth'), strict=False)

  finetuned_model.load_state_dict(torch.load('/content/drive/MyDrive/Генерация в NLP/hw_task_2/models/qwen_House_1.pth'), strict=False)


_IncompatibleKeys(missing_keys=[], unexpected_keys=['base_model.model.model.layers.0.self_attn.q_proj.base_layer.weight.absmax', 'base_model.model.model.layers.0.self_attn.q_proj.base_layer.weight.quant_map', 'base_model.model.model.layers.0.self_attn.q_proj.base_layer.weight.quant_state.bitsandbytes__fp4', 'base_model.model.model.layers.0.self_attn.k_proj.base_layer.weight.absmax', 'base_model.model.model.layers.0.self_attn.k_proj.base_layer.weight.quant_map', 'base_model.model.model.layers.0.self_attn.k_proj.base_layer.weight.quant_state.bitsandbytes__fp4', 'base_model.model.model.layers.0.self_attn.v_proj.base_layer.weight.absmax', 'base_model.model.model.layers.0.self_attn.v_proj.base_layer.weight.quant_map', 'base_model.model.model.layers.0.self_attn.v_proj.base_layer.weight.quant_state.bitsandbytes__fp4', 'base_model.model.model.layers.0.self_attn.o_proj.base_layer.weight.absmax', 'base_model.model.model.layers.0.self_attn.o_proj.base_layer.weight.quant_map', 'base_model.model.mo

In [None]:
from tqdm import tqdm

finetuned_model.eval()
chunk_size = 10

ft_answers = []
for step in tqdm(range(0, len(texts), chunk_size)):
  texts_chunk = texts[step:step + chunk_size]
  model_inputs = tokenizer(texts_chunk, return_tensors="pt", truncation=True, padding="max_length", max_length=512, padding_side='left').to(model.device)

  generated_ids = finetuned_model.generate(
      **model_inputs,
      max_new_tokens=32
  )
  generated_ids = [
      output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
  ]

  response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
  ft_answers.extend(response)

In [None]:
df['finetuned_model_answers'] = ft_answers

In [None]:
df.head()

In [None]:
df.to_csv('/content/drive/MyDrive/Генерация в NLP/hw_task_2/data/answers.csv', index=False)

# Простая функция инференса

In [9]:
def get_model_answer(model, prompt):
    messages = [
        {"role": "system", "content": "Answer like Dr.House"},
        {"role": "user", "content": prompt}
    ]
    text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )
    model_inputs = tokenizer([text], return_tensors="pt").to(model.device)

    generated_ids = model.generate(
        **model_inputs,
        max_new_tokens=32
    )
    generated_ids = [
        output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
    ]

    response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
    return response

In [10]:
get_model_answer(model, 'Hi')

'Hello! How can I assist you today?'

In [12]:
get_model_answer(finetuned_model, 'Hi')

' What are you going to do?'

In [None]:
!pip freeze > requirements.txt