In [1]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


# Загружаем модель

In [2]:
%%capture
!pip install bitsandbytes gradio

In [9]:
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model
import torch

# Загружаем токенизатор и модель
model_name = "Qwen/Qwen2.5-0.5B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)

finetuned_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    load_in_4bit=True,
    device_map="auto",
    torch_dtype=torch.float16
)

# 3. Готовим модель к дообучению с LoRA
finetuned_model = prepare_model_for_kbit_training(finetuned_model)

lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q_proj", "v_proj", "k_proj", "o_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

finetuned_model = get_peft_model(finetuned_model, lora_config)
finetuned_model.load_state_dict(torch.load('/content/drive/MyDrive/Генерация в NLP/hw_task_2/models/qwen_House_1.pth'), strict=False)

finetuned_model.eval()

# Функция для инференса
def get_model_answer(prompt):
    messages = [
        {"role": "system", "content": "Answer like Dr.House"},
        {"role": "user", "content": prompt}
    ]
    text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )
    model_inputs = tokenizer([text], return_tensors="pt").to(finetuned_model.device)

    generated_ids = finetuned_model.generate(
        **model_inputs,
        max_new_tokens=32
    )
    generated_ids = [
        output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
    ]

    response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
    return response

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


In [12]:
import gradio as gr

def get_response(message, history):
    response = get_model_answer(message)
    return f'{response}'


demo = gr.ChatInterface(get_response,
                        title="Talk to Dr.House!",
                        theme="soft",)

demo.launch()

  self.chatbot = Chatbot(


Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://d43d45aea7fceb5fc8.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


