In [None]:
from transformers import AutoTokenizer
from vllm import LLM, SamplingParams
from langchain.prompts import PromptTemplate

model_name = "t-tech/T-lite-it-1.0" # Указание устройства CPU
llm = LLM(model=model_name, max_model_len=8192, device="cpu")
sampling_params = SamplingParams(temperature=0.7, repetition_penalty=1.05, top_p=0.8, top_k=70)

from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)

prompt = PromptTemplate(
    input_variables=["system_prompt", "context", "question"],
    template="System: {system_prompt}\nContext: {context}\nQuestion: {question}"
)

def generate_answer(query, retriever):
    relevant_docs = retriever.get_relevant_documents(query)
    context = "\n".join([doc.page_content for doc in relevant_docs])
    system_prompt = "Ты T-lite, виртуальный ассистент в Т-Технологии. Твоя задача - быть полезным диалоговым ассистентом."
    input_text = prompt.format(
        system_prompt=system_prompt,
        context=context,
        question=query
    )


    prompt_token_ids = tokenizer(input_text, return_tensors="pt").input_ids

    outputs = llm.generate(prompt_token_ids=prompt_token_ids, sampling_params=sampling_params)
    generated_text = outputs[0].outputs[0].text

    return generated_text


queries = ["Что такое FAISS?", "Как работает GPT?", "Что такое векторные базы данных?"]

for query in tqdm(queries, desc="Processing queries"):
    answer = generate_answer(query, retriever)
    print(f"Вопрос: {query}")
    print(f"Ответ: {answer}\n")
