In [2]:
!pip install faiss-cpu sentence-transformers datasets transformers


Collecting faiss-cpu
  Using cached faiss_cpu-1.12.0.tar.gz (69 kB)
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
[?25hCollecting sentence-transformers
  Using cached sentence_transformers-5.1.2-py3-none-any.whl.metadata (16 kB)
Collecting transformers
  Using cached transformers-4.57.1-py3-none-any.whl.metadata (43 kB)
Collecting huggingface-hub>=0.20.0 (from sentence-transformers)
  Using cached huggingface_hub-0.36.0-py3-none-any.whl.metadata (14 kB)
Using cached sentence_transformers-5.1.2-py3-none-any.whl (488 kB)
Using cached transformers-4.57.1-py3-none-any.whl (12.0 MB)
Using cached huggingface_hub-0.36.0-py3-none-any.whl (566 kB)
Building wheels for collected packages: faiss-cpu
  Building wheel for faiss-cpu (pyproject.toml) ... [?25lerror
  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m√ó[0m [32mBuilding wheel for faiss-cpu [0m[1;32m

In [None]:
import pickle
from datasets import Dataset
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np

# –ó–∞–≥—Ä—É–∂–∞–µ–º —Ç–≤–æ–π –¥–∞—Ç–∞—Å–µ—Ç
with open("data.pickle", "rb") as f:
    data = pickle.load(f)

texts = [
    f"Question: {ex['question']}\nOptions: {ex['options']}\nAnswer: {ex['answer']}"
    for ex in data
]

encoder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

embs = encoder.encode(texts, convert_to_numpy=True)

d = embs.shape[1]     
faiss_index = faiss.IndexFlatL2(d)
faiss_index.add(embs)

print("FAISS index built:", faiss_index.ntotal)


In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

# –ó–∞–≥—Ä—É–∂–∞–µ–º Qwen
model_name = "Qwen/Qwen3-4B-Instruct-2507"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", device_map="auto").eval()

def rag_answer(question):
    query = f"Question: {question}"
    
    # 1. –ü–æ–ª—É—á–∏—Ç—å —ç–º–±–µ–¥–¥–∏–Ω–≥ –≤–æ–ø—Ä–æ—Å–∞
    q_emb = encoder.encode([query], convert_to_numpy=True)

    # 2. –ù–∞–π—Ç–∏ –ø–æ—Ö–æ–∂–∏–µ –∫—É—Å–∫–∏ (top-3)
    D, I = faiss_index.search(q_emb, 3)

    retrieved = "\n\n".join([texts[i] for i in I[0]])

    # 3. –°—Ñ–æ—Ä–º–∏—Ä–æ–≤–∞—Ç—å –ø—Ä–æ–º–ø—Ç —Å –∫–æ–Ω—Ç–µ–∫—Å—Ç–æ–º
    prompt = f"""
You are an expert at solving math and physics multiple-choice questions.

Use ONLY the context below to answer.

### Context:
{retrieved}

### Question:
{question}

### Your answer (just letters without spaces):
"""
    
    # 4. –ì–µ–Ω–µ—Ä–∞—Ü–∏—è
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    with torch.no_grad():
        out = model.generate(**inputs, max_new_tokens=100)

    answer = tokenizer.decode(out[0][inputs['input_ids'].size(1):], skip_special_tokens=True)
    
    return answer.strip()


In [None]:
print(rag_answer("The perimeter of a square with side 3 is:"))


In [None]:
from rank_bm25 import BM25Okapi

# –ø–æ–¥–≥–æ—Ç–∞–≤–ª–∏–≤–∞–µ–º —Ç–æ–∫–µ–Ω—ã
tokenized_docs = [doc.split() for doc in docs]
bm25 = BM25Okapi(tokenized_docs)

def retrieve_bm25(query, top_k=3):
    tokenized_query = query.split()
    scores = bm25.get_scores(tokenized_query)
    ranked = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)
    return [docs[i] for i in ranked[:top_k]]

# –ø—Ä–∏–º–µ—Ä
question = "What is the formula for the perimeter of a square?"
retrieved = retrieve_bm25(question)
answer = generate_answer(question, retrieved)

print("=== BM25 Retrieved ===")
print(retrieved)
print("=== Answer ===")
print(answer)


In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
import requests
import json

# 1. –ó–∞–≥—Ä—É–∑–∫–∞ –º–æ–¥–µ–ª–µ–π
print("–ó–∞–≥—Ä—É–∂–∞–µ–º –º–æ–¥–µ–ª–∏...")

# –ú–æ–¥–µ–ª—å –¥–ª—è —ç–º–±–µ–¥–¥–∏–Ω–≥–æ–≤ (—Ä—É—Å—Å–∫–∏–π —è–∑—ã–∫)
embedding_model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')

# –ú–æ–¥–µ–ª—å –¥–ª—è –≥–µ–Ω–µ—Ä–∞—Ü–∏–∏
model_name = "IlyaGusev/saiga_yandexgpt_8b"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    device_map="auto"
)

# 2. –ü–æ–¥–≥–æ—Ç–æ–≤–∫–∞ –±–∞–∑—ã –∑–Ω–∞–Ω–∏–π (–ø—Ä–∏–º–µ—Ä —Å —Ä—É—Å—Å–∫–∏–º–∏ —Ç–µ–∫—Å—Ç–∞–º–∏)
def create_knowledge_base():
    """–°–æ–∑–¥–∞–µ–º —Ä–∞—Å—à–∏—Ä–µ–Ω–Ω—É—é –±–∞–∑—É –∑–Ω–∞–Ω–∏–π –Ω–∞ —Ä—É—Å—Å–∫–æ–º"""
    documents = [
        # –ì–µ–æ–≥—Ä–∞—Ñ–∏—è –∏ –≥–æ—Ä–æ–¥–∞
        "–ú–æ—Å–∫–≤–∞ - —Å—Ç–æ–ª–∏—Ü–∞ –†–æ—Å—Å–∏–∏, –∫—Ä—É–ø–Ω–µ–π—à–∏–π –≥–æ—Ä–æ–¥ —Å—Ç—Ä–∞–Ω—ã —Å –Ω–∞—Å–µ–ª–µ–Ω–∏–µ–º –±–æ–ª–µ–µ 12 –º–∏–ª–ª–∏–æ–Ω–æ–≤ —á–µ–ª–æ–≤–µ–∫. –û—Å–Ω–æ–≤–∞–Ω–∞ –≤ 1147 –≥–æ–¥—É –Æ—Ä–∏–µ–º –î–æ–ª–≥–æ—Ä—É–∫–∏–º.",
        "–°–∞–Ω–∫—Ç-–ü–µ—Ç–µ—Ä–±—É—Ä–≥ –±—ã–ª –æ—Å–Ω–æ–≤–∞–Ω –ü–µ—Ç—Ä–æ–º I –≤ 1703 –≥–æ–¥—É –∏ —è–≤–ª—è–µ—Ç—Å—è –∫—É–ª—å—Ç—É—Ä–Ω–æ–π —Å—Ç–æ–ª–∏—Ü–µ–π –†–æ—Å—Å–∏–∏. –ù–∞—Å–µ–ª–µ–Ω–∏–µ –æ–∫–æ–ª–æ 5 –º–∏–ª–ª–∏–æ–Ω–æ–≤ —á–µ–ª–æ–≤–µ–∫.",
        "–ù–æ–≤–æ—Å–∏–±–∏—Ä—Å–∫ - —Ç—Ä–µ—Ç–∏–π –ø–æ —á–∏—Å–ª–µ–Ω–Ω–æ—Å—Ç–∏ –Ω–∞—Å–µ–ª–µ–Ω–∏—è –≥–æ—Ä–æ–¥ –†–æ—Å—Å–∏–∏, –∫—Ä—É–ø–Ω–µ–π—à–∏–π –Ω–∞—É—á–Ω—ã–π –∏ –æ–±—Ä–∞–∑–æ–≤–∞—Ç–µ–ª—å–Ω—ã–π —Ü–µ–Ω—Ç—Ä –°–∏–±–∏—Ä–∏.",
        "–°–æ—á–∏ - –∫—É—Ä–æ—Ä—Ç–Ω—ã–π –≥–æ—Ä–æ–¥ –Ω–∞ —á–µ—Ä–Ω–æ–º–æ—Ä—Å–∫–æ–º –ø–æ–±–µ—Ä–µ–∂—å–µ –†–æ—Å—Å–∏–∏, –∏–∑–≤–µ—Å—Ç–Ω—ã–π —Å–≤–æ–∏–º–∏ –ø–ª—è–∂–∞–º–∏ –∏ –≥–æ—Ä–Ω–æ–ª—ã–∂–Ω—ã–º–∏ –∫—É—Ä–æ—Ä—Ç–∞–º–∏.",
        "–ö–∞–ª–∏–Ω–∏–Ω–≥—Ä–∞–¥ - —Å–∞–º—ã–π –∑–∞–ø–∞–¥–Ω—ã–π –≥–æ—Ä–æ–¥ –†–æ—Å—Å–∏–∏, —Ä–∞—Å–ø–æ–ª–æ–∂–µ–Ω–Ω—ã–π –º–µ–∂–¥—É –ü–æ–ª—å—à–µ–π –∏ –õ–∏—Ç–≤–æ–π –Ω–∞ –±–µ—Ä–µ–≥—É –ë–∞–ª—Ç–∏–π—Å–∫–æ–≥–æ –º–æ—Ä—è.",
        
        # –ü—Ä–∏—Ä–æ–¥–∞ –∏ –≥–µ–æ–≥—Ä–∞—Ñ–∏—è
        "–ë–∞–π–∫–∞–ª - —Å–∞–º–æ–µ –≥–ª—É–±–æ–∫–æ–µ –æ–∑–µ—Ä–æ –≤ –º–∏—Ä–µ, —Ä–∞—Å–ø–æ–ª–æ–∂–µ–Ω–Ω–æ–µ –≤ –°–∏–±–∏—Ä–∏. –ì–ª—É–±–∏–Ω–∞ –¥–æ—Å—Ç–∏–≥–∞–µ—Ç 1642 –º–µ—Ç—Ä–æ–≤, —Å–æ–¥–µ—Ä–∂–∏—Ç 20% –º–∏—Ä–æ–≤—ã—Ö –∑–∞–ø–∞—Å–æ–≤ –ø—Ä–µ—Å–Ω–æ–π –≤–æ–¥—ã.",
        "–í–æ–ª–≥–∞ - —Å–∞–º–∞—è –¥–ª–∏–Ω–Ω–∞—è —Ä–µ–∫–∞ –≤ –ï–≤—Ä–æ–ø–µ, –ø—Ä–æ—Ç—è–∂–µ–Ω–Ω–æ—Å—Ç—å—é 3530 –∫–º. –ü—Ä–æ—Ç–µ–∫–∞–µ—Ç —á–µ—Ä–µ–∑ 15 —Å—É–±—ä–µ–∫—Ç–æ–≤ –†–æ—Å—Å–∏–π—Å–∫–æ–π –§–µ–¥–µ—Ä–∞—Ü–∏–∏.",
        "–≠–ª—å–±—Ä—É—Å - –≤—ã—Å–æ—á–∞–π—à–∞—è –≥–æ—Ä–Ω–∞—è –≤–µ—Ä—à–∏–Ω–∞ –†–æ—Å—Å–∏–∏ –∏ –ï–≤—Ä–æ–ø—ã, –≤—ã—Å–æ—Ç–∞ 5642 –º–µ—Ç—Ä–∞. –†–∞—Å–ø–æ–ª–æ–∂–µ–Ω –Ω–∞ –ö–∞–≤–∫–∞–∑–µ.",
        "–ö–∞–º—á–∞—Ç–∫–∞ - –ø–æ–ª—É–æ—Å—Ç—Ä–æ–≤ –Ω–∞ –î–∞–ª—å–Ω–µ–º –í–æ—Å—Ç–æ–∫–µ –†–æ—Å—Å–∏–∏, –∏–∑–≤–µ—Å—Ç–Ω—ã–π —Å–≤–æ–∏–º–∏ –≤—É–ª–∫–∞–Ω–∞–º–∏ –∏ –≥–µ–π–∑–µ—Ä–∞–º–∏.",
        "–¢–∞–π–≥–∞ - –∫—Ä—É–ø–Ω–µ–π—à–∏–π –≤ –º–∏—Ä–µ –ª–µ—Å–Ω–æ–π –º–∞—Å—Å–∏–≤, –∑–∞–Ω–∏–º–∞—é—â–∏–π –±–æ–ª—å—à—É—é —á–∞—Å—Ç—å –°–∏–±–∏—Ä–∏ –∏ –î–∞–ª—å–Ω–µ–≥–æ –í–æ—Å—Ç–æ–∫–∞.",
        
        # –ò—Å—Ç–æ—Ä–∏—è
        "–í–µ–ª–∏–∫–∞—è –û—Ç–µ—á–µ—Å—Ç–≤–µ–Ω–Ω–∞—è –≤–æ–π–Ω–∞ –¥–ª–∏–ª–∞—Å—å —Å 1941 –ø–æ 1945 –≥–æ–¥ –∏ –∑–∞–≤–µ—Ä—à–∏–ª–∞—Å—å –ø–æ–±–µ–¥–æ–π –°–æ–≤–µ—Ç—Å–∫–æ–≥–æ –°–æ—é–∑–∞ –Ω–∞–¥ –Ω–∞—Ü–∏—Å—Ç—Å–∫–æ–π –ì–µ—Ä–º–∞–Ω–∏–µ–π.",
        "–û–∫—Ç—è–±—Ä—å—Å–∫–∞—è —Ä–µ–≤–æ–ª—é—Ü–∏—è 1917 –≥–æ–¥–∞ –ø—Ä–∏–≤–µ–ª–∞ –∫ —É—Å—Ç–∞–Ω–æ–≤–ª–µ–Ω–∏—é —Å–æ–≤–µ—Ç—Å–∫–æ–π –≤–ª–∞—Å—Ç–∏ –≤ –†–æ—Å—Å–∏–∏ –∏ —Å–æ–∑–¥–∞–Ω–∏—é –°–°–°–†.",
        "–ü–µ—Ç—Ä I –í–µ–ª–∏–∫–∏–π - —Ä—É—Å—Å–∫–∏–π —Ü–∞—Ä—å, –ø—Ä–æ–≤–µ–ª –º–∞—Å—à—Ç–∞–±–Ω—ã–µ —Ä–µ—Ñ–æ—Ä–º—ã –∏ –æ—Å–Ω–æ–≤–∞–ª –°–∞–Ω–∫—Ç-–ü–µ—Ç–µ—Ä–±—É—Ä–≥. –ü—Ä–∞–≤–∏–ª —Å 1682 –ø–æ 1725 –≥–æ–¥.",
        "–ï–∫–∞—Ç–µ—Ä–∏–Ω–∞ II –í–µ–ª–∏–∫–∞—è - –∏–º–ø–µ—Ä–∞—Ç—Ä–∏—Ü–∞ –†–æ—Å—Å–∏–∏ —Å 1762 –ø–æ 1796 –≥–æ–¥, –∑–Ω–∞—á–∏—Ç–µ–ª—å–Ω–æ —Ä–∞—Å—à–∏—Ä–∏–ª–∞ —Ç–µ—Ä—Ä–∏—Ç–æ—Ä–∏—é –†–æ—Å—Å–∏–π—Å–∫–æ–π –∏–º–ø–µ—Ä–∏–∏.",
        
        # –ö—É–ª—å—Ç—É—Ä–∞ –∏ –∏—Å–∫—É—Å—Å—Ç–≤–æ
        "–≠—Ä–º–∏—Ç–∞–∂ –≤ –°–∞–Ω–∫—Ç-–ü–µ—Ç–µ—Ä–±—É—Ä–≥–µ - –æ–¥–∏–Ω –∏–∑ –∫—Ä—É–ø–Ω–µ–π—à–∏—Ö —Ö—É–¥–æ–∂–µ—Å—Ç–≤–µ–Ω–Ω—ã—Ö –º—É–∑–µ–µ–≤ –º–∏—Ä–∞, –æ—Å–Ω–æ–≤–∞–Ω–Ω—ã–π –ï–∫–∞—Ç–µ—Ä–∏–Ω–æ–π II –≤ 1764 –≥–æ–¥—É.",
        "–¢—Ä–µ—Ç—å—è–∫–æ–≤—Å–∫–∞—è –≥–∞–ª–µ—Ä–µ—è –≤ –ú–æ—Å–∫–≤–µ - –≥–ª–∞–≤–Ω—ã–π –º—É–∑–µ–π —Ä—É—Å—Å–∫–æ–≥–æ –∏–∑–æ–±—Ä–∞–∑–∏—Ç–µ–ª—å–Ω–æ–≥–æ –∏—Å–∫—É—Å—Å—Ç–≤–∞, –æ—Å–Ω–æ–≤–∞–Ω–Ω—ã–π –ü–∞–≤–ª–æ–º –¢—Ä–µ—Ç—å—è–∫–æ–≤—ã–º.",
        "–ë–æ–ª—å—à–æ–π —Ç–µ–∞—Ç—Ä –≤ –ú–æ—Å–∫–≤–µ - –æ–¥–∏–Ω –∏–∑ –≤–µ–¥—É—â–∏—Ö —Ç–µ–∞—Ç—Ä–æ–≤ –æ–ø–µ—Ä—ã –∏ –±–∞–ª–µ—Ç–∞ –≤ –º–∏—Ä–µ, –æ—Å–Ω–æ–≤–∞–Ω–Ω—ã–π –≤ 1776 –≥–æ–¥—É.",
        "–ê–ª–µ–∫—Å–∞–Ω–¥—Ä –ü—É—à–∫–∏–Ω - –≤–µ–ª–∏–∫–∏–π —Ä—É—Å—Å–∫–∏–π –ø–æ—ç—Ç, –æ—Å–Ω–æ–≤–æ–ø–æ–ª–æ–∂–Ω–∏–∫ —Å–æ–≤—Ä–µ–º–µ–Ω–Ω–æ–≥–æ —Ä—É—Å—Å–∫–æ–≥–æ –ª–∏—Ç–µ—Ä–∞—Ç—É—Ä–Ω–æ–≥–æ —è–∑—ã–∫–∞. –†–æ–¥–∏–ª—Å—è –≤ 1799 –≥–æ–¥—É.",
        "–õ–µ–≤ –¢–æ–ª—Å—Ç–æ–π - –∫–ª–∞—Å—Å–∏–∫ —Ä—É—Å—Å–∫–æ–π –ª–∏—Ç–µ—Ä–∞—Ç—É—Ä—ã, –∞–≤—Ç–æ—Ä —Ä–æ–º–∞–Ω–æ–≤ '–í–æ–π–Ω–∞ –∏ –º–∏—Ä' –∏ '–ê–Ω–Ω–∞ –ö–∞—Ä–µ–Ω–∏–Ω–∞'.",
        
        # –ù–∞—É–∫–∞ –∏ —Ç–µ—Ö–Ω–æ–ª–æ–≥–∏–∏
        "–î–º–∏—Ç—Ä–∏–π –ú–µ–Ω–¥–µ–ª–µ–µ–≤ - —Ä—É—Å—Å–∫–∏–π —É—á–µ–Ω—ã–π, —Å–æ–∑–¥–∞—Ç–µ–ª—å –ø–µ—Ä–∏–æ–¥–∏—á–µ—Å–∫–æ–π —Å–∏—Å—Ç–µ–º—ã —Ö–∏–º–∏—á–µ—Å–∫–∏—Ö —ç–ª–µ–º–µ–Ω—Ç–æ–≤ –≤ 1869 –≥–æ–¥—É.",
        "–Æ—Ä–∏–π –ì–∞–≥–∞—Ä–∏–Ω - –ø–µ—Ä–≤—ã–π —á–µ–ª–æ–≤–µ–∫ –≤ –∫–æ—Å–º–æ—Å–µ, —Å–æ–≤–µ—Ä—à–∏–ª –ø–æ–ª–µ—Ç 12 –∞–ø—Ä–µ–ª—è 1961 –≥–æ–¥–∞ –Ω–∞ –∫–æ—Ä–∞–±–ª–µ '–í–æ—Å—Ç–æ–∫-1'.",
        "–ú–∏—Ö–∞–∏–ª –õ–æ–º–æ–Ω–æ—Å–æ–≤ - —Ä—É—Å—Å–∫–∏–π —É—á–µ–Ω—ã–π-—ç–Ω—Ü–∏–∫–ª–æ–ø–µ–¥–∏—Å—Ç, –æ—Å–Ω–æ–≤–∞—Ç–µ–ª—å –ú–æ—Å–∫–æ–≤—Å–∫–æ–≥–æ —É–Ω–∏–≤–µ—Ä—Å–∏—Ç–µ—Ç–∞ –≤ 1755 –≥–æ–¥—É.",
        "–°–µ—Ä–≥–µ–π –ö–æ—Ä–æ–ª–µ–≤ - —Å–æ–≤–µ—Ç—Å–∫–∏–π –∫–æ–Ω—Å—Ç—Ä—É–∫—Ç–æ—Ä —Ä–∞–∫–µ—Ç–Ω–æ-–∫–æ—Å–º–∏—á–µ—Å–∫–æ–π —Ç–µ—Ö–Ω–∏–∫–∏, —Ä—É–∫–æ–≤–æ–¥–∏—Ç–µ–ª—å –ø—Ä–æ–≥—Ä–∞–º–º—ã –ø–µ—Ä–≤–æ–≥–æ –ø–æ–ª–µ—Ç–∞ —á–µ–ª–æ–≤–µ–∫–∞ –≤ –∫–æ—Å–º–æ—Å.",
        
        # –≠–∫–æ–Ω–æ–º–∏–∫–∞ –∏ –ø—Ä–æ–º—ã—à–ª–µ–Ω–Ω–æ—Å—Ç—å
        "–ì–∞–∑–ø—Ä–æ–º - –∫—Ä—É–ø–Ω–µ–π—à–∞—è –≥–∞–∑–æ–≤–∞—è –∫–æ–º–ø–∞–Ω–∏—è –º–∏—Ä–∞, –æ—Å–Ω–æ–≤–∞–Ω–∞ –≤ 1989 –≥–æ–¥—É. –®—Ç–∞–±-–∫–≤–∞—Ä—Ç–∏—Ä–∞ —Ä–∞—Å–ø–æ–ª–æ–∂–µ–Ω–∞ –≤ –ú–æ—Å–∫–≤–µ.",
        "–†–æ—Å–Ω–µ—Ñ—Ç—å - –æ–¥–Ω–∞ –∏–∑ –∫—Ä—É–ø–Ω–µ–π—à–∏—Ö –Ω–µ—Ñ—Ç—è–Ω—ã—Ö –∫–æ–º–ø–∞–Ω–∏–π –º–∏—Ä–∞, –æ—Å–Ω–æ–≤–∞–Ω–∞ –≤ 1993 –≥–æ–¥—É.",
        "–¢—Ä–∞–Ω—Å—Å–∏–±–∏—Ä—Å–∫–∞—è –º–∞–≥–∏—Å—Ç—Ä–∞–ª—å - —Å–∞–º–∞—è –¥–ª–∏–Ω–Ω–∞—è –∂–µ–ª–µ–∑–Ω–∞—è –¥–æ—Ä–æ–≥–∞ –≤ –º–∏—Ä–µ, –ø—Ä–æ—Ç—è–∂–µ–Ω–Ω–æ—Å—Ç—å—é 9288 –∫–º –æ—Ç –ú–æ—Å–∫–≤—ã –¥–æ –í–ª–∞–¥–∏–≤–æ—Å—Ç–æ–∫–∞.",
        "–†—É–±–ª—å - –Ω–∞—Ü–∏–æ–Ω–∞–ª—å–Ω–∞—è –≤–∞–ª—é—Ç–∞ –†–æ—Å—Å–∏–∏, –≤–≤–µ–¥–µ–Ω–∞ –≤ –æ–±—Ä–∞—â–µ–Ω–∏–µ –≤ 14 –≤–µ–∫–µ. –°–æ–≤—Ä–µ–º–µ–Ω–Ω—ã–π –∫–æ–¥ –≤–∞–ª—é—Ç—ã - RUB.",
        
        # –ü–æ–ª–∏—Ç–∏–∫–∞
        "–í–ª–∞–¥–∏–º–∏—Ä –ü—É—Ç–∏–Ω —è–≤–ª—è–µ—Ç—Å—è –ø—Ä–µ–∑–∏–¥–µ–Ω—Ç–æ–º –†–æ—Å—Å–∏–π—Å–∫–æ–π –§–µ–¥–µ—Ä–∞—Ü–∏–∏ —Å 2012 –≥–æ–¥–∞. –†–æ–¥–∏–ª—Å—è –≤ 1952 –≥–æ–¥—É –≤ –õ–µ–Ω–∏–Ω–≥—Ä–∞–¥–µ.",
        "–ì–æ—Å—É–¥–∞—Ä—Å—Ç–≤–µ–Ω–Ω–∞—è –î—É–º–∞ - –Ω–∏–∂–Ω—è—è –ø–∞–ª–∞—Ç–∞ –ø–∞—Ä–ª–∞–º–µ–Ω—Ç–∞ –†–æ—Å—Å–∏–∏, —Å–æ—Å—Ç–æ–∏—Ç –∏–∑ 450 –¥–µ–ø—É—Ç–∞—Ç–æ–≤.",
        "–ö–æ–Ω—Å—Ç–∏—Ç—É—Ü–∏—è –†–æ—Å—Å–∏–∏ –±—ã–ª–∞ –ø—Ä–∏–Ω—è—Ç–∞ 12 –¥–µ–∫–∞–±—Ä—è 1993 –≥–æ–¥–∞ –∏ —è–≤–ª—è–µ—Ç—Å—è –æ—Å–Ω–æ–≤–Ω—ã–º –∑–∞–∫–æ–Ω–æ–º —Å—Ç—Ä–∞–Ω—ã.",
        "–ö—Ä–µ–º–ª—å - –∏—Å—Ç–æ—Ä–∏—á–µ—Å–∫–∏–π –∫–æ–º–ø–ª–µ–∫—Å –≤ –ú–æ—Å–∫–≤–µ, —Ä–µ–∑–∏–¥–µ–Ω—Ü–∏—è –ø—Ä–µ–∑–∏–¥–µ–Ω—Ç–∞ –†–æ—Å—Å–∏–∏. –ü–æ—Å—Ç—Ä–æ–µ–Ω –≤ 15 –≤–µ–∫–µ.",
        
        # –û–±—Ä–∞–∑–æ–≤–∞–Ω–∏–µ
        "–ú–ì–£ –∏–º–µ–Ω–∏ –õ–æ–º–æ–Ω–æ—Å–æ–≤–∞ - —Å—Ç–∞—Ä–µ–π—à–∏–π –∏ –∫—Ä—É–ø–Ω–µ–π—à–∏–π —É–Ω–∏–≤–µ—Ä—Å–∏—Ç–µ—Ç –†–æ—Å—Å–∏–∏, –æ—Å–Ω–æ–≤–∞–Ω –≤ 1755 –≥–æ–¥—É.",
        "–ï–ì–≠ - –µ–¥–∏–Ω—ã–π –≥–æ—Å—É–¥–∞—Ä—Å—Ç–≤–µ–Ω–Ω—ã–π —ç–∫–∑–∞–º–µ–Ω, –æ–±—è–∑–∞—Ç–µ–ª—å–Ω—ã–π –¥–ª—è –≤—Å–µ—Ö –≤—ã–ø—É—Å–∫–Ω–∏–∫–æ–≤ —à–∫–æ–ª –†–æ—Å—Å–∏–∏ —Å 2009 –≥–æ–¥–∞.",
        "–†–æ—Å—Å–∏–π—Å–∫–∞—è –∞–∫–∞–¥–µ–º–∏—è –Ω–∞—É–∫ - –≥–ª–∞–≤–Ω–∞—è –Ω–∞—É—á–Ω–∞—è –æ—Ä–≥–∞–Ω–∏–∑–∞—Ü–∏—è —Å—Ç—Ä–∞–Ω—ã, –æ—Å–Ω–æ–≤–∞–Ω–∞ –ü–µ—Ç—Ä–æ–º I –≤ 1724 –≥–æ–¥—É.",
        
        # –°–ø–æ—Ä—Ç
        "–•–æ–∫–∫–µ–π —Å —à–∞–π–±–æ–π - –æ–¥–∏–Ω –∏–∑ —Å–∞–º—ã—Ö –ø–æ–ø—É–ª—è—Ä–Ω—ã—Ö –≤–∏–¥–æ–≤ —Å–ø–æ—Ä—Ç–∞ –≤ –†–æ—Å—Å–∏–∏. –°–±–æ—Ä–Ω–∞—è –†–æ—Å—Å–∏–∏ –º–Ω–æ–≥–æ–∫—Ä–∞—Ç–Ω—ã–π —á–µ–º–ø–∏–æ–Ω –º–∏—Ä–∞.",
        "–§–∏–≥—É—Ä–Ω–æ–µ –∫–∞—Ç–∞–Ω–∏–µ - —Ç—Ä–∞–¥–∏—Ü–∏–æ–Ω–Ω–æ —Å–∏–ª—å–Ω—ã–π –≤–∏–¥ —Å–ø–æ—Ä—Ç–∞ –¥–ª—è –†–æ—Å—Å–∏–∏, –º–Ω–æ–∂–µ—Å—Ç–≤–æ –æ–ª–∏–º–ø–∏–π—Å–∫–∏—Ö —á–µ–º–ø–∏–æ–Ω–æ–≤.",
        "–§—É—Ç–±–æ–ª—å–Ω—ã–π –∫–ª—É–± –ó–µ–Ω–∏—Ç –∏–∑ –°–∞–Ω–∫—Ç-–ü–µ—Ç–µ—Ä–±—É—Ä–≥–∞ - –æ–¥–∏–Ω –∏–∑ –≤–µ–¥—É—â–∏—Ö —Ñ—É—Ç–±–æ–ª—å–Ω—ã—Ö –∫–ª—É–±–æ–≤ –†–æ—Å—Å–∏–∏.",
        "–û–ª–∏–º–ø–∏–∞–¥–∞ 1980 –≥–æ–¥–∞ –ø—Ä–æ–≤–æ–¥–∏–ª–∞—Å—å –≤ –ú–æ—Å–∫–≤–µ, –±—ã–ª–∞ –ø–µ—Ä–≤–æ–π –û–ª–∏–º–ø–∏–∞–¥–æ–π –≤ —Å–æ—Ü–∏–∞–ª–∏—Å—Ç–∏—á–µ—Å–∫–æ–π —Å—Ç—Ä–∞–Ω–µ.",
        
        # –¢—Ä–∞–¥–∏—Ü–∏–∏ –∏ –ø—Ä–∞–∑–¥–Ω–∏–∫–∏
        "–î–µ–Ω—å –ü–æ–±–µ–¥—ã –æ—Ç–º–µ—á–∞–µ—Ç—Å—è 9 –º–∞—è –≤ —á–µ—Å—Ç—å –ø–æ–±–µ–¥—ã –≤ –í–µ–ª–∏–∫–æ–π –û—Ç–µ—á–µ—Å—Ç–≤–µ–Ω–Ω–æ–π –≤–æ–π–Ω–µ. –ì–ª–∞–≤–Ω—ã–π –ø–∞—Ä–∞–¥ –ø—Ä–æ—Ö–æ–¥–∏—Ç –Ω–∞ –ö—Ä–∞—Å–Ω–æ–π –ø–ª–æ—â–∞–¥–∏.",
        "–ú–∞—Å–ª–µ–Ω–∏—Ü–∞ - —Ç—Ä–∞–¥–∏—Ü–∏–æ–Ω–Ω—ã–π —Ä—É—Å—Å–∫–∏–π –ø—Ä–∞–∑–¥–Ω–∏–∫ –ø—Ä–æ–≤–æ–¥–æ–≤ –∑–∏–º—ã, –æ—Ç–º–µ—á–∞–µ—Ç—Å—è –∑–∞ –Ω–µ–¥–µ–ª—é –¥–æ –í–µ–ª–∏–∫–æ–≥–æ –ø–æ—Å—Ç–∞.",
        "–ù–æ–≤—ã–π –≥–æ–¥ - –≥–ª–∞–≤–Ω—ã–π –ø—Ä–∞–∑–¥–Ω–∏–∫ –≤ –†–æ—Å—Å–∏–∏, –æ—Ç–º–µ—á–∞–µ—Ç—Å—è –≤ –Ω–æ—á—å —Å 31 –¥–µ–∫–∞–±—Ä—è –Ω–∞ 1 —è–Ω–≤–∞—Ä—è.",
        "–î–µ–Ω—å –†–æ—Å—Å–∏–∏ - –≥–æ—Å—É–¥–∞—Ä—Å—Ç–≤–µ–Ω–Ω—ã–π –ø—Ä–∞–∑–¥–Ω–∏–∫ 12 –∏—é–Ω—è, –æ—Ç–º–µ—á–∞–µ—Ç—Å—è —Å 1992 –≥–æ–¥–∞."
    ]
    return documents

# –ü—Ä–∏–º–µ—Ä—ã –≤–æ–ø—Ä–æ—Å–æ–≤ –¥–ª—è —Ç–µ—Å—Ç–∏—Ä–æ–≤–∞–Ω–∏—è
test_questions = [
    "–ö–∞–∫–∞—è —Å–∞–º–∞—è –¥–ª–∏–Ω–Ω–∞—è —Ä–µ–∫–∞ –≤ –ï–≤—Ä–æ–ø–µ?",
    "–ö–æ–≥–¥–∞ –±—ã–ª –æ—Å–Ω–æ–≤–∞–Ω –°–∞–Ω–∫—Ç-–ü–µ—Ç–µ—Ä–±—É—Ä–≥?",
    "–ö—Ç–æ –±—ã–ª –ø–µ—Ä–≤—ã–º —á–µ–ª–æ–≤–µ–∫–æ–º –≤ –∫–æ—Å–º–æ—Å–µ?",
    "–°–∫–æ–ª—å–∫–æ —á–µ–ª–æ–≤–µ–∫ –∂–∏–≤–µ—Ç –≤ –ú–æ—Å–∫–≤–µ?",
    "–ö–∞–∫–æ–µ –æ–∑–µ—Ä–æ —Å–∞–º–æ–µ –≥–ª—É–±–æ–∫–æ–µ –≤ –º–∏—Ä–µ?",
    "–ö—Ç–æ —Å–æ–∑–¥–∞–ª –ø–µ—Ä–∏–æ–¥–∏—á–µ—Å–∫—É—é —Ç–∞–±–ª–∏—Ü—É —ç–ª–µ–º–µ–Ω—Ç–æ–≤?",
    "–ö–∞–∫ –Ω–∞–∑—ã–≤–∞–µ—Ç—Å—è –≤–∞–ª—é—Ç–∞ –†–æ—Å—Å–∏–∏?",
    "–ö–æ–≥–¥–∞ –±—ã–ª–∞ –û–∫—Ç—è–±—Ä—å—Å–∫–∞—è —Ä–µ–≤–æ–ª—é—Ü–∏—è?",
    "–ö–∞–∫–æ–π —É–Ω–∏–≤–µ—Ä—Å–∏—Ç–µ—Ç —Å–∞–º—ã–π —Å—Ç–∞—Ä—ã–π –≤ –†–æ—Å—Å–∏–∏?",
    "–ß—Ç–æ —Ç–∞–∫–æ–µ –≠—Ä–º–∏—Ç–∞–∂?",
    "–ö—Ç–æ —è–≤–ª—è–µ—Ç—Å—è –ø—Ä–µ–∑–∏–¥–µ–Ω—Ç–æ–º –†–æ—Å—Å–∏–∏?",
    "–ö–∞–∫–æ–π –ø—Ä–∞–∑–¥–Ω–∏–∫ –æ—Ç–º–µ—á–∞–µ—Ç—Å—è 9 –º–∞—è?",
    "–°–∫–æ–ª—å–∫–æ –∫–∏–ª–æ–º–µ—Ç—Ä–æ–≤ —Å–æ—Å—Ç–∞–≤–ª—è–µ—Ç –¢—Ä–∞–Ω—Å—Å–∏–±–∏—Ä—Å–∫–∞—è –º–∞–≥–∏—Å—Ç—Ä–∞–ª—å?",
    "–ö—Ç–æ –∞–≤—Ç–æ—Ä —Ä–æ–º–∞–Ω–∞ '–í–æ–π–Ω–∞ –∏ –º–∏—Ä'?",
    "–í –∫–∞–∫–æ–º –≥–æ–¥—É –Æ—Ä–∏–π –ì–∞–≥–∞—Ä–∏–Ω –ø–æ–ª–µ—Ç–µ–ª –≤ –∫–æ—Å–º–æ—Å?"
]

def run_test_questions():
    """–§—É–Ω–∫—Ü–∏—è –¥–ª—è —Ç–µ—Å—Ç–∏—Ä–æ–≤–∞–Ω–∏—è —Å–∏—Å—Ç–µ–º—ã –Ω–∞ –ø–æ–¥–≥–æ—Ç–æ–≤–ª–µ–Ω–Ω—ã—Ö –≤–æ–ø—Ä–æ—Å–∞—Ö"""
    documents = create_knowledge_base()
    index, documents = create_vector_db(documents)
    
    print("=" * 50)
    print("–¢–ï–°–¢–ò–†–û–í–ê–ù–ò–ï RAG –°–ò–°–¢–ï–ú–´")
    print("=" * 50)
    
    for i, question in enumerate(test_questions, 1):
        print(f"\n{i}. –í–æ–ø—Ä–æ—Å: {question}")
        try:
            answer, context_docs = rag_pipeline(question, index, documents)
            print(f"–û—Ç–≤–µ—Ç: {answer}")
            print(f"–ò—Å–ø–æ–ª—å–∑–æ–≤–∞–Ω–æ –¥–æ–∫—É–º–µ–Ω—Ç–æ–≤: {len(context_docs)}")
            print("-" * 30)
        except Exception as e:
            print(f"–û—à–∏–±–∫–∞: {e}")
            print("-" * 30)


# 3. –°–æ–∑–¥–∞–Ω–∏–µ –≤–µ–∫—Ç–æ—Ä–Ω–æ–π –±–∞–∑—ã –¥–∞–Ω–Ω—ã—Ö
def create_vector_db(documents):
    """–°–æ–∑–¥–∞–µ–º –≤–µ–∫—Ç–æ—Ä–Ω—É—é –±–∞–∑—É –¥–∞–Ω–Ω—ã—Ö"""
    print("–°–æ–∑–¥–∞–µ–º –≤–µ–∫—Ç–æ—Ä–Ω—É—é –±–∞–∑—É...")
    
    # –ü–æ–ª—É—á–∞–µ–º —ç–º–±–µ–¥–¥–∏–Ω–≥–∏ –¥–ª—è –¥–æ–∫—É–º–µ–Ω—Ç–æ–≤
    embeddings = embedding_model.encode(documents)
    
    # –°–æ–∑–¥–∞–µ–º FAISS –∏–Ω–¥–µ–∫—Å
    dimension = embeddings.shape[1]
    index = faiss.IndexFlatIP(dimension)  # Inner Product (–∫–æ—Å–∏–Ω—É—Å–Ω–æ–µ —Å—Ö–æ–¥—Å—Ç–≤–æ)
    
    # –ù–æ—Ä–º–∞–ª–∏–∑—É–µ–º –≤–µ–∫—Ç–æ—Ä—ã –¥–ª—è –∫–æ—Å–∏–Ω—É—Å–Ω–æ–≥–æ —Å—Ö–æ–¥—Å—Ç–≤–∞
    faiss.normalize_L2(embeddings)
    index.add(embeddings)
    
    return index, documents

# 4. –ü–æ–∏—Å–∫ —Ä–µ–ª–µ–≤–∞–Ω—Ç–Ω—ã—Ö –¥–æ–∫—É–º–µ–Ω—Ç–æ–≤
def search_similar_documents(query, index, documents, top_k=3):
    """–ò—â–µ–º –Ω–∞–∏–±–æ–ª–µ–µ —Ä–µ–ª–µ–≤–∞–Ω—Ç–Ω—ã–µ –¥–æ–∫—É–º–µ–Ω—Ç—ã –¥–ª—è –∑–∞–ø—Ä–æ—Å–∞"""
    # –ü–æ–ª—É—á–∞–µ–º —ç–º–±–µ–¥–¥–∏–Ω–≥ –∑–∞–ø—Ä–æ—Å–∞
    query_embedding = embedding_model.encode([query])
    faiss.normalize_L2(query_embedding)
    
    # –ò—â–µ–º –ø–æ—Ö–æ–∂–∏–µ –¥–æ–∫—É–º–µ–Ω—Ç—ã
    scores, indices = index.search(query_embedding, top_k)
    
    # –í–æ–∑–≤—Ä–∞—â–∞–µ–º –Ω–∞–π–¥–µ–Ω–Ω—ã–µ –¥–æ–∫—É–º–µ–Ω—Ç—ã –∏ –∏—Ö —Å–∫–æ—Ä–∏–Ω–≥–∏
    results = []
    for i, idx in enumerate(indices[0]):
        if idx < len(documents):
            results.append({
                'text': documents[idx],
                'score': scores[0][i]
            })
    
    return results

# 5. –°–æ–∑–¥–∞–Ω–∏–µ –ø—Ä–æ–º–ø—Ç–∞ —Å –∫–æ–Ω—Ç–µ–∫—Å—Ç–æ–º
def create_prompt(question, context_docs):
    """–°–æ–∑–¥–∞–µ–º –ø—Ä–æ–º–ø—Ç –¥–ª—è –º–æ–¥–µ–ª–∏ —Å –∫–æ–Ω—Ç–µ–∫—Å—Ç–æ–º"""
    context = "\n".join([f"- {doc['text']}" for doc in context_docs])
    
    prompt = f"""–ö–æ–Ω—Ç–µ–∫—Å—Ç:
{context}

–í–æ–ø—Ä–æ—Å: {question}

–û—Ç–≤–µ—Ç: """
    
    return prompt

# 6. –ì–µ–Ω–µ—Ä–∞—Ü–∏—è –æ—Ç–≤–µ—Ç–∞
def generate_answer(prompt):
    """–ì–µ–Ω–µ—Ä–∏—Ä—É–µ–º –æ—Ç–≤–µ—Ç —Å –ø–æ–º–æ—â—å—é –º–æ–¥–µ–ª–∏"""
    inputs = tokenizer(prompt, return_tensors="pt", max_length=1024, truncation=True)
    
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=256,
            do_sample=True,
            temperature=0.7,
            top_p=0.9,
            pad_token_id=tokenizer.eos_token_id
        )
    
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    # –£–±–∏—Ä–∞–µ–º –ø—Ä–æ–º–ø—Ç –∏–∑ –æ—Ç–≤–µ—Ç–∞
    answer = response[len(prompt):].strip()
    
    return answer

# 7. –û—Å–Ω–æ–≤–Ω–∞—è —Ñ—É–Ω–∫—Ü–∏—è RAG
def rag_pipeline(question, index, documents):
    """–ü–æ–ª–Ω—ã–π –ø–∞–π–ø–ª–∞–π–Ω RAG"""
    print(f"–í–æ–ø—Ä–æ—Å: {question}")
    
    # –®–∞–≥ 1: –ü–æ–∏—Å–∫ —Ä–µ–ª–µ–≤–∞–Ω—Ç–Ω—ã—Ö –¥–æ–∫—É–º–µ–Ω—Ç–æ–≤
    print("–ò—â–µ–º —Ä–µ–ª–µ–≤–∞–Ω—Ç–Ω—ã–µ –¥–æ–∫—É–º–µ–Ω—Ç—ã...")
    relevant_docs = search_similar_documents(question, index, documents)
    
    print("–ù–∞–π–¥–µ–Ω–Ω—ã–µ –¥–æ–∫—É–º–µ–Ω—Ç—ã:")
    for i, doc in enumerate(relevant_docs):
        print(f"{i+1}. {doc['text']} (—Å—Ö–æ–¥—Å—Ç–≤–æ: {doc['score']:.3f})")
    
    # –®–∞–≥ 2: –°–æ–∑–¥–∞–Ω–∏–µ –ø—Ä–æ–º–ø—Ç–∞
    prompt = create_prompt(question, relevant_docs)
    
    # –®–∞–≥ 3: –ì–µ–Ω–µ—Ä–∞—Ü–∏—è –æ—Ç–≤–µ—Ç–∞
    print("\n–ì–µ–Ω–µ—Ä–∏—Ä—É–µ–º –æ—Ç–≤–µ—Ç...")
    answer = generate_answer(prompt)
    
    return answer, relevant_docs

# 8. –ó–∞–ø—É—Å–∫ —Å–∏—Å—Ç–µ–º—ã
def main():
    # –ò–Ω–∏—Ü–∏–∞–ª–∏–∑–∞—Ü–∏—è
    documents = create_knowledge_base()
    index, documents = create_vector_db(documents)
    
    print("RAG —Å–∏—Å—Ç–µ–º–∞ –≥–æ—Ç–æ–≤–∞! –í–≤–µ–¥–∏—Ç–µ –≤–∞—à –≤–æ–ø—Ä–æ—Å (–∏–ª–∏ '–≤—ã—Ö–æ–¥' –¥–ª—è –∑–∞–≤–µ—Ä—à–µ–Ω–∏—è):")
    
    # –ò–Ω—Ç–µ—Ä–∞–∫—Ç–∏–≤–Ω—ã–π —Ü–∏–∫–ª
    while True:
        question = input("\n–í–∞—à –≤–æ–ø—Ä–æ—Å: ").strip()
        
        if question.lower() in ['–≤—ã—Ö–æ–¥', 'exit', 'quit']:
            break
            
        if not question:
            continue
            
        try:
            answer, context_docs = rag_pipeline(question, index, documents)
            print(f"\n–û—Ç–≤–µ—Ç: {answer}")
            print(f"\n–ò—Å—Ç–æ—á–Ω–∏–∫–∏: {len(context_docs)} –¥–æ–∫—É–º–µ–Ω—Ç–æ–≤")
            
        except Exception as e:
            print(f"–û—à–∏–±–∫–∞: {e}")

if __name__ == "__main__":
    run_test_questions()

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
import requests
import json

# 1. –ó–∞–≥—Ä—É–∑–∫–∞ –º–æ–¥–µ–ª–µ–π
print("–ó–∞–≥—Ä—É–∂–∞–µ–º –º–æ–¥–µ–ª–∏...")

# –ú–æ–¥–µ–ª—å –¥–ª—è —ç–º–±–µ–¥–¥–∏–Ω–≥–æ–≤ (—Ä—É—Å—Å–∫–∏–π —è–∑—ã–∫)
embedding_model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')

# –ú–æ–¥–µ–ª—å –¥–ª—è –≥–µ–Ω–µ—Ä–∞—Ü–∏–∏
model_name = "IlyaGusev/saiga_yandexgpt_8b"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    device_map="auto"
)

# 2. –ü–æ–¥–≥–æ—Ç–æ–≤–∫–∞ –±–∞–∑—ã –∑–Ω–∞–Ω–∏–π (–ø—Ä–∏–º–µ—Ä —Å —Ä—É—Å—Å–∫–∏–º–∏ —Ç–µ–∫—Å—Ç–∞–º–∏)
def create_knowledge_base():
    """–°–æ–∑–¥–∞–µ–º —Ä–∞—Å—à–∏—Ä–µ–Ω–Ω—É—é –±–∞–∑—É –∑–Ω–∞–Ω–∏–π –Ω–∞ —Ä—É—Å—Å–∫–æ–º"""
    documents = [
        # –ì–µ–æ–≥—Ä–∞—Ñ–∏—è –∏ –≥–æ—Ä–æ–¥–∞
        "–ú–æ—Å–∫–≤–∞ - —Å—Ç–æ–ª–∏—Ü–∞ –†–æ—Å—Å–∏–∏, –∫—Ä—É–ø–Ω–µ–π—à–∏–π –≥–æ—Ä–æ–¥ —Å—Ç—Ä–∞–Ω—ã —Å –Ω–∞—Å–µ–ª–µ–Ω–∏–µ–º –±–æ–ª–µ–µ 12 –º–∏–ª–ª–∏–æ–Ω–æ–≤ —á–µ–ª–æ–≤–µ–∫. –û—Å–Ω–æ–≤–∞–Ω–∞ –≤ 1147 –≥–æ–¥—É –Æ—Ä–∏–µ–º –î–æ–ª–≥–æ—Ä—É–∫–∏–º.",
        "–°–∞–Ω–∫—Ç-–ü–µ—Ç–µ—Ä–±—É—Ä–≥ –±—ã–ª –æ—Å–Ω–æ–≤–∞–Ω –ü–µ—Ç—Ä–æ–º I –≤ 1703 –≥–æ–¥—É –∏ —è–≤–ª—è–µ—Ç—Å—è –∫—É–ª—å—Ç—É—Ä–Ω–æ–π —Å—Ç–æ–ª–∏—Ü–µ–π –†–æ—Å—Å–∏–∏. –ù–∞—Å–µ–ª–µ–Ω–∏–µ –æ–∫–æ–ª–æ 5 –º–∏–ª–ª–∏–æ–Ω–æ–≤ —á–µ–ª–æ–≤–µ–∫.",
        "–ù–æ–≤–æ—Å–∏–±–∏—Ä—Å–∫ - —Ç—Ä–µ—Ç–∏–π –ø–æ —á–∏—Å–ª–µ–Ω–Ω–æ—Å—Ç–∏ –Ω–∞—Å–µ–ª–µ–Ω–∏—è –≥–æ—Ä–æ–¥ –†–æ—Å—Å–∏–∏, –∫—Ä—É–ø–Ω–µ–π—à–∏–π –Ω–∞—É—á–Ω—ã–π –∏ –æ–±—Ä–∞–∑–æ–≤–∞—Ç–µ–ª—å–Ω—ã–π —Ü–µ–Ω—Ç—Ä –°–∏–±–∏—Ä–∏.",
        "–°–æ—á–∏ - –∫—É—Ä–æ—Ä—Ç–Ω—ã–π –≥–æ—Ä–æ–¥ –Ω–∞ —á–µ—Ä–Ω–æ–º–æ—Ä—Å–∫–æ–º –ø–æ–±–µ—Ä–µ–∂—å–µ –†–æ—Å—Å–∏–∏, –∏–∑–≤–µ—Å—Ç–Ω—ã–π —Å–≤–æ–∏–º–∏ –ø–ª—è–∂–∞–º–∏ –∏ –≥–æ—Ä–Ω–æ–ª—ã–∂–Ω—ã–º–∏ –∫—É—Ä–æ—Ä—Ç–∞–º–∏.",
        "–ö–∞–ª–∏–Ω–∏–Ω–≥—Ä–∞–¥ - —Å–∞–º—ã–π –∑–∞–ø–∞–¥–Ω—ã–π –≥–æ—Ä–æ–¥ –†–æ—Å—Å–∏–∏, —Ä–∞—Å–ø–æ–ª–æ–∂–µ–Ω–Ω—ã–π –º–µ–∂–¥—É –ü–æ–ª—å—à–µ–π –∏ –õ–∏—Ç–≤–æ–π –Ω–∞ –±–µ—Ä–µ–≥—É –ë–∞–ª—Ç–∏–π—Å–∫–æ–≥–æ –º–æ—Ä—è.",
        
        # –ü—Ä–∏—Ä–æ–¥–∞ –∏ –≥–µ–æ–≥—Ä–∞—Ñ–∏—è
        "–ë–∞–π–∫–∞–ª - —Å–∞–º–æ–µ –≥–ª—É–±–æ–∫–æ–µ –æ–∑–µ—Ä–æ –≤ –º–∏—Ä–µ, —Ä–∞—Å–ø–æ–ª–æ–∂–µ–Ω–Ω–æ–µ –≤ –°–∏–±–∏—Ä–∏. –ì–ª—É–±–∏–Ω–∞ –¥–æ—Å—Ç–∏–≥–∞–µ—Ç 1642 –º–µ—Ç—Ä–æ–≤, —Å–æ–¥–µ—Ä–∂–∏—Ç 20% –º–∏—Ä–æ–≤—ã—Ö –∑–∞–ø–∞—Å–æ–≤ –ø—Ä–µ—Å–Ω–æ–π –≤–æ–¥—ã.",
        "–í–æ–ª–≥–∞ - —Å–∞–º–∞—è –¥–ª–∏–Ω–Ω–∞—è —Ä–µ–∫–∞ –≤ –ï–≤—Ä–æ–ø–µ, –ø—Ä–æ—Ç—è–∂–µ–Ω–Ω–æ—Å—Ç—å—é 3530 –∫–º. –ü—Ä–æ—Ç–µ–∫–∞–µ—Ç —á–µ—Ä–µ–∑ 15 —Å—É–±—ä–µ–∫—Ç–æ–≤ –†–æ—Å—Å–∏–π—Å–∫–æ–π –§–µ–¥–µ—Ä–∞—Ü–∏–∏.",
        "–≠–ª—å–±—Ä—É—Å - –≤—ã—Å–æ—á–∞–π—à–∞—è –≥–æ—Ä–Ω–∞—è –≤–µ—Ä—à–∏–Ω–∞ –†–æ—Å—Å–∏–∏ –∏ –ï–≤—Ä–æ–ø—ã, –≤—ã—Å–æ—Ç–∞ 5642 –º–µ—Ç—Ä–∞. –†–∞—Å–ø–æ–ª–æ–∂–µ–Ω –Ω–∞ –ö–∞–≤–∫–∞–∑–µ.",
        "–ö–∞–º—á–∞—Ç–∫–∞ - –ø–æ–ª—É–æ—Å—Ç—Ä–æ–≤ –Ω–∞ –î–∞–ª—å–Ω–µ–º –í–æ—Å—Ç–æ–∫–µ –†–æ—Å—Å–∏–∏, –∏–∑–≤–µ—Å—Ç–Ω—ã–π —Å–≤–æ–∏–º–∏ –≤—É–ª–∫–∞–Ω–∞–º–∏ –∏ –≥–µ–π–∑–µ—Ä–∞–º–∏.",
        "–¢–∞–π–≥–∞ - –∫—Ä—É–ø–Ω–µ–π—à–∏–π –≤ –º–∏—Ä–µ –ª–µ—Å–Ω–æ–π –º–∞—Å—Å–∏–≤, –∑–∞–Ω–∏–º–∞—é—â–∏–π –±–æ–ª—å—à—É—é —á–∞—Å—Ç—å –°–∏–±–∏—Ä–∏ –∏ –î–∞–ª—å–Ω–µ–≥–æ –í–æ—Å—Ç–æ–∫–∞.",
        
        # –ò—Å—Ç–æ—Ä–∏—è
        "–í–µ–ª–∏–∫–∞—è –û—Ç–µ—á–µ—Å—Ç–≤–µ–Ω–Ω–∞—è –≤–æ–π–Ω–∞ –¥–ª–∏–ª–∞—Å—å —Å 1941 –ø–æ 1945 –≥–æ–¥ –∏ –∑–∞–≤–µ—Ä—à–∏–ª–∞—Å—å –ø–æ–±–µ–¥–æ–π –°–æ–≤–µ—Ç—Å–∫–æ–≥–æ –°–æ—é–∑–∞ –Ω–∞–¥ –Ω–∞—Ü–∏—Å—Ç—Å–∫–æ–π –ì–µ—Ä–º–∞–Ω–∏–µ–π.",
        "–û–∫—Ç—è–±—Ä—å—Å–∫–∞—è —Ä–µ–≤–æ–ª—é—Ü–∏—è 1917 –≥–æ–¥–∞ –ø—Ä–∏–≤–µ–ª–∞ –∫ —É—Å—Ç–∞–Ω–æ–≤–ª–µ–Ω–∏—é —Å–æ–≤–µ—Ç—Å–∫–æ–π –≤–ª–∞—Å—Ç–∏ –≤ –†–æ—Å—Å–∏–∏ –∏ —Å–æ–∑–¥–∞–Ω–∏—é –°–°–°–†.",
        "–ü–µ—Ç—Ä I –í–µ–ª–∏–∫–∏–π - —Ä—É—Å—Å–∫–∏–π —Ü–∞—Ä—å, –ø—Ä–æ–≤–µ–ª –º–∞—Å—à—Ç–∞–±–Ω—ã–µ —Ä–µ—Ñ–æ—Ä–º—ã –∏ –æ—Å–Ω–æ–≤–∞–ª –°–∞–Ω–∫—Ç-–ü–µ—Ç–µ—Ä–±—É—Ä–≥. –ü—Ä–∞–≤–∏–ª —Å 1682 –ø–æ 1725 –≥–æ–¥.",
        "–ï–∫–∞—Ç–µ—Ä–∏–Ω–∞ II –í–µ–ª–∏–∫–∞—è - –∏–º–ø–µ—Ä–∞—Ç—Ä–∏—Ü–∞ –†–æ—Å—Å–∏–∏ —Å 1762 –ø–æ 1796 –≥–æ–¥, –∑–Ω–∞—á–∏—Ç–µ–ª—å–Ω–æ —Ä–∞—Å—à–∏—Ä–∏–ª–∞ —Ç–µ—Ä—Ä–∏—Ç–æ—Ä–∏—é –†–æ—Å—Å–∏–π—Å–∫–æ–π –∏–º–ø–µ—Ä–∏–∏.",
        
        # –ö—É–ª—å—Ç—É—Ä–∞ –∏ –∏—Å–∫—É—Å—Å—Ç–≤–æ
        "–≠—Ä–º–∏—Ç–∞–∂ –≤ –°–∞–Ω–∫—Ç-–ü–µ—Ç–µ—Ä–±—É—Ä–≥–µ - –æ–¥–∏–Ω –∏–∑ –∫—Ä—É–ø–Ω–µ–π—à–∏—Ö —Ö—É–¥–æ–∂–µ—Å—Ç–≤–µ–Ω–Ω—ã—Ö –º—É–∑–µ–µ–≤ –º–∏—Ä–∞, –æ—Å–Ω–æ–≤–∞–Ω–Ω—ã–π –ï–∫–∞—Ç–µ—Ä–∏–Ω–æ–π II –≤ 1764 –≥–æ–¥—É.",
        "–¢—Ä–µ—Ç—å—è–∫–æ–≤—Å–∫–∞—è –≥–∞–ª–µ—Ä–µ—è –≤ –ú–æ—Å–∫–≤–µ - –≥–ª–∞–≤–Ω—ã–π –º—É–∑–µ–π —Ä—É—Å—Å–∫–æ–≥–æ –∏–∑–æ–±—Ä–∞–∑–∏—Ç–µ–ª—å–Ω–æ–≥–æ –∏—Å–∫—É—Å—Å—Ç–≤–∞, –æ—Å–Ω–æ–≤–∞–Ω–Ω—ã–π –ü–∞–≤–ª–æ–º –¢—Ä–µ—Ç—å—è–∫–æ–≤—ã–º.",
        "–ë–æ–ª—å—à–æ–π —Ç–µ–∞—Ç—Ä –≤ –ú–æ—Å–∫–≤–µ - –æ–¥–∏–Ω –∏–∑ –≤–µ–¥—É—â–∏—Ö —Ç–µ–∞—Ç—Ä–æ–≤ –æ–ø–µ—Ä—ã –∏ –±–∞–ª–µ—Ç–∞ –≤ –º–∏—Ä–µ, –æ—Å–Ω–æ–≤–∞–Ω–Ω—ã–π –≤ 1776 –≥–æ–¥—É.",
        "–ê–ª–µ–∫—Å–∞–Ω–¥—Ä –ü—É—à–∫–∏–Ω - –≤–µ–ª–∏–∫–∏–π —Ä—É—Å—Å–∫–∏–π –ø–æ—ç—Ç, –æ—Å–Ω–æ–≤–æ–ø–æ–ª–æ–∂–Ω–∏–∫ —Å–æ–≤—Ä–µ–º–µ–Ω–Ω–æ–≥–æ —Ä—É—Å—Å–∫–æ–≥–æ –ª–∏—Ç–µ—Ä–∞—Ç—É—Ä–Ω–æ–≥–æ —è–∑—ã–∫–∞. –†–æ–¥–∏–ª—Å—è –≤ 1799 –≥–æ–¥—É.",
        "–õ–µ–≤ –¢–æ–ª—Å—Ç–æ–π - –∫–ª–∞—Å—Å–∏–∫ —Ä—É—Å—Å–∫–æ–π –ª–∏—Ç–µ—Ä–∞—Ç—É—Ä—ã, –∞–≤—Ç–æ—Ä —Ä–æ–º–∞–Ω–æ–≤ '–í–æ–π–Ω–∞ –∏ –º–∏—Ä' –∏ '–ê–Ω–Ω–∞ –ö–∞—Ä–µ–Ω–∏–Ω–∞'.",
        
        # –ù–∞—É–∫–∞ –∏ —Ç–µ—Ö–Ω–æ–ª–æ–≥–∏–∏
        "–î–º–∏—Ç—Ä–∏–π –ú–µ–Ω–¥–µ–ª–µ–µ–≤ - —Ä—É—Å—Å–∫–∏–π —É—á–µ–Ω—ã–π, —Å–æ–∑–¥–∞—Ç–µ–ª—å –ø–µ—Ä–∏–æ–¥–∏—á–µ—Å–∫–æ–π —Å–∏—Å—Ç–µ–º—ã —Ö–∏–º–∏—á–µ—Å–∫–∏—Ö —ç–ª–µ–º–µ–Ω—Ç–æ–≤ –≤ 1869 –≥–æ–¥—É.",
        "–Æ—Ä–∏–π –ì–∞–≥–∞—Ä–∏–Ω - –ø–µ—Ä–≤—ã–π —á–µ–ª–æ–≤–µ–∫ –≤ –∫–æ—Å–º–æ—Å–µ, —Å–æ–≤–µ—Ä—à–∏–ª –ø–æ–ª–µ—Ç 12 –∞–ø—Ä–µ–ª—è 1961 –≥–æ–¥–∞ –Ω–∞ –∫–æ—Ä–∞–±–ª–µ '–í–æ—Å—Ç–æ–∫-1'.",
        "–ú–∏—Ö–∞–∏–ª –õ–æ–º–æ–Ω–æ—Å–æ–≤ - —Ä—É—Å—Å–∫–∏–π —É—á–µ–Ω—ã–π-—ç–Ω—Ü–∏–∫–ª–æ–ø–µ–¥–∏—Å—Ç, –æ—Å–Ω–æ–≤–∞—Ç–µ–ª—å –ú–æ—Å–∫–æ–≤—Å–∫–æ–≥–æ —É–Ω–∏–≤–µ—Ä—Å–∏—Ç–µ—Ç–∞ –≤ 1755 –≥–æ–¥—É.",
        "–°–µ—Ä–≥–µ–π –ö–æ—Ä–æ–ª–µ–≤ - —Å–æ–≤–µ—Ç—Å–∫–∏–π –∫–æ–Ω—Å—Ç—Ä—É–∫—Ç–æ—Ä —Ä–∞–∫–µ—Ç–Ω–æ-–∫–æ—Å–º–∏—á–µ—Å–∫–æ–π —Ç–µ—Ö–Ω–∏–∫–∏, —Ä—É–∫–æ–≤–æ–¥–∏—Ç–µ–ª—å –ø—Ä–æ–≥—Ä–∞–º–º—ã –ø–µ—Ä–≤–æ–≥–æ –ø–æ–ª–µ—Ç–∞ —á–µ–ª–æ–≤–µ–∫–∞ –≤ –∫–æ—Å–º–æ—Å.",
        
        # –≠–∫–æ–Ω–æ–º–∏–∫–∞ –∏ –ø—Ä–æ–º—ã—à–ª–µ–Ω–Ω–æ—Å—Ç—å
        "–ì–∞–∑–ø—Ä–æ–º - –∫—Ä—É–ø–Ω–µ–π—à–∞—è –≥–∞–∑–æ–≤–∞—è –∫–æ–º–ø–∞–Ω–∏—è –º–∏—Ä–∞, –æ—Å–Ω–æ–≤–∞–Ω–∞ –≤ 1989 –≥–æ–¥—É. –®—Ç–∞–±-–∫–≤–∞—Ä—Ç–∏—Ä–∞ —Ä–∞—Å–ø–æ–ª–æ–∂–µ–Ω–∞ –≤ –ú–æ—Å–∫–≤–µ.",
        "–†–æ—Å–Ω–µ—Ñ—Ç—å - –æ–¥–Ω–∞ –∏–∑ –∫—Ä—É–ø–Ω–µ–π—à–∏—Ö –Ω–µ—Ñ—Ç—è–Ω—ã—Ö –∫–æ–º–ø–∞–Ω–∏–π –º–∏—Ä–∞, –æ—Å–Ω–æ–≤–∞–Ω–∞ –≤ 1993 –≥–æ–¥—É.",
        "–¢—Ä–∞–Ω—Å—Å–∏–±–∏—Ä—Å–∫–∞—è –º–∞–≥–∏—Å—Ç—Ä–∞–ª—å - —Å–∞–º–∞—è –¥–ª–∏–Ω–Ω–∞—è –∂–µ–ª–µ–∑–Ω–∞—è –¥–æ—Ä–æ–≥–∞ –≤ –º–∏—Ä–µ, –ø—Ä–æ—Ç—è–∂–µ–Ω–Ω–æ—Å—Ç—å—é 9288 –∫–º –æ—Ç –ú–æ—Å–∫–≤—ã –¥–æ –í–ª–∞–¥–∏–≤–æ—Å—Ç–æ–∫–∞.",
        "–†—É–±–ª—å - –Ω–∞—Ü–∏–æ–Ω–∞–ª—å–Ω–∞—è –≤–∞–ª—é—Ç–∞ –†–æ—Å—Å–∏–∏, –≤–≤–µ–¥–µ–Ω–∞ –≤ –æ–±—Ä–∞—â–µ–Ω–∏–µ –≤ 14 –≤–µ–∫–µ. –°–æ–≤—Ä–µ–º–µ–Ω–Ω—ã–π –∫–æ–¥ –≤–∞–ª—é—Ç—ã - RUB.",
        
        # –ü–æ–ª–∏—Ç–∏–∫–∞
        "–í–ª–∞–¥–∏–º–∏—Ä –ü—É—Ç–∏–Ω —è–≤–ª—è–µ—Ç—Å—è –ø—Ä–µ–∑–∏–¥–µ–Ω—Ç–æ–º –†–æ—Å—Å–∏–π—Å–∫–æ–π –§–µ–¥–µ—Ä–∞—Ü–∏–∏ —Å 2012 –≥–æ–¥–∞. –†–æ–¥–∏–ª—Å—è –≤ 1952 –≥–æ–¥—É –≤ –õ–µ–Ω–∏–Ω–≥—Ä–∞–¥–µ.",
        "–ì–æ—Å—É–¥–∞—Ä—Å—Ç–≤–µ–Ω–Ω–∞—è –î—É–º–∞ - –Ω–∏–∂–Ω—è—è –ø–∞–ª–∞—Ç–∞ –ø–∞—Ä–ª–∞–º–µ–Ω—Ç–∞ –†–æ—Å—Å–∏–∏, —Å–æ—Å—Ç–æ–∏—Ç –∏–∑ 450 –¥–µ–ø—É—Ç–∞—Ç–æ–≤.",
        "–ö–æ–Ω—Å—Ç–∏—Ç—É—Ü–∏—è –†–æ—Å—Å–∏–∏ –±—ã–ª–∞ –ø—Ä–∏–Ω—è—Ç–∞ 12 –¥–µ–∫–∞–±—Ä—è 1993 –≥–æ–¥–∞ –∏ —è–≤–ª—è–µ—Ç—Å—è –æ—Å–Ω–æ–≤–Ω—ã–º –∑–∞–∫–æ–Ω–æ–º —Å—Ç—Ä–∞–Ω—ã.",
        "–ö—Ä–µ–º–ª—å - –∏—Å—Ç–æ—Ä–∏—á–µ—Å–∫–∏–π –∫–æ–º–ø–ª–µ–∫—Å –≤ –ú–æ—Å–∫–≤–µ, —Ä–µ–∑–∏–¥–µ–Ω—Ü–∏—è –ø—Ä–µ–∑–∏–¥–µ–Ω—Ç–∞ –†–æ—Å—Å–∏–∏. –ü–æ—Å—Ç—Ä–æ–µ–Ω –≤ 15 –≤–µ–∫–µ.",
        
        # –û–±—Ä–∞–∑–æ–≤–∞–Ω–∏–µ
        "–ú–ì–£ –∏–º–µ–Ω–∏ –õ–æ–º–æ–Ω–æ—Å–æ–≤–∞ - —Å—Ç–∞—Ä–µ–π—à–∏–π –∏ –∫—Ä—É–ø–Ω–µ–π—à–∏–π —É–Ω–∏–≤–µ—Ä—Å–∏—Ç–µ—Ç –†–æ—Å—Å–∏–∏, –æ—Å–Ω–æ–≤–∞–Ω –≤ 1755 –≥–æ–¥—É.",
        "–ï–ì–≠ - –µ–¥–∏–Ω—ã–π –≥–æ—Å—É–¥–∞—Ä—Å—Ç–≤–µ–Ω–Ω—ã–π —ç–∫–∑–∞–º–µ–Ω, –æ–±—è–∑–∞—Ç–µ–ª—å–Ω—ã–π –¥–ª—è –≤—Å–µ—Ö –≤—ã–ø—É—Å–∫–Ω–∏–∫–æ–≤ —à–∫–æ–ª –†–æ—Å—Å–∏–∏ —Å 2009 –≥–æ–¥–∞.",
        "–†–æ—Å—Å–∏–π—Å–∫–∞—è –∞–∫–∞–¥–µ–º–∏—è –Ω–∞—É–∫ - –≥–ª–∞–≤–Ω–∞—è –Ω–∞—É—á–Ω–∞—è –æ—Ä–≥–∞–Ω–∏–∑–∞—Ü–∏—è —Å—Ç—Ä–∞–Ω—ã, –æ—Å–Ω–æ–≤–∞–Ω–∞ –ü–µ—Ç—Ä–æ–º I –≤ 1724 –≥–æ–¥—É.",
        
        # –°–ø–æ—Ä—Ç
        "–•–æ–∫–∫–µ–π —Å —à–∞–π–±–æ–π - –æ–¥–∏–Ω –∏–∑ —Å–∞–º—ã—Ö –ø–æ–ø—É–ª—è—Ä–Ω—ã—Ö –≤–∏–¥–æ–≤ —Å–ø–æ—Ä—Ç–∞ –≤ –†–æ—Å—Å–∏–∏. –°–±–æ—Ä–Ω–∞—è –†–æ—Å—Å–∏–∏ –º–Ω–æ–≥–æ–∫—Ä–∞—Ç–Ω—ã–π —á–µ–º–ø–∏–æ–Ω –º–∏—Ä–∞.",
        "–§–∏–≥—É—Ä–Ω–æ–µ –∫–∞—Ç–∞–Ω–∏–µ - —Ç—Ä–∞–¥–∏—Ü–∏–æ–Ω–Ω–æ —Å–∏–ª—å–Ω—ã–π –≤–∏–¥ —Å–ø–æ—Ä—Ç–∞ –¥–ª—è –†–æ—Å—Å–∏–∏, –º–Ω–æ–∂–µ—Å—Ç–≤–æ –æ–ª–∏–º–ø–∏–π—Å–∫–∏—Ö —á–µ–º–ø–∏–æ–Ω–æ–≤.",
        "–§—É—Ç–±–æ–ª—å–Ω—ã–π –∫–ª—É–± –ó–µ–Ω–∏—Ç –∏–∑ –°–∞–Ω–∫—Ç-–ü–µ—Ç–µ—Ä–±—É—Ä–≥–∞ - –æ–¥–∏–Ω –∏–∑ –≤–µ–¥—É—â–∏—Ö —Ñ—É—Ç–±–æ–ª—å–Ω—ã—Ö –∫–ª—É–±–æ–≤ –†–æ—Å—Å–∏–∏.",
        "–û–ª–∏–º–ø–∏–∞–¥–∞ 1980 –≥–æ–¥–∞ –ø—Ä–æ–≤–æ–¥–∏–ª–∞—Å—å –≤ –ú–æ—Å–∫–≤–µ, –±—ã–ª–∞ –ø–µ—Ä–≤–æ–π –û–ª–∏–º–ø–∏–∞–¥–æ–π –≤ —Å–æ—Ü–∏–∞–ª–∏—Å—Ç–∏—á–µ—Å–∫–æ–π —Å—Ç—Ä–∞–Ω–µ.",
        
        # –¢—Ä–∞–¥–∏—Ü–∏–∏ –∏ –ø—Ä–∞–∑–¥–Ω–∏–∫–∏
        "–î–µ–Ω—å –ü–æ–±–µ–¥—ã –æ—Ç–º–µ—á–∞–µ—Ç—Å—è 9 –º–∞—è –≤ —á–µ—Å—Ç—å –ø–æ–±–µ–¥—ã –≤ –í–µ–ª–∏–∫–æ–π –û—Ç–µ—á–µ—Å—Ç–≤–µ–Ω–Ω–æ–π –≤–æ–π–Ω–µ. –ì–ª–∞–≤–Ω—ã–π –ø–∞—Ä–∞–¥ –ø—Ä–æ—Ö–æ–¥–∏—Ç –Ω–∞ –ö—Ä–∞—Å–Ω–æ–π –ø–ª–æ—â–∞–¥–∏.",
        "–ú–∞—Å–ª–µ–Ω–∏—Ü–∞ - —Ç—Ä–∞–¥–∏—Ü–∏–æ–Ω–Ω—ã–π —Ä—É—Å—Å–∫–∏–π –ø—Ä–∞–∑–¥–Ω–∏–∫ –ø—Ä–æ–≤–æ–¥–æ–≤ –∑–∏–º—ã, –æ—Ç–º–µ—á–∞–µ—Ç—Å—è –∑–∞ –Ω–µ–¥–µ–ª—é –¥–æ –í–µ–ª–∏–∫–æ–≥–æ –ø–æ—Å—Ç–∞.",
        "–ù–æ–≤—ã–π –≥–æ–¥ - –≥–ª–∞–≤–Ω—ã–π –ø—Ä–∞–∑–¥–Ω–∏–∫ –≤ –†–æ—Å—Å–∏–∏, –æ—Ç–º–µ—á–∞–µ—Ç—Å—è –≤ –Ω–æ—á—å —Å 31 –¥–µ–∫–∞–±—Ä—è –Ω–∞ 1 —è–Ω–≤–∞—Ä—è.",
        "–î–µ–Ω—å –†–æ—Å—Å–∏–∏ - –≥–æ—Å—É–¥–∞—Ä—Å—Ç–≤–µ–Ω–Ω—ã–π –ø—Ä–∞–∑–¥–Ω–∏–∫ 12 –∏—é–Ω—è, –æ—Ç–º–µ—á–∞–µ—Ç—Å—è —Å 1992 –≥–æ–¥–∞."
    ]
    return documents

# –ü—Ä–∏–º–µ—Ä—ã –≤–æ–ø—Ä–æ—Å–æ–≤ –¥–ª—è —Ç–µ—Å—Ç–∏—Ä–æ–≤–∞–Ω–∏—è
test_questions = [
    "–ö–∞–∫–∞—è —Å–∞–º–∞—è –¥–ª–∏–Ω–Ω–∞—è —Ä–µ–∫–∞ –≤ –ï–≤—Ä–æ–ø–µ?",
    "–ö–æ–≥–¥–∞ –±—ã–ª –æ—Å–Ω–æ–≤–∞–Ω –°–∞–Ω–∫—Ç-–ü–µ—Ç–µ—Ä–±—É—Ä–≥?",
    "–ö—Ç–æ –±—ã–ª –ø–µ—Ä–≤—ã–º —á–µ–ª–æ–≤–µ–∫–æ–º –≤ –∫–æ—Å–º–æ—Å–µ?",
    "–°–∫–æ–ª—å–∫–æ —á–µ–ª–æ–≤–µ–∫ –∂–∏–≤–µ—Ç –≤ –ú–æ—Å–∫–≤–µ?",
    "–ö–∞–∫–æ–µ –æ–∑–µ—Ä–æ —Å–∞–º–æ–µ –≥–ª—É–±–æ–∫–æ–µ –≤ –º–∏—Ä–µ?",
    "–ö—Ç–æ —Å–æ–∑–¥–∞–ª –ø–µ—Ä–∏–æ–¥–∏—á–µ—Å–∫—É—é —Ç–∞–±–ª–∏—Ü—É —ç–ª–µ–º–µ–Ω—Ç–æ–≤?",
    "–ö–∞–∫ –Ω–∞–∑—ã–≤–∞–µ—Ç—Å—è –≤–∞–ª—é—Ç–∞ –†–æ—Å—Å–∏–∏?",
    "–ö–æ–≥–¥–∞ –±—ã–ª–∞ –û–∫—Ç—è–±—Ä—å—Å–∫–∞—è —Ä–µ–≤–æ–ª—é—Ü–∏—è?",
    "–ö–∞–∫–æ–π —É–Ω–∏–≤–µ—Ä—Å–∏—Ç–µ—Ç —Å–∞–º—ã–π —Å—Ç–∞—Ä—ã–π –≤ –†–æ—Å—Å–∏–∏?",
    "–ß—Ç–æ —Ç–∞–∫–æ–µ –≠—Ä–º–∏—Ç–∞–∂?",
    "–ö—Ç–æ —è–≤–ª—è–µ—Ç—Å—è –ø—Ä–µ–∑–∏–¥–µ–Ω—Ç–æ–º –†–æ—Å—Å–∏–∏?",
    "–ö–∞–∫–æ–π –ø—Ä–∞–∑–¥–Ω–∏–∫ –æ—Ç–º–µ—á–∞–µ—Ç—Å—è 9 –º–∞—è?",
    "–°–∫–æ–ª—å–∫–æ –∫–∏–ª–æ–º–µ—Ç—Ä–æ–≤ —Å–æ—Å—Ç–∞–≤–ª—è–µ—Ç –¢—Ä–∞–Ω—Å—Å–∏–±–∏—Ä—Å–∫–∞—è –º–∞–≥–∏—Å—Ç—Ä–∞–ª—å?",
    "–ö—Ç–æ –∞–≤—Ç–æ—Ä —Ä–æ–º–∞–Ω–∞ '–í–æ–π–Ω–∞ –∏ –º–∏—Ä'?",
    "–í –∫–∞–∫–æ–º –≥–æ–¥—É –Æ—Ä–∏–π –ì–∞–≥–∞—Ä–∏–Ω –ø–æ–ª–µ—Ç–µ–ª –≤ –∫–æ—Å–º–æ—Å?"
]

def run_test_questions():
    """–§—É–Ω–∫—Ü–∏—è –¥–ª—è —Ç–µ—Å—Ç–∏—Ä–æ–≤–∞–Ω–∏—è —Å–∏—Å—Ç–µ–º—ã –Ω–∞ –ø–æ–¥–≥–æ—Ç–æ–≤–ª–µ–Ω–Ω—ã—Ö –≤–æ–ø—Ä–æ—Å–∞—Ö"""
    documents = create_knowledge_base()
    index, documents = create_vector_db(documents)
    
    print("=" * 50)
    print("–¢–ï–°–¢–ò–†–û–í–ê–ù–ò–ï RAG –°–ò–°–¢–ï–ú–´")
    print("=" * 50)
    
    for i, question in enumerate(test_questions, 1):
        print(f"\n{i}. –í–æ–ø—Ä–æ—Å: {question}")
        try:
            answer, context_docs = rag_pipeline(question, index, documents)
            print(f"–û—Ç–≤–µ—Ç: {answer}")
            print(f"–ò—Å–ø–æ–ª—å–∑–æ–≤–∞–Ω–æ –¥–æ–∫—É–º–µ–Ω—Ç–æ–≤: {len(context_docs)}")
            print("-" * 30)
        except Exception as e:
            print(f"–û—à–∏–±–∫–∞: {e}")
            print("-" * 30)


# 3. –°–æ–∑–¥–∞–Ω–∏–µ –≤–µ–∫—Ç–æ—Ä–Ω–æ–π –±–∞–∑—ã –¥–∞–Ω–Ω—ã—Ö
def create_vector_db(documents):
    """–°–æ–∑–¥–∞–µ–º –≤–µ–∫—Ç–æ—Ä–Ω—É—é –±–∞–∑—É –¥–∞–Ω–Ω—ã—Ö"""
    print("–°–æ–∑–¥–∞–µ–º –≤–µ–∫—Ç–æ—Ä–Ω—É—é –±–∞–∑—É...")
    
    # –ü–æ–ª—É—á–∞–µ–º —ç–º–±–µ–¥–¥–∏–Ω–≥–∏ –¥–ª—è –¥–æ–∫—É–º–µ–Ω—Ç–æ–≤
    embeddings = embedding_model.encode(documents)
    
    # –°–æ–∑–¥–∞–µ–º FAISS –∏–Ω–¥–µ–∫—Å
    dimension = embeddings.shape[1]
    index = faiss.IndexFlatIP(dimension)  # Inner Product (–∫–æ—Å–∏–Ω—É—Å–Ω–æ–µ —Å—Ö–æ–¥—Å—Ç–≤–æ)
    
    # –ù–æ—Ä–º–∞–ª–∏–∑—É–µ–º –≤–µ–∫—Ç–æ—Ä—ã –¥–ª—è –∫–æ—Å–∏–Ω—É—Å–Ω–æ–≥–æ —Å—Ö–æ–¥—Å—Ç–≤–∞
    faiss.normalize_L2(embeddings)
    index.add(embeddings)
    
    return index, documents

# 4. –ü–æ–∏—Å–∫ —Ä–µ–ª–µ–≤–∞–Ω—Ç–Ω—ã—Ö –¥–æ–∫—É–º–µ–Ω—Ç–æ–≤
def search_similar_documents(query, index, documents, top_k=3):
    """–ò—â–µ–º –Ω–∞–∏–±–æ–ª–µ–µ —Ä–µ–ª–µ–≤–∞–Ω—Ç–Ω—ã–µ –¥–æ–∫—É–º–µ–Ω—Ç—ã –¥–ª—è –∑–∞–ø—Ä–æ—Å–∞"""
    # –ü–æ–ª—É—á–∞–µ–º —ç–º–±–µ–¥–¥–∏–Ω–≥ –∑–∞–ø—Ä–æ—Å–∞
    query_embedding = embedding_model.encode([query])
    faiss.normalize_L2(query_embedding)
    
    # –ò—â–µ–º –ø–æ—Ö–æ–∂–∏–µ –¥–æ–∫—É–º–µ–Ω—Ç—ã
    scores, indices = index.search(query_embedding, top_k)
    
    # –í–æ–∑–≤—Ä–∞—â–∞–µ–º –Ω–∞–π–¥–µ–Ω–Ω—ã–µ –¥–æ–∫—É–º–µ–Ω—Ç—ã –∏ –∏—Ö —Å–∫–æ—Ä–∏–Ω–≥–∏
    results = []
    for i, idx in enumerate(indices[0]):
        if idx < len(documents):
            results.append({
                'text': documents[idx],
                'score': scores[0][i]
            })
    
    return results

# 5. –°–æ–∑–¥–∞–Ω–∏–µ –ø—Ä–æ–º–ø—Ç–∞ —Å –∫–æ–Ω—Ç–µ–∫—Å—Ç–æ–º
def create_prompt(question, context_docs):
    """–°–æ–∑–¥–∞–µ–º –ø—Ä–æ–º–ø—Ç –¥–ª—è –º–æ–¥–µ–ª–∏ —Å –∫–æ–Ω—Ç–µ–∫—Å—Ç–æ–º"""
    context = "\n".join([f"- {doc['text']}" for doc in context_docs])
    
    prompt = f"""–ö–æ–Ω—Ç–µ–∫—Å—Ç:
{context}

–í–æ–ø—Ä–æ—Å: {question}

–û—Ç–≤–µ—Ç: """
    
    return prompt

# 6. –ì–µ–Ω–µ—Ä–∞—Ü–∏—è –æ—Ç–≤–µ—Ç–∞
def generate_answer(prompt):
    """–ì–µ–Ω–µ—Ä–∏—Ä—É–µ–º –æ—Ç–≤–µ—Ç —Å –ø–æ–º–æ—â—å—é –º–æ–¥–µ–ª–∏"""
    inputs = tokenizer(prompt, return_tensors="pt", max_length=1024, truncation=True)
    
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=256,
            do_sample=True,
            temperature=0.7,
            top_p=0.9,
            pad_token_id=tokenizer.eos_token_id
        )
    
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    # –£–±–∏—Ä–∞–µ–º –ø—Ä–æ–º–ø—Ç –∏–∑ –æ—Ç–≤–µ—Ç–∞
    answer = response[len(prompt):].strip()
    
    return answer

# 7. –û—Å–Ω–æ–≤–Ω–∞—è —Ñ—É–Ω–∫—Ü–∏—è RAG
def rag_pipeline(question, index, documents):
    """–ü–æ–ª–Ω—ã–π –ø–∞–π–ø–ª–∞–π–Ω RAG"""
    print(f"–í–æ–ø—Ä–æ—Å: {question}")
    
    # –®–∞–≥ 1: –ü–æ–∏—Å–∫ —Ä–µ–ª–µ–≤–∞–Ω—Ç–Ω—ã—Ö –¥–æ–∫—É–º–µ–Ω—Ç–æ–≤
    print("–ò—â–µ–º —Ä–µ–ª–µ–≤–∞–Ω—Ç–Ω—ã–µ –¥–æ–∫—É–º–µ–Ω—Ç—ã...")
    relevant_docs = search_similar_documents(question, index, documents)
    
    print("–ù–∞–π–¥–µ–Ω–Ω—ã–µ –¥–æ–∫—É–º–µ–Ω—Ç—ã:")
    for i, doc in enumerate(relevant_docs):
        print(f"{i+1}. {doc['text']} (—Å—Ö–æ–¥—Å—Ç–≤–æ: {doc['score']:.3f})")
    
    # –®–∞–≥ 2: –°–æ–∑–¥–∞–Ω–∏–µ –ø—Ä–æ–º–ø—Ç–∞
    prompt = create_prompt(question, relevant_docs)
    
    # –®–∞–≥ 3: –ì–µ–Ω–µ—Ä–∞—Ü–∏—è –æ—Ç–≤–µ—Ç–∞
    print("\n–ì–µ–Ω–µ—Ä–∏—Ä—É–µ–º –æ—Ç–≤–µ—Ç...")
    answer = generate_answer(prompt)
    
    return answer, relevant_docs

# 8. –ó–∞–ø—É—Å–∫ —Å–∏—Å—Ç–µ–º—ã
def main():
    # –ò–Ω–∏—Ü–∏–∞–ª–∏–∑–∞—Ü–∏—è
    documents = create_knowledge_base()
    index, documents = create_vector_db(documents)
    
    print("RAG —Å–∏—Å—Ç–µ–º–∞ –≥–æ—Ç–æ–≤–∞! –í–≤–µ–¥–∏—Ç–µ –≤–∞—à –≤–æ–ø—Ä–æ—Å (–∏–ª–∏ '–≤—ã—Ö–æ–¥' –¥–ª—è –∑–∞–≤–µ—Ä—à–µ–Ω–∏—è):")
    
    # –ò–Ω—Ç–µ—Ä–∞–∫—Ç–∏–≤–Ω—ã–π —Ü–∏–∫–ª
    while True:
        question = input("\n–í–∞—à –≤–æ–ø—Ä–æ—Å: ").strip()
        
        if question.lower() in ['–≤—ã—Ö–æ–¥', 'exit', 'quit']:
            break
            
        if not question:
            continue
            
        try:
            answer, context_docs = rag_pipeline(question, index, documents)
            print(f"\n–û—Ç–≤–µ—Ç: {answer}")
            print(f"\n–ò—Å—Ç–æ—á–Ω–∏–∫–∏: {len(context_docs)} –¥–æ–∫—É–º–µ–Ω—Ç–æ–≤")
            
        except Exception as e:
            print(f"–û—à–∏–±–∫–∞: {e}")

if __name__ == "__main__":
    run_test_questions()

In [None]:
# quick_test.py - –±—ã—Å—Ç—Ä—ã–π —Ç–µ—Å—Ç —Å–∏—Å—Ç–µ–º—ã
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
from datasets import load_dataset

def quick_rag_test():
    """–ë—ã—Å—Ç—Ä—ã–π —Ç–µ—Å—Ç RAG —Å–∏—Å—Ç–µ–º—ã —Å —Ä–µ–∞–ª—å–Ω—ã–º–∏ –Ω–æ–≤–æ—Å—Ç—è–º–∏"""
    
    print("üöÄ –ë—ã—Å—Ç—Ä–∞—è –∑–∞–≥—Ä—É–∑–∫–∞ RAG —Å–∏—Å—Ç–µ–º—ã...")
    
    # –ó–∞–≥—Ä—É–∑–∫–∞ –º–æ–¥–µ–ª–µ–π
    embedding_model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')
    
    tokenizer = AutoTokenizer.from_pretrained("IlyaGusev/saiga_yandexgpt_8b")
    model = AutoModelForCausalLM.from_pretrained(
        "IlyaGusev/saiga_yandexgpt_8b",
        torch_dtype=torch.float16,
        device_map="auto"
    )
    
    # –ó–∞–≥—Ä—É–∑–∫–∞ –¥–∞—Ç–∞—Å–µ—Ç–∞
    print("üì∞ –ó–∞–≥—Ä—É–∂–∞–µ–º –Ω–æ–≤–æ—Å—Ç–∏...")
    dataset = load_dataset("IlyaGusev/ru_news", split="train[:500]")
    documents = []
    
    for item in dataset:
        text = item.get('text', '')
        title = item.get('title', '')
        if title and text:
            documents.append(f"{title}. {text[:200]}...")
    
    print(f"‚úÖ –ó–∞–≥—Ä—É–∂–µ–Ω–æ {len(documents)} –Ω–æ–≤–æ—Å—Ç–µ–π")
    
    # –°–æ–∑–¥–∞–Ω–∏–µ –≤–µ–∫—Ç–æ—Ä–Ω–æ–π –±–∞–∑—ã
    embeddings = embedding_model.encode(documents)
    dimension = embeddings.shape[1]
    index = faiss.IndexFlatIP(dimension)
    faiss.normalize_L2(embeddings)
    index.add(embeddings)
    
    # –ü—Ä–æ—Å—Ç—ã–µ —Ç–µ—Å—Ç–æ–≤—ã–µ –≤–æ–ø—Ä–æ—Å—ã
    questions = [
        "–ö–∞–∫–∏–µ —ç–∫–æ–Ω–æ–º–∏—á–µ—Å–∫–∏–µ –Ω–æ–≤–æ—Å—Ç–∏?",
        "–ß—Ç–æ –Ω–æ–≤–æ–≥–æ –≤ –ø–æ–ª–∏—Ç–∏–∫–µ?",
        "–ö–∞–∫–∏–µ –∫—É–ª—å—Ç—É—Ä–Ω—ã–µ —Å–æ–±—ã—Ç–∏—è?",
        "–ö–∞–∫–∏–µ —Å–ø–æ—Ä—Ç–∏–≤–Ω—ã–µ –Ω–æ–≤–æ—Å—Ç–∏?"
    ]
    
    print("\nüß™ –¢–µ—Å—Ç–∏—Ä—É–µ–º —Å–∏—Å—Ç–µ–º—É...")
    
    for question in questions:
        print(f"\n‚ùì –í–æ–ø—Ä–æ—Å: {question}")
        
        # –ü–æ–∏—Å–∫ —Ä–µ–ª–µ–≤–∞–Ω—Ç–Ω—ã—Ö –¥–æ–∫—É–º–µ–Ω—Ç–æ–≤
        query_embedding = embedding_model.encode([question])
        faiss.normalize_L2(query_embedding)
        scores, indices = index.search(query_embedding, 2)
        
        # –°–æ–∑–¥–∞–Ω–∏–µ –∫–æ–Ω—Ç–µ–∫—Å—Ç–∞
        context = "\n".join([f"- {documents[idx]}" for idx in indices[0] if idx < len(documents)])
        
        # –ì–µ–Ω–µ—Ä–∞—Ü–∏—è –æ—Ç–≤–µ—Ç–∞
        prompt = f"""<bos><start_of_turn>user
–ù–∞ –æ—Å–Ω–æ–≤–µ –Ω–æ–≤–æ—Å—Ç–µ–π –æ—Ç–≤–µ—Ç—å –Ω–∞ –≤–æ–ø—Ä–æ—Å:

{context}

–í–æ–ø—Ä–æ—Å: {question}

–ö—Ä–∞—Ç–∫–∏–π –æ—Ç–≤–µ—Ç:<end_of_turn>
<start_of_turn>model
"""
        
        inputs = tokenizer(prompt, return_tensors="pt", max_length=1024, truncation=True)
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=200,
                do_sample=True,
                temperature=0.7
            )
        
        response = tokenizer.decode(outputs[0], skip_special_tokens=False)
        if "<start_of_turn>model" in response:
            answer = response.split("<start_of_turn>model")[-1].replace("<end_of_turn>", "").strip()
            print(f"ü§ñ –û—Ç–≤–µ—Ç: {answer}")

if __name__ == "__main__":
    quick_rag_test()

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
import requests
import json
from datasets import load_dataset

# 1. –ó–∞–≥—Ä—É–∑–∫–∞ –º–æ–¥–µ–ª–µ–π
print("–ó–∞–≥—Ä—É–∂–∞–µ–º –º–æ–¥–µ–ª–∏...")

# –ú–æ–¥–µ–ª—å –¥–ª—è —ç–º–±–µ–¥–¥–∏–Ω–≥–æ–≤ (—Ä—É—Å—Å–∫–∏–π —è–∑—ã–∫)
embedding_model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')

# –ú–æ–¥–µ–ª—å –¥–ª—è –≥–µ–Ω–µ—Ä–∞—Ü–∏–∏
model_name = "IlyaGusev/saiga_yandexgpt_8b"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    device_map="auto"
)

# 2. –ó–∞–≥—Ä—É–∑–∫–∞ —Ä–µ–∞–ª—å–Ω–æ–≥–æ –¥–∞—Ç–∞—Å–µ—Ç–∞ –Ω–æ–≤–æ—Å—Ç–µ–π
def create_knowledge_base():
    """–ó–∞–≥—Ä—É–∂–∞–µ–º —Ä–µ–∞–ª—å–Ω—ã–π –¥–∞—Ç–∞—Å–µ—Ç –Ω–æ–≤–æ—Å—Ç–µ–π –Ω–∞ —Ä—É—Å—Å–∫–æ–º —è–∑—ã–∫–µ"""
    print("–ó–∞–≥—Ä—É–∂–∞–µ–º –¥–∞—Ç–∞—Å–µ—Ç IlyaGusev/ru_news...")
    
    try:
        # –ó–∞–≥—Ä—É–∂–∞–µ–º –¥–∞—Ç–∞—Å–µ—Ç
        dataset = load_dataset("IlyaGusev/ru_news", split="train[:1000]")  # –ë–µ—Ä–µ–º –ø–µ—Ä–≤—ã–µ 1000 –Ω–æ–≤–æ—Å—Ç–µ–π
        
        documents = []
        for item in dataset:
            # –ò–∑–≤–ª–µ–∫–∞–µ–º —Ç–µ–∫—Å—Ç –Ω–æ–≤–æ—Å—Ç–∏
            text = item.get('text', '')
            title = item.get('title', '')
            
            # –°–æ–∑–¥–∞–µ–º –¥–æ–∫—É–º–µ–Ω—Ç –∏–∑ –∑–∞–≥–æ–ª–æ–≤–∫–∞ –∏ –Ω–∞—á–∞–ª–∞ —Ç–µ–∫—Å—Ç–∞
            if title and text:
                document = f"{title}. {text[:300]}..."  # –û–≥—Ä–∞–Ω–∏—á–∏–≤–∞–µ–º –¥–ª–∏–Ω—É
                documents.append(document)
            elif text:
                documents.append(text[:400] + "...")
            elif title:
                documents.append(title)
        
        print(f"–ó–∞–≥—Ä—É–∂–µ–Ω–æ {len(documents)} –Ω–æ–≤–æ—Å—Ç–µ–π –∏–∑ –¥–∞—Ç–∞—Å–µ—Ç–∞")
        return documents
        
    except Exception as e:
        print(f"–û—à–∏–±–∫–∞ –∑–∞–≥—Ä—É–∑–∫–∏ –¥–∞—Ç–∞—Å–µ—Ç–∞: {e}")
        # –†–µ–∑–µ—Ä–≤–Ω—ã–π –≤–∞—Ä–∏–∞–Ω—Ç - –≤–æ–∑–≤—Ä–∞—â–∞–µ–º –Ω–µ—Å–∫–æ–ª—å–∫–æ –ø—Ä–∏–º–µ—Ä–æ–≤
        return [
            "–†–æ—Å—Å–∏—è –∏ –ö–∏—Ç–∞–π —É—Å–∏–ª–∏–≤–∞—é—Ç —ç–∫–æ–Ω–æ–º–∏—á–µ—Å–∫–æ–µ —Å–æ—Ç—Ä—É–¥–Ω–∏—á–µ—Å—Ç–≤–æ –≤ —É—Å–ª–æ–≤–∏—è—Ö —Å–∞–Ω–∫—Ü–∏–π.",
            "–¶–µ–Ω—Ç—Ä–æ–±–∞–Ω–∫ –†–æ—Å—Å–∏–∏ —Å–æ—Ö—Ä–∞–Ω–∏–ª –∫–ª—é—á–µ–≤—É—é —Å—Ç–∞–≤–∫—É –Ω–∞ –ø—Ä–µ–∂–Ω–µ–º —É—Ä–æ–≤–Ω–µ.",
            "–í –ú–æ—Å–∫–≤–µ –æ—Ç–∫—Ä—ã–ª–∞—Å—å –Ω–æ–≤–∞—è –≤–µ—Ç–∫–∞ –º–µ—Ç—Ä–æ, —Å–æ–µ–¥–∏–Ω—è—é—â–∞—è —Ü–µ–Ω—Ç—Ä —Å —Å–ø–∞–ª—å–Ω—ã–º–∏ —Ä–∞–π–æ–Ω–∞–º–∏.",
            "–£—á–µ–Ω—ã–µ —Ä–∞–∑—Ä–∞–±–æ—Ç–∞–ª–∏ –Ω–æ–≤—É—é –≤–∞–∫—Ü–∏–Ω—É –æ—Ç –≥—Ä–∏–ø–ø–∞ —Å –ø–æ–≤—ã—à–µ–Ω–Ω–æ–π —ç—Ñ—Ñ–µ–∫—Ç–∏–≤–Ω–æ—Å—Ç—å—é.",
            "–¶–µ–Ω—ã –Ω–∞ –Ω–µ—Ñ—Ç—å –≤—ã—Ä–æ—Å–ª–∏ –Ω–∞ –º–∏—Ä–æ–≤—ã—Ö —Ä—ã–Ω–∫–∞—Ö –∏–∑-–∑–∞ —Å–æ–∫—Ä–∞—â–µ–Ω–∏—è –¥–æ–±—ã—á–∏.",
        ]

# –ü—Ä–∏–º–µ—Ä—ã –≤–æ–ø—Ä–æ—Å–æ–≤ –¥–ª—è —Ç–µ—Å—Ç–∏—Ä–æ–≤–∞–Ω–∏—è (–∞–¥–∞–ø—Ç–∏—Ä–æ–≤–∞–Ω–Ω—ã–µ –ø–æ–¥ –Ω–æ–≤–æ—Å—Ç–Ω—É—é —Ç–µ–º–∞—Ç–∏–∫—É)
test_questions = [
    "–ö–∞–∫–∏–µ –ø–æ—Å–ª–µ–¥–Ω–∏–µ –Ω–æ–≤–æ—Å—Ç–∏ –æ —ç–∫–æ–Ω–æ–º–∏–∫–µ –†–æ—Å—Å–∏–∏?",
    "–ß—Ç–æ –ø—Ä–æ–∏—Å—Ö–æ–¥–∏—Ç —Å –∫—É—Ä—Å–æ–º —Ä—É–±–ª—è?",
    "–ö–∞–∫–∏–µ –Ω–æ–≤—ã–µ –ø—Ä–æ–µ–∫—Ç—ã –≤ –º–µ—Ç—Ä–æ –ú–æ—Å–∫–≤—ã?",
    "–ö–∞–∫–∏–µ –Ω–∞—É—á–Ω—ã–µ —Ä–∞–∑—Ä–∞–±–æ—Ç–∫–∏ –ø–æ—Å–ª–µ–¥–Ω–µ–≥–æ –≤—Ä–µ–º–µ–Ω–∏?",
    "–ö–∞–∫ –∏–∑–º–µ–Ω–∏–ª–∏—Å—å —Ü–µ–Ω—ã –Ω–∞ –Ω–µ—Ñ—Ç—å?",
    "–ö–∞–∫–∏–µ –º–µ–∂–¥—É–Ω–∞—Ä–æ–¥–Ω—ã–µ –æ—Ç–Ω–æ—à–µ–Ω–∏—è —É –†–æ—Å—Å–∏–∏?",
    "–ß—Ç–æ –Ω–æ–≤–æ–≥–æ –≤ –±–∞–Ω–∫–æ–≤—Å–∫–æ–π —Å–∏—Å—Ç–µ–º–µ?",
    "–ö–∞–∫–∏–µ —Å–æ–±—ã—Ç–∏—è –≤ –∫—É–ª—å—Ç—É—Ä–Ω–æ–π –∂–∏–∑–Ω–∏?",
    "–ö–∞–∫–∏–µ —Å–ø–æ—Ä—Ç–∏–≤–Ω—ã–µ –Ω–æ–≤–æ—Å—Ç–∏?",
    "–ß—Ç–æ –ø—Ä–æ–∏—Å—Ö–æ–¥–∏—Ç –≤ —Å—Ñ–µ—Ä–µ —Ç–µ—Ö–Ω–æ–ª–æ–≥–∏–π?",
    "–ö–∞–∫–∏–µ –∏–∑–º–µ–Ω–µ–Ω–∏—è –≤ –æ–±—Ä–∞–∑–æ–≤–∞–Ω–∏–∏?",
    "–ö–∞–∫–∏–µ –º–µ–¥–∏—Ü–∏–Ω—Å–∫–∏–µ –Ω–æ–≤–æ—Å—Ç–∏?",
    "–ö–∞–∫–∏–µ –ø–æ–ª–∏—Ç–∏—á–µ—Å–∫–∏–µ —Å–æ–±—ã—Ç–∏—è?",
    "–ß—Ç–æ –Ω–æ–≤–æ–≥–æ –≤ —Å—Ç—Ä–æ–∏—Ç–µ–ª—å—Å—Ç–≤–µ?",
    "–ö–∞–∫–∏–µ —ç–∫–æ–ª–æ–≥–∏—á–µ—Å–∫–∏–µ –ø—Ä–æ–±–ª–µ–º—ã –æ–±—Å—É–∂–¥–∞—é—Ç—Å—è?"
]

def run_test_questions():
    """–§—É–Ω–∫—Ü–∏—è –¥–ª—è —Ç–µ—Å—Ç–∏—Ä–æ–≤–∞–Ω–∏—è —Å–∏—Å—Ç–µ–º—ã –Ω–∞ –ø–æ–¥–≥–æ—Ç–æ–≤–ª–µ–Ω–Ω—ã—Ö –≤–æ–ø—Ä–æ—Å–∞—Ö"""
    documents = create_knowledge_base()
    index, documents = create_vector_db(documents)
    
    print("=" * 60)
    print("–¢–ï–°–¢–ò–†–û–í–ê–ù–ò–ï RAG –°–ò–°–¢–ï–ú–´ –° –†–ï–ê–õ–¨–ù–´–ú–ò –ù–û–í–û–°–¢–Ø–ú–ò")
    print("=" * 60)
    
    for i, question in enumerate(test_questions, 1):
        print(f"\n{i}. –í–æ–ø—Ä–æ—Å: {question}")
        try:
            answer, context_docs = rag_pipeline(question, index, documents)
            print(f"ü§ñ –û—Ç–≤–µ—Ç: {answer}")
            print(f"üìä –ò—Å–ø–æ–ª—å–∑–æ–≤–∞–Ω–æ –¥–æ–∫—É–º–µ–Ω—Ç–æ–≤: {len(context_docs)}")
            print("üì∞ –ò—Å—Ç–æ—á–Ω–∏–∫–∏:")
            for j, doc in enumerate(context_docs, 1):
                print(f"   {j}. {doc['text'][:100]}... (—Å—Ö–æ–¥—Å—Ç–≤–æ: {doc['score']:.3f})")
            print("-" * 80)
        except Exception as e:
            print(f"‚ùå –û—à–∏–±–∫–∞: {e}")
            print("-" * 80)

# 3. –°–æ–∑–¥–∞–Ω–∏–µ –≤–µ–∫—Ç–æ—Ä–Ω–æ–π –±–∞–∑—ã –¥–∞–Ω–Ω—ã—Ö
def create_vector_db(documents):
    """–°–æ–∑–¥–∞–µ–º –≤–µ–∫—Ç–æ—Ä–Ω—É—é –±–∞–∑—É –¥–∞–Ω–Ω—ã—Ö"""
    print("–°–æ–∑–¥–∞–µ–º –≤–µ–∫—Ç–æ—Ä–Ω—É—é –±–∞–∑—É –∏–∑ –Ω–æ–≤–æ—Å—Ç–µ–π...")
    
    # –ü–æ–ª—É—á–∞–µ–º —ç–º–±–µ–¥–¥–∏–Ω–≥–∏ –¥–ª—è –¥–æ–∫—É–º–µ–Ω—Ç–æ–≤
    embeddings = embedding_model.encode(documents)
    
    # –°–æ–∑–¥–∞–µ–º FAISS –∏–Ω–¥–µ–∫—Å
    dimension = embeddings.shape[1]
    index = faiss.IndexFlatIP(dimension)  # Inner Product (–∫–æ—Å–∏–Ω—É—Å–Ω–æ–µ —Å—Ö–æ–¥—Å—Ç–≤–æ)
    
    # –ù–æ—Ä–º–∞–ª–∏–∑—É–µ–º –≤–µ–∫—Ç–æ—Ä—ã –¥–ª—è –∫–æ—Å–∏–Ω—É—Å–Ω–æ–≥–æ —Å—Ö–æ–¥—Å—Ç–≤–∞
    faiss.normalize_L2(embeddings)
    index.add(embeddings)
    
    print(f"–í–µ–∫—Ç–æ—Ä–Ω–∞—è –±–∞–∑–∞ —Å–æ–∑–¥–∞–Ω–∞: {len(documents)} –¥–æ–∫—É–º–µ–Ω—Ç–æ–≤")
    return index, documents

# 4. –ü–æ–∏—Å–∫ —Ä–µ–ª–µ–≤–∞–Ω—Ç–Ω—ã—Ö –¥–æ–∫—É–º–µ–Ω—Ç–æ–≤
def search_similar_documents(query, index, documents, top_k=3):
    """–ò—â–µ–º –Ω–∞–∏–±–æ–ª–µ–µ —Ä–µ–ª–µ–≤–∞–Ω—Ç–Ω—ã–µ –¥–æ–∫—É–º–µ–Ω—Ç—ã –¥–ª—è –∑–∞–ø—Ä–æ—Å–∞"""
    # –ü–æ–ª—É—á–∞–µ–º —ç–º–±–µ–¥–¥–∏–Ω–≥ –∑–∞–ø—Ä–æ—Å–∞
    query_embedding = embedding_model.encode([query])
    faiss.normalize_L2(query_embedding)
    
    # –ò—â–µ–º –ø–æ—Ö–æ–∂–∏–µ –¥–æ–∫—É–º–µ–Ω—Ç—ã
    scores, indices = index.search(query_embedding, top_k)
    
    # –í–æ–∑–≤—Ä–∞—â–∞–µ–º –Ω–∞–π–¥–µ–Ω–Ω—ã–µ –¥–æ–∫—É–º–µ–Ω—Ç—ã –∏ –∏—Ö —Å–∫–æ—Ä–∏–Ω–≥–∏
    results = []
    for i, idx in enumerate(indices[0]):
        if idx < len(documents):
            results.append({
                'text': documents[idx],
                'score': scores[0][i]
            })
    
    return results

# 5. –°–æ–∑–¥–∞–Ω–∏–µ –ø—Ä–æ–º–ø—Ç–∞ —Å –∫–æ–Ω—Ç–µ–∫—Å—Ç–æ–º
def create_prompt(question, context_docs):
    """–°–æ–∑–¥–∞–µ–º –ø—Ä–æ–º–ø—Ç –¥–ª—è –º–æ–¥–µ–ª–∏ —Å –∫–æ–Ω—Ç–µ–∫—Å—Ç–æ–º"""
    context = "\n".join([f"- {doc['text']}" for doc in context_docs])
    
    prompt = f"""<bos><start_of_turn>user
–ù–∞ –æ—Å–Ω–æ–≤–µ —Å–ª–µ–¥—É—é—â–∏—Ö –Ω–æ–≤–æ—Å—Ç–Ω—ã—Ö –º–∞—Ç–µ—Ä–∏–∞–ª–æ–≤ –æ—Ç–≤–µ—Ç—å –Ω–∞ –≤–æ–ø—Ä–æ—Å:

{context}

–í–æ–ø—Ä–æ—Å: {question}

–û—Ç–≤–µ—Ç—å –∏–Ω—Ñ–æ—Ä–º–∞—Ç–∏–≤–Ω–æ –∏ —Ç–æ—á–Ω–æ, –∏—Å–ø–æ–ª—å–∑—É—è —Ç–æ–ª—å–∫–æ –ø—Ä–µ–¥–æ—Å—Ç–∞–≤–ª–µ–Ω–Ω—É—é –∏–Ω—Ñ–æ—Ä–º–∞—Ü–∏—é.
–ï—Å–ª–∏ –≤ –º–∞—Ç–µ—Ä–∏–∞–ª–∞—Ö –Ω–µ—Ç —Ç–æ—á–Ω–æ–≥–æ –æ—Ç–≤–µ—Ç–∞, —Å–∫–∞–∂–∏ –æ–± —ç—Ç–æ–º.<end_of_turn>
<start_of_turn>model
"""
    
    return prompt

# 6. –ì–µ–Ω–µ—Ä–∞—Ü–∏—è –æ—Ç–≤–µ—Ç–∞
def generate_answer(prompt):
    """–ì–µ–Ω–µ—Ä–∏—Ä—É–µ–º –æ—Ç–≤–µ—Ç —Å –ø–æ–º–æ—â—å—é –º–æ–¥–µ–ª–∏"""
    inputs = tokenizer(prompt, return_tensors="pt", max_length=2048, truncation=True)
    
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=512,
            do_sample=True,
            temperature=0.7,
            top_p=0.9,
            pad_token_id=tokenizer.eos_token_id,
            repetition_penalty=1.1
        )
    
    response = tokenizer.decode(outputs[0], skip_special_tokens=False)
    
    # –ò–∑–≤–ª–µ–∫–∞–µ–º —Ç–æ–ª—å–∫–æ –æ—Ç–≤–µ—Ç –º–æ–¥–µ–ª–∏
    if "<start_of_turn>model" in response:
        response = response.split("<start_of_turn>model")[-1].strip()
    
    # –û—á–∏—â–∞–µ–º –æ—Ç —Å–ø–µ—Ü–∏–∞–ª—å–Ω—ã—Ö —Ç–æ–∫–µ–Ω–æ–≤
    response = response.replace("<end_of_turn>", "").replace("<eos>", "").strip()
    
    return response

# 7. –û—Å–Ω–æ–≤–Ω–∞—è —Ñ—É–Ω–∫—Ü–∏—è RAG
def rag_pipeline(question, index, documents):
    """–ü–æ–ª–Ω—ã–π –ø–∞–π–ø–ª–∞–π–Ω RAG"""
    # –®–∞–≥ 1: –ü–æ–∏—Å–∫ —Ä–µ–ª–µ–≤–∞–Ω—Ç–Ω—ã—Ö –¥–æ–∫—É–º–µ–Ω—Ç–æ–≤
    relevant_docs = search_similar_documents(question, index, documents, top_k=3)
    
    # –®–∞–≥ 2: –°–æ–∑–¥–∞–Ω–∏–µ –ø—Ä–æ–º–ø—Ç–∞
    prompt = create_prompt(question, relevant_docs)
    
    # –®–∞–≥ 3: –ì–µ–Ω–µ—Ä–∞—Ü–∏—è –æ—Ç–≤–µ—Ç–∞
    answer = generate_answer(prompt)
    
    return answer, relevant_docs

# 8. –ó–∞–ø—É—Å–∫ —Å–∏—Å—Ç–µ–º—ã
def main():
    # –ò–Ω–∏—Ü–∏–∞–ª–∏–∑–∞—Ü–∏—è
    documents = create_knowledge_base()
    index, documents = create_vector_db(documents)
    
    print("\n" + "="*60)
    print("RAG –°–ò–°–¢–ï–ú–ê –ù–ê –û–°–ù–û–í–ï –†–ï–ê–õ–¨–ù–´–• –ù–û–í–û–°–¢–ï–ô")
    print("="*60)
    print(f"–ó–∞–≥—Ä—É–∂–µ–Ω–æ {len(documents)} –Ω–æ–≤–æ—Å—Ç–Ω—ã—Ö –¥–æ–∫—É–º–µ–Ω—Ç–æ–≤")
    print("–°–∏—Å—Ç–µ–º–∞ –≥–æ—Ç–æ–≤–∞! –í–≤–µ–¥–∏—Ç–µ –≤–∞—à –≤–æ–ø—Ä–æ—Å (–∏–ª–∏ '–≤—ã—Ö–æ–¥' –¥–ª—è –∑–∞–≤–µ—Ä—à–µ–Ω–∏—è):")
    
    # –°–Ω–∞—á–∞–ª–∞ –∑–∞–ø—É—Å–∫–∞–µ–º —Ç–µ—Å—Ç–æ–≤—ã–µ –≤–æ–ø—Ä–æ—Å—ã
    run_test_questions()
    
    # –ò–Ω—Ç–µ—Ä–∞–∫—Ç–∏–≤–Ω—ã–π —Ü–∏–∫–ª
    while True:
        question = input("\nüéØ –í–∞—à –≤–æ–ø—Ä–æ—Å: ").strip()
        
        if question.lower() in ['–≤—ã—Ö–æ–¥', 'exit', 'quit']:
            print("–î–æ —Å–≤–∏–¥–∞–Ω–∏—è!")
            break
            
        if not question:
            continue
            
        try:
            print("üîç –ò—â–µ–º —Ä–µ–ª–µ–≤–∞–Ω—Ç–Ω—ã–µ –Ω–æ–≤–æ—Å—Ç–∏...")
            answer, context_docs = rag_pipeline(question, index, documents)
            print(f"\nü§ñ –û—Ç–≤–µ—Ç: {answer}")
            print(f"\nüìä –ò—Å–ø–æ–ª—å–∑–æ–≤–∞–Ω–æ –∏—Å—Ç–æ—á–Ω–∏–∫–æ–≤: {len(context_docs)}")
            print("üì∞ –†–µ–ª–µ–≤–∞–Ω—Ç–Ω—ã–µ –Ω–æ–≤–æ—Å—Ç–∏:")
            for i, doc in enumerate(context_docs, 1):
                print(f"   {i}. {doc['text'][:120]}... (—Ä–µ–ª–µ–≤–∞–Ω—Ç–Ω–æ—Å—Ç—å: {doc['score']:.3f})")
            
        except Exception as e:
            print(f"‚ùå –û—à–∏–±–∫–∞: {e}")

if __name__ == "__main__":
    main()

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
import requests
import json
from datasets import load_dataset

# 1. –ó–∞–≥—Ä—É–∑–∫–∞ –º–æ–¥–µ–ª–µ–π
print("–ó–∞–≥—Ä—É–∂–∞–µ–º –º–æ–¥–µ–ª–∏...")

# –ú–æ–¥–µ–ª—å –¥–ª—è —ç–º–±–µ–¥–¥–∏–Ω–≥–æ–≤ (—Ä—É—Å—Å–∫–∏–π —è–∑—ã–∫)
embedding_model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')

# –ú–æ–¥–µ–ª—å –¥–ª—è –≥–µ–Ω–µ—Ä–∞—Ü–∏–∏
model_name = "Qwen/Qwen3-4B-Instruct-2507"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    device_map="auto"
)

# 2. –ó–∞–≥—Ä—É–∑–∫–∞ —Ä–µ–∞–ª—å–Ω–æ–≥–æ –¥–∞—Ç–∞—Å–µ—Ç–∞ –Ω–æ–≤–æ—Å—Ç–µ–π
def create_knowledge_base():
    """–ó–∞–≥—Ä—É–∂–∞–µ–º —Ä–µ–∞–ª—å–Ω—ã–π –¥–∞—Ç–∞—Å–µ—Ç –Ω–æ–≤–æ—Å—Ç–µ–π –Ω–∞ —Ä—É—Å—Å–∫–æ–º —è–∑—ã–∫–µ"""
    print("–ó–∞–≥—Ä—É–∂–∞–µ–º –¥–∞—Ç–∞—Å–µ—Ç")
    
    try:
        # –ó–∞–≥—Ä—É–∂–∞–µ–º –¥–∞—Ç–∞—Å–µ—Ç
        dataset = load_dataset("zloelias/lenta-ru", split="train[:1000]")  # –ë–µ—Ä–µ–º –ø–µ—Ä–≤—ã–µ 1000 –Ω–æ–≤–æ—Å—Ç–µ–π
        
        documents = []
        for item in dataset:
            # –ò–∑–≤–ª–µ–∫–∞–µ–º —Ç–µ–∫—Å—Ç –Ω–æ–≤–æ—Å—Ç–∏
            text = item.get('text', '')
            title = item.get('title', '')
            
            # –°–æ–∑–¥–∞–µ–º –¥–æ–∫—É–º–µ–Ω—Ç –∏–∑ –∑–∞–≥–æ–ª–æ–≤–∫–∞ –∏ –Ω–∞—á–∞–ª–∞ —Ç–µ–∫—Å—Ç–∞
            if title and text:
                document = f"{title}. {text[:300]}..."  # –û–≥—Ä–∞–Ω–∏—á–∏–≤–∞–µ–º –¥–ª–∏–Ω—É
                documents.append(document)
            elif text:
                documents.append(text[:400] + "...")
            elif title:
                documents.append(title)
        
        print(f"–ó–∞–≥—Ä—É–∂–µ–Ω–æ {len(documents)} –Ω–æ–≤–æ—Å—Ç–µ–π –∏–∑ –¥–∞—Ç–∞—Å–µ—Ç–∞")
        return documents
        
    except Exception as e:
        print(f"–û—à–∏–±–∫–∞ –∑–∞–≥—Ä—É–∑–∫–∏ –¥–∞—Ç–∞—Å–µ—Ç–∞: {e}")
        # –†–µ–∑–µ—Ä–≤–Ω—ã–π –≤–∞—Ä–∏–∞–Ω—Ç - –≤–æ–∑–≤—Ä–∞—â–∞–µ–º –Ω–µ—Å–∫–æ–ª—å–∫–æ –ø—Ä–∏–º–µ—Ä–æ–≤
        return [
            "–†–æ—Å—Å–∏—è –∏ –ö–∏—Ç–∞–π —É—Å–∏–ª–∏–≤–∞—é—Ç —ç–∫–æ–Ω–æ–º–∏—á–µ—Å–∫–æ–µ —Å–æ—Ç—Ä—É–¥–Ω–∏—á–µ—Å—Ç–≤–æ –≤ —É—Å–ª–æ–≤–∏—è—Ö —Å–∞–Ω–∫—Ü–∏–π.",
            "–¶–µ–Ω—Ç—Ä–æ–±–∞–Ω–∫ –†–æ—Å—Å–∏–∏ —Å–æ—Ö—Ä–∞–Ω–∏–ª –∫–ª—é—á–µ–≤—É—é —Å—Ç–∞–≤–∫—É –Ω–∞ –ø—Ä–µ–∂–Ω–µ–º —É—Ä–æ–≤–Ω–µ.",
            "–í –ú–æ—Å–∫–≤–µ –æ—Ç–∫—Ä—ã–ª–∞—Å—å –Ω–æ–≤–∞—è –≤–µ—Ç–∫–∞ –º–µ—Ç—Ä–æ, —Å–æ–µ–¥–∏–Ω—è—é—â–∞—è —Ü–µ–Ω—Ç—Ä —Å —Å–ø–∞–ª—å–Ω—ã–º–∏ —Ä–∞–π–æ–Ω–∞–º–∏.",
            "–£—á–µ–Ω—ã–µ —Ä–∞–∑—Ä–∞–±–æ—Ç–∞–ª–∏ –Ω–æ–≤—É—é –≤–∞–∫—Ü–∏–Ω—É –æ—Ç –≥—Ä–∏–ø–ø–∞ —Å –ø–æ–≤—ã—à–µ–Ω–Ω–æ–π —ç—Ñ—Ñ–µ–∫—Ç–∏–≤–Ω–æ—Å—Ç—å—é.",
            "–¶–µ–Ω—ã –Ω–∞ –Ω–µ—Ñ—Ç—å –≤—ã—Ä–æ—Å–ª–∏ –Ω–∞ –º–∏—Ä–æ–≤—ã—Ö —Ä—ã–Ω–∫–∞—Ö –∏–∑-–∑–∞ —Å–æ–∫—Ä–∞—â–µ–Ω–∏—è –¥–æ–±—ã—á–∏.",
        ]

# –ü—Ä–∏–º–µ—Ä—ã –≤–æ–ø—Ä–æ—Å–æ–≤ –¥–ª—è —Ç–µ—Å—Ç–∏—Ä–æ–≤–∞–Ω–∏—è (–∞–¥–∞–ø—Ç–∏—Ä–æ–≤–∞–Ω–Ω—ã–µ –ø–æ–¥ –Ω–æ–≤–æ—Å—Ç–Ω—É—é —Ç–µ–º–∞—Ç–∏–∫—É)
test_questions = [
    "–ö–∞–∫–∏–µ –ø–æ—Å–ª–µ–¥–Ω–∏–µ –Ω–æ–≤–æ—Å—Ç–∏ –æ —ç–∫–æ–Ω–æ–º–∏–∫–µ –†–æ—Å—Å–∏–∏?",
    "–ß—Ç–æ –ø—Ä–æ–∏—Å—Ö–æ–¥–∏—Ç —Å –∫—É—Ä—Å–æ–º —Ä—É–±–ª—è?",
    "–ö–∞–∫–∏–µ –Ω–æ–≤—ã–µ –ø—Ä–æ–µ–∫—Ç—ã –≤ –º–µ—Ç—Ä–æ –ú–æ—Å–∫–≤—ã?",
    "–ö–∞–∫–∏–µ –Ω–∞—É—á–Ω—ã–µ —Ä–∞–∑—Ä–∞–±–æ—Ç–∫–∏ –ø–æ—Å–ª–µ–¥–Ω–µ–≥–æ –≤—Ä–µ–º–µ–Ω–∏?",
    "–ö–∞–∫ –∏–∑–º–µ–Ω–∏–ª–∏—Å—å —Ü–µ–Ω—ã –Ω–∞ –Ω–µ—Ñ—Ç—å?",
    "–ö–∞–∫–∏–µ –º–µ–∂–¥—É–Ω–∞—Ä–æ–¥–Ω—ã–µ –æ—Ç–Ω–æ—à–µ–Ω–∏—è —É –†–æ—Å—Å–∏–∏?",
    "–ß—Ç–æ –Ω–æ–≤–æ–≥–æ –≤ –±–∞–Ω–∫–æ–≤—Å–∫–æ–π —Å–∏—Å—Ç–µ–º–µ?",
    "–ö–∞–∫–∏–µ —Å–æ–±—ã—Ç–∏—è –≤ –∫—É–ª—å—Ç—É—Ä–Ω–æ–π –∂–∏–∑–Ω–∏?",
    "–ö–∞–∫–∏–µ —Å–ø–æ—Ä—Ç–∏–≤–Ω—ã–µ –Ω–æ–≤–æ—Å—Ç–∏?",
    "–ß—Ç–æ –ø—Ä–æ–∏—Å—Ö–æ–¥–∏—Ç –≤ —Å—Ñ–µ—Ä–µ —Ç–µ—Ö–Ω–æ–ª–æ–≥–∏–π?",
    "–ö–∞–∫–∏–µ –∏–∑–º–µ–Ω–µ–Ω–∏—è –≤ –æ–±—Ä–∞–∑–æ–≤–∞–Ω–∏–∏?",
    "–ö–∞–∫–∏–µ –º–µ–¥–∏—Ü–∏–Ω—Å–∫–∏–µ –Ω–æ–≤–æ—Å—Ç–∏?",
    "–ö–∞–∫–∏–µ –ø–æ–ª–∏—Ç–∏—á–µ—Å–∫–∏–µ —Å–æ–±—ã—Ç–∏—è?",
    "–ß—Ç–æ –Ω–æ–≤–æ–≥–æ –≤ —Å—Ç—Ä–æ–∏—Ç–µ–ª—å—Å—Ç–≤–µ?",
    "–ö–∞–∫–∏–µ —ç–∫–æ–ª–æ–≥–∏—á–µ—Å–∫–∏–µ –ø—Ä–æ–±–ª–µ–º—ã –æ–±—Å—É–∂–¥–∞—é—Ç—Å—è?"
]

def run_test_questions():
    """–§—É–Ω–∫—Ü–∏—è –¥–ª—è —Ç–µ—Å—Ç–∏—Ä–æ–≤–∞–Ω–∏—è —Å–∏—Å—Ç–µ–º—ã –Ω–∞ –ø–æ–¥–≥–æ—Ç–æ–≤–ª–µ–Ω–Ω—ã—Ö –≤–æ–ø—Ä–æ—Å–∞—Ö"""
    documents = create_knowledge_base()
    index, documents = create_vector_db(documents)
    
    print("=" * 60)
    print("–¢–ï–°–¢–ò–†–û–í–ê–ù–ò–ï RAG –°–ò–°–¢–ï–ú–´ –° –†–ï–ê–õ–¨–ù–´–ú–ò –ù–û–í–û–°–¢–Ø–ú–ò")
    print("=" * 60)
    
    for i, question in enumerate(test_questions, 1):
        print(f"\n{i}. –í–æ–ø—Ä–æ—Å: {question}")
        try:
            answer, context_docs = rag_pipeline(question, index, documents)
            print(f"ü§ñ –û—Ç–≤–µ—Ç: {answer}")
            print(f"üìä –ò—Å–ø–æ–ª—å–∑–æ–≤–∞–Ω–æ –¥–æ–∫—É–º–µ–Ω—Ç–æ–≤: {len(context_docs)}")
            print("üì∞ –ò—Å—Ç–æ—á–Ω–∏–∫–∏:")
            for j, doc in enumerate(context_docs, 1):
                print(f"   {j}. {doc['text'][:100]}... (—Å—Ö–æ–¥—Å—Ç–≤–æ: {doc['score']:.3f})")
            print("-" * 80)
        except Exception as e:
            print(f"‚ùå –û—à–∏–±–∫–∞: {e}")
            print("-" * 80)

# 3. –°–æ–∑–¥–∞–Ω–∏–µ –≤–µ–∫—Ç–æ—Ä–Ω–æ–π –±–∞–∑—ã –¥–∞–Ω–Ω—ã—Ö
def create_vector_db(documents):
    """–°–æ–∑–¥–∞–µ–º –≤–µ–∫—Ç–æ—Ä–Ω—É—é –±–∞–∑—É –¥–∞–Ω–Ω—ã—Ö"""
    print("–°–æ–∑–¥–∞–µ–º –≤–µ–∫—Ç–æ—Ä–Ω—É—é –±–∞–∑—É –∏–∑ –Ω–æ–≤–æ—Å—Ç–µ–π...")
    
    # –ü–æ–ª—É—á–∞–µ–º —ç–º–±–µ–¥–¥–∏–Ω–≥–∏ –¥–ª—è –¥–æ–∫—É–º–µ–Ω—Ç–æ–≤
    embeddings = embedding_model.encode(documents)
    
    # –°–æ–∑–¥–∞–µ–º FAISS –∏–Ω–¥–µ–∫—Å
    dimension = embeddings.shape[1]
    index = faiss.IndexFlatIP(dimension)  # Inner Product (–∫–æ—Å–∏–Ω—É—Å–Ω–æ–µ —Å—Ö–æ–¥—Å—Ç–≤–æ)
    
    # –ù–æ—Ä–º–∞–ª–∏–∑—É–µ–º –≤–µ–∫—Ç–æ—Ä—ã –¥–ª—è –∫–æ—Å–∏–Ω—É—Å–Ω–æ–≥–æ —Å—Ö–æ–¥—Å—Ç–≤–∞
    faiss.normalize_L2(embeddings)
    index.add(embeddings)
    
    print(f"–í–µ–∫—Ç–æ—Ä–Ω–∞—è –±–∞–∑–∞ —Å–æ–∑–¥–∞–Ω–∞: {len(documents)} –¥–æ–∫—É–º–µ–Ω—Ç–æ–≤")
    return index, documents

# 4. –ü–æ–∏—Å–∫ —Ä–µ–ª–µ–≤–∞–Ω—Ç–Ω—ã—Ö –¥–æ–∫—É–º–µ–Ω—Ç–æ–≤
def search_similar_documents(query, index, documents, top_k=3):
    """–ò—â–µ–º –Ω–∞–∏–±–æ–ª–µ–µ —Ä–µ–ª–µ–≤–∞–Ω—Ç–Ω—ã–µ –¥–æ–∫—É–º–µ–Ω—Ç—ã –¥–ª—è –∑–∞–ø—Ä–æ—Å–∞"""
    # –ü–æ–ª—É—á–∞–µ–º —ç–º–±–µ–¥–¥–∏–Ω–≥ –∑–∞–ø—Ä–æ—Å–∞
    query_embedding = embedding_model.encode([query])
    faiss.normalize_L2(query_embedding)
    
    # –ò—â–µ–º –ø–æ—Ö–æ–∂–∏–µ –¥–æ–∫—É–º–µ–Ω—Ç—ã
    scores, indices = index.search(query_embedding, top_k)
    
    # –í–æ–∑–≤—Ä–∞—â–∞–µ–º –Ω–∞–π–¥–µ–Ω–Ω—ã–µ –¥–æ–∫—É–º–µ–Ω—Ç—ã –∏ –∏—Ö —Å–∫–æ—Ä–∏–Ω–≥–∏
    results = []
    for i, idx in enumerate(indices[0]):
        if idx < len(documents):
            results.append({
                'text': documents[idx],
                'score': scores[0][i]
            })
    
    return results

# 5. –°–æ–∑–¥–∞–Ω–∏–µ –ø—Ä–æ–º–ø—Ç–∞ —Å –∫–æ–Ω—Ç–µ–∫—Å—Ç–æ–º
def create_prompt(question, context_docs):
    """–°–æ–∑–¥–∞–µ–º –ø—Ä–æ–º–ø—Ç –¥–ª—è –º–æ–¥–µ–ª–∏ —Å –∫–æ–Ω—Ç–µ–∫—Å—Ç–æ–º"""
    context = "\n".join([f"- {doc['text']}" for doc in context_docs])
    
    prompt = f"""<bos><start_of_turn>user
–ù–∞ –æ—Å–Ω–æ–≤–µ —Å–ª–µ–¥—É—é—â–∏—Ö –Ω–æ–≤–æ—Å—Ç–Ω—ã—Ö –º–∞—Ç–µ—Ä–∏–∞–ª–æ–≤ –æ—Ç–≤–µ—Ç—å –Ω–∞ –≤–æ–ø—Ä–æ—Å:

{context}

–í–æ–ø—Ä–æ—Å: {question}

–û—Ç–≤–µ—Ç—å –∏–Ω—Ñ–æ—Ä–º–∞—Ç–∏–≤–Ω–æ –∏ —Ç–æ—á–Ω–æ, –∏—Å–ø–æ–ª—å–∑—É—è —Ç–æ–ª—å–∫–æ –ø—Ä–µ–¥–æ—Å—Ç–∞–≤–ª–µ–Ω–Ω—É—é –∏–Ω—Ñ–æ—Ä–º–∞—Ü–∏—é.
–ï—Å–ª–∏ –≤ –º–∞—Ç–µ—Ä–∏–∞–ª–∞—Ö –Ω–µ—Ç —Ç–æ—á–Ω–æ–≥–æ –æ—Ç–≤–µ—Ç–∞, —Å–∫–∞–∂–∏ –æ–± —ç—Ç–æ–º.<end_of_turn>
<start_of_turn>model
"""
    
    return prompt

# 6. –ì–µ–Ω–µ—Ä–∞—Ü–∏—è –æ—Ç–≤–µ—Ç–∞
def generate_answer(prompt):
    """–ì–µ–Ω–µ—Ä–∏—Ä—É–µ–º –æ—Ç–≤–µ—Ç —Å –ø–æ–º–æ—â—å—é –º–æ–¥–µ–ª–∏"""
    inputs = tokenizer(prompt, return_tensors="pt", max_length=2048, truncation=True)
    
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=512,
            do_sample=True,
            temperature=0.5,
            top_p=0.9,
            pad_token_id=tokenizer.eos_token_id,
            repetition_penalty=1.1
        )
    
    response = tokenizer.decode(outputs[0], skip_special_tokens=False)
    
    # –ò–∑–≤–ª–µ–∫–∞–µ–º —Ç–æ–ª—å–∫–æ –æ—Ç–≤–µ—Ç –º–æ–¥–µ–ª–∏
    if "<start_of_turn>model" in response:
        response = response.split("<start_of_turn>model")[-1].strip()
    
    # –û—á–∏—â–∞–µ–º –æ—Ç —Å–ø–µ—Ü–∏–∞–ª—å–Ω—ã—Ö —Ç–æ–∫–µ–Ω–æ–≤
    response = response.replace("<end_of_turn>", "").replace("<eos>", "").strip()
    
    return response

# 7. –û—Å–Ω–æ–≤–Ω–∞—è —Ñ—É–Ω–∫—Ü–∏—è RAG
def rag_pipeline(question, index, documents):
    """–ü–æ–ª–Ω—ã–π –ø–∞–π–ø–ª–∞–π–Ω RAG"""
    # –®–∞–≥ 1: –ü–æ–∏—Å–∫ —Ä–µ–ª–µ–≤–∞–Ω—Ç–Ω—ã—Ö –¥–æ–∫—É–º–µ–Ω—Ç–æ–≤
    relevant_docs = search_similar_documents(question, index, documents, top_k=3)
    
    # –®–∞–≥ 2: –°–æ–∑–¥–∞–Ω–∏–µ –ø—Ä–æ–º–ø—Ç–∞
    prompt = create_prompt(question, relevant_docs)
    
    # –®–∞–≥ 3: –ì–µ–Ω–µ—Ä–∞—Ü–∏—è –æ—Ç–≤–µ—Ç–∞
    answer = generate_answer(prompt)
    
    return answer, relevant_docs

# 8. –ó–∞–ø—É—Å–∫ —Å–∏—Å—Ç–µ–º—ã
def main():
    # –ò–Ω–∏—Ü–∏–∞–ª–∏–∑–∞—Ü–∏—è
    documents = create_knowledge_base()
    index, documents = create_vector_db(documents)
    
    print("\n" + "="*60)
    print("RAG –°–ò–°–¢–ï–ú–ê –ù–ê –û–°–ù–û–í–ï –†–ï–ê–õ–¨–ù–´–• –ù–û–í–û–°–¢–ï–ô")
    print("="*60)
    print(f"–ó–∞–≥—Ä—É–∂–µ–Ω–æ {len(documents)} –Ω–æ–≤–æ—Å—Ç–Ω—ã—Ö –¥–æ–∫—É–º–µ–Ω—Ç–æ–≤")
    print("–°–∏—Å—Ç–µ–º–∞ –≥–æ—Ç–æ–≤–∞! –í–≤–µ–¥–∏—Ç–µ –≤–∞—à –≤–æ–ø—Ä–æ—Å (–∏–ª–∏ '–≤—ã—Ö–æ–¥' –¥–ª—è –∑–∞–≤–µ—Ä—à–µ–Ω–∏—è):")
    
    # –°–Ω–∞—á–∞–ª–∞ –∑–∞–ø—É—Å–∫–∞–µ–º —Ç–µ—Å—Ç–æ–≤—ã–µ –≤–æ–ø—Ä–æ—Å—ã
    run_test_questions()
    
    # –ò–Ω—Ç–µ—Ä–∞–∫—Ç–∏–≤–Ω—ã–π —Ü–∏–∫–ª
    while True:
        question = input("\nüéØ –í–∞—à –≤–æ–ø—Ä–æ—Å: ").strip()
        
        if question.lower() in ['–≤—ã—Ö–æ–¥', 'exit', 'quit']:
            print("–î–æ —Å–≤–∏–¥–∞–Ω–∏—è!")
            break
            
        if not question:
            continue
            
        try:
            print("üîç –ò—â–µ–º —Ä–µ–ª–µ–≤–∞–Ω—Ç–Ω—ã–µ –Ω–æ–≤–æ—Å—Ç–∏...")
            answer, context_docs = rag_pipeline(question, index, documents)
            print(f"\nü§ñ –û—Ç–≤–µ—Ç: {answer}")
            print(f"\nüìä –ò—Å–ø–æ–ª—å–∑–æ–≤–∞–Ω–æ –∏—Å—Ç–æ—á–Ω–∏–∫–æ–≤: {len(context_docs)}")
            print("üì∞ –†–µ–ª–µ–≤–∞–Ω—Ç–Ω—ã–µ –Ω–æ–≤–æ—Å—Ç–∏:")
            for i, doc in enumerate(context_docs, 1):
                print(f"   {i}. {doc['text'][:120]}... (—Ä–µ–ª–µ–≤–∞–Ω—Ç–Ω–æ—Å—Ç—å: {doc['score']:.3f})")
            
        except Exception as e:
            print(f"‚ùå –û—à–∏–±–∫–∞: {e}")

if __name__ == "__main__":
    main()

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
import requests
import json
from datasets import load_dataset
import pickle
import os


print("–ó–∞–≥—Ä—É–∂–∞–µ–º –º–æ–¥–µ–ª–∏...")

# –ú–æ–¥–µ–ª—å –¥–ª—è —ç–º–±–µ–¥–¥–∏–Ω–≥–æ–≤ (—Ä—É—Å—Å–∫–∏–π —è–∑—ã–∫)
embedding_model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')

# –ú–æ–¥–µ–ª—å –¥–ª—è –≥–µ–Ω–µ—Ä–∞—Ü–∏–∏
model_name = "Qwen/Qwen3-4B-Instruct-2507"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    device_map="auto"
)

# 2. –ó–∞–≥—Ä—É–∑–∫–∞ —Ä–µ–∞–ª—å–Ω–æ–≥–æ –¥–∞—Ç–∞—Å–µ—Ç–∞ –Ω–æ–≤–æ—Å—Ç–µ–π
def create_knowledge_base():
    """–ó–∞–≥—Ä—É–∂–∞–µ–º —Ä–µ–∞–ª—å–Ω—ã–π –¥–∞—Ç–∞—Å–µ—Ç –Ω–æ–≤–æ—Å—Ç–µ–π –Ω–∞ —Ä—É—Å—Å–∫–æ–º —è–∑—ã–∫–µ"""
    print("–ó–∞–≥—Ä—É–∂–∞–µ–º –¥–∞—Ç–∞—Å–µ—Ç...")
    
    try:
        # –ó–∞–≥—Ä—É–∂–∞–µ–º –¥–∞—Ç–∞—Å–µ—Ç
        dataset = load_dataset("zloelias/lenta-ru", split="train[:1000]")  # –ë–µ—Ä–µ–º –ø–µ—Ä–≤—ã–µ 1000 –Ω–æ–≤–æ—Å—Ç–µ–π
        
        documents = []
        for item in dataset:
            # –ò–∑–≤–ª–µ–∫–∞–µ–º —Ç–µ–∫—Å—Ç –Ω–æ–≤–æ—Å—Ç–∏
            text = item.get('text', '')
            title = item.get('title', '')
            
            # –°–æ–∑–¥–∞–µ–º –¥–æ–∫—É–º–µ–Ω—Ç –∏–∑ –∑–∞–≥–æ–ª–æ–≤–∫–∞ –∏ –Ω–∞—á–∞–ª–∞ —Ç–µ–∫—Å—Ç–∞
            if title and text:
                document = f"{title}. {text[:300]}..."  # –û–≥—Ä–∞–Ω–∏—á–∏–≤–∞–µ–º –¥–ª–∏–Ω—É
                documents.append(document)
            elif text:
                documents.append(text[:400] + "...")
            elif title:
                documents.append(title)
        
        print(f"–ó–∞–≥—Ä—É–∂–µ–Ω–æ {len(documents)} –Ω–æ–≤–æ—Å—Ç–µ–π –∏–∑ –¥–∞—Ç–∞—Å–µ—Ç–∞")
        return documents
        
    except Exception as e:
        print(f"–û—à–∏–±–∫–∞ –∑–∞–≥—Ä—É–∑–∫–∏ –¥–∞—Ç–∞—Å–µ—Ç–∞: {e}")
        # –†–µ–∑–µ—Ä–≤–Ω—ã–π –≤–∞—Ä–∏–∞–Ω—Ç - –≤–æ–∑–≤—Ä–∞—â–∞–µ–º –Ω–µ—Å–∫–æ–ª—å–∫–æ –ø—Ä–∏–º–µ—Ä–æ–≤
        return [
            "–†–æ—Å—Å–∏—è –∏ –ö–∏—Ç–∞–π —É—Å–∏–ª–∏–≤–∞—é—Ç —ç–∫–æ–Ω–æ–º–∏—á–µ—Å–∫–æ–µ —Å–æ—Ç—Ä—É–¥–Ω–∏—á–µ—Å—Ç–≤–æ –≤ —É—Å–ª–æ–≤–∏—è—Ö —Å–∞–Ω–∫—Ü–∏–π.",
            "–¶–µ–Ω—Ç—Ä–æ–±–∞–Ω–∫ –†–æ—Å—Å–∏–∏ —Å–æ—Ö—Ä–∞–Ω–∏–ª –∫–ª—é—á–µ–≤—É—é —Å—Ç–∞–≤–∫—É –Ω–∞ –ø—Ä–µ–∂–Ω–µ–º —É—Ä–æ–≤–Ω–µ.",
            "–í –ú–æ—Å–∫–≤–µ –æ—Ç–∫—Ä—ã–ª–∞—Å—å –Ω–æ–≤–∞—è –≤–µ—Ç–∫–∞ –º–µ—Ç—Ä–æ, —Å–æ–µ–¥–∏–Ω—è—é—â–∞—è —Ü–µ–Ω—Ç—Ä —Å —Å–ø–∞–ª—å–Ω—ã–º–∏ —Ä–∞–π–æ–Ω–∞–º–∏.",
            "–£—á–µ–Ω—ã–µ —Ä–∞–∑—Ä–∞–±–æ—Ç–∞–ª–∏ –Ω–æ–≤—É—é –≤–∞–∫—Ü–∏–Ω—É –æ—Ç –≥—Ä–∏–ø–ø–∞ —Å –ø–æ–≤—ã—à–µ–Ω–Ω–æ–π —ç—Ñ—Ñ–µ–∫—Ç–∏–≤–Ω–æ—Å—Ç—å—é.",
            "–¶–µ–Ω—ã –Ω–∞ –Ω–µ—Ñ—Ç—å –≤—ã—Ä–æ—Å–ª–∏ –Ω–∞ –º–∏—Ä–æ–≤—ã—Ö —Ä—ã–Ω–∫–∞—Ö –∏–∑-–∑–∞ —Å–æ–∫—Ä–∞—â–µ–Ω–∏—è –¥–æ–±—ã—á–∏.",
        ]

# 3. –ó–∞–≥—Ä—É–∑–∫–∞ –≤–æ–ø—Ä–æ—Å–æ–≤ –∏–∑ pickle —Ñ–∞–π–ª–∞
def load_questions_from_pickle(filename="input.pickle"):
    """–ó–∞–≥—Ä—É–∂–∞–µ–º –≤–æ–ø—Ä–æ—Å—ã –∏–∑ pickle —Ñ–∞–π–ª–∞"""
    try:
        if not os.path.exists(filename):
            print(f"–§–∞–π–ª {filename} –Ω–µ –Ω–∞–π–¥–µ–Ω. –ò—Å–ø–æ–ª—å–∑—É—é —Ç–µ—Å—Ç–æ–≤—ã–µ –≤–æ–ø—Ä–æ—Å—ã.")
            return get_default_questions()
        
        with open(filename, 'rb') as f:
            questions_data = pickle.load(f)
        
        # –û–±—Ä–∞–±–∞—Ç—ã–≤–∞–µ–º —Ä–∞–∑–Ω—ã–µ —Ñ–æ—Ä–º–∞—Ç—ã pickle —Ñ–∞–π–ª–∞
        if isinstance(questions_data, list):
            # –ï—Å–ª–∏ —ç—Ç–æ –ø—Ä–æ—Å—Ç–æ–π —Å–ø–∏—Å–æ–∫ –≤–æ–ø—Ä–æ—Å–æ–≤
            questions = questions_data
        elif isinstance(questions_data, dict):
            # –ï—Å–ª–∏ —ç—Ç–æ —Å–ª–æ–≤–∞—Ä—å, –ø—ã—Ç–∞–µ–º—Å—è –∏–∑–≤–ª–µ—á—å –≤–æ–ø—Ä–æ—Å—ã
            if 'questions' in questions_data:
                questions = questions_data['questions']
            elif 'queries' in questions_data:
                questions = questions_data['queries']
            else:
                # –ë–µ—Ä–µ–º –≤—Å–µ —Å—Ç—Ä–æ–∫–æ–≤—ã–µ –∑–Ω–∞—á–µ–Ω–∏—è
                questions = [v for v in questions_data.values() if isinstance(v, str)]
        else:
            print(f"–ù–µ–∏–∑–≤–µ—Å—Ç–Ω—ã–π —Ñ–æ—Ä–º–∞—Ç —Ñ–∞–π–ª–∞ {filename}. –ò—Å–ø–æ–ª—å–∑—É—é —Ç–µ—Å—Ç–æ–≤—ã–µ –≤–æ–ø—Ä–æ—Å—ã.")
            return get_default_questions()
        
        print(f"–ó–∞–≥—Ä—É–∂–µ–Ω–æ {len(questions)} –≤–æ–ø—Ä–æ—Å–æ–≤ –∏–∑ {filename}")
        return questions
        
    except Exception as e:
        print(f"–û—à–∏–±–∫–∞ –∑–∞–≥—Ä—É–∑–∫–∏ –≤–æ–ø—Ä–æ—Å–æ–≤ –∏–∑ {filename}: {e}")
        return get_default_questions()

def get_default_questions():
    """–í–æ–∑–≤—Ä–∞—â–∞–µ—Ç –≤–æ–ø—Ä–æ—Å—ã –ø–æ —É–º–æ–ª—á–∞–Ω–∏—é –µ—Å–ª–∏ pickle —Ñ–∞–π–ª –Ω–µ–¥–æ—Å—Ç—É–ø–µ–Ω"""
    return [
        "–ö–∞–∫–∏–µ –ø–æ—Å–ª–µ–¥–Ω–∏–µ –Ω–æ–≤–æ—Å—Ç–∏ –æ —ç–∫–æ–Ω–æ–º–∏–∫–µ –†–æ—Å—Å–∏–∏?",
        "–ß—Ç–æ –ø—Ä–æ–∏—Å—Ö–æ–¥–∏—Ç —Å –∫—É—Ä—Å–æ–º —Ä—É–±–ª—è?",
        "–ö–∞–∫–∏–µ –Ω–æ–≤—ã–µ –ø—Ä–æ–µ–∫—Ç—ã –≤ –º–µ—Ç—Ä–æ –ú–æ—Å–∫–≤—ã?",
        "–ö–∞–∫–∏–µ –Ω–∞—É—á–Ω—ã–µ —Ä–∞–∑—Ä–∞–±–æ—Ç–∫–∏ –ø–æ—Å–ª–µ–¥–Ω–µ–≥–æ –≤—Ä–µ–º–µ–Ω–∏?",
        "–ö–∞–∫ –∏–∑–º–µ–Ω–∏–ª–∏—Å—å —Ü–µ–Ω—ã –Ω–∞ –Ω–µ—Ñ—Ç—å?",
        "–ö–∞–∫–∏–µ –º–µ–∂–¥—É–Ω–∞—Ä–æ–¥–Ω—ã–µ –æ—Ç–Ω–æ—à–µ–Ω–∏—è —É –†–æ—Å—Å–∏–∏?",
        "–ß—Ç–æ –Ω–æ–≤–æ–≥–æ –≤ –±–∞–Ω–∫–æ–≤—Å–∫–æ–π —Å–∏—Å—Ç–µ–º–µ?",
        "–ö–∞–∫–∏–µ —Å–æ–±—ã—Ç–∏—è –≤ –∫—É–ª—å—Ç—É—Ä–Ω–æ–π –∂–∏–∑–Ω–∏?",
        "–ö–∞–∫–∏–µ —Å–ø–æ—Ä—Ç–∏–≤–Ω—ã–µ –Ω–æ–≤–æ—Å—Ç–∏?",
        "–ß—Ç–æ –ø—Ä–æ–∏—Å—Ö–æ–¥–∏—Ç –≤ —Å—Ñ–µ—Ä–µ —Ç–µ—Ö–Ω–æ–ª–æ–≥–∏–π?"
    ]

def run_test_questions():
    """–§—É–Ω–∫—Ü–∏—è –¥–ª—è —Ç–µ—Å—Ç–∏—Ä–æ–≤–∞–Ω–∏—è —Å–∏—Å—Ç–µ–º—ã –Ω–∞ –≤–æ–ø—Ä–æ—Å–∞—Ö –∏–∑ pickle"""
    # –ó–∞–≥—Ä—É–∂–∞–µ–º –≤–æ–ø—Ä–æ—Å—ã –∏–∑ pickle
    test_questions = load_questions_from_pickle("input.pickle")
    
    # –ó–∞–≥—Ä—É–∂–∞–µ–º –±–∞–∑—É –∑–Ω–∞–Ω–∏–π
    documents = create_knowledge_base()
    index, documents = create_vector_db(documents)
    
    print("=" * 60)
    print("–¢–ï–°–¢–ò–†–û–í–ê–ù–ò–ï RAG –°–ò–°–¢–ï–ú–´ –° –†–ï–ê–õ–¨–ù–´–ú–ò –ù–û–í–û–°–¢–Ø–ú–ò")
    print("=" * 60)
    print(f"–ë—É–¥–µ—Ç –æ–±—Ä–∞–±–æ—Ç–∞–Ω–æ {len(test_questions)} –≤–æ–ø—Ä–æ—Å–æ–≤")
    
    # –°–æ—Ö—Ä–∞–Ω—è–µ–º —Ä–µ–∑—É–ª—å—Ç–∞—Ç—ã
    results = []
    
    for i, question in enumerate(test_questions, 1):
        print(f"\n{i}. –í–æ–ø—Ä–æ—Å: {question}")
        try:
            answer, context_docs = rag_pipeline(question, index, documents)
            print(f"ü§ñ –û—Ç–≤–µ—Ç: {answer}")
            print(f"üìä –ò—Å–ø–æ–ª—å–∑–æ–≤–∞–Ω–æ –¥–æ–∫—É–º–µ–Ω—Ç–æ–≤: {len(context_docs)}")
            print("üì∞ –ò—Å—Ç–æ—á–Ω–∏–∫–∏:")
            for j, doc in enumerate(context_docs, 1):
                print(f"   {j}. {doc['text'][:100]}... (—Å—Ö–æ–¥—Å—Ç–≤–æ: {doc['score']:.3f})")
            
            # –°–æ—Ö—Ä–∞–Ω—è–µ–º —Ä–µ–∑—É–ª—å—Ç–∞—Ç
            results.append({
                'question': question,
                'answer': answer,
                'sources': [doc['text'] for doc in context_docs],
                'scores': [doc['score'] for doc in context_docs]
            })
            
            print("-" * 80)
        except Exception as e:
            print(f"‚ùå –û—à–∏–±–∫–∞: {e}")
            results.append({
                'question': question,
                'answer': f"–û—à–∏–±–∫–∞: {e}",
                'sources': [],
                'scores': []
            })
            print("-" * 80)
    
    # –°–æ—Ö—Ä–∞–Ω—è–µ–º —Ä–µ–∑—É–ª—å—Ç–∞—Ç—ã –≤ —Ñ–∞–π–ª
    save_results(results)
    return results

def save_results(results, filename="rag_results.pickle"):
    """–°–æ—Ö—Ä–∞–Ω—è–µ–º —Ä–µ–∑—É–ª—å—Ç–∞—Ç—ã —Ä–∞–±–æ—Ç—ã RAG —Å–∏—Å—Ç–µ–º—ã"""
    try:
        with open(filename, 'wb') as f:
            pickle.dump(results, f)
        print(f"\nüíæ –†–µ–∑—É–ª—å—Ç–∞—Ç—ã —Å–æ—Ö—Ä–∞–Ω–µ–Ω—ã –≤ {filename}")
        
        # –¢–∞–∫–∂–µ —Å–æ—Ö—Ä–∞–Ω—è–µ–º –≤ —á–∏—Ç–∞–µ–º–æ–º —Ñ–æ—Ä–º–∞—Ç–µ
        with open("rag_results.txt", "w", encoding="utf-8") as f:
            f.write("–†–ï–ó–£–õ–¨–¢–ê–¢–´ RAG –°–ò–°–¢–ï–ú–´\n")
            f.write("=" * 50 + "\n")
            for i, result in enumerate(results, 1):
                f.write(f"\n{i}. –í–û–ü–†–û–°: {result['question']}\n")
                f.write(f"   –û–¢–í–ï–¢: {result['answer']}\n")
                f.write(f"   –ò–°–¢–û–ß–ù–ò–ö–ò: {len(result['sources'])}\n")
                for j, source in enumerate(result['sources'], 1):
                    f.write(f"      {j}. {source[:100]}...\n")
                f.write("-" * 80 + "\n")
        
        print(f"üìÑ –¢–µ–∫—Å—Ç–æ–≤—ã–π –æ—Ç—á–µ—Ç —Å–æ—Ö—Ä–∞–Ω–µ–Ω –≤ rag_results.txt")
        
    except Exception as e:
        print(f"–û—à–∏–±–∫–∞ —Å–æ—Ö—Ä–∞–Ω–µ–Ω–∏—è —Ä–µ–∑—É–ª—å—Ç–∞—Ç–æ–≤: {e}")

# 4. –°–æ–∑–¥–∞–Ω–∏–µ –≤–µ–∫—Ç–æ—Ä–Ω–æ–π –±–∞–∑—ã –¥–∞–Ω–Ω—ã—Ö
def create_vector_db(documents):
    """–°–æ–∑–¥–∞–µ–º –≤–µ–∫—Ç–æ—Ä–Ω—É—é –±–∞–∑—É –¥–∞–Ω–Ω—ã—Ö"""
    print("–°–æ–∑–¥–∞–µ–º –≤–µ–∫—Ç–æ—Ä–Ω—É—é –±–∞–∑—É –∏–∑ –Ω–æ–≤–æ—Å—Ç–µ–π...")
    
    # –ü–æ–ª—É—á–∞–µ–º —ç–º–±–µ–¥–¥–∏–Ω–≥–∏ –¥–ª—è –¥–æ–∫—É–º–µ–Ω—Ç–æ–≤
    embeddings = embedding_model.encode(documents)
    
    # –°–æ–∑–¥–∞–µ–º FAISS –∏–Ω–¥–µ–∫—Å
    dimension = embeddings.shape[1]
    index = faiss.IndexFlatIP(dimension)  # Inner Product (–∫–æ—Å–∏–Ω—É—Å–Ω–æ–µ —Å—Ö–æ–¥—Å—Ç–≤–æ)
    
    # –ù–æ—Ä–º–∞–ª–∏–∑—É–µ–º –≤–µ–∫—Ç–æ—Ä—ã –¥–ª—è –∫–æ—Å–∏–Ω—É—Å–Ω–æ–≥–æ —Å—Ö–æ–¥—Å—Ç–≤–∞
    faiss.normalize_L2(embeddings)
    index.add(embeddings)
    
    print(f"–í–µ–∫—Ç–æ—Ä–Ω–∞—è –±–∞–∑–∞ —Å–æ–∑–¥–∞–Ω–∞: {len(documents)} –¥–æ–∫—É–º–µ–Ω—Ç–æ–≤")
    return index, documents

# 5. –ü–æ–∏—Å–∫ —Ä–µ–ª–µ–≤–∞–Ω—Ç–Ω—ã—Ö –¥–æ–∫—É–º–µ–Ω—Ç–æ–≤
def search_similar_documents(query, index, documents, top_k=3):
    """–ò—â–µ–º –Ω–∞–∏–±–æ–ª–µ–µ —Ä–µ–ª–µ–≤–∞–Ω—Ç–Ω—ã–µ –¥–æ–∫—É–º–µ–Ω—Ç—ã –¥–ª—è –∑–∞–ø—Ä–æ—Å–∞"""
    # –ü–æ–ª—É—á–∞–µ–º —ç–º–±–µ–¥–¥–∏–Ω–≥ –∑–∞–ø—Ä–æ—Å–∞
    query_embedding = embedding_model.encode([query])
    faiss.normalize_L2(query_embedding)
    
    # –ò—â–µ–º –ø–æ—Ö–æ–∂–∏–µ –¥–æ–∫—É–º–µ–Ω—Ç—ã
    scores, indices = index.search(query_embedding, top_k)
    
    # –í–æ–∑–≤—Ä–∞—â–∞–µ–º –Ω–∞–π–¥–µ–Ω–Ω—ã–µ –¥–æ–∫—É–º–µ–Ω—Ç—ã –∏ –∏—Ö —Å–∫–æ—Ä–∏–Ω–≥–∏
    results = []
    for i, idx in enumerate(indices[0]):
        if idx < len(documents):
            results.append({
                'text': documents[idx],
                'score': scores[0][i]
            })
    
    return results

# 6. –°–æ–∑–¥–∞–Ω–∏–µ –ø—Ä–æ–º–ø—Ç–∞ —Å –∫–æ–Ω—Ç–µ–∫—Å—Ç–æ–º
def create_prompt(question, context_docs):
    """–°–æ–∑–¥–∞–µ–º –ø—Ä–æ–º–ø—Ç –¥–ª—è –º–æ–¥–µ–ª–∏ —Å –∫–æ–Ω—Ç–µ–∫—Å—Ç–æ–º"""
    context = "\n".join([f"- {doc['text']}" for doc in context_docs])
    
    # –ê–¥–∞–ø—Ç–∏—Ä—É–µ–º –ø—Ä–æ–º–ø—Ç –ø–æ–¥ —Ñ–æ—Ä–º–∞—Ç Qwen
    prompt = f"""<|im_start|>system
–¢—ã - –ø–æ–º–æ—â–Ω–∏–∫, –∫–æ—Ç–æ—Ä—ã–π –æ—Ç–≤–µ—á–∞–µ—Ç –Ω–∞ –≤–æ–ø—Ä–æ—Å—ã –Ω–∞ –æ—Å–Ω–æ–≤–µ –ø—Ä–µ–¥–æ—Å—Ç–∞–≤–ª–µ–Ω–Ω—ã—Ö –Ω–æ–≤–æ—Å—Ç–Ω—ã—Ö –º–∞—Ç–µ—Ä–∏–∞–ª–æ–≤.
–û—Ç–≤–µ—á–∞–π —Ç–æ—á–Ω–æ –∏ –∏–Ω—Ñ–æ—Ä–º–∞—Ç–∏–≤–Ω–æ, –∏—Å–ø–æ–ª—å–∑—É—è —Ç–æ–ª—å–∫–æ –ø—Ä–µ–¥–æ—Å—Ç–∞–≤–ª–µ–Ω–Ω—É—é –∏–Ω—Ñ–æ—Ä–º–∞—Ü–∏—é.
–ï—Å–ª–∏ –≤ –º–∞—Ç–µ—Ä–∏–∞–ª–∞—Ö –Ω–µ—Ç —Ç–æ—á–Ω–æ–≥–æ –æ—Ç–≤–µ—Ç–∞, —Å–∫–∞–∂–∏ –æ–± —ç—Ç–æ–º.<|im_end|>
<|im_start|>user
–ù–∞ –æ—Å–Ω–æ–≤–µ —Å–ª–µ–¥—É—é—â–∏—Ö –Ω–æ–≤–æ—Å—Ç–Ω—ã—Ö –º–∞—Ç–µ—Ä–∏–∞–ª–æ–≤ –æ—Ç–≤–µ—Ç—å –Ω–∞ –≤–æ–ø—Ä–æ—Å:

{context}

–í–æ–ø—Ä–æ—Å: {question}<|im_end|>
<|im_start|>assistant
"""
    
    return prompt

# 7. –ì–µ–Ω–µ—Ä–∞—Ü–∏—è –æ—Ç–≤–µ—Ç–∞
def generate_answer(prompt):
    """–ì–µ–Ω–µ—Ä–∏—Ä—É–µ–º –æ—Ç–≤–µ—Ç —Å –ø–æ–º–æ—â—å—é –º–æ–¥–µ–ª–∏"""
    inputs = tokenizer(prompt, return_tensors="pt", max_length=2048, truncation=True)
    
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=512,
            do_sample=True,
            temperature=0.5,
            top_p=0.9,
            pad_token_id=tokenizer.eos_token_id,
            repetition_penalty=1.1
        )
    
    response = tokenizer.decode(outputs[0], skip_special_tokens=False)
    
    # –ò–∑–≤–ª–µ–∫–∞–µ–º —Ç–æ–ª—å–∫–æ –æ—Ç–≤–µ—Ç –∞—Å—Å–∏—Å—Ç–µ–Ω—Ç–∞
    if "<|im_start|>assistant" in response:
        response = response.split("<|im_start|>assistant")[-1].strip()
        if "<|im_end|>" in response:
            response = response.split("<|im_end|>")[0].strip()
    
    return response

# 8. –û—Å–Ω–æ–≤–Ω–∞—è —Ñ—É–Ω–∫—Ü–∏—è RAG
def rag_pipeline(question, index, documents):
    """–ü–æ–ª–Ω—ã–π –ø–∞–π–ø–ª–∞–π–Ω RAG"""
    # –®–∞–≥ 1: –ü–æ–∏—Å–∫ —Ä–µ–ª–µ–≤–∞–Ω—Ç–Ω—ã—Ö –¥–æ–∫—É–º–µ–Ω—Ç–æ–≤
    relevant_docs = search_similar_documents(question, index, documents, top_k=3)
    
    # –®–∞–≥ 2: –°–æ–∑–¥–∞–Ω–∏–µ –ø—Ä–æ–º–ø—Ç–∞
    prompt = create_prompt(question, relevant_docs)
    
    # –®–∞–≥ 3: –ì–µ–Ω–µ—Ä–∞—Ü–∏—è –æ—Ç–≤–µ—Ç–∞
    answer = generate_answer(prompt)
    
    return answer, relevant_docs

# 9. –ó–∞–ø—É—Å–∫ —Å–∏—Å—Ç–µ–º—ã
def main():
    # –ò–Ω–∏—Ü–∏–∞–ª–∏–∑–∞—Ü–∏—è
    documents = create_knowledge_base()
    index, documents = create_vector_db(documents)
    
    print("\n" + "="*60)
    print("RAG –°–ò–°–¢–ï–ú–ê –ù–ê –û–°–ù–û–í–ï –†–ï–ê–õ–¨–ù–´–• –ù–û–í–û–°–¢–ï–ô")
    print("="*60)
    print(f"–ó–∞–≥—Ä—É–∂–µ–Ω–æ {len(documents)} –Ω–æ–≤–æ—Å—Ç–Ω—ã—Ö –¥–æ–∫—É–º–µ–Ω—Ç–æ–≤")
    
    # –ó–∞–ø—É—Å–∫–∞–µ–º —Ç–µ—Å—Ç–∏—Ä–æ–≤–∞–Ω–∏–µ –Ω–∞ –≤–æ–ø—Ä–æ—Å–∞—Ö –∏–∑ pickle
    results = run_test_questions()
    
    # –ò–Ω—Ç–µ—Ä–∞–∫—Ç–∏–≤–Ω—ã–π —Ü–∏–∫–ª
    print("\n" + "="*60)
    print("–ò–ù–¢–ï–†–ê–ö–¢–ò–í–ù–´–ô –†–ï–ñ–ò–ú")
    print("="*60)
    print("–í–≤–µ–¥–∏—Ç–µ –≤–∞—à –≤–æ–ø—Ä–æ—Å (–∏–ª–∏ '–≤—ã—Ö–æ–¥' –¥–ª—è –∑–∞–≤–µ—Ä—à–µ–Ω–∏—è):")
    
    while True:
        question = input("\nüéØ –í–∞—à –≤–æ–ø—Ä–æ—Å: ").strip()
        
        if question.lower() in ['–≤—ã—Ö–æ–¥', 'exit', 'quit']:
            print("–î–æ —Å–≤–∏–¥–∞–Ω–∏—è!")
            break
            
        if not question:
            continue
            
        try:
            print("üîç –ò—â–µ–º —Ä–µ–ª–µ–≤–∞–Ω—Ç–Ω—ã–µ –Ω–æ–≤–æ—Å—Ç–∏...")
            answer, context_docs = rag_pipeline(question, index, documents)
            print(f"\nü§ñ –û—Ç–≤–µ—Ç: {answer}")
            print(f"\nüìä –ò—Å–ø–æ–ª—å–∑–æ–≤–∞–Ω–æ –∏—Å—Ç–æ—á–Ω–∏–∫–æ–≤: {len(context_docs)}")
            print("üì∞ –†–µ–ª–µ–≤–∞–Ω—Ç–Ω—ã–µ –Ω–æ–≤–æ—Å—Ç–∏:")
            for i, doc in enumerate(context_docs, 1):
                print(f"   {i}. {doc['text'][:120]}... (—Ä–µ–ª–µ–≤–∞–Ω—Ç–Ω–æ—Å—Ç—å: {doc['score']:.3f})")
            
        except Exception as e:
            print(f"‚ùå –û—à–∏–±–∫–∞: {e}")

if __name__ == "__main__":
    main()