# Simple RAG Pipeline - Простой RAG без векторных БД

Пайплайн для:
- Retrieval без FAISS (TF-IDF, BM25)
- Простое хранилище документов
- Context injection в промпты
- Question Answering с LLM

In [None]:
!pip install transformers torch pandas numpy scikit-learn rank-bm25 sentence-transformers -q

In [None]:
import pandas as pd
import numpy as np
from typing import List, Dict, Tuple
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from rank_bm25 import BM25Okapi
from sentence_transformers import SentenceTransformer
import warnings
warnings.filterwarnings('ignore')

print("✓ Библиотеки загружены!")
print(f"CUDA available: {torch.cuda.is_available()}")

## 1. Загрузка документов

In [None]:
# === ВАШИ ДАННЫЕ ===
# Вариант 1: CSV с документами
docs_df = pd.read_csv('documents.csv')
# Колонки: 'id', 'text', 'title' (опционально)

# Вариант 2: Текстовые файлы
# import os
# documents = []
# for filename in os.listdir('docs_folder/'):
#     with open(f'docs_folder/{filename}', 'r', encoding='utf-8') as f:
#         documents.append({'id': filename, 'text': f.read()})
# docs_df = pd.DataFrame(documents)

# Вариант 3: Простой список
# documents = [
#     "Документ 1: Машинное обучение - это...",
#     "Документ 2: Нейронные сети применяются...",
# ]
# docs_df = pd.DataFrame({'id': range(len(documents)), 'text': documents})

print(f"Загружено документов: {len(docs_df)}")
print(f"\nПервые документы:")
print(docs_df.head())

## 2. Simple Document Store

In [None]:
class SimpleDocumentStore:
    """
    Простое хранилище документов в памяти
    """
    def __init__(self, documents: pd.DataFrame):
        self.documents = documents.to_dict('records')
        self.texts = [doc['text'] for doc in self.documents]
    
    def get_document(self, idx: int) -> Dict:
        return self.documents[idx]
    
    def get_documents_by_ids(self, ids: List[int]) -> List[Dict]:
        return [self.documents[i] for i in ids]
    
    def get_all_texts(self) -> List[str]:
        return self.texts
    
    def __len__(self):
        return len(self.documents)

# Создание хранилища
doc_store = SimpleDocumentStore(docs_df)
print(f"✓ Document store создан! Документов: {len(doc_store)}")

## 3. Retriever - TF-IDF

In [None]:
class TFIDFRetriever:
    """
    Retriever на основе TF-IDF
    """
    def __init__(self, doc_store: SimpleDocumentStore):
        self.doc_store = doc_store
        self.vectorizer = TfidfVectorizer(
            max_features=5000,
            ngram_range=(1, 2),
            stop_words='english'  # или None для русского
        )
        
        # Индексация документов
        self.doc_vectors = self.vectorizer.fit_transform(doc_store.get_all_texts())
        print(f"✓ TF-IDF индекс построен! Shape: {self.doc_vectors.shape}")
    
    def retrieve(self, query: str, top_k: int = 3) -> List[Dict]:
        """
        Поиск топ-k наиболее релевантных документов
        """
        # Векторизация запроса
        query_vector = self.vectorizer.transform([query])
        
        # Вычисление косинусного сходства
        similarities = cosine_similarity(query_vector, self.doc_vectors)[0]
        
        # Топ-k документов
        top_indices = np.argsort(similarities)[::-1][:top_k]
        
        results = []
        for idx in top_indices:
            doc = self.doc_store.get_document(idx)
            doc['score'] = float(similarities[idx])
            results.append(doc)
        
        return results

# Создание retriever
tfidf_retriever = TFIDFRetriever(doc_store)
print("✓ TF-IDF Retriever готов!")

## 4. Retriever - BM25

In [None]:
class BM25Retriever:
    """
    Retriever на основе BM25 (улучшенная версия TF-IDF)
    """
    def __init__(self, doc_store: SimpleDocumentStore):
        self.doc_store = doc_store
        
        # Токенизация документов (простая по пробелам)
        self.tokenized_docs = [doc.lower().split() for doc in doc_store.get_all_texts()]
        
        # Создание BM25 индекса
        self.bm25 = BM25Okapi(self.tokenized_docs)
        print(f"✓ BM25 индекс построен!")
    
    def retrieve(self, query: str, top_k: int = 3) -> List[Dict]:
        """
        Поиск с BM25
        """
        # Токенизация запроса
        tokenized_query = query.lower().split()
        
        # Получение скоров
        scores = self.bm25.get_scores(tokenized_query)
        
        # Топ-k документов
        top_indices = np.argsort(scores)[::-1][:top_k]
        
        results = []
        for idx in top_indices:
            doc = self.doc_store.get_document(idx)
            doc['score'] = float(scores[idx])
            results.append(doc)
        
        return results

# Создание BM25 retriever
bm25_retriever = BM25Retriever(doc_store)
print("✓ BM25 Retriever готов!")

## 5. Retriever - Sentence Transformers

In [None]:
class SemanticRetriever:
    """
    Semantic retriever с sentence-transformers
    """
    def __init__(self, doc_store: SimpleDocumentStore, model_name: str = 'all-MiniLM-L6-v2'):
        self.doc_store = doc_store
        self.model = SentenceTransformer(model_name)
        
        # Кодирование всех документов
        print("Кодирование документов...")
        self.doc_embeddings = self.model.encode(
            doc_store.get_all_texts(),
            convert_to_tensor=True,
            show_progress_bar=True
        )
        print(f"✓ Embeddings готовы! Shape: {self.doc_embeddings.shape}")
    
    def retrieve(self, query: str, top_k: int = 3) -> List[Dict]:
        """
        Semantic search
        """
        # Кодирование запроса
        query_embedding = self.model.encode(query, convert_to_tensor=True)
        
        # Вычисление косинусного сходства
        similarities = torch.nn.functional.cosine_similarity(
            query_embedding.unsqueeze(0),
            self.doc_embeddings
        )
        
        # Топ-k
        top_scores, top_indices = torch.topk(similarities, k=min(top_k, len(similarities)))
        
        results = []
        for idx, score in zip(top_indices.cpu().numpy(), top_scores.cpu().numpy()):
            doc = self.doc_store.get_document(int(idx))
            doc['score'] = float(score)
            results.append(doc)
        
        return results

# Создание semantic retriever
semantic_retriever = SemanticRetriever(doc_store)
print("✓ Semantic Retriever готов!")

## 6. Тестирование Retrieval

In [None]:
# === ТЕСТОВЫЙ ЗАПРОС ===
test_query = "What is machine learning?"

print(f"Запрос: {test_query}\n")

# TF-IDF
print("=" * 60)
print("TF-IDF Results:")
print("=" * 60)
tfidf_results = tfidf_retriever.retrieve(test_query, top_k=3)
for i, doc in enumerate(tfidf_results, 1):
    print(f"{i}. Score: {doc['score']:.4f}")
    print(f"   Text: {doc['text'][:200]}...\n")

# BM25
print("=" * 60)
print("BM25 Results:")
print("=" * 60)
bm25_results = bm25_retriever.retrieve(test_query, top_k=3)
for i, doc in enumerate(bm25_results, 1):
    print(f"{i}. Score: {doc['score']:.4f}")
    print(f"   Text: {doc['text'][:200]}...\n")

# Semantic
print("=" * 60)
print("Semantic Results:")
print("=" * 60)
semantic_results = semantic_retriever.retrieve(test_query, top_k=3)
for i, doc in enumerate(semantic_results, 1):
    print(f"{i}. Score: {doc['score']:.4f}")
    print(f"   Text: {doc['text'][:200]}...\n")

## 7. RAG Pipeline - Генерация ответов

In [None]:
class SimpleRAGPipeline:
    """
    Простой RAG: Retrieval + Generation
    """
    def __init__(self, retriever, llm_model_name: str = "microsoft/phi-2"):
        self.retriever = retriever
        
        # Загрузка LLM
        print(f"Загрузка LLM: {llm_model_name}...")
        self.tokenizer = AutoTokenizer.from_pretrained(llm_model_name, trust_remote_code=True)
        self.model = AutoModelForCausalLM.from_pretrained(
            llm_model_name,
            device_map="auto",
            torch_dtype=torch.float16,
            trust_remote_code=True
        )
        
        self.generator = pipeline(
            "text-generation",
            model=self.model,
            tokenizer=self.tokenizer,
            max_new_tokens=256,
            temperature=0.7,
            do_sample=True
        )
        print("✓ LLM загружена!")
    
    def create_prompt(self, query: str, context_docs: List[Dict]) -> str:
        """
        Создание промпта с контекстом
        """
        context = "\n\n".join([f"Document {i+1}: {doc['text']}" 
                               for i, doc in enumerate(context_docs)])
        
        prompt = f"""Use the following context to answer the question.

Context:
{context}

Question: {query}

Answer:"""
        return prompt
    
    def answer(self, query: str, top_k: int = 3) -> Dict:
        """
        Полный RAG pipeline
        """
        # 1. Retrieval
        retrieved_docs = self.retriever.retrieve(query, top_k=top_k)
        
        # 2. Создание промпта
        prompt = self.create_prompt(query, retrieved_docs)
        
        # 3. Generation
        response = self.generator(prompt)[0]['generated_text']
        
        # Извлекаем только ответ (после "Answer:")
        answer_start = response.find("Answer:") + len("Answer:")
        answer = response[answer_start:].strip()
        
        return {
            'query': query,
            'answer': answer,
            'retrieved_docs': retrieved_docs,
            'prompt': prompt
        }

# Создание RAG pipeline (выберите retriever)
rag = SimpleRAGPipeline(retriever=semantic_retriever)  # или tfidf_retriever, bm25_retriever
print("\n✓ RAG Pipeline готов!")

## 8. Тестирование RAG

In [None]:
# === ТЕСТОВЫЕ ВОПРОСЫ ===
questions = [
    "What is machine learning?",
    "How do neural networks work?",
    "Explain deep learning"
]

for question in questions:
    print("\n" + "="*80)
    print(f"ВОПРОС: {question}")
    print("="*80)
    
    result = rag.answer(question, top_k=2)
    
    print(f"\nОТВЕТ: {result['answer']}")
    
    print(f"\nИСПОЛЬЗОВАНЫ ДОКУМЕНТЫ:")
    for i, doc in enumerate(result['retrieved_docs'], 1):
        print(f"{i}. Score: {doc['score']:.4f} - {doc['text'][:150]}...")
    
    print("="*80)

## 9. Batch Processing для Competition

In [None]:
# === ЗАГРУЗКА ТЕСТОВЫХ ДАННЫХ ===
# test_df = pd.read_csv('test.csv')
# Колонки: 'id', 'query'

# Пример
test_df = pd.DataFrame({
    'id': [1, 2, 3],
    'query': [
        "What is AI?",
        "Explain supervised learning",
        "How to train neural networks?"
    ]
})

# Генерация ответов
answers = []
for idx, row in test_df.iterrows():
    result = rag.answer(row['query'], top_k=3)
    answers.append(result['answer'])
    
    if (idx + 1) % 10 == 0:
        print(f"Обработано: {idx + 1}/{len(test_df)}")

print(f"\n✓ Все ответы сгенерированы!")

## 10. Submission

In [None]:
submission = pd.DataFrame({
    'id': test_df['id'],
    'answer': answers
})

submission.to_csv('rag_submission.csv', index=False)
print("\n✓ Submission сохранен!")
print(submission.head())

## 11. Evaluation (если есть ground truth)

In [None]:
# Если есть правильные ответы
# from sklearn.metrics import accuracy_score
# from nltk.translate.bleu_score import sentence_bleu
# from rouge import Rouge

# val_df = pd.read_csv('val.csv')  # С ground truth ответами

# rouge = Rouge()
# rouge_scores = []

# for idx, row in val_df.iterrows():
#     result = rag.answer(row['query'])
#     score = rouge.get_scores(result['answer'], row['ground_truth_answer'])[0]
#     rouge_scores.append(score)

# print(f"Average ROUGE-L F1: {np.mean([s['rouge-l']['f'] for s in rouge_scores])}")

print("Добавьте свою метрику оценки!")