Загрузка данных

In [1]:
import pandas as pd
import numpy as np
import torch
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import pymorphy3
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')
from tqdm import tqdm
import hashlib
import pickle
import os

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Используется устройство: {device}")

Используется устройство: cuda


In [3]:
CHUNKS_CACHE_PATH = "chunks_cache_save.pkl"
VECTOR_STORE_CACHE_PATH = "vector_store.pkl"
PROCESSED_QUESTIONS_PATH = "questions_processed.csv"

In [18]:
if os.path.exists(CHUNKS_CACHE_PATH):
    print("Загружаем чанки из кэша...")
    with open(CHUNKS_CACHE_PATH, 'rb') as f:
        (all_chunks, chunk_metadata) = pickle.load(f)

Загружаем чанки из кэша...


Векторизация

In [5]:
from langchain_community.vectorstores.faiss import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_core.documents import Document

In [19]:
documents = []
for i, chunk_text in enumerate(all_chunks):
    doc = Document(
        page_content=chunk_text,
        metadata=chunk_metadata[i]
    )
    documents.append(doc)

In [None]:
if os.path.exists(VECTOR_STORE_CACHE_PATH):
    print("Загружаем vector_store из кэша...")
    vector_store = FAISS.load_local(
        VECTOR_STORE_CACHE_PATH, 
        HuggingFaceEmbeddings(model_name="ai-forever/ru-en-RoSBERTa"),
        allow_dangerous_deserialization=True
    )
else:
    print("Создаем vector_store...")
    embedding_model = HuggingFaceEmbeddings(
        model_name="ai-forever/ru-en-RoSBERTa",
        model_kwargs={'device': device}
    )
    
    print("Создание vector_store...")
    
    batch_size = 100
    total_batches = (len(documents) + batch_size - 1) // batch_size
    
    vector_store = None
    
    for i in tqdm(range(0, len(documents), batch_size), desc="Обработка батчей"):
        batch_docs = documents[i:i + batch_size]
        
        if vector_store is None:
            vector_store = FAISS.from_documents(
                batch_docs,
                embedding=embedding_model
            )
        else:
            vector_store.add_documents(batch_docs)
    
    vector_store.save_local(VECTOR_STORE_CACHE_PATH)
    print(f"Vector_store сохранен в {VECTOR_STORE_CACHE_PATH}")

'(ProtocolError('Connection aborted.', RemoteDisconnected('Remote end closed connection without response')), '(Request ID: 435c9e7f-4e7e-4bb4-8811-24af5b3f3f4e)')' thrown while requesting HEAD https://huggingface.co/ai-forever/ru-en-RoSBERTa/resolve/main/./modules.json
Retrying in 1s [Retry 1/5].


Создаем vector_store...


Some weights of RobertaModel were not initialized from the model checkpoint at ai-forever/ru-en-RoSBERTa and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Создание vector_store...


Обработка батчей:   0%|          | 0/116 [00:00<?, ?it/s]

Векторный поиск

In [None]:
from rank_bm25 import BM25Okapi

In [None]:
def hybrid_search(query, vector_store, documents, faiss_k=30, bm25_k=20):
    """Каскадный гибридный поиск: FAISS -> BM25 переранжирование"""
    
    # 1. FAISS: семантический поиск (большой пул кандидатов)
    semantic_results = vector_store.similarity_search_with_score(
        query, 
        k=faiss_k
    )
    
    candidate_texts = []
    candidate_chunk_ids = []
    
    for doc, score in semantic_results:
        candidate_texts.append(doc.page_content)
        candidate_chunk_ids.append(doc.metadata['chunk_id'])
    
    tokenized_candidates = [text.split() for text in candidate_texts]
    candidate_bm25 = BM25Okapi(tokenized_candidates)
    
    # Оцениваем BM25 релевантность
    tokenized_query = query.split()
    bm25_scores = candidate_bm25.get_scores(tokenized_query)
    
    BM25_candidates = []
    
    for i, (chunk_id, bm25_score) in enumerate(zip(candidate_chunk_ids, bm25_scores)):
        BM25_candidates.append({
            'chunk_id': chunk_id,
            'bm25_score': bm25_score
        })
    
    # Сортируем по комбинированному score
    BM25_candidates.sort(key=lambda x: x['bm25_score'], reverse=True)
    
    # Возвращаем топ-K chunk_ids
    top_chunk_ids = [candidate['chunk_id'] for candidate in BM25_candidates[:bm25_k]]
    
    return top_chunk_ids

Re-Ranking

In [1]:
from sentence_transformers import CrossEncoder

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
reranker = CrossEncoder('sberbank-ai/sbert_large_nlu_ru')

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at sberbank-ai/sbert_large_nlu_ru and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
def search_with_reranking(query, vector_store, documents, top_k=5):
    """Поиск с реранжированием"""
    candidate_chunk_ids = hybrid_search(query, vector_store, documents)
    
    candidate_texts = []
    chunk_id_to_web_id = {}
    
    for chunk_id in candidate_chunk_ids:
        doc = next((d for d in documents if d.metadata.get('chunk_id') == chunk_id), None)
        candidate_texts.append(doc.page_content)
        chunk_id_to_web_id[chunk_id] = doc.metadata['web_id']
    
    pairs = [[query, doc_text] for doc_text in candidate_texts]
    scores = reranker.predict(pairs)
    
    scored_candidates = []
    for i, (chunk_id, score) in enumerate(zip(candidate_chunk_ids, scores)):
        web_id = chunk_id_to_web_id.get(chunk_id)
        scored_candidates.append((web_id, score, chunk_id))
    
    scored_candidates.sort(key=lambda x: x[1], reverse=True)
    
    unique_web_ids = []
    seen_web_ids = set()
    
    for web_id, score, chunk_id in scored_candidates:
        if web_id not in seen_web_ids:
            unique_web_ids.append(web_id)
            seen_web_ids.add(web_id)
        if len(unique_web_ids) >= top_k:
            break
    
    return unique_web_ids[:top_k]

Выполнение поиска

In [None]:
questions_df = pd.read_csv('questions_processed.csv')
questions_df.set_index('q_id', inplace=True)

In [None]:
results = []

for q_id, row in tqdm(questions_df.iterrows(), total=len(questions_df)):
    query = row['processed_query']
    
    top_web_ids = search_with_reranking(
        query, vector_store, documents, top_k=5
    )
    
    web_list_str = f"[{', '.join(map(str, top_web_ids))}]"
    results.append({'q_id': q_id, 'web_list': web_list_str})

# Сохранение результатов
submission_df = pd.DataFrame(results)
with open('submit.csv', 'w', encoding='utf-8') as f:
    f.write('q_id,web_list\n')
    for result in results:
        f.write(f'{result["q_id"]},"{result["web_list"]}"\n')

submission_df.head()

100%|██████████| 6977/6977 [59:46<00:00,  1.95it/s]  


Unnamed: 0,q_id,web_list
0,1,"[372, 1125, 1812, 1811, 1705]"
1,2,"[372, 1157, 1030, 108, 368]"
2,3,"[341, 856, 116, 1035, 1760]"
3,4,"[1038, 1027, 938, 163, 1043]"
4,5,"[920, 165, 921, 1032, 164]"
