Загрузка данных

In [16]:
import pandas as pd
import numpy as np
import torch
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import pymorphy3
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')
from tqdm import tqdm
import hashlib
import pickle
import os

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Используется устройство: {device}")

Используется устройство: cuda


In [17]:
CHUNKS_CACHE_PATH = "chunks_cache_save.pkl"
VECTOR_STORE_CACHE_PATH = "vector_store.pkl"
PROCESSED_QUESTIONS_PATH = "questions_processed.csv"

In [18]:
if os.path.exists(CHUNKS_CACHE_PATH):
    print("Загружаем чанки из кэша...")
    with open(CHUNKS_CACHE_PATH, 'rb') as f:
        (all_chunks, chunk_metadata) = pickle.load(f)

Загружаем чанки из кэша...


Векторизация

In [5]:
from langchain_community.vectorstores.faiss import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_core.documents import Document

In [19]:
documents = []
for i, chunk_text in enumerate(all_chunks):
    doc = Document(
        page_content=chunk_text,
        metadata=chunk_metadata[i]
    )
    documents.append(doc)

In [None]:
if os.path.exists(VECTOR_STORE_CACHE_PATH):
    print("Загружаем vector_store из кэша...")
    vector_store = FAISS.load_local(
        VECTOR_STORE_CACHE_PATH, 
        HuggingFaceEmbeddings(model_name="ai-forever/ru-en-RoSBERTa"),
        allow_dangerous_deserialization=True
    )
else:
    print("Создаем vector_store...")
    embedding_model = HuggingFaceEmbeddings(
        model_name="ai-forever/ru-en-RoSBERTa",
        model_kwargs={'device': device}
    )
    
    print("Создание vector_store...")
    
    batch_size = 100
    total_batches = (len(documents) + batch_size - 1) // batch_size
    
    vector_store = None
    
    for i in tqdm(range(0, len(documents), batch_size), desc="Обработка батчей"):
        batch_docs = documents[i:i + batch_size]
        
        if vector_store is None:
            vector_store = FAISS.from_documents(
                batch_docs,
                embedding=embedding_model
            )
        else:
            vector_store.add_documents(batch_docs)
    
    vector_store.save_local(VECTOR_STORE_CACHE_PATH)
    print(f"Vector_store сохранен в {VECTOR_STORE_CACHE_PATH}")

'(ProtocolError('Connection aborted.', RemoteDisconnected('Remote end closed connection without response')), '(Request ID: 435c9e7f-4e7e-4bb4-8811-24af5b3f3f4e)')' thrown while requesting HEAD https://huggingface.co/ai-forever/ru-en-RoSBERTa/resolve/main/./modules.json
Retrying in 1s [Retry 1/5].


Создаем vector_store...


Some weights of RobertaModel were not initialized from the model checkpoint at ai-forever/ru-en-RoSBERTa and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Создание vector_store...


Обработка батчей:   0%|          | 0/116 [00:00<?, ?it/s]

Векторный поиск

In [None]:
from rank_bm25 import BM25Okapi

In [None]:
texts_for_bm25 = [doc.page_content for doc in documents]
tokenized_corpus = [text.split() for text in texts_for_bm25]
bm25_index = BM25Okapi(tokenized_corpus)

In [None]:
def hybrid_search(query, vector_store, bm25_index, documents, top_k=5, alpha=0.7):
    """Гибридный поиск: комбинация семантического (FAISS) и ключевого (BM25)"""
    semantic_results = vector_store.similarity_search_with_score(
        query, 
        k=top_k * 6
    )
    
    semantic_scores = {}
    for doc, score in semantic_results:
        chunk_id = doc.metadata['chunk_id']
        similarity = 1 - score
        if chunk_id not in semantic_scores:
            semantic_scores[chunk_id] = similarity
    
    # Ключевой поиск через BM25
    tokenized_query = query.split()
    if tokenized_query:
        bm25_scores = bm25_index.get_scores(tokenized_query)
        
        keyword_scores = {}
        for idx, score in enumerate(bm25_scores):
            chunk_id = documents[idx].metadata['chunk_id']
            if chunk_id not in keyword_scores:
                keyword_scores[chunk_id] = score
    else:
        keyword_scores = {}
    
    # Нормализация scores
    if semantic_scores:
        max_semantic = max(semantic_scores.values())
        for chunk_id in semantic_scores:
            semantic_scores[chunk_id] /= max_semantic if max_semantic > 0 else 1

    if keyword_scores:
        max_keyword = max(keyword_scores.values())
        for chunk_id in keyword_scores:
            keyword_scores[chunk_id] /= max_keyword if max_keyword > 0 else 1

    # Комбинирование scores
    combined_scores = {}
    all_chunk_ids = set(list(semantic_scores.keys()) + list(keyword_scores.keys()))
    
    for chunk_id in all_chunk_ids:
        semantic_score = semantic_scores.get(chunk_id, 0)
        keyword_score = keyword_scores.get(chunk_id, 0)
        combined_score = alpha * semantic_score + (1 - alpha) * keyword_score
        combined_scores[chunk_id] = combined_score
    
    # Возвращаем топ-K документов
    top_chunk_ids = sorted(combined_scores.items(), key=lambda x: x[1], reverse=True)[:top_k*3]
    return [chunk_id for chunk_id, score in top_chunk_ids]

Re-Ranking

In [11]:
from sentence_transformers import CrossEncoder

In [None]:
reranker = CrossEncoder('cross-encoder/mmarco-mMiniLMv2-L12-H384-v1')

In [None]:
def search_with_reranking(query, vector_store, bm25_index, documents, top_k=5):
    """Поиск с реранжированием"""
    candidate_chunk_ids = hybrid_search(query, vector_store, bm25_index, documents)
    
    candidate_texts = []
    chunk_id_to_web_id = {}
    
    for chunk_id in candidate_chunk_ids:
        doc = next((d for d in documents if d.metadata.get('chunk_id') == chunk_id), None)
        candidate_texts.append(doc.page_content)
        chunk_id_to_web_id[chunk_id] = doc.metadata['web_id']
    
    # Оценка релевантности пар (запрос, документ)
    pairs = [[query, doc_text] for doc_text in candidate_texts]
    scores = reranker.predict(pairs)
    
    # Сортировка по убыванию релевантности
    scored_candidates = []
    for i, (chunk_id, score) in enumerate(zip(candidate_chunk_ids, scores)):
        web_id = chunk_id_to_web_id.get(chunk_id)
        scored_candidates.append((web_id, score, chunk_id))
    
    # Сортировка по убыванию релевантности
    scored_candidates.sort(key=lambda x: x[1], reverse=True)
    
    # Убираем дубликаты web_id, оставляя только лучший результат для каждого web_id
    unique_web_ids = []
    seen_web_ids = set()
    
    for web_id, score, chunk_id in scored_candidates:
        if web_id not in seen_web_ids:
            unique_web_ids.append(web_id)
            seen_web_ids.add(web_id)
        if len(unique_web_ids) >= top_k:
            break
    
    return unique_web_ids[:top_k]

Выполнение поиска

In [None]:
questions_df = pd.read_csv('questions_processed.csv')
questions_df.set_index('q_id', inplace=True)

In [None]:
results = []

for q_id, row in tqdm(questions_df.iterrows(), total=len(questions_df)):
    query = row['processed_query']
    
    top_web_ids = search_with_reranking(
        query, vector_store, bm25_index, documents, top_k=5
    )
    
    web_list_str = f"[{', '.join(map(str, top_web_ids))}]"
    results.append({'q_id': q_id, 'web_list': web_list_str})

# Сохранение результатов
submission_df = pd.DataFrame(results)
with open('submit.csv', 'w', encoding='utf-8') as f:
    f.write('q_id,web_list\n')
    for result in results:
        f.write(f'{result["q_id"]},"{result["web_list"]}"\n')

submission_df.head()

100%|██████████| 6977/6977 [59:46<00:00,  1.95it/s]  


Unnamed: 0,q_id,web_list
0,1,"[372, 1125, 1812, 1811, 1705]"
1,2,"[372, 1157, 1030, 108, 368]"
2,3,"[341, 856, 116, 1035, 1760]"
3,4,"[1038, 1027, 938, 163, 1043]"
4,5,"[920, 165, 921, 1032, 164]"
