In [1]:
import emoji
import pandas as pd
import numpy as np
import faiss
import pickle
import re
from pymorphy3 import MorphAnalyzer
from transformers import AutoTokenizer, AutoModel
import torch
from tqdm import tqdm
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
QUESTIONS_PATH = "../Data/Common/questions_clean.csv"
WEBSITES_PATH = "../Data/Common/websites_updated.csv"
SUBMISSION_PATH = "../Data/Common/sample_submission.csv"

OUTPUT_SUBMISSION_FILE = "submission.csv"

EMBEDDING_MODEL_NAME = "intfloat/multilingual-e5-large"
RERANKING_MODEL_NAME = "cross-encoder/ms-marco-MiniLM-L-12-v2"

INDEX_PATH = "faiss_index.bin"
METADATA_PATH = "metadata.pkl"

MAX_TOKENS = 512
MIN_TOKENS = 64
SIM_THRESHOLD = 0.8
OVERLAP_SENTENCES = 2

TOP_K = 5

In [3]:
morph = MorphAnalyzer()


def clean_text(text: str) -> str:
    """
    –û—á–∏—â–∞–µ—Ç —Ç–µ–∫—Å—Ç –æ—Ç HTML-—Ç–µ–≥–æ–≤, –ª–∏—à–Ω–∏—Ö —Å–∏–º–≤–æ–ª–æ–≤ –∏ –ø—Ä–∏–≤–æ–¥–∏—Ç –∫ –Ω–∏–∂–Ω–µ–º—É —Ä–µ–≥–∏—Å—Ç—Ä—É.
    """
    # –£–¥–∞–ª—è–µ–º HTML-—Ç–µ–≥–∏
    text = re.sub(r'<[^>]+>', ' ', text)
    # –£–¥–∞–ª—è–µ–º URL-–∞–¥—Ä–µ—Å–∞
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    # –£–¥–∞–ª—è–µ–º –∑–Ω–∞–∫–∏ –ø—Ä–µ–ø–∏–Ω–∞–Ω–∏—è –∏ —Ü–∏—Ñ—Ä—ã, –æ—Å—Ç–∞–≤–ª—è–µ–º —Ç–æ–ª—å–∫–æ —Ä—É—Å—Å–∫–∏–µ –±—É–∫–≤—ã –∏ –ø—Ä–æ–±–µ–ª—ã
    text = re.sub(r'[^–∞-—è–ê-–Ø—ë–Å\s]', ' ', text)
    # –ó–∞–º–µ–Ω—è–µ–º –º–Ω–æ–∂–µ—Å—Ç–≤–µ–Ω–Ω—ã–µ –ø—Ä–æ–±–µ–ª—ã –Ω–∞ –æ–¥–∏–Ω
    text = re.sub(r'\s+', ' ', text).strip()
    # –ü—Ä–∏–≤–æ–¥–∏–º –∫ –Ω–∏–∂–Ω–µ–º—É —Ä–µ–≥–∏—Å—Ç—Ä—É
    text = text.lower()
    return text


def lemmatize_text(text: str) -> str:
    """
    –õ–µ–º–º–∞—Ç–∏–∑–∏—Ä—É–µ—Ç —Ç–µ–∫—Å—Ç (–ø—Ä–∏–≤–æ–¥–∏—Ç —Å–ª–æ–≤–∞ –∫ –Ω–∞—á–∞–ª—å–Ω–æ–π —Ñ–æ—Ä–º–µ).
    """
    words = text.split()
    lemmatized_words = [morph.parse(word)[0].normal_form for word in words]
    return " ".join(lemmatized_words)


def preprocess_query(query: str) -> str:
    """
    –ü–æ–ª–Ω—ã–π —Ü–∏–∫–ª –ø—Ä–µ–¥–æ–±—Ä–∞–±–æ—Ç–∫–∏ –¥–ª—è –ø–æ–∏—Å–∫–æ–≤–æ–≥–æ –∑–∞–ø—Ä–æ—Å–∞.
    """
    cleaned_query = clean_text(query)
    lemmatized_query = lemmatize_text(cleaned_query)
    return lemmatized_query


def preprocess_document(text: str) -> str:
    """
    –ü–æ–ª–Ω—ã–π —Ü–∏–∫–ª –ø—Ä–µ–¥–æ–±—Ä–∞–±–æ—Ç–∫–∏ –¥–ª—è —Ç–µ–∫—Å—Ç–∞ –¥–æ–∫—É–º–µ–Ω—Ç–∞.
    –í –¥–∞–Ω–Ω–æ–º –±–µ–π–∑–ª–∞–π–Ω–µ –æ–Ω —Ç–∞–∫–æ–π –∂–µ, –∫–∞–∫ –¥–ª—è –∑–∞–ø—Ä–æ—Å–∞.
    """
    cleaned_doc = clean_text(text)
    # –î–ª—è –¥–æ–∫—É–º–µ–Ω—Ç–æ–≤ –ª–µ–º–º–∞—Ç–∏–∑–∞—Ü–∏—è –º–æ–∂–µ—Ç –±—ã—Ç—å –æ–ø—Ü–∏–æ–Ω–∞–ª—å–Ω–æ–π, –≤ –∑–∞–≤–∏—Å–∏–º–æ—Å—Ç–∏ –æ—Ç –º–æ–¥–µ–ª–∏
    # –ù–æ –¥–ª—è —Å–∏–º–º–µ—Ç—Ä–∏—á–Ω–æ–≥–æ –ø–æ–∏—Å–∫–∞ (–∫–æ–≥–¥–∞ –∏ –∑–∞–ø—Ä–æ—Å, –∏ –¥–æ–∫—É–º–µ–Ω—Ç –æ–±—Ä–∞–±–∞—Ç—ã–≤–∞—é—Ç—Å—è –æ–¥–∏–Ω–∞–∫–æ–≤–æ)
    # –ª—É—á—à–µ –µ–µ –æ—Å—Ç–∞–≤–∏—Ç—å.
    lemmatized_doc = lemmatize_text(cleaned_doc)
    return lemmatized_doc


def easy_preprocess_text(text):
    text = re.sub(r'<[^>]+>', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text


1. –ó–∞–≥—Ä—É–∂–∞–µ—Ç –∏ –æ–±—Ä–∞–±–∞—Ç—ã–≤–∞–µ—Ç –¥–æ–∫—É–º–µ–Ω—Ç—ã.

2. –†–∞–∑–±–∏–≤–∞–µ—Ç –∏—Ö –Ω–∞ —á–∞–Ω–∫–∏.

3. –í–µ–∫—Ç–æ—Ä–∏–∑—É–µ—Ç —á–∞–Ω–∫–∏.

4. –°–æ–∑–¥–∞–µ—Ç –∏ —Å–æ—Ö—Ä–∞–Ω—è–µ—Ç FAISS –∏–Ω–¥–µ–∫—Å –∏ –º–µ—Ç–∞–¥–∞–Ω–Ω—ã–µ.

In [4]:
# 1. –ó–∞–≥—Ä—É–∑–∫–∞ –¥–∞–Ω–Ω—ã—Ö –∏ –º–æ–¥–µ–ª–µ–π

df_websites = pd.read_csv(WEBSITES_PATH)
df_websites['title'] = df_websites['title'].fillna('')
df_websites['text'] = df_websites['text'].fillna('')

In [5]:
df_websites[df_websites['web_id'] == 1705].iloc[0]

web_id                                                 1705
url       https://alfabank.servicecdn.ru/site-upload/f4/...
kind                                               document
title                               dogovor_cbo_1072025.pdf
text      –ü—Ä–∏–ª–æ–∂–µ–Ω–∏–µ –∫ –†–∞—Å–ø–æ—Ä—è–∂–µ–Ω–∏—é –ê–û ¬´–ê–õ–¨–§–ê-–ë–ê–ù–ö¬ª \n–æ—Ç...
Name: 1704, dtype: object

In [6]:
print(f"–ó–∞–≥—Ä—É–∑–∫–∞ –º–æ–¥–µ–ª–∏ —ç–º–±–µ–¥–¥–∏–Ω–≥–æ–≤ –∏ —Ç–æ–∫–µ–Ω–∞–π–∑–µ—Ä–∞: {EMBEDDING_MODEL_NAME}")

tokenizer = AutoTokenizer.from_pretrained(EMBEDDING_MODEL_NAME, device='cuda' if torch.cuda.is_available() else 'cpu')
embeddings_model = SentenceTransformer(EMBEDDING_MODEL_NAME, device='cuda' if torch.cuda.is_available() else 'cpu')

–ó–∞–≥—Ä—É–∑–∫–∞ –º–æ–¥–µ–ª–∏ —ç–º–±–µ–¥–¥–∏–Ω–≥–æ–≤ –∏ —Ç–æ–∫–µ–Ω–∞–π–∑–µ—Ä–∞: intfloat/multilingual-e5-large


'(MaxRetryError('HTTPSConnectionPool(host=\'huggingface.co\', port=443): Max retries exceeded with url: /intfloat/multilingual-e5-large/resolve/main/1_Pooling/config.json (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x000001B32121DE50>: Failed to resolve \'huggingface.co\' ([Errno 11001] getaddrinfo failed)"))'), '(Request ID: 567326ea-a533-40e3-8344-fac317f5744f)')' thrown while requesting HEAD https://huggingface.co/intfloat/multilingual-e5-large/resolve/main/1_Pooling/config.json
Retrying in 1s [Retry 1/5].
'(MaxRetryError('HTTPSConnectionPool(host=\'huggingface.co\', port=443): Max retries exceeded with url: /intfloat/multilingual-e5-large/resolve/main/1_Pooling/config.json (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x000001B32121CE10>: Failed to resolve \'huggingface.co\' ([Errno 11001] getaddrinfo failed)"))'), '(Request ID: 5a6e93aa-9cba-4472-ae2e-9634a98e6a52)')' thrown while requesting HEAD https://huggingface

In [7]:
# 2. –°–æ–∑–¥–∞–Ω–∏–µ —á–∞–Ω–∫–æ–≤

def tokenize_len(text):
    return len(tokenizer.encode(text, add_special_tokens=False))


def embed_sentences(sentences):
    # –ú–µ—Ç–æ–¥ .encode() —Å–∞–º –æ–±—Ä–∞–±–∞—Ç—ã–≤–∞–µ—Ç —Ç–æ–∫–µ–Ω–∏–∑–∞—Ü–∏—é, –ø–∞–¥–¥–∏–Ω–≥ –∏ –∑–∞–ø—É—Å–∫ –Ω–∞ –Ω—É–∂–Ω–æ–º —É—Å—Ç—Ä–æ–π—Å—Ç–≤–µ
    embeddings = embeddings_model.encode(sentences, convert_to_numpy=True, show_progress_bar=False)
    return embeddings


def semantic_chunk_text(text):
    sentences = re.split(r'(?<=[.!?])\s+', text)
    sentences = [s.strip() for s in sentences if len(s.strip()) > 0]
    if len(sentences) <= 1:
        return [text]

    embs = embed_sentences(sentences)
    sims = cosine_similarity(embs[:-1], embs[1:]).diagonal()

    breaks = [i for i, sim in enumerate(sims) if sim < SIM_THRESHOLD]
    chunks, start = [], 0
    for b in breaks + [len(sentences) - 1]:
        chunk = " ".join(sentences[start:b + 1])
        chunks.append(chunk)
        start = b + 1

    merged = []
    buffer = ""
    for c in chunks:
        if tokenize_len(c) < MIN_TOKENS:
            buffer += " " + c
        else:
            if buffer:
                merged.append(buffer.strip())
                buffer = ""
            merged.append(c)
    if buffer:
        merged.append(buffer.strip())

    final_chunks = []
    for i, c in enumerate(merged):
        overlap = " ".join(sentences[max(0, i - OVERLAP_SENTENCES):i])
        full = (overlap + " " + c).strip()
        final_chunks.append(full)
    return final_chunks


def auto_summary(text):
    sents = re.split(r'(?<=[.!?])\s+', text)
    return " ".join(sents[:2]).strip()


def create_chunks_with_headers(df_websites):
    all_chunks = []
    metadata = []
    for _, row in tqdm(df_websites.iterrows(), total=len(df_websites), desc="Semantic chunking"):
        title = str(row['title'])
        text = str(row['text'])
        full_text = f"{title}. {text}"
        sem_chunks = semantic_chunk_text(full_text)
        for ch in sem_chunks:
            header = f"[–î–æ–∫—É–º–µ–Ω—Ç: {title}] [–ö—Ä–∞—Ç–∫–æ–µ —Å–æ–¥–µ—Ä–∂–∞–Ω–∏–µ: {auto_summary(ch)}] "
            all_chunks.append(header + ch)
            metadata.append({'web_id': row['web_id'], 'title': title})
    return all_chunks, metadata

In [24]:
print("–°–æ–∑–¥–∞–Ω–∏–µ —á–∞–Ω–∫–æ–≤ –∏–∑ –¥–æ–∫—É–º–µ–Ω—Ç–æ–≤...")
chunks, metadata = create_chunks_with_headers(df_websites[:1000])
print(f"–°–æ–∑–¥–∞–Ω–æ {len(chunks)} —á–∞–Ω–∫–æ–≤.")

–°–æ–∑–¥–∞–Ω–∏–µ —á–∞–Ω–∫–æ–≤ –∏–∑ –¥–æ–∫—É–º–µ–Ω—Ç–æ–≤...


Semantic chunking: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1000/1000 [05:40<00:00,  2.94it/s]

–°–æ–∑–¥–∞–Ω–æ 5979 —á–∞–Ω–∫–æ–≤.





In [25]:
chunks[10]

'[–î–æ–∫—É–º–µ–Ω—Ç: –ê-–ö–ª—É–±. –î–µ–Ω—å–≥–∏ –∏–º–µ—é—Ç –∑–Ω–∞—á–µ–Ω–∏–µ] [–ö—Ä–∞—Ç–∫–æ–µ —Å–æ–¥–µ—Ä–∂–∞–Ω–∏–µ: –ê-–ö–ª—É–±. –ê–û ¬´–ê–ª—å—Ñ–∞-–ë–∞–Ω–∫¬ª —è–≤–ª—è–µ—Ç—Å—è —É—á–∞—Å—Ç–Ω–∏–∫–æ–º —Å–∏—Å—Ç–µ–º—ã –æ–±—è–∑–∞—Ç–µ–ª—å–Ω–æ–≥–æ —Å—Ç—Ä–∞—Ö–æ–≤–∞–Ω–∏—è –≤–∫–ª–∞–¥–æ–≤.] –ê-–ö–ª—É–±. –ê–û ¬´–ê–ª—å—Ñ–∞-–ë–∞–Ω–∫¬ª —è–≤–ª—è–µ—Ç—Å—è —É—á–∞—Å—Ç–Ω–∏–∫–æ–º —Å–∏—Å—Ç–µ–º—ã –æ–±—è–∑–∞—Ç–µ–ª—å–Ω–æ–≥–æ —Å—Ç—Ä–∞—Ö–æ–≤–∞–Ω–∏—è –≤–∫–ª–∞–¥–æ–≤. –ò–Ω—Ñ–æ—Ä–º–∞—Ü–∏—è –æ –ø—Ä–æ—Ü–µ–Ω—Ç–Ω—ã—Ö —Å—Ç–∞–≤–∫–∞—Ö –ø–æ –¥–æ–≥–æ–≤–æ—Ä–∞–º –±–∞–Ω–∫–æ–≤—Å–∫–æ–≥–æ –≤–∫–ª–∞–¥–∞ —Å —Ñ–∏–∑–∏—á–µ—Å–∫–∏–º–∏ –ª–∏—Ü–∞–º–∏. –¶–µ–Ω—Ç—Ä —Ä–∞—Å–∫—Ä—ã—Ç–∏—è –∫–æ—Ä–ø–æ—Ä–∞—Ç–∏–≤–Ω–æ–π –∏–Ω—Ñ–æ—Ä–º–∞—Ü–∏–∏. –ò–Ω—Ñ–æ—Ä–º–∞—Ü–∏—è –ø—Ä–æ—Ñ–µ—Å—Å–∏–æ–Ω–∞–ª—å–Ω–æ–≥–æ —É—á–∞—Å—Ç–Ω–∏–∫–∞ —Ä—ã–Ω–∫–∞ —Ü–µ–Ω–Ω—ã—Ö –±—É–º–∞–≥. –ò–Ω—Ñ–æ—Ä–º–∞—Ü–∏—è –æ –ª–∏—Ü–∞—Ö, –ø–æ–¥ –∫–æ–Ω—Ç—Ä–æ–ª–µ–º –ª–∏–±–æ –∑–Ω–∞—á–∏—Ç–µ–ª—å–Ω—ã–º –≤–ª–∏—è–Ω–∏–µ–º –∫–æ—Ç–æ—Ä—ã—Ö –Ω–∞—Ö–æ–¥–∏—Ç—Å—è –ë–∞–Ω–∫. –£–ª. –ö–∞–ª–∞–Ω—á–µ–≤—Å–∫–∞—è, 27, –ú–æ—Å–∫–≤–∞, 107078.

In [26]:
metadata[10]

{'web_id': 3, 'title': '–ê-–ö–ª—É–±. –î–µ–Ω—å–≥–∏ –∏–º–µ—é—Ç –∑–Ω–∞—á–µ–Ω–∏–µ'}

In [27]:
# !nvidia-smi
torch.__version__

'2.9.1+cu130'

In [28]:
# 4. –í–µ–∫—Ç–æ—Ä–∏–∑–∞—Ü–∏—è —á–∞–Ω–∫–æ–≤
def clean_for_embeddings(text: str) -> str:
    if not isinstance(text, str):
        return ""
    text = text.replace('\xa0', ' ')  # –Ω–µ—Ä–∞–∑—Ä—ã–≤–Ω—ã–µ –ø—Ä–æ–±–µ–ª—ã
    text = re.sub(r'\n+', '. ', text)  # –ø–µ—Ä–µ–Ω–æ—Å—ã -> —Ç–æ—á–∫–∏
    text = re.sub(r'\s+', ' ', text).strip()
    text = re.sub(r'http\S+', '', text)  # —É–±—Ä–∞—Ç—å URL
    text = re.sub(r'\d+[\s]*(–ö–±|–±–∞–π—Ç|Mb|GB|Kb|MB|pdf|–∞—Ä—Ö–∏–≤)', '', text, flags=re.IGNORECASE)
    text = re.sub(r'(\.|,){2,}', '.', text)  # —É–±—Ä–∞—Ç—å –ø–æ–≤—Ç–æ—Ä—è—é—â–∏–µ—Å—è —Ç–æ—á–∫–∏/–∑–∞–ø—è—Ç—ã–µ
    text = re.sub(r'[¬´¬ª‚Äú‚Äù"]', '"', text)
    text = emoji.replace_emoji(text, replace='')  # —É–±—Ä–∞—Ç—å —ç–º–æ–¥–∑–∏ üöÄüìöüî•
    text = re.sub(r'\s{2,}', ' ', text)
    return text.strip()


passage_chunks = ["passage: " + clean_for_embeddings(chunk) for chunk in chunks]
embeddings = embeddings_model.encode(
    passage_chunks,
    batch_size=32,
    convert_to_numpy=True,
    normalize_embeddings=False,
    show_progress_bar=True
)
print(f"–†–∞–∑–º–µ—Ä–Ω–æ—Å—Ç—å —ç–º–±–µ–¥–¥–∏–Ω–≥–æ–≤: {embeddings.shape}")

# –ù–æ—Ä–º–∞–ª–∏–∑—É–µ–º —ç–º–±–µ–¥–¥–∏–Ω–≥–∏ –¥–ª—è –∏—Å–ø–æ–ª—å–∑–æ–≤–∞–Ω–∏—è —Å –∫–æ—Å–∏–Ω—É—Å–Ω—ã–º —Ä–∞—Å—Å—Ç–æ—è–Ω–∏–µ–º (IndexFlatIP)
faiss.normalize_L2(embeddings)

d = embeddings.shape[1]  # –†–∞–∑–º–µ—Ä–Ω–æ—Å—Ç—å –≤–µ–∫—Ç–æ—Ä–æ–≤
index = faiss.IndexFlatIP(d)  # IndexFlatIP –ø–æ–¥—Ö–æ–¥–∏—Ç –¥–ª—è –Ω–æ—Ä–º–∞–ª–∏–∑–æ–≤–∞–Ω–Ω—ã—Ö –≤–µ–∫—Ç–æ—Ä–æ–≤ (–∫–æ—Å–∏–Ω—É—Å–Ω–æ–µ —Å—Ö–æ–¥—Å—Ç–≤–æ)
index.add(embeddings)



Batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 187/187 [03:05<00:00,  1.01it/s]

–†–∞–∑–º–µ—Ä–Ω–æ—Å—Ç—å —ç–º–±–µ–¥–¥–∏–Ω–≥–æ–≤: (5979, 1024)





In [29]:
# 5. –°–æ—Ö—Ä–∞–Ω–µ–Ω–∏–µ –∏–Ω–¥–µ–∫—Å–∞ –∏ –º–µ—Ç–∞–¥–∞–Ω–Ω—ã—Ö...
faiss.write_index(index, INDEX_PATH)
with open(METADATA_PATH, 'wb') as f:
    pickle.dump(metadata, f)

print("–ò–Ω–¥–µ–∫—Å–∞—Ü–∏—è –∑–∞–≤–µ—Ä—à–µ–Ω–∞ —É—Å–ø–µ—à–Ω–æ!")

–ò–Ω–¥–µ–∫—Å–∞—Ü–∏—è –∑–∞–≤–µ—Ä—à–µ–Ω–∞ —É—Å–ø–µ—à–Ω–æ!


1. –ó–∞–≥—Ä—É–∂–∞–µ—Ç –∏–Ω–¥–µ–∫—Å, –º–æ–¥–µ–ª—å –∏ –¥–∞–Ω–Ω—ã–µ.

2. –û–±—Ä–∞–±–∞—Ç—ã–≤–∞–µ—Ç –∫–∞–∂–¥—ã–π –≤–æ–ø—Ä–æ—Å.

3. –í—ã–ø–æ–ª–Ω—è–µ—Ç –ø–æ–∏—Å–∫ –≤ –∏–Ω–¥–µ–∫—Å–µ.

4. –ê–≥—Ä–µ–≥–∏—Ä—É–µ—Ç —Ä–µ–∑—É–ª—å—Ç–∞—Ç—ã –∏ —Ñ–æ—Ä–º–∏—Ä—É–µ—Ç —Ñ–∞–π–ª submission.csv.

In [30]:
print("1. –ó–∞–≥—Ä—É–∑–∫–∞ –¥–∞–Ω–Ω—ã—Ö")
index = faiss.read_index(INDEX_PATH)
with open(METADATA_PATH, 'rb') as f:
    metadata = pickle.load(f)

1. –ó–∞–≥—Ä—É–∑–∫–∞ –¥–∞–Ω–Ω—ã—Ö


In [31]:
# embeddings_model = SentenceTransformer(EMBEDDING_MODEL_NAME, device='cuda' if torch.cuda.is_available() else 'cpu')
df_questions = pd.read_csv(QUESTIONS_PATH)

all_web_lists = []

In [33]:
print("2.0. –ü–æ–∏—Å–∫ —Ä–µ–ª–µ–≤–∞–Ω—Ç–Ω—ã—Ö –¥–æ–∫—É–º–µ–Ω—Ç–æ–≤ –¥–ª—è –æ–¥–Ω–æ–≥–æ –≤–æ–ø—Ä–æ—Å–∞ (—Ç–µ—Å—Ç)...")
test_query = "–ì–¥–µ —É–∑–Ω–∞—Ç—å –±–∏–∫ –∏ —Å—á—ë—Ç"
preprocessed_test_query = clean_for_embeddings(test_query)
test_query_embedding = embeddings_model.encode(["query: " + preprocessed_test_query], convert_to_numpy=True)
faiss.normalize_L2(test_query_embedding)  # –ù–æ—Ä–º–∞–ª–∏–∑—É–µ–º –≤–µ–∫—Ç–æ—Ä –∑–∞–ø—Ä–æ—Å–∞
result = index.search(test_query_embedding, TOP_K * 5)
print("–ù–∞–π–¥–µ–Ω–Ω—ã–µ web_id –¥–ª—è —Ç–µ—Å—Ç–æ–≤–æ–≥–æ –∑–∞–ø—Ä–æ—Å–∞:")
print("–ó–∞–ø—Ä–æ—Å:", test_query)
print("–ü—Ä–µ–¥–æ–±—Ä–∞–±–æ—Ç–∞–Ω–Ω—ã–π –∑–∞–ø—Ä–æ—Å:", preprocessed_test_query)
print("–í–µ–∫—Ç–æ—Ä –∑–∞–ø—Ä–æ—Å–∞:", test_query_embedding[0][:5])
for i in range(TOP_K):
    print(f"–†–µ–∑—É–ª—å—Ç–∞—Ç {i + 1}: web_id = {metadata[result[1][0][i]]['web_id']}, —Ä–∞—Å—Å—Ç–æ—è–Ω–∏–µ = {result[0][0][i]:.4f}")
    print("–¢–µ–∫—Å—Ç:", chunks[i])
print(result[0], result[1])

2.0. –ü–æ–∏—Å–∫ —Ä–µ–ª–µ–≤–∞–Ω—Ç–Ω—ã—Ö –¥–æ–∫—É–º–µ–Ω—Ç–æ–≤ –¥–ª—è –æ–¥–Ω–æ–≥–æ –≤–æ–ø—Ä–æ—Å–∞ (—Ç–µ—Å—Ç)...
–ù–∞–π–¥–µ–Ω–Ω—ã–µ web_id –¥–ª—è —Ç–µ—Å—Ç–æ–≤–æ–≥–æ –∑–∞–ø—Ä–æ—Å–∞:
–ó–∞–ø—Ä–æ—Å: –ì–¥–µ —É–∑–Ω–∞—Ç—å –±–∏–∫ –∏ —Å—á—ë—Ç
–ü—Ä–µ–¥–æ–±—Ä–∞–±–æ—Ç–∞–Ω–Ω—ã–π –∑–∞–ø—Ä–æ—Å: –ì–¥–µ —É–∑–Ω–∞—Ç—å –±–∏–∫ –∏ —Å—á—ë—Ç
–í–µ–∫—Ç–æ—Ä –∑–∞–ø—Ä–æ—Å–∞: [ 0.02718943 -0.0318291  -0.04523505 -0.05356316  0.00961234]
–†–µ–∑—É–ª—å—Ç–∞—Ç 1: web_id = 372, —Ä–∞—Å—Å—Ç–æ—è–Ω–∏–µ = 0.8329
–¢–µ–∫—Å—Ç: [–î–æ–∫—É–º–µ–Ω—Ç: –ê–ª—å—Ñ–∞-–ë–∞–Ω–∫ - –∫—Ä–µ–¥–∏—Ç–Ω—ã–µ –∏ –¥–µ–±–µ—Ç–æ–≤—ã–µ –∫–∞—Ä—Ç—ã, –∫—Ä–µ–¥–∏—Ç—ã –Ω–∞–ª–∏—á–Ω—ã–º–∏, –∞–≤—Ç–æ–∫—Ä–µ–¥–∏—Ç–æ–≤–∞–Ω–∏–µ, –∏–ø–æ—Ç–µ–∫–∞ –∏ –¥—Ä—É–≥–∏–µ –±–∞–Ω–∫–æ–≤—Å–∫–∏–µ —É—Å–ª—É–≥–∏ —Ñ–∏–∑–∏—á–µ—Å–∫–∏–º –∏ —é—Ä–∏–¥–∏—á–µ—Å–∫–∏–º –ª–∏—Ü–∞–º ‚Äì –ê–ª—å—Ñ–∞-–ë–∞–Ω–∫] [–ö—Ä–∞—Ç–∫–æ–µ —Å–æ–¥–µ—Ä–∂–∞–Ω–∏–µ: –ê–ª—å—Ñ–∞-–ë–∞–Ω–∫ - –∫—Ä–µ–¥–∏—Ç–Ω—ã–µ –∏ –¥–µ–±–µ—Ç–æ–≤—ã–µ –∫–∞—Ä—Ç—ã, –∫—Ä–µ–¥–∏—Ç—ã –Ω–∞–ª–∏—á–Ω—ã–º–∏, –∞–≤—Ç–æ–∫—Ä–µ–¥–∏—Ç–æ–≤–∞–Ω–∏–µ, –∏–ø–æ—Ç–µ–∫–∞ –∏ –¥—Ä

In [6]:
print("2. –ü–æ–∏—Å–∫ —Ä–µ–ª–µ–≤–∞–Ω—Ç–Ω—ã—Ö –¥–æ–∫—É–º–µ–Ω—Ç–æ–≤ –¥–ª—è –∫–∞–∂–¥–æ–≥–æ –≤–æ–ø—Ä–æ—Å–∞...")
for _, row in tqdm(df_questions.iterrows(), total=df_questions.shape[0], desc="–û–±—Ä–∞–±–æ—Ç–∫–∞ –≤–æ–ø—Ä–æ—Å–æ–≤"):
    query = easy_preprocess_text(row['query'])
    query_embedding = embeddings_model.encode(["query: " + query], convert_to_numpy=True)
    faiss.normalize_L2(query_embedding)  # –ù–æ—Ä–º–∞–ª–∏–∑—É–µ–º –≤–µ–∫—Ç–æ—Ä –∑–∞–ø—Ä–æ—Å–∞

    # –ò—â–µ–º TOP_K * 5 –∫–∞–Ω–¥–∏–¥–∞—Ç–æ–≤, —á—Ç–æ–±—ã –∏–º–µ—Ç—å –∑–∞–ø–∞—Å –¥–ª—è –≤—ã–±–æ—Ä–∞ —É–Ω–∏–∫–∞–ª—å–Ω—ã—Ö web_id
    distances, indices = index.search(query_embedding, TOP_K * 5)

    found_web_ids = []
    # –°–æ–±–∏—Ä–∞–µ–º —É–Ω–∏–∫–∞–ª—å–Ω—ã–µ web_id –≤ –ø–æ—Ä—è–¥–∫–µ –∏—Ö –Ω–∞—Ö–æ–∂–¥–µ–Ω–∏—è
    for idx in indices[0]:
        if idx != -1:  # FAISS –º–æ–∂–µ—Ç –≤–µ—Ä–Ω—É—Ç—å -1, –µ—Å–ª–∏ –Ω–∏—á–µ–≥–æ –Ω–µ –Ω–∞–π–¥–µ–Ω–æ
            web_id = metadata[idx]['web_id']
            if web_id not in found_web_ids:
                found_web_ids.append(web_id)
            if len(found_web_ids) == TOP_K:
                break

    # –ï—Å–ª–∏ –Ω–∞–π–¥–µ–Ω–æ –º–µ–Ω—å—à–µ TOP_K –¥–æ–∫—É–º–µ–Ω—Ç–æ–≤, –¥–æ–ø–æ–ª–Ω—è–µ–º –ª—é–±—ã–º–∏ –¥—Ä—É–≥–∏–º–∏ web_id
    # –≠—Ç–æ–≥–æ –Ω–µ –¥–æ–ª–∂–Ω–æ –ø—Ä–æ–∏—Å—Ö–æ–¥–∏—Ç—å –ø—Ä–∏ K*5 –ø–æ–∏—Å–∫–µ, –Ω–æ —ç—Ç–æ –∑–∞—â–∏—Ç–∞
    if len(found_web_ids) < TOP_K:
        # –ü—Ä–æ—Å—Ç–æ –¥–æ–±–∞–≤–∏–º –ø–µ—Ä–≤—ã–µ web_id, –∫–æ—Ç–æ—Ä—ã—Ö –µ—â–µ –Ω–µ—Ç –≤ —Å–ø–∏—Å–∫–µ
        all_ids = [m['web_id'] for m in metadata]
        for web_id in all_ids:
            if web_id not in found_web_ids:
                found_web_ids.append(web_id)
            if len(found_web_ids) == TOP_K:
                break

    all_web_lists.append(str(found_web_ids))

2. –ü–æ–∏—Å–∫ —Ä–µ–ª–µ–≤–∞–Ω—Ç–Ω—ã—Ö –¥–æ–∫—É–º–µ–Ω—Ç–æ–≤ –¥–ª—è –∫–∞–∂–¥–æ–≥–æ –≤–æ–ø—Ä–æ—Å–∞...


–û–±—Ä–∞–±–æ—Ç–∫–∞ –≤–æ–ø—Ä–æ—Å–æ–≤: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 6977/6977 [00:55<00:00, 124.63it/s]


In [7]:
print("3. –°–æ–∑–¥–∞–Ω–∏–µ —Ñ–∞–π–ª–∞ –¥–ª—è –æ—Ç–ø—Ä–∞–≤–∫–∏...")
submission_df = pd.DataFrame({
    'q_id': df_questions['q_id'],
    'web_list': all_web_lists
})

submission_df.to_csv(OUTPUT_SUBMISSION_FILE, index=False)
print(f"–§–∞–π–ª '{OUTPUT_SUBMISSION_FILE}' —É—Å–ø–µ—à–Ω–æ —Å–æ–∑–¥–∞–Ω.")

3. –°–æ–∑–¥–∞–Ω–∏–µ —Ñ–∞–π–ª–∞ –¥–ª—è –æ—Ç–ø—Ä–∞–≤–∫–∏...
–§–∞–π–ª 'submission.csv' —É—Å–ø–µ—à–Ω–æ —Å–æ–∑–¥–∞–Ω.
