In [1]:
import pandas as pd
from sentence_transformers import SentenceTransformer, util
import numpy as np
import torch
import ast
from lex_index import LexIndex, Chunk

model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')



In [2]:
def cosine_to_unit(scores: torch.Tensor) -> torch.Tensor:
    return (scores + 1.0) / 2.0

def load_faq_data(csv_path: str) -> pd.DataFrame:
    df = pd.read_csv(csv_path)
    df["embedding"] = df["embedding"].apply(lambda x: np.array(ast.literal_eval(x), dtype=np.float32))
    return df

def build_lex_index(df: pd.DataFrame) -> LexIndex:
    chunks = [
        Chunk(id=i, text=q, meta={"answer": df.iloc[i]["answer"]})
        for i, q in enumerate(df["question"].tolist())
    ]
    return LexIndex(chunks)

def search_faq(query: str, csv_path: str, w_embed: float, w_lex: float, top_k: int = 5):

    df = load_faq_data(csv_path)

    lex_index = build_lex_index(df)

    model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    query_emb = model.encode([query], convert_to_tensor=True).to(device)
    all_embs = torch.tensor(np.stack(df["embedding"].values), device=device)

    cos_scores = util.cos_sim(query_emb, all_embs)[0]
    cos_unit = cosine_to_unit(cos_scores)

    lex_scores = torch.tensor(lex_index.score_all(query), dtype=torch.float32, device=device)

    hybrid = w_embed * cos_unit + w_lex * lex_scores

    top_idx = torch.argsort(hybrid, descending=True)[:top_k].tolist()

    results = []
    for idx in top_idx:
        q = df.iloc[idx]["question"]
        a = df.iloc[idx]["answer"]
        lex_score = float(lex_scores[idx])
        embed_score = float(cos_unit[idx])
        hybrid_score = float(hybrid[idx])

        results.append({
            "query": query,
            "answer": a,
            "lex_weight": w_lex,
            "embed_weight": w_embed,
            "lex_score": lex_score,
            "embed_score": embed_score,
            "hybrid_score": hybrid_score
        })

    return results

In [6]:
def process_queries(queries_file: str, faq_csv: str, output_file: str):

    with open(queries_file, "r", encoding="utf-8") as f:
        queries = f.readlines()

    all_results = []

    for query in queries:
        query = query.strip()  
        
        print(query)

        for w_lex in np.arange(0.6, 0, -0.1):  
            w_embed = 1 - w_lex  
            results = search_faq(query, faq_csv, w_embed=w_embed, w_lex=w_lex)
            all_results.extend(results)  

    df_results = pd.DataFrame(all_results)
    df_results.to_csv(output_file, index=False)

    print(output_file)

In [7]:
FAQ_CSV_PATH = "/Users/vozamhcak/Desktop/delAI/KBC/bot_files/faq_data.csv"  
QUESTIONS_FILE = "questions_for_test.txt"  
OUTPUT_FILE = "faq_search_results.csv"  

process_queries(QUESTIONS_FILE, FAQ_CSV_PATH, OUTPUT_FILE)

нас топят соседи сверху, что делать?




как вызвать сантехника?




как я могу сдать показания счетчиков?




когда будет капитальный ремонт




нет горячей воды, когда появится?




не работает ключ от домофона, что делать?




faq_search_results.csv
