In [None]:
!pip install faiss-cpu
!pip install sentence-transformers
!pip install numpy
!pip install tensorflow==2.12.0 keras==2.12.0


In [7]:
import json

def preprocess_news(file_path, output_path):
    """
    Предварительно обрабатывает новостные данные и преобразует их в формат JSON.
    :param file_path: Путь сырых новостных данных
    :param output_path: Путь к обработанному файлу JSON
    """
    data = {}
    with open(file_path, 'r', encoding='utf-8') as f:
        for i, line in enumerate(f):
            parts = line.strip().split('\t')
            if len(parts) == 3:
                label, title, content = parts
                combined_text = f"{label}: {title}. {content}"
                data[str(i)] = combined_text
    
    with open(output_path, 'w', encoding='utf-8') as outfile:
        json.dump(data, outfile, ensure_ascii=False, indent=4)
    print(f"Preprocessing complete, saved to: {output_path}")

if __name__ == "__main__":
    file_path = '../data/news.txt'
    output_path = 'data.json'
    preprocess_news(file_path, output_path)


Preprocessing complete, saved to: data.json


In [8]:
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer
import json

def build_index(data_path, model_name, index_path):
    """
    Построение индексов семантического поиска.
    :param data_path: Путь к новостным данным в формате JSON
    :param model_name: Hugging Face Название модели
    :param index_path: Путь сохранения индекса
    """
    with open(data_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    texts = list(data.values())
    ids = list(data.keys())
    
    model = SentenceTransformer(model_name)
    print("Start generating vectors...")
    embeddings = model.encode(texts, show_progress_bar=True)

    index = faiss.IndexFlatL2(embeddings.shape[1])  #  L2 расстояние
    index.add(np.array(embeddings))
    print(f"Index construction is complete with {index.ntotal} vectors")

    faiss.write_index(index, index_path)
    with open("metadata.json", "w", encoding="utf-8") as meta_file:
        json.dump(ids, meta_file)
    print("Indexes and metadata are saved")

if __name__ == "__main__":
    data_path = 'data.json'
    model_name = "sentence-transformers/all-MiniLM-L6-v2"
    index_path = "vector_index.faiss"
    build_index(data_path, model_name, index_path)


Start generating vectors...


Batches: 100%|██████████| 313/313 [01:23<00:00,  3.77it/s]

Index construction is complete with 10000 vectors
Indexes and metadata are saved





In [None]:
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer
import json

def search_query(query, model_name, index_path, meta_path, distance_threshold=1.0):
    """
    Семантический поиск с фильтрацией по порогу схожести.
    :param query: Текст запроса
    :param model_name: Hugging Face Название модели
    :param index_path: Путь к индексному файлу
    :param meta_path: Путь к файлу метаданных
    :param distance_threshold: Порог расстояния для фильтрации
    """
    model = SentenceTransformer(model_name)
    query_vec = model.encode([query])
    
    index = faiss.read_index(index_path)
    with open(meta_path, "r", encoding="utf-8") as meta_file:
        metadata = json.load(meta_file)
    
    distances, indices = index.search(np.array(query_vec), index.ntotal)
    results = [
        {"id": metadata[i], "distance": distances[0][idx]}
        for idx, i in enumerate(indices[0])
        if distances[0][idx] < distance_threshold
    ]
    
    print(f"query: {query}")
    if results:
        print("Search results:")
        for result in results:
            print(f"ID: {result['id']}, Distance: {result['distance']:.4f}")
    else:
        print("No matching results were found")
    return results

if __name__ == "__main__":
    query = input("Enter request text: ")
    model_name = "sentence-transformers/all-MiniLM-L6-v2"
    index_path = "vector_index.faiss"
    meta_path = "metadata.json"
    
    search_query(query, model_name, index_path, meta_path, distance_threshold=0.9)


Запрос: Исторические события в Китае
Результаты поиска:
ID: 9100, Distance: 0.8356
ID: 3995, Distance: 0.8398
ID: 1416, Distance: 0.8679
ID: 2231, Distance: 0.8701
ID: 6690, Distance: 0.8715
ID: 1862, Distance: 0.8733
ID: 874, Distance: 0.8886
ID: 676, Distance: 0.8891
ID: 8272, Distance: 0.8958
ID: 4923, Distance: 0.8976
