In [1]:
!pip install sentence-transformers
!pip install faiss-cpu



In [2]:
from sentence_transformers import SentenceTransformer
import numpy as np
from sentence_transformers import SentenceTransformer
import numpy as np
import faiss
import pickle
import os

  from .autonotebook import tqdm as notebook_tqdm


In [6]:



class MessageManager:
    def __init__(self, model_name='paraphrase-multilingual-MiniLM-L12-v2', index_path='faiss.index', data_path='data.pkl'):
        self.model = SentenceTransformer(model_name)
        self.messages = []
        self.embeddings = np.empty((0, self.model.get_sentence_embedding_dimension()), dtype='float32')
        self.dimension = self.model.get_sentence_embedding_dimension()
        self.index_path = index_path
        self.data_path = data_path
        self.index = faiss.IndexFlatL2(self.dimension)
        self.load_data()
    
    def add_new_message(self, new_message):

        new_embedding = self.model.encode([new_message]).astype('float32')
        self.index.add(new_embedding)
        self.messages.append(new_message)
        self.embeddings = np.vstack([self.embeddings, new_embedding])
       
    
    def is_similar(self, new_message, threshold=0.8):
        if len(self.messages) == 0:
            return False
        new_embedding = self.model.encode([new_message]).astype('float32')
        
        # Пошук найближчого сусіда у FAISS
        D, I = self.index.search(new_embedding, k=1)
        
        existing_embedding = self.embeddings[I[0][0]]
        similarity = np.dot(new_embedding, existing_embedding) / (np.linalg.norm(new_embedding) * np.linalg.norm(existing_embedding))

        return similarity > threshold
    
    def save_data(self):
        faiss.write_index(self.index, self.index_path)
        with open(self.data_path, 'wb') as f:
            pickle.dump({'messages': self.messages, 'embeddings': self.embeddings}, f)
       
    
    def load_data(self):
        if os.path.exists(self.data_path) and os.path.exists(self.index_path):
            with open(self.data_path, 'rb') as f:
                data = pickle.load(f)
                self.messages = data['messages']
                self.embeddings = data['embeddings']
            self.index = faiss.read_index(self.index_path)
            print("Дані завантажено успішно!")
        else:
            print("Файли даних не знайдено. Починаємо з порожньої бази.")
    
    def shutdown(self):
        self.save_data()


manager = MessageManager()

def process_message(new_msg, threshold=0.8):
    similar = manager.is_similar(new_msg, threshold)
    if not (similar):
         manager.add_new_message(new_msg)
    return similar
 




Дані завантажено успішно!


In [7]:
test_messages = [
    "стіл",
    "Ще одне нове повідомлення",
    "Повідомлення про запуск нового продукту",
    "Схоже повідомлення"
]
res_mes = []


for msg in test_messages:
    similar = process_message(msg)
   # print(similar)
    if similar == False :
        res_mes.append(msg)
        




manager.shutdown()

In [8]:
res_mes

[]