In [None]:
!pip install sentence-transformers
!pip install faiss-cpu

In [142]:
import os
import pickle
import faiss
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer
from datetime import datetime, timedelta

class MessageManager:
    def __init__(self, model_name='paraphrase-multilingual-MiniLM-L12-v2', index_path='faiss.index', data_path='data.pkl'):
        self.model = SentenceTransformer(model_name)
        self.messages = []
        self.timestamps = [] 
        self.embeddings = np.empty((0, self.model.get_sentence_embedding_dimension()), dtype='float32')
        self.dimension = self.model.get_sentence_embedding_dimension()
        self.index_path = index_path
        self.data_path = data_path
        self.index = faiss.IndexFlatL2(self.dimension)
        self.load_data()
    
    def add_new_message(self, new_message , current_time):
        new_embedding = self.model.encode([new_message]).astype('float32')
        self.index.add(new_embedding)
        self.messages.append(new_message)
        self.timestamps.append(current_time)
        self.embeddings = np.vstack([self.embeddings, new_embedding])
    
    def is_similar(self, new_message, threshold=0.8):
        if len(self.messages) == 0:
            return False, None

        new_embedding = self.model.encode([new_message]).astype('float32')
        D, I = self.index.search(new_embedding, k=1)
        nearest_index = I[0][0]

        if nearest_index == -1:
            return None, None
        # get the date and content of the nearest message
        nearest_message = self.messages[nearest_index]
        nearest_timestamp = self.timestamps[nearest_index]  

        existing_embedding = self.embeddings[nearest_index]
        similarity = np.dot(new_embedding, existing_embedding) / (np.linalg.norm(new_embedding) * np.linalg.norm(existing_embedding))
        similar = similarity > threshold

        return (nearest_message, nearest_timestamp), similar[0]
    
    def save_data(self):
        faiss.write_index(self.index, self.index_path)
        with open(self.data_path, 'wb') as f:
            pickle.dump({'messages': self.messages, 'timestamps': self.timestamps, 'embeddings': self.embeddings}, f)
    
    def load_data(self):
        if os.path.exists(self.data_path) and os.path.exists(self.index_path):
            with open(self.data_path, 'rb') as f:
                data = pickle.load(f)
                self.messages = data.get('messages', [])
                self.timestamps = data.get('timestamps', [])  
                self.embeddings = data.get('embeddings', np.empty((0, self.dimension), dtype='float32'))
            self.index = faiss.read_index(self.index_path)

    def shutdown(self):
        self.save_data()


manager = MessageManager()
manager.shutdown()

 




In [143]:

day_range = 1
df_current = pd.read_csv("/Users/ulanagusar/Desktop/ML_week/table1.csv",delimiter=";")
df_current = pd.read_csv("/Users/ulanagusar/Desktop/ML_week/current.csv")


def rm_dublicates(df_current ,day_range = 1):

     test_messages = df_current['Message'].to_list()
     test_times = df_current['MessageDate'].to_list()
     test_ids = df_current['TelegramPostInfoID'].to_list()
     res_ids = []

     for i in range(len(test_messages)):
          new_msg = test_messages[i]
          new_timestamp = test_times[i]
          new_id= test_ids[i]

          result, is_similar = manager.is_similar(new_msg)
          if result == None:
               print(new_msg)
               print("the nearest neighbor is not found - write ")
               #the nearest neighbor is not found - write 
               manager.add_new_message(new_msg , new_timestamp)
               res_ids.append(new_id)
          elif result == False :
               print(new_msg)
               print("the database is empty - write")
               manager.add_new_message(new_msg , new_timestamp)
               res_ids.append(new_id)
               # the database is empty - write
          elif is_similar == False :
               print(new_msg)
               print("no similar  - write ")
               manager.add_new_message(new_msg , new_timestamp)
               res_ids.append(new_id)
               # no similar  - write 
          else: 
               print(new_msg)
               print("There are similar ones - check the time")
               # There are similar ones - check the time
               nearest_message, nearest_timestamp = result

               nearest_dt = datetime.strptime(nearest_timestamp, "%Y-%m-%d %H:%M:%S.%f")
               #end_dt time of the current message
               end_dt = datetime.strptime(new_timestamp, "%Y-%m-%d %H:%M:%S.%f")
               # start_dt the day_range before the current day
               start_dt = end_dt - timedelta(days=day_range)
               if not (start_dt <= nearest_dt <= end_dt) :
                    # there are no similar ones in the specified time range - write 
                    print("there are no similar ones in the specified time range - write ")
                    manager.add_new_message(new_msg , new_timestamp)
                    res_ids.append(new_id)
               else:
                    print("similar record in the specified time range - don't write")
     return res_ids



In [144]:
res_ids = rm_dublicates(df_current)

Мы с этим будем жить в ближайшие годы. Путин предупредил, что экономика России будет испытывать дефицит кадров и объяснил, как с этим бороться.
the database is empty - write
Проиранские формирования в Ираке заявили, что атаковали ракетами базу американских войск Айн аль-Асад в провинции Анбар, Ирак.
no similar  - write 
Президент США Джо Байден внезапно прервал свой отдых и возвращается в Белый дом для консультаций с командой по безопасности из-за событий на Ближнем Востоке
no similar  - write 
Президент прервал свой отдых и возвращается в Белый дом для консультаций с командой по безопасности
There are similar ones - check the time
similar record in the specified time range - don't write
Владимир Путин вместе с президентами из других стран возложил цветы к Могиле Неизвестного Солдата 
no similar  - write 
 Путин возложил цветы к Могиле Неизвестного Солдата 
There are similar ones - check the time
there are no similar ones in the specified time range - write 
 я прийшла вас порадувати 


In [145]:
res_ids

[1, 3, 5, 7, 8, 9, 10]