**This holds the main flow of the application**

In [None]:
# includes all main imports from sub-directories
%load_ext autoreload
%autoreload 2

import os 
import sys 
import sqlite3
from pprint import pprint # text formatting 
from dotenv import load_dotenv  # from python-dotenv
import numpy as np
import pandas as pd

import torch.nn.functional as F
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sentence_transformers import SentenceTransformer, SimilarityFunction
from openai import OpenAI
import torch

project_dir = os.getcwd() 
print(project_dir)

message_dir = "data/processing"
sys.path.append(message_dir)


## Chosen embedding approach : Asymmetric Semantic Search 

For the first MVP I will be using this instead of symmetric search because the queries themselves are not symmetrical. For an example query like "SUBJECT_NAME : When was I really annoyed at this person?" the query maps to a longer block of text containing the relevant information. 

This varies in comparison to a query like "How to learn Javascript" and finding an entry similar to "How to learn JavaScript on the web?", where this would be symmetrical. 

Pre-Trained MS MARCO Models will be used for this first implementation. 
Specifically, models tuned with normalized embeddings will be used instead of models tuned with dot products initially, normalized embeddings are more generalized, but dot products can be used as experimentation later, as they are more dynamic and may pick up additional semantic information.

SentenceTransformer.encode_query and SentenceTransformer.encode_document specifically used for encoding the corpus as well as query. 

In [None]:
bert_model_name = "multi-qa-mpnet-base-cos-v1" # dim-size 384
embedder = SentenceTransformer(bert_model_name, similarity_fn_name=SimilarityFunction.COSINE) # 

current_dim = 768

input = [] 
input.append("This is a test input") 
embeddings = embedder.encode(input)

## DATA LOADING

In [None]:
import file
from datetime import datetime

subject_phone = "9365539666"
subject_name = "Paris" 
messages_per_subject = 4700

messages = file.addToTextFile(subject_phone, messages_per_subject, subject_name) # puts data into text file 

time_differences = [] 


for idx, message in enumerate(messages): 
  if idx == len(messages) - 2: # reach second to last element 
    break
  first = datetime.strptime(messages[idx][0], '%Y-%m-%d %H:%M:%S')
  second = datetime.strptime(messages[idx + 1][0], '%Y-%m-%d %H:%M:%S')
  time_diff_minutes = abs((first - second).total_seconds() / 60)
  if time_diff_minutes > (60 * 60 * 24 * 14): # if time is greater than two weeks, don't dilute the average
    pass 
  time_differences.append(time_diff_minutes)
  print(time_differences)
  
time_tensor = torch.tensor(time_differences)
mean = torch.mean(time_tensor) 
standard_deviation = torch.std(time_tensor) 

print("This is the average distance in minutes between two text messages : " + str(mean))
print("This is the standard deviation across all text messages : " + str(standard_deviation))

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Set style for better-looking plots
sns.set_style("whitegrid")

# Create figure with multiple subplots
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# 1. Regular histogram
axes[0, 0].hist(time_differences, bins=50, edgecolor='black', color='steelblue')
axes[0, 0].set_xlabel('Time Difference (minutes)', fontsize=12)
axes[0, 0].set_ylabel('Frequency', fontsize=12)
axes[0, 0].set_title('Distribution of Time Differences', fontsize=14, fontweight='bold')
axes[0, 0].axvline(mean.item(), color='red', linestyle='--', linewidth=2, label=f'Mean: {mean.item():.2f} min')
axes[0, 0].legend()

# 2. Logarithmic histogram (y-axis log scale)
axes[0, 1].hist(time_differences, bins=50, edgecolor='black', color='coral')
axes[0, 1].set_xlabel('Time Difference (minutes)', fontsize=12)
axes[0, 1].set_ylabel('Frequency (log scale)', fontsize=12)
axes[0, 1].set_yscale('log')
axes[0, 1].set_title('Distribution with Log Scale', fontsize=14, fontweight='bold')
axes[0, 1].axvline(mean.item(), color='red', linestyle='--', linewidth=2, label=f'Mean: {mean.item():.2f} min')
axes[0, 1].legend()

# 3. Box plot to visualize outliers and quartiles
axes[1, 0].boxplot(time_differences, vert=True, patch_artist=True,
                    boxprops=dict(facecolor='lightgreen', alpha=0.7),
                    medianprops=dict(color='red', linewidth=2))
axes[1, 0].set_ylabel('Time Difference (minutes)', fontsize=12)
axes[1, 0].set_title('Box Plot: Outliers & Spread', fontsize=14, fontweight='bold')
axes[1, 0].grid(True, alpha=0.3)

# 4. Log-transformed histogram (x-axis log scale)
# Filter out zeros to avoid log(0)
time_diffs_nonzero = [t for t in time_differences if t > 0]
axes[1, 1].hist(time_diffs_nonzero, bins=50, edgecolor='black', color='mediumpurple')
axes[1, 1].set_xlabel('Time Difference (minutes, log scale)', fontsize=12)
axes[1, 1].set_ylabel('Frequency', fontsize=12)
axes[1, 1].set_xscale('log')
axes[1, 1].set_title('Distribution with Log-Transformed X-axis', fontsize=14, fontweight='bold')
axes[1, 1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Calculate and display percentiles
percentiles = [25, 50, 75, 90, 95, 99]
print("\n" + "="*50)
print("STATISTICAL SUMMARY")
print("="*50)
print(f"Mean:              {mean.item():.2f} minutes ({mean.item()/60:.2f} hours)")
print(f"Std Deviation:     {standard_deviation.item():.2f} minutes ({standard_deviation.item()/60:.2f} hours)")
print(f"Std/Mean Ratio:    {standard_deviation.item()/mean.item():.2f}x")
print(f"\nTotal messages:    {len(time_differences)}")
print(f"Min time diff:     {min(time_differences):.2f} minutes")
print(f"Max time diff:     {max(time_differences):.2f} minutes ({max(time_differences)/60:.2f} hours)")
print("\nPercentiles:")
for p in percentiles:
    val = np.percentile(time_differences, p)
    print(f"  {p}th percentile: {val:.2f} minutes ({val/60:.2f} hours)")
print("="*50)

In [None]:
import file

subject_phone = "9365539666"
subject_name = "Paris" 
messages_per_subject = 5000

file.addToTextFile(subject_phone, messages_per_subject, subject_name) # puts data into text file 

sentences_per_embedding = 1 # sentences per embedding
index_multiplier = 1 * sentences_per_embedding # looking for indexes in the text file 

batch_data = file.getTextFile(sentences_per_embedding)

corpus = [] 

sentences = ""
for batch in batch_data:
  for sentence in batch: 
    sentences += sentence + " "
  corpus.append(sentences)
  sentences = ""
print("number of text messages : " + str(len(corpus)))

In [None]:
np_embeddings = embedder.encode_document(corpus) 

embeddings = torch.tensor(np_embeddings, dtype=torch.float32)

num_of_vectors = len(np_embeddings)
print(embeddings.shape)
print("Numbers of embedding vectors " + str(num_of_vectors))

In [None]:
import faiss 

index_one = faiss.IndexFlatL2(current_dim) # per subject, euclidean distance
global_index = faiss.IndexFlatL2(current_dim) # includes all of the clusters 

index_one.add(embeddings) # to add an embedding, shape : (n_vectors, current_dim)
# index.is_trained, for seeing if the index is trained 
print("Number of vectors in FAISS : " + str((index_one.ntotal)))

In [None]:
query_one = "What time is the kickback?"
query_two = "all kickbacks around fall"

current_query = query_two

np_query_embedding = embedder.encode_query(current_query)

query_embedding = torch.tensor(np_query_embedding, dtype=torch.float32)
query_embedding = torch.unsqueeze(query_embedding, dim=0) # dim 1 to match the number of queries 
print(query_embedding.shape) 

print("Length of embedding " + str(len(query_embedding)))

k = 5
xq = query_embedding # shape : (n_queries, dimension)
# index.search finds the similar vectors in the FAISS DB 
D, I = index_one.search(xq, k) # I has shape : (number_of_queries, k), D has shape : (number_of_queries, k)

In [None]:
print("Shape of Indices matrix : " + str(I.shape))
print("Shape of Distances matrix : " + str(D.shape))
print("ID of indices : " + str(I[0, :]) + "\n")
vec_ids = I[0, :].tolist()
for index in vec_ids:
    sentence = file.getTextFileLine(index, index_multiplier)
    print(sentence)
    vec = index_one.reconstruct(index)
    reconstructed_vec = torch.from_numpy(vec)
    query_tensor = torch.from_numpy(query_embedding.squeeze(0))  # Convert to tensor
    similarity_score = embedder.similarity(query_tensor.unsqueeze(0), reconstructed_vec.unsqueeze(0))
    euclidean_distance = torch.sqrt(torch.sum((query_tensor - reconstructed_vec) ** 2))
    print("similarity score : " + str(similarity_score))
    print("euclidean distance score : " + str(euclidean_distance) + "\n")

In [None]:
target_text = "They want to kickback on the Thursday after Halloweekend"
target_embedding = embedder.encode([target_text])
query_embedding = embedder.encode(["all kickbacks around fall"])
similarity = embedder.similarity(query_embedding, target_embedding)
print(f"Direct similarity: {similarity}")