**This holds the main flow of the application**

In [None]:
# includes all main imports from sub-directories
%load_ext autoreload
%autoreload 2

import os 
import sys 
import sqlite3
from pprint import pprint # text formatting 
from dotenv import load_dotenv  # from python-dotenv
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sentence_transformers import SentenceTransformer
from openai import OpenAI
import torch

client = OpenAI(api_key=os.getenv("GPT_API_KEY"))

project_dir = os.getcwd() 
print(project_dir)

message_dir = "data/processing"
sys.path.append(message_dir)


## Chosen embedding approach : Asymmetric Semantic Search 

For the first MVP I will be using this instead of symmetric search because the queries themselves are not symmetrical. For an example query like "SUBJECT_NAME : When was I really annoyed at this person?" the query maps to a longer block of text containing the relevant information. 

This varies in comparison to a query like "How to learn Javascript" and finding an entry similar to "How to learn JavaScript on the web?", where this would be symmetrical. 

Pre-Trained MS MARCO Models will be used for this first implementation. 
Specifically, models tuned with normalized embeddings will be used instead of models tuned with dot products initially, normalized embeddings are more generalized, but dot products can be used as experimentation later, as they are more dynamic and may pick up additional semantic information.

SentenceTransformer.encode_query and SentenceTransformer.encode_document specifically used for encoding the corpus as well as query. 

In [None]:
large_embedding_model = "text-embedding-3-large" # dim-size :  1536  
small_embedding_model = "text-embedding-3-small" # dim-size : 3072 

bert_model_name = "msmarco-MiniLM-L6-cos-v5" # dim-size 384
embedder = SentenceTransformer(bert_model_name) 

current_dim = 384

input = [] 
input.append("This is a test input") 
embeddings = embedder.encode(input)
print(embeddings) 

# response = client.embeddings.create(
    # input=test_input,
   #  model=current_model
# )
#print(response.data[0].embedding)


In [None]:
import os
os.chdir(os.getcwd())
print(os.listdir('.'))

In [None]:
file_name = "data.txt"
print(f"File name: {file_name}")
print(f"Sentences per embedding: {sentences_per_embedding}")

# Also check the raw file content:
with open(file_name, 'r') as f:
    lines = f.readlines()
    print(f"Total lines in file: {len(lines)}")
    for i, line in enumerate(lines):
        print(f"Line {i}: '{line.strip()}'")

## DATA LOADING

In [None]:
import file

subject_phone = ""
subject_name = "" 
messages_per_subject = 100

file.addToTextFile(subject_phone, messages_per_subject) # puts data into text file 

In [None]:
sentences_per_embedding = 2 # sentences per embedding
index_multiplier = 1 * sentences_per_embedding # looking for indexes in the text file 

batch_data = file.getTextFile(sentences_per_embedding)

corpus = [] 

for batch in batch_data:
  for sentence in batch: 
    sentences += sentence + " "
  corpus.append(sentences)
  print(sentences) 
  sentences = ""

# conn.close() closes the connection to the database 

In [None]:
np_embeddings = embedder.encode_document(corpus) 

In [None]:
embeddings = torch.tensor(np_embeddings, dtype=torch.float32)

num_of_vectors = len(np_embeddings)
print(embeddings.shape)
print("Numbers of embedding vectors " + str(num_of_vectors))

In [None]:
query = "Did someone's family ever get killed?"

np_query_embedding = embedder.encode_query(query)

query_embedding = torch.tensor(np_query_embedding, dtype=torch.float32)
query_embedding = torch.unsqueeze(query_embedding, dim=0) # dim 1 to match the number of queries 
print(query_embedding.shape) 

print("Length of embedding " + str(len(query_embedding)))

In [None]:
import faiss 

index_one = faiss.IndexFlatL2(current_dim) # per subject 
global_index = faiss.IndexFlatL2(current_dim) # includes all of the clusters 

index_one.add(embeddings) # to add an embedding, shape : (n_vectors, current_dim)
# index.is_trained, for seeing if the index is trained 

In [None]:
print("Number of vectors in FAISS : " + str((index_one.ntotal)))

In [None]:
k = 5
xq = query_embedding # shape : (n_queries, dimension)
# index.search finds the similar vectors in the FAISS DB 
D, I = index_one.search(xq, k) # I has shape : (number_of_queries, k), D has shape : (number_of_queries, k)

In [None]:
print("Shape of Indices matrix : " + str(I.shape))
print("Shape of Distances matrix : " + str(D.shape))
print("ID of indices : " + str(I[0, :]))

In [None]:
vec_ids = I[0, :].tolist()# this the first index returned by FAISS, int is because it originally returns as a numpy int
for index in vec_ids: 
  sentence = file.getTextFileLine(index, index_multiplier)
  print(sentence)