# Top K with Vector Stores

Tools:
1. LangChain: standardize way to implement (set up, create, and query) multiple vector stores
2. Vector Stores:
    1. Chroma
3. Embedding Models
    1. HuggingFace

[LangChain-Chroma](https://python.langchain.com/docs/integrations/vectorstores/chroma/)

In [1]:
import os
import sys

import pandas as pd

from tqdm import tqdm
from uuid import uuid4

from langchain_chroma import Chroma
from langchain_huggingface import HuggingFaceEmbeddings

from langchain_core.documents import Document

# Get the current working directory of the notebook
notebook_dir = os.getcwd()
# Add the parent directory to the system path
sys.path.append(os.path.join(notebook_dir, '../'))

import log_files
from data_processing import DataProcessing

In [2]:
pd.set_option('max_colwidth', 800)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

## Load Prediction and Observations Data

In [3]:
log_file_path = "data/prediction_logs"
predictions = True
predictions_df = log_files.read_data(notebook_dir, log_file_path, predictions)

log_file_path = "data/observation_logs"
predictions = False
observations_df = log_files.read_data(notebook_dir, log_file_path, predictions)

col_name = 'Base Sentence'
predictions = DataProcessing.df_to_list(predictions_df, col_name)
observations = DataProcessing.df_to_list(observations_df, col_name)
len(predictions), len(observations)

Start logging batch
log_directory: /Users/detraviousjamaribrinkley/Documents/Development/research_labs/uf_ds/predictions/notebook_experiments/../data/prediction_logs
save_batch_directory: /Users/detraviousjamaribrinkley/Documents/Development/research_labs/uf_ds/predictions/notebook_experiments/../data/prediction_logs/batch_1-prediction
CSV to DF
Load saved csv: /Users/detraviousjamaribrinkley/Documents/Development/research_labs/uf_ds/predictions/notebook_experiments/../data/prediction_logs/batch_1-prediction/batch_1-from_df.csv
save_batch_directory: /Users/detraviousjamaribrinkley/Documents/Development/research_labs/uf_ds/predictions/notebook_experiments/../data/prediction_logs/batch_2-prediction
CSV to DF
Load saved csv: /Users/detraviousjamaribrinkley/Documents/Development/research_labs/uf_ds/predictions/notebook_experiments/../data/prediction_logs/batch_2-prediction/batch_2-from_df.csv
save_batch_directory: /Users/detraviousjamaribrinkley/Documents/Development/research_labs/uf_ds/pr

(705, 1903)

## Embedding Model(s)

In [4]:
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

#

## Create Vector Store: Collect Name + Connect Embedding Model(s)

- Collection [store your embeddings, documents, and any additional metadata.][[Getting Started with Chroma](https://docs.trychroma.com/docs/overview/getting-started)]
- Collections index your embeddings and documents, and enable efficient retrieval and filtering.

In [5]:
vector_store = Chroma(
    collection_name="prediction_collection",
    embedding_function=embeddings,
)

## Create a Chroma Client

In [6]:
import chromadb

client = chromadb.Client()

## Add Observations to Vector Store

In [7]:
# observations_df.loc

In [8]:
documents = []

for index, row in tqdm(observations_df.iterrows()):
    # print(f"""Index: {index},
    #       Name: {row['Base Sentence']},
    #       Sentence Label: {row['Sentence Label']},
    #       Domain: {row['Domain']},
    #       Model Name: {row['Model Name']},
    #       API Name: {row['API Name']},
    #       Template Number: {row['Template Number']}
    #       """)
    idx = index
    base_sentence = row['Base Sentence']
    sentence_label = row['Sentence Label']
    domain = row['Domain']
    model_name = row['Model Name']
    api_name = row['API Name']
    template_number = row['Template Number']

    document = Document(
        page_content=base_sentence,
        metadata={"sentence_label": sentence_label,
                  "domain": domain,
                  "model_name": model_name,
                  "api_name": api_name,
                  "template_number": template_number
                  },
        id=idx,)
    
    documents.append(document)

# documents

1903it [00:00, 72789.75it/s]


In [9]:
uuids = [str(uuid4()) for _ in range(len(documents))]
# uuids

## Query Vector Store

In [10]:
predictions[:3]

['JPMorgan Chase forecasts that the net profit at Amazon potentially decrease in Q3 of 2027.',
 'On August 21, 2024, Bank of America speculates the revenue at Microsoft will likely increase.',
 'Citigroup predicts on 2024-08-21, the operating income at Alphabet may rise.']

In [None]:
for idx, prediction in enumerate(predictions):
    print(idx, prediction)
    print("-------Similarity-------")
    results = vector_store.similarity_search(
        prediction,
        k=3,
    )
    for res in results:
        print(f"* {res.page_content} [{res.metadata}]")

    print()
    print("-------Similarity with score-------")
    results = vector_store.similarity_search_with_score(
        prediction, k=1,
    )
    for res, score in results:
        print(f"* [SIM={score:3f}] {res.page_content} [{res.metadata}]")

    print()
    print("-------Similarity by vector-------")
    results = vector_store.similarity_search_by_vector(
        embedding=embeddings.embed_query(prediction), k=3
    )
    for doc in results:
        print(f"* {doc.page_content} [{doc.metadata}]")

    print()
    print("-------Retriever-------")
    retriever = vector_store.as_retriever(
        search_type="mmr", search_kwargs={"k": 3, "fetch_k": 5}
    )
    retriever.invoke(prediction)

0 JPMorgan Chase forecasts that the net profit at Amazon potentially decrease in Q3 of 2027.
-------Similarity-------

-------Similarity with score-------

-------Similarity by vector-------

-------Retriever-------
1 On August 21, 2024, Bank of America speculates the revenue at Microsoft will likely increase.
-------Similarity-------

-------Similarity with score-------

-------Similarity by vector-------

-------Retriever-------
2 Citigroup predicts on 2024-08-21, the operating income at Alphabet may rise.
-------Similarity-------

-------Similarity with score-------

-------Similarity by vector-------

-------Retriever-------
