# Build any Vector Store with any Embedding Model

**Tools:**

1. LangChain: standardize way to implement (set up, create, and query) multiple vector stores
2. Vector Stores supported:
    1. Chroma
3. Embedding Models supported:
    1. HuggingFace

**References:**

1. [LangChain-Chroma](https://python.langchain.com/docs/integrations/vectorstores/chroma/)

In [1]:
import os
import sys
import warnings

import pandas as pd

from tqdm import tqdm
from uuid import uuid4

from langchain_chroma import Chroma
from langchain_huggingface import HuggingFaceEmbeddings

from langchain_core.documents import Document

# Get the current working directory of the notebook
notebook_dir = os.getcwd()
# Add the parent directory to the system path
sys.path.append(os.path.join(notebook_dir, '../'))

import log_files
from data_processing import DataProcessing
from vector_stores import ChromaVectorStore, VectorStoreDirector

In [2]:
pd.set_option('max_colwidth', 800)
# pd.set_option('display.max_columns', None)
# pd.set_option('display.max_rows', None)
warnings.simplefilter(action='ignore', category=Warning)
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=DeprecationWarning)

## Load Data

In [3]:
base_path = os.path.join(notebook_dir, '../data/')

### Select Dataset

In [4]:
financial_phrasebank = False
if financial_phrasebank == True:
    financial_full_path = os.path.join(base_path, 'financial_phrase_bank/all_data-adjusted_header.csv')
    financial_df = pd.read_csv(financial_full_path, encoding_errors = 'ignore')

    financial_df['domain'] = 'financial'
    df = financial_df[:40]
    collection_name = "prediction_collection-real_data-financial_phrase_bank"
    persist_directory = "../data/chroma/chroma_langchain_db-financial_phrase_bank"
else:
    df = DataProcessing.load_multiple_batches(notebook_dir, sep=',', data_type='observation')
    collection_name = "prediction_collection-synthetic_data-oberservations"
    persist_directory = "../data/chroma/chroma_langchain_db-oberservations"

df.head(3)

Loading: /Users/detraviousjamaribrinkley/Documents/Development/research_labs/uf_ds/predictions/predictions_with_rag/../data/observation_logs/batch_1-observation/batch_1-from_df.csv
Loading: /Users/detraviousjamaribrinkley/Documents/Development/research_labs/uf_ds/predictions/predictions_with_rag/../data/observation_logs/batch_2-observation/batch_2-from_df.csv
Loading: /Users/detraviousjamaribrinkley/Documents/Development/research_labs/uf_ds/predictions/predictions_with_rag/../data/observation_logs/batch_3-observation/batch_3-from_df.csv
Loading: /Users/detraviousjamaribrinkley/Documents/Development/research_labs/uf_ds/predictions/predictions_with_rag/../data/observation_logs/batch_4-observation/batch_4-from_df.csv
Loading: /Users/detraviousjamaribrinkley/Documents/Development/research_labs/uf_ds/predictions/predictions_with_rag/../data/observation_logs/batch_5-observation/batch_5-from_df.csv
✓ Loaded batch 5
Loading: /Users/detraviousjamaribrinkley/Documents/Development/research_labs/u

Unnamed: 0,Base Sentence,Sentence Label,Domain,Model Name,API Name,Batch ID,Template Number
0,JPMorgan Chase observed that the net profit at Amazon had remained stable in Q2 2026.,0,finance,llama-3.1-70b-instruct,NAVI_GATOR,0,1
1,"On 08/20/2024 to 08/20/2025, Bank of America speculated the operating income at Microsoft changed.",0,finance,llama-3.1-70b-instruct,NAVI_GATOR,0,2
2,"Citigroup noted on 2024-08-20, the research and development expenses at Alphabet fell.",0,finance,llama-3.1-70b-instruct,NAVI_GATOR,0,3


## Vector Store

In [7]:
chroma_builder = ChromaVectorStore(collection_name, persist_directory, 'Base Sentence')
chroma_builder

	Collection Name: prediction_collection-synthetic_data-oberservations
	Persist Directory: ../data/chroma/chroma_langchain_db-oberservations
	Vector Store: None
	Docments: []
	UUIDS: None
	Embedding Model: None


<vector_stores.ChromaVectorStore at 0x3553c4810>

In [8]:
chroma_director = VectorStoreDirector(builder=chroma_builder)
embedding_model_name = "Hugging Face"
chroma_director.construct(embedding_model_name, df)

### BUILDER ###
	<vector_stores.ChromaVectorStore object at 0x3553c4810>
### EMBEDDING MODEL ###
	Hugging Face
### INITIALIZE VECTOR STORE ###
	Collection Name: prediction_collection-synthetic_data-oberservations
	Embedding Model: model_name='sentence-transformers/all-mpnet-base-v2' cache_folder=None model_kwargs={} encode_kwargs={} query_encode_kwargs={} multi_process=False show_progress=False
	Persist Directory: ../data/chroma/chroma_langchain_db-oberservations
	Vector Store (Original): <langchain_chroma.vectorstores.Chroma object at 0x350483650>
	Vector Store (Prediction's Wrapper): <vector_stores.ChromaVectorStore object at 0x3553c4810>
### BUILD DOCUMENT ###
	Metadata Columns: ['Sentence Label', 'Domain', 'Model Name', 'API Name', 'Batch ID', 'Template Number']


1112it [00:00, 62383.85it/s]

	UUIDS (N = D): 1112
	Documents (D) 1112
	Documents (D) 0
### ADD DOCUMENTS TO VECTOR STORE ###





	Documents added: <langchain_chroma.vectorstores.Chroma object at 0x350483650>
