# Load Vector Store 

In [1]:
import os
import sys
import pprint

import pandas as pd

from tqdm import tqdm

notebook_dir = os.getcwd()

sys.path.append(os.path.join(notebook_dir, '../'))

from data_processing import DataProcessing
from prediction_properties import PredictionProperties
from text_generation_models import TextGenerationModelFactory
from vector_stores import ChromaVectorStore, VectorStoreDirector

In [2]:
pd.set_option('max_colwidth', 800)
pd.set_option('display.max_columns', None)
# pd.set_option('display.max_columns', 40)
# pd.set_option('display.max_rows', None)

## Load Data

In [3]:
base_data_path = os.path.join(notebook_dir, '../data')
extract_prediction_properties_path =  os.path.join(base_data_path, "extract_prediction_properties/extracted_pps-v1.csv")
df = DataProcessing.load_from_file(extract_prediction_properties_path, 'csv', sep=',')
df.head(3)

Unnamed: 0,Prediction Sentence,Raw Response,Model Name,No Property,Source,Target,Date,Outcome
0,Professor Thompson forecasts that the graduation rate at Harvard University will drop in 2027.,"{0: [""forecasts"", ""that"", ""the"", ""graduation"", ""rate"", ""at"", ""will"", ""in""], 1: [""Professor Thompson""], 2: [""Harvard University""], 3: [""2027""], 4: [""drop""]}",openai/gpt-oss-120b,"forecasts, that, the, graduation, rate, at, will, in",Professor Thompson,Harvard University,2027,drop
1,"Economist Dr. Sarah Lee predicts on 12/31/2027, the consumer confidence index may rise.","{0: [""predicts"", ""on"", ""the""], 1: [""Economist Dr. Sarah Lee""], 2: [""consumer confidence index""], 3: [""12/31/2027""], 4: [""may rise""]}",openai/gpt-oss-120b,"predicts, on, the",Economist Dr. Sarah Lee,consumer confidence index,12/31/2027,may rise
2,"According to a fitness expert, the nutritional intake at community centers would fall in 21 August 2024.","{0: [""According"", ""to"", ""a"", ""the"", ""in""], 1: [""fitness expert""], 2: [""nutritional intake at community centers""], 3: [""21 August 2024""], 4: [""would fall""]}",openai/gpt-oss-120b,"According, to, a, the, in",fitness expert,nutritional intake at community centers,21 August 2024,would fall


## Load Vector Store

In [4]:
collection_name = "prediction_collection-synthetic_data-oberservations"
persist_directory = "../data/chroma/chroma_langchain_db-oberservations"

chroma_loader = ChromaVectorStore(collection_name, persist_directory, 'Prediction Sentence')

chroma_director = VectorStoreDirector(loader=chroma_loader)
embedding_model_name = "Hugging Face"

k = 33

# query_results = []

query_results = []
for idx, row in df.iterrows():
    target = row['Target']
    date = row['Date']
    outcome = row['Outcome']

    query_string = f"{target} {date} {outcome}"

    query_result = chroma_director.query(embedding_model_name, query_string, k, query_search_type="similarity_with_score")
    query_results.append(query_result)

df['Collection Name'] = collection_name
df['Persist Directory'] = persist_directory        
df['Query Results'] = query_results


	Collection Name: prediction_collection-synthetic_data-oberservations
	Persist Directory: ../data/chroma/chroma_langchain_db-oberservations
	Vector Store: None
	Docments: []
	UUIDS: None
	Embedding Model: None
### LOADER ###
	<vector_stores.ChromaVectorStore object at 0x160dbf2d0>
### INITIALIZE CLIENT VECTOR STORE ###
	Vector Store (Prediction's Wrapper): None
### LOAD EMBEDDING MODEL ###
	Hugging Face
### LOAD VECTOR STORE ###
	Collection Name: prediction_collection-synthetic_data-oberservations
	Embedding Model: model_name='sentence-transformers/all-mpnet-base-v2' cache_folder=None model_kwargs={} encode_kwargs={} query_encode_kwargs={} multi_process=False show_progress=False
	Persist Directory: ../data/chroma/chroma_langchain_db-oberservations
	Vector Store (Original): <langchain_chroma.vectorstores.Chroma object at 0x32365e290>
	Vector Store (Prediction's Wrapper): <vector_stores.ChromaVectorStore object at 0x160dbf2d0>
	Documents (D) 1112
### TOP K ###
	2. Similarity with score
	

In [10]:
sentences = []

for idx, row in df.iterrows():
    # print(idx)
    new_row_sentences = []
    prediction_sentence = row['Prediction Sentence']
    query_results = row['Query Results']['similarity_with_score']
    # print(query_results)
    for query_results_idx in range(len(query_results)):
        query_data = query_results[query_results_idx]
        sentence = query_data['page_content']
        new_row_sentences.append(sentence)

    if idx < 1:
        # print(sentence)
        print(f"Sentence: {prediction_sentence}\n\t{new_row_sentences}")
        # print()
    sentences.append(new_row_sentences)

Sentence: Professor Thompson forecasts that the graduation rate at Harvard University will drop in 2027.
	['In 2029 of Q3, Professor James Davis envisioned that the graduation rates at Harvard University decreased.', 'In Aug 21, 2024, Professor Michael Davis envisioned that the graduation rates at Harvard University decreased.', 'According to Professor Alice Lee, the graduation rate at Harvard University rose in June 2022.', 'According to Professor Katherine Lee, the average GPA at Harvard University rose in 2022.', 'In 2029 of Q3, Professor Michael Brown envisioned that the graduation rates at Stanford University decreased.', 'In 2024, Professor James Wilson envisioned that the graduation rates at the University decreased.', 'In 2029 of Q3, Professor James Davis envisioned that the graduation rates at Stanford University decreased.', 'College student, Emily Patel, noted on January 10, 2023, the price of textbooks at the campus bookstore fell.', 'Professor John Taylor noted that the av

In [11]:
df['Query Results: Sentences'] = sentences
df.head(3)

Unnamed: 0,Prediction Sentence,Raw Response,Model Name,No Property,Source,Target,Date,Outcome,Collection Name,Persist Directory,Query Results,Query Results: Sentences
0,Professor Thompson forecasts that the graduation rate at Harvard University will drop in 2027.,"{0: [""forecasts"", ""that"", ""the"", ""graduation"", ""rate"", ""at"", ""will"", ""in""], 1: [""Professor Thompson""], 2: [""Harvard University""], 3: [""2027""], 4: [""drop""]}",openai/gpt-oss-120b,"forecasts, that, the, graduation, rate, at, will, in",Professor Thompson,Harvard University,2027,drop,prediction_collection-synthetic_data-oberservations,../data/chroma/chroma_langchain_db-oberservations,"{'similarity_with_score': [{'page_content': 'In 2029 of Q3, Professor James Davis envisioned that the graduation rates at Harvard University decreased.', 'metadata': {'Template Number': 5, 'Sentence Label': 0, 'API Name': 'NAVI_GATOR', 'Model Name': 'llama-3.1-8b-instruct', 'Domain': 'miscellaneous', 'Batch ID': 0}, 'score': 0.8186123371124268}, {'page_content': 'In Aug 21, 2024, Professor Michael Davis envisioned that the graduation rates at Harvard University decreased.', 'metadata': {'Sentence Label': 0, 'Template Number': 5, 'Batch ID': 0, 'API Name': 'NAVI_GATOR', 'Model Name': 'llama-3.1-8b-instruct', 'Domain': 'miscellaneous'}, 'score': 0.8256052732467651}, {'page_content': 'According to Professor Alice Lee, the graduation rate at Harvard University rose in June 2022.', 'metadat...","[In 2029 of Q3, Professor James Davis envisioned that the graduation rates at Harvard University decreased., In Aug 21, 2024, Professor Michael Davis envisioned that the graduation rates at Harvard University decreased., According to Professor Alice Lee, the graduation rate at Harvard University rose in June 2022., According to Professor Katherine Lee, the average GPA at Harvard University rose in 2022., In 2029 of Q3, Professor Michael Brown envisioned that the graduation rates at Stanford University decreased., In 2024, Professor James Wilson envisioned that the graduation rates at the University decreased., In 2029 of Q3, Professor James Davis envisioned that the graduation rates at Stanford University decreased., College student, Emily Patel, noted on January 10, 2023, the price of..."
1,"Economist Dr. Sarah Lee predicts on 12/31/2027, the consumer confidence index may rise.","{0: [""predicts"", ""on"", ""the""], 1: [""Economist Dr. Sarah Lee""], 2: [""consumer confidence index""], 3: [""12/31/2027""], 4: [""may rise""]}",openai/gpt-oss-120b,"predicts, on, the",Economist Dr. Sarah Lee,consumer confidence index,12/31/2027,may rise,prediction_collection-synthetic_data-oberservations,../data/chroma/chroma_langchain_db-oberservations,"{'similarity_with_score': [{'page_content': 'On 11/10/2022, a market researcher at Global Trends observed that the consumer confidence index in the retail sector had shifted.', 'metadata': {'API Name': 'NAVI_GATOR', 'Model Name': 'mistral-small-3.1', 'Batch ID': 0, 'Domain': 'miscellaneous', 'Sentence Label': 0, 'Template Number': 2}, 'score': 0.6180521249771118}, {'page_content': 'In 2027, the National Institute of Economic Research envisioned that consumer confidence decreased.', 'metadata': {'Sentence Label': 0, 'Model Name': 'gemma2-9b-it', 'Domain': 'policy', 'Template Number': 5, 'Batch ID': 0, 'API Name': 'GROQ_CLOUD'}, 'score': 0.82148277759552}, {'page_content': 'The Cato Institute noted on 03/15/2028, the consumer price index fell.', 'metadata': {'Template Number': 3, 'API Na...","[On 11/10/2022, a market researcher at Global Trends observed that the consumer confidence index in the retail sector had shifted., In 2027, the National Institute of Economic Research envisioned that consumer confidence decreased., The Cato Institute noted on 03/15/2028, the consumer price index fell., In 09/2026, a financial advisor envisioned that the inflation rate at the US economy decreased., According to the Federal Reserve, the inflation rate in the US rose in Q2 2024., In June 2027, a financial researcher envisioned that the inflation rate at the Federal Reserve decreased., According to the National Bureau of Economic Research, the inflation rate in the United States rose in Q4 2035., The Cato Institute noted on 11/15/2029, the inflation rate fell., On 2027-03-10, the Congre..."
2,"According to a fitness expert, the nutritional intake at community centers would fall in 21 August 2024.","{0: [""According"", ""to"", ""a"", ""the"", ""in""], 1: [""fitness expert""], 2: [""nutritional intake at community centers""], 3: [""21 August 2024""], 4: [""would fall""]}",openai/gpt-oss-120b,"According, to, a, the, in",fitness expert,nutritional intake at community centers,21 August 2024,would fall,prediction_collection-synthetic_data-oberservations,../data/chroma/chroma_langchain_db-oberservations,"{'similarity_with_score': [{'page_content': 'Dr. Linda Green noted on 03/20/2021, the nutritional intake at local schools fell.', 'metadata': {'Template Number': 3, 'Sentence Label': 0, 'API Name': 'NAVI_GATOR', 'Domain': 'health', 'Model Name': 'mistral-small-3.1', 'Batch ID': 0}, 'score': 0.6335760354995728}, {'page_content': 'On 11th of October 2025, the World Health Organization monitored the obesity rates at rural high schools changed.', 'metadata': {'Sentence Label': 0, 'Model Name': 'llama-3.3-70b-versatile', 'Domain': 'health', 'API Name': 'GROQ_CLOUD', 'Template Number': 2, 'Batch ID': 0}, 'score': 0.7807698249816895}, {'page_content': 'On November 15, 2023, a health researcher monitored that the average body mass index at rural high schools changed.', 'metadata': {'Domain': '...","[Dr. Linda Green noted on 03/20/2021, the nutritional intake at local schools fell., On 11th of October 2025, the World Health Organization monitored the obesity rates at rural high schools changed., On November 15, 2023, a health researcher monitored that the average body mass index at rural high schools changed., On 1st January 2026, the Centers for Disease Control and Prevention (CDC) monitored the obesity rates in urban areas changed., In the summer of 2027, I envisioned that the rate of childhood obesity would decrease in urban areas., The average daily intake of fruits and vegetables increased in my patients in 2029, according to my records., In the fourth quarter of 2029, Professor Patel envisioned that the daily caloric intake at college cafeterias decreased., On 11th of Octob..."


In [12]:
synthetic_dataset_path = "rag/"
synthetic_dataset_full_path = os.path.join(base_data_path, synthetic_dataset_path)
DataProcessing.save_to_file(df, synthetic_dataset_full_path, 'match_p_o-synthetic_dataset', 'csv')

Using file number: 3
Saving CSV file to: /Users/detraviousjamaribrinkley/Documents/Development/research_labs/uf_ds/predictions/notebook_experiments/../data/rag/match_p_o-synthetic_dataset-v3.csv
