In [17]:
import chromadb
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.core import StorageContext
from llama_index.core import VectorStoreIndex

chroma_client = chromadb.EphemeralClient()
chroma_collection = chroma_client.create_collection("quickstart")
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
storage_context = StorageContext.from_defaults(vector_store=vector_store)

In [4]:
v = VectorStoreIndex(
    index_struct=VectorStoreIndex.index_struct_cls(index_id='test'),
    storage_context=storage_context,
    embed_model='skip'
)
retriever = v.as_retriever()

In [16]:

retriever._vector_store.client._client.delete_collection(retriever._vector_store.client.name)

In [19]:
retriever._vector_store._collection = chroma_collection

In [22]:
from llama_index.core.schema import MetadataMode, TextNode
retriever._vector_store.add([TextNode(text="test", embedding=[1,0,0,0])])

['1ac4550a-a88c-4226-82e3-4804cab047d0']

In [None]:
import os
import chromadb
from llama_index.vector_stores.chroma import ChromaVectorStore

for i in range(1,5):
    db_path = f'{i}_chroma'
    chroma_client = chromadb.PersistentClient(path=db_path)
    chroma_collection = chroma_client.get_or_create_collection(name='quickstart')
    vector_stores.append(ChromaVectorStore(chroma_collection=chroma_collection))

### Visualize retrieval result

In [None]:
import os
import json
import matplotlib.pyplot as plt
from tqdm import tqdm

id_num = []
text_num = []
length_of_text = {}
file_path = './._cache/gpt-4o-batch-all-target_all-level_0_chroma.jsonl'
file_size = os.path.getsize(file_path)
with open(os.path.abspath(file_path), 'r') as read_file:
    with tqdm(total=file_size, desc=f'Reading {file_path.split(os.path.sep)[-1]}', unit='B', unit_scale=True, unit_divisor=1024) as pbar:
        for line in read_file:
            data = json.loads(line)
            id_set = set()
            text_set = set()
            if isinstance(data['retrieved_nodes'], dict):
                for level in data['retrieved_nodes']:
                    for node in data['retrieved_nodes'][level]:
                        id_set.add(node['id_'])
                        text_set.add(node['text'])
                    
                    length = sum([len(node['text']) for node in data['retrieved_nodes'][level]])
                    if level not in length_of_text:
                        length_of_text[level] = []
                    length_of_text[level].append(length)
                    
            else:
                id_set = set()
                text_set = set()
                for node in data['retrieved_nodes']:
                    id_set.add(node['id_'])
                    text_set.add(node['text'])
                    
            id_num.append(len(id_set))
            text_num.append(len(text_set))
            
            pbar.update(len(line))

data_to_plot = [id_num, text_num] + list(length_of_text.values())
labels = ['ID Numbers', 'Text Numbers'] + [f'Length of Text - {level}' for level in length_of_text.keys()]

# Plot the box plot
plt.figure(figsize=(10, 6))
plt.boxplot(data_to_plot, labels=labels, patch_artist=True, 
            boxprops=dict(facecolor='lightblue', color='blue'),
            medianprops=dict(color='red'))

min_values = [min(data) for data in data_to_plot]
for i, min_val in enumerate(min_values):
    plt.text(i + 1, min_val, f'{min_val}', color='blue', ha='center', va='bottom', fontsize=10)

# Add title and labels
plt.title('Box Plot for ID Numbers, Text Numbers, and Length of Text', fontsize=14)
plt.ylabel('Values', fontsize=12)
plt.grid(axis='y', linestyle='--', alpha=0.7)

# Show the plot
plt.show()

Reading gpt-4o-batch-all-target_all-level_0_chroma.jsonl:   0%|          | 0.00/16.5G [00:00<?, ?B/s]

Reading gpt-4o-batch-all-target_all-level_0_chroma.jsonl:   5%|▍         | 842M/16.5G [00:13<03:50, 73.1MB/s] 

In [3]:
length_of_text.index(17320)

221

In [None]:
line_number = 221
with open(file_path, 'r') as file:
    for current_line_number, line in enumerate(file):
        if current_line_number == line_number:
            data = json.loads(line)  # .strip() removes any leading/trailing whitespace or newline
            break

In [10]:
for level in data['retrieved_nodes']:
    length = sum([len(node['text']) for node in data['retrieved_nodes'][level]])
    print(length)

9167
4372
2793
2775


In [2]:
import os
import json
with open(os.path.abspath('./retrieved_contexts/gpt-4o-batch-all-target_one_retrieved_contexts.jsonl'), 'r') as read_file:
    line = read_file.readline()
    data = json.loads(line)
data

{'question_node_id': '3d751495-ce66-4254-a79f-ed8d740b9424',
 'question_id': 0,
 'retrieved_nodes_id': ['3d751495-ce66-4254-a79f-ed8d740b9424',
  'bb171619-265f-4f07-8261-558ed5301732',
  '9883acd0-621d-41cb-95d8-94353bd89de6',
  '95174e27-8fb4-4b7a-9c9f-0ddb3ab08a31',
  '71503056-c446-457a-91a4-7c1ab3ef6e97',
  '7b4b9a02-4867-4933-8edf-d16dcfb851b8',
  'f8f21811-b8d9-43bd-86d7-f5664ddcf251',
  'b61b9701-a5ab-41d9-8a10-7325d58a39e8',
  '13cdd47d-b0a8-414c-8410-cbe0deb6fb90',
  '65dd068d-b628-49f3-9433-92660d8bf648',
  'ed09992c-7852-4561-b8fb-16ed13ff9cfb',
  '67b520b7-0a4f-421d-9663-2da4f987f03d',
  'b668db5d-0e65-42fc-a3b5-38ae5ac659f1',
  '949d5ef3-d469-40a3-943f-fa7f8f31ef73',
  '3936d6a8-cd1b-43ae-9246-d620312cf7f1'],
 'retrieved_contexts': ['This research investigates the differences in water uptake rates among various soybean cultivars by analyzing their seed coat cutin structures. The findings show that cracked cuticles allow for easy water absorption while intact ones remain i

In [4]:
id_set = set(data['retrieved_nodes_id'])
text_set = set(data['retrieved_contexts'])

print(len(id_set))
print(len(text_set))

15
14


In [5]:
sum([len(text) for text in data['retrieved_contexts']])

4530

In [12]:
file_path = './._cache/gpt-4o-batch-all-target_all-level_1_chroma_not_finish.jsonl'
file_size = os.path.getsize(file_path)
with open(os.path.abspath(file_path), 'r') as read_file:
    with tqdm(total=file_size, desc=f'Reading {file_path.split(os.path.sep)[-1]}', unit='B', unit_scale=True, unit_divisor=1024) as pbar:
        for i, line in enumerate(read_file):
            data = json.loads(line)
print(i)
print(data['question_node_id'])

Reading gpt-4o-batch-all-target_all-level_1_chroma_not_finish.jsonl:   0%|          | 0.00/5.49G [01:56<?, ?B/s]

290
f6439ca3-ecf0-45a6-ad07-1d8f7e852724



