In [1]:
# Installs
# %pip install pandas
# %pip install pyarrow
# %pip install fsspec
# %pip install huggingface-hub
# %pip install matplotlib
# %pip install tqdm
# %pip install ipywidgets
# %pip install fastapi uvicorn chromadb requests
# %pip install "uvicorn[standard]"

In [2]:
# !ollama pull nomic-embed-text

In [3]:
from tqdm.notebook import tqdm
tqdm.pandas()

In [4]:
import pandas as pd

LOCAL_FNAME = "./LOCAL/raw.parquet"

try:
    df = pd.read_parquet(LOCAL_FNAME)
except:
    print("File not found, downloading...")
    df = pd.read_parquet("hf://datasets/rag-datasets/rag-mini-wikipedia/data/passages.parquet/part.0.parquet")
    df.to_parquet("./LOCAL/raw.parquet")

print(df.shape)

(3200, 1)


In [6]:
# len(df['passage'][0])
# df['passage'].apply(lambda x: len(x)).hist()

In [7]:
df['passage'].iloc[0]

'Uruguay (official full name in  ; pron.  , Eastern Republic of  Uruguay) is a country located in the southeastern part of South America.  It is home to 3.3 million people, of which 1.7 million live in the capital Montevideo and its metropolitan area.'

In [8]:
import requests

def get_embedding(text, model='nomic-embed-text'):
    url = 'http://localhost:11434/api/embeddings'
    data = {
        "model": model,
        "prompt": text
    }
    response = requests.post(url, json=data)
    response.raise_for_status()
    return response.json()['embedding']

# Example usage:
embedding = get_embedding(df.iloc[0]['passage'])

In [9]:
try:
    df = pd.read_parquet("./LOCAL/processed.parquet")
except: 
    df['embedding'] = df['passage'].progress_apply(lambda x: get_embedding(x))
    df.to_parquet("./LOCAL/processed.parquet")

In [None]:
import chromadb

client = chromadb.PersistentClient(path="./LOCAL/chroma_db")  
collection = client.create_collection("chunks")

# Add vectors
collection.add(
    embeddings=df["embedding"].to_list(),
    documents=df["passage"].to_list(),
    ids=[str(val) for val in df.index]
)

In [11]:
# Query
results = collection.query(
    query_embeddings=embedding,
    n_results=5
)

In [None]:
# // Example using fetch in React
# async function queryChroma(query) {
#   const res = await fetch('http://localhost:8000/query', {
#     method: 'POST',
#     headers: { 'Content-Type': 'application/json' },
#     body: JSON.stringify({ query }),
#   });
#   const data = await res.json();
#   return data; // contains the Chroma results
# }

In [None]:
import chromadb
from chromadb.config import Settings

client = chromadb.Client(Settings(persist_directory="LOCAL/chroma_db"))
collection = client.get_or_create_collection("test_collection")
collection.add(
    embeddings=[[0.1, 0.2, 0.3]],
    documents=["test document"],
    ids=["1"]
)

In [None]:
import chromadb
from chromadb.config import Settings

client = chromadb.Client(Settings(persist_directory="chroma"))
collection = client.get_or_create_collection("test_collection", persist_directory="LOCAL/chroma")

In [None]:
import chromadb

client = chromadb.PersistentClient(path="./LOCAL/chroma_db")  

collection = client.get_or_create_collection("test_collection")
collection.add(
    embeddings=[[0.1, 0.2, 0.3]],
    documents=["test document"],
    ids=["1"]
)

# Save to disk
# client.persist()


In [None]:
print(client._settings)

In [None]:
!mkdir LOCAL/chroma