# Install chromadb

In [1]:
!pip install chromadb openai

Collecting chromadb
  Downloading chromadb-0.4.24-py3-none-any.whl (525 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m525.5/525.5 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting openai
  Downloading openai-1.13.3-py3-none-any.whl (227 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m227.4/227.4 kB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m
Collecting chroma-hnswlib==0.7.3 (from chromadb)
  Downloading chroma_hnswlib-0.7.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.4/2.4 MB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting fastapi>=0.95.2 (from chromadb)
  Downloading fastapi-0.110.0-py3-none-any.whl (92 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.1/92.1 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting uvicorn[standard]>=0.18.3 (from chromadb)
  Downloading uvicorn-0.27.1-py3-none-any.whl (6

# Configure Client

In [3]:
import chromadb
from chromadb.config import Settings


# client = chromadb.Client(Settings(chroma_db_impl="duckdb+parquet",
#                                     persist_directory="db/"
#                                 ))

client = chromadb.PersistentClient(path="db/")

# Create Collection

In [4]:
collection = client.create_collection(name="Students")

# Text to add to our collection

In [6]:
student_info = """
Alexandra Thompson, a 19-year-old computer science sophomore with a 3.7 GPA,
is a member of the programming and chess clubs who enjoys pizza, swimming, and hiking
in her free time in hopes of working at a tech company after graduating from the University of Washington.
"""

club_info = """
The university chess club provides an outlet for students to come together and enjoy playing
the classic strategy game of chess. Members of all skill levels are welcome, from beginners learning
the rules to experienced tournament players. The club typically meets a few times per week to play casual games,
participate in tournaments, analyze famous chess matches, and improve members' skills.
"""

university_info = """
The University of Washington, founded in 1861 in Seattle, is a public research university
with over 45,000 students across three campuses in Seattle, Tacoma, and Bothell.
As the flagship institution of the six public universities in Washington state,
UW encompasses over 500 buildings and 20 million square feet of space,
including one of the largest library systems in the world."""

# Convert doc into embeddings and store it in the “Students” collection.
* all-MiniLM-L6-v2 by default


In [7]:
collection.add(
    documents = [student_info, club_info, university_info],
    metadatas = [{"source": "student info"},{"source": "club info"},{'source':'university info'}],
    ids = ["id1", "id2", "id3"]
)

/root/.cache/chroma/onnx_models/all-MiniLM-L6-v2/onnx.tar.gz: 100%|██████████| 79.3M/79.3M [00:01<00:00, 64.8MiB/s]


# Query our embeddings

In [8]:
results = collection.query(
    query_texts=["What is the student name?"],
    n_results=2
)

results

{'ids': [['id1', 'id2']],
 'distances': [[1.2946666564424738, 1.3954030668049473]],
 'metadatas': [[{'source': 'student info'}, {'source': 'club info'}]],
 'embeddings': None,
 'documents': [['\nAlexandra Thompson, a 19-year-old computer science sophomore with a 3.7 GPA,\nis a member of the programming and chess clubs who enjoys pizza, swimming, and hiking\nin her free time in hopes of working at a tech company after graduating from the University of Washington.\n',
   "\nThe university chess club provides an outlet for students to come together and enjoy playing\nthe classic strategy game of chess. Members of all skill levels are welcome, from beginners learning\nthe rules to experienced tournament players. The club typically meets a few times per week to play casual games,\nparticipate in tournaments, analyze famous chess matches, and improve members' skills.\n"]],
 'uris': None,
 'data': None}

# Use the OpenAI API for Text Embeddings

In [10]:
from chromadb.utils import embedding_functions
openai_ef = embedding_functions.OpenAIEmbeddingFunction(api_key="ADD_YOUR_OWN_KEY",
                model_name="text-embedding-ada-002"
            )
students_embeddings = openai_ef([student_info, club_info, university_info])
print(students_embeddings)

[[-0.012598248198628426, 0.008161698468029499, 0.012378361076116562, -0.04327899217605591, -0.0012109968811273575, 0.024381620809435844, -0.03037031553685665, 0.0016483463114127517, -0.009571564383804798, -0.020540060475468636, 0.0286888238042593, -0.015198091976344585, -0.0037607194390147924, -0.01623285561800003, 0.01931127719581127, -0.022932950407266617, 0.01628459431231022, -0.019220735877752304, 0.01470657903701067, -0.02416173368692398, 0.0067712352611124516, -0.008853697218000889, 0.00743736419826746, -0.013542470522224903, -0.015327437780797482, -0.006341161206364632, 0.01866455003619194, -0.01796608418226242, 0.018121300265192986, -0.004911893978714943, 0.03939862921833992, -0.015456782653927803, -0.016051772981882095, -0.015301568433642387, -0.022570783272385597, -0.011136644519865513, -0.0024996011052280664, 0.004278101027011871, -0.0011099458206444979, 0.007288617081940174, 0.013555404730141163, 0.0005727579118683934, -0.006428469438105822, 0.00897657498717308, -0.00425546

In [13]:
collection2 = client.get_or_create_collection(name="Students2",embedding_function=openai_ef)

collection2.add(
    documents = [student_info, club_info, university_info],
    metadatas = [{"source": "student info"},{"source": "club info"},{'source':'university info'}],
    ids = ["id1", "id2", "id3"]
)



# Query using OPENAI embeddings

In [14]:
results = collection2.query(
    query_texts=["What is the student name?"],
    n_results=2
)

results

{'ids': [['id1', 'id3']],
 'distances': [[0.43987091718661614, 0.5072687660763332]],
 'metadatas': [[{'source': 'student info'}, {'source': 'university info'}]],
 'embeddings': None,
 'documents': [['\nAlexandra Thompson, a 19-year-old computer science sophomore with a 3.7 GPA,\nis a member of the programming and chess clubs who enjoys pizza, swimming, and hiking\nin her free time in hopes of working at a tech company after graduating from the University of Washington.\n',
   '\nThe University of Washington, founded in 1861 in Seattle, is a public research university\nwith over 45,000 students across three campuses in Seattle, Tacoma, and Bothell.\nAs the flagship institution of the six public universities in Washington state,\nUW encompasses over 500 buildings and 20 million square feet of space,\nincluding one of the largest library systems in the world.']],
 'uris': None,
 'data': None}

# Update embedding data

In [15]:
collection2.update(
    ids=["id1"],
    documents=["Kristiane Carina, a 19-year-old computer science sophomore with a 3.7 GPA"],
    metadatas=[{"source": "student info"}],
)

# Query Updated data

In [16]:
results = collection2.query(
    query_texts=["What is the student name?"],
    n_results=2
)

results

{'ids': [['id1', 'id3']],
 'distances': [[0.37525246564009884, 0.5072687660763332]],
 'metadatas': [[{'source': 'student info'}, {'source': 'university info'}]],
 'embeddings': None,
 'documents': [['Kristiane Carina, a 19-year-old computer science sophomore with a 3.7 GPA',
   '\nThe University of Washington, founded in 1861 in Seattle, is a public research university\nwith over 45,000 students across three campuses in Seattle, Tacoma, and Bothell.\nAs the flagship institution of the six public universities in Washington state,\nUW encompasses over 500 buildings and 20 million square feet of space,\nincluding one of the largest library systems in the world.']],
 'uris': None,
 'data': None}