In [1]:
'''
# 17/Aug/2023
# References:
# https://www.sbert.net/index.html
# https://trychroma.com

# Installation
$ pip install -U sentence-transformers
$ pip install chromadb

'''

'\n# 17/Aug/2023\n# References:\n# https://www.sbert.net/index.html\n# https://trychroma.com\n\n# Installation\n$ pip install -U sentence-transformers\n$ pip install chromadb\n\n'

In [2]:
import os

In [3]:
import numpy as np

In [4]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-MiniLM-L6-v2')


In [5]:
type(model)

sentence_transformers.SentenceTransformer.SentenceTransformer

In [6]:
sentences = ['This framework generates embeddings for each input sentence',
    'Sentences are passed as a list of string.', 
    'The quick brown fox jumps over the lazy dog.']
print(sentences)


['This framework generates embeddings for each input sentence', 'Sentences are passed as a list of string.', 'The quick brown fox jumps over the lazy dog.']


In [7]:
print(f"Number of sentences : {len(sentences)}")
for i in range(len(sentences)):
    print(f"Sentence {i} has {len(sentences[i])} chars")
sentence_embeddings = model.encode(sentences)
print(f"Sentence embeddings shape : {sentence_embeddings.shape}")
print(f"Each embedding type is : {type(sentence_embeddings[0])}, has dimensions {sentence_embeddings[0].shape}")


Number of sentences : 3
Sentence 0 has 59 chars
Sentence 1 has 41 chars
Sentence 2 has 44 chars
Sentence embeddings shape : (3, 384)
Each embedding type is : <class 'numpy.ndarray'>, has dimensions (384,)


In [8]:
from chromadb.utils import embedding_functions

In [9]:
# By default Chroma uses : 
embedding_func = embedding_functions.DefaultEmbeddingFunction()

In [10]:
# change model name in case you want to use a different model
#sentence_transformer_ef = embedding_functions
#    .SentenceTransformerEmbeddingFunction(model_name="all-MiniLM-L6-v2")

In [11]:
#### Try in memory client
import chromadb
chroma_client = chromadb.Client()


In [12]:
collection = chroma_client.create_collection(name="my_collection")

In [13]:
## Cannot use embedding  from sentence transformer as it does not return a list
## the number of metadatas and ids should match with number of documents
## in this case len(sentences) = len(metadatas) = len(ids)
## note :  sentence transformer as returns ndarray instead of list and that ended up in error
## here collection.add is using sentence_emsentence_embeddings after converting to list
collection.add(
    documents=sentences,
    embeddings=sentence_embeddings.tolist(),
    metadatas=[{"source": "id1"},{"source":"id2"},{"source":"id3"}],
    ids=["id1", "id2","id3"]
)

## or using  embedding function (returns list)
collection.add(
    documents=sentences,
    embeddings=embedding_func(sentences),
    metadatas=[{"source": "id11"},{"source":"id22"},{"source":"id33"}],
    ids=["id11", "id22","id33"]
)

In [14]:
collection.get()

{'ids': ['id1', 'id2', 'id3', 'id11', 'id22', 'id33'],
 'embeddings': None,
 'metadatas': [{'source': 'id1'},
  {'source': 'id2'},
  {'source': 'id3'},
  {'source': 'id11'},
  {'source': 'id22'},
  {'source': 'id33'}],
 'documents': ['This framework generates embeddings for each input sentence',
  'Sentences are passed as a list of string.',
  'The quick brown fox jumps over the lazy dog.',
  'This framework generates embeddings for each input sentence',
  'Sentences are passed as a list of string.',
  'The quick brown fox jumps over the lazy dog.']}

In [None]:
collection.peek()

In [21]:
collection.get(where={"source":"id1"})


{'ids': ['id1'],
 'embeddings': None,
 'metadatas': [{'source': 'id1'}],
 'documents': ['This framework generates embeddings for each input sentence']}

In [31]:
collection.get(where={"source":"id33"})

{'ids': ['id33'],
 'embeddings': None,
 'metadatas': [{'source': 'id33'}],
 'documents': ['The quick brown fox jumps over the lazy dog.']}