### Using ChromaDB to store your Document Embeddings ###

In [11]:
import os
import chromadb

from dotenv import load_dotenv, find_dotenv

load_dotenv(find_dotenv())

True

In [12]:
from chromadb.utils.embedding_functions import OpenAIEmbeddingFunction

In [13]:
current_dir = os.getcwd()
data_folder_path = os.path.join(current_dir, "data")

client = chromadb.PersistentClient(path=data_folder_path)

In [14]:
openai_ef = OpenAIEmbeddingFunction(api_key=os.getenv("OPENAI_API_KEY"))

collection = client.create_collection(name="new-collection", 
                                      embedding_function=openai_ef)

UniqueConstraintError: Collection new-collection already exists

In [16]:
# list the collections
collections = client.list_collections()
print(collections)

[Collection(name=test-collection), Collection(name=new-collection)]


In [11]:
collection = client.get_collection(name="new-collection",embedding_function=openai_ef)

In [12]:
collection.add(ids=["id1","id2"],documents=["this is doc1", "doc2"])

In [13]:
collection.count()

2

In [15]:
collection.get(ids=["id1","id2"])

{'ids': ['id1', 'id2'],
 'embeddings': None,
 'metadatas': [None, None],
 'documents': ['this is doc1', 'doc2'],
 'uris': None,
 'data': None}

In [16]:
id_text = "id"
text = "this is text "

for i in range(0, 12):
    collection.add(ids=[id_text+str(i)],documents=[text+str(i)])

Add of existing embedding ID: id1
Insert of existing embedding ID: id1
Add of existing embedding ID: id2
Insert of existing embedding ID: id2


In [19]:
collection.get(["id4"])

{'ids': ['id4'],
 'embeddings': None,
 'metadatas': [None],
 'documents': ['this is text 4'],
 'uris': None,
 'data': None}

In [20]:

for i in range(0, 12):
    collection.upsert(ids=[id_text+str(i)],documents=[text+str(i)])

In [21]:
collection.get(ids=[id_text+str(i) for i in range(0, 12)])

{'ids': ['id0',
  'id1',
  'id10',
  'id11',
  'id2',
  'id3',
  'id4',
  'id5',
  'id6',
  'id7',
  'id8',
  'id9'],
 'embeddings': None,
 'metadatas': [None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None],
 'documents': ['this is text 0',
  'this is text 1',
  'this is text 10',
  'this is text 11',
  'this is text 2',
  'this is text 3',
  'this is text 4',
  'this is text 5',
  'this is text 6',
  'this is text 7',
  'this is text 8',
  'this is text 9'],
 'uris': None,
 'data': None}

In [22]:
collection.peek()

{'ids': ['id0',
  'id1',
  'id10',
  'id11',
  'id2',
  'id3',
  'id4',
  'id5',
  'id6',
  'id7'],
 'embeddings': [[-0.0064959959127008915,
   -0.013814887031912804,
   -0.005324414931237698,
   -0.012475937604904175,
   -0.011220673099160194,
   0.0223437137901783,
   -0.016527654603123665,
   -0.0034938205499202013,
   -0.03475688770413399,
   -0.034338466823101044,
   -0.001982272369787097,
   0.030210040509700775,
   -0.02034923806786537,
   -0.006537837907671928,
   -0.0013485379749909043,
   0.009100669994950294,
   0.008354485034942627,
   0.005024546291679144,
   0.00842422153800726,
   -0.005296520423144102,
   -0.0147981783375144,
   0.008626459166407585,
   -0.011360147036612034,
   0.0007841046899557114,
   -0.002465200610458851,
   -0.008891459554433823,
   0.016708970069885254,
   -0.015174757689237595,
   0.014010150916874409,
   -0.03210688754916191,
   0.006834219675511122,
   -0.014491335488855839,
   -0.0013049524277448654,
   -0.022287925705313683,
   -0.0076710633

### Understanding ChromaDB Operations using Netflix Data ###

**Scope for our session today**

1. Take 1000 rows from the Netflix dataset. The Netflix dataset is available at [Kaggle](https://www.kaggle.com/datasets/shivamb/netflix-shows).
2. Look through the dataset. We will use pandas for this.
3. Create an enriched information string which will capture title, type, category. We will use this enriched information string as a document.
4. Ingest documents.
5. Find out how many total documents are part of the collection using the count() method.
6. View some documents by passing in id’s to the get() method and also using the peek() method.
7. Update collection by adding metadata information for the documents. We will add release “type”, “country” and “release_year” as part of metadata. Metadata will allow us to do searches like “ bank heist movies released post 2019 which are from UK.”
8. Retrive collection and make query searches.
9. Delete the collection

In [2]:
import pandas as pd

df = pd.read_csv("netflix_titles.csv")

In [3]:
df.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...
3,s4,TV Show,Jailbirds New Orleans,,,,"September 24, 2021",2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo..."
4,s5,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8807 entries, 0 to 8806
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   show_id       8807 non-null   object
 1   type          8807 non-null   object
 2   title         8807 non-null   object
 3   director      6173 non-null   object
 4   cast          7982 non-null   object
 5   country       7976 non-null   object
 6   date_added    8797 non-null   object
 7   release_year  8807 non-null   int64 
 8   rating        8803 non-null   object
 9   duration      8804 non-null   object
 10  listed_in     8807 non-null   object
 11  description   8807 non-null   object
dtypes: int64(1), object(11)
memory usage: 825.8+ KB


In [5]:
df.dropna(inplace=True)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5332 entries, 7 to 8806
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   show_id       5332 non-null   object
 1   type          5332 non-null   object
 2   title         5332 non-null   object
 3   director      5332 non-null   object
 4   cast          5332 non-null   object
 5   country       5332 non-null   object
 6   date_added    5332 non-null   object
 7   release_year  5332 non-null   int64 
 8   rating        5332 non-null   object
 9   duration      5332 non-null   object
 10  listed_in     5332 non-null   object
 11  description   5332 non-null   object
dtypes: int64(1), object(11)
memory usage: 541.5+ KB


In [7]:
df = df[:1000]

netflix_info = []

for index, row in df.iterrows():
    netflix_info.append(f"{row['title']} \n {row['description']}")

ids = df.index.astype(str).tolist()


In [8]:
netflix_info[:5]

['Sankofa \n On a photo shoot in Ghana, an American model slips back in time, becomes enslaved on a plantation and bears witness to the agony of her ancestral past.',
 "The Great British Baking Show \n A talented batch of amateur bakers face off in a 10-week competition, whipping up their best dishes in the hopes of being named the U.K.'s best.",
 "The Starling \n A woman adjusting to life after a loss contends with a feisty bird that's taken over her garden — and a husband who's struggling to find a way forward.",
 'Je Suis Karl \n After most of her family is murdered in a terrorist bombing, a young woman is unknowingly lured into joining the very group that killed them.',
 'Jeans \n When the father of the man she loves insists that his twin sons marry twin sisters, a woman creates an alter ego that might be a bit too convincing.']

In [9]:
ids[:5]

['7', '8', '9', '12', '24']

In [18]:
collection = client.get_collection(name="new-collection", embedding_function=openai_ef)

In [19]:
collection.add(ids=ids, documents=netflix_info)

Add of existing embedding ID: id1
Add of existing embedding ID: id2


In [21]:
collection.upsert(ids=ids, documents=netflix_info)

In [22]:
collection.get(ids=ids[:5])

{'ids': ['12', '24', '7', '8', '9'],
 'embeddings': None,
 'metadatas': [None, None, None, None, None],
 'documents': ['Je Suis Karl \n After most of her family is murdered in a terrorist bombing, a young woman is unknowingly lured into joining the very group that killed them.',
  'Jeans \n When the father of the man she loves insists that his twin sons marry twin sisters, a woman creates an alter ego that might be a bit too convincing.',
  'Sankofa \n On a photo shoot in Ghana, an American model slips back in time, becomes enslaved on a plantation and bears witness to the agony of her ancestral past.',
  "The Great British Baking Show \n A talented batch of amateur bakers face off in a 10-week competition, whipping up their best dishes in the hopes of being named the U.K.'s best.",
  "The Starling \n A woman adjusting to life after a loss contends with a feisty bird that's taken over her garden — and a husband who's struggling to find a way forward."],
 'uris': None,
 'data': None}

In [25]:
collection.count()

1012

In [27]:
query_texts = ["where the main character is a detective"]
result = collection.query(query_texts=query_texts, n_results=5)

print(result)

{'ids': [['383', '496', '866', '881', '122']], 'distances': [[0.360140860080719, 0.3627288341522217, 0.36536821722984314, 0.3725615441799164, 0.37443166971206665]], 'metadatas': [[None, None, None, None, None]], 'embeddings': None, 'documents': [['Department \n Two cops form a task force to take down two mobsters, but when a sinister politician enters the picture, dubious loyalties and motives come to light.', 'Brick Mansions \n An undercover police detective partners with an ex-convict to take down a drug kingpin amid grime and crime in a walled-off Detroit housing project.', "Small Town Crime \n When a disgraced ex-cop discovers a dying woman, he's compelled to track down her killer — an act of self-redemption that takes him down a dark path.", 'I Am All Girls \n A relentless detective finds common ground with a killer systematically targeting the perpetrators running a powerful child-trafficking ring.', 'In the Cut \n After embarking on an affair with the cop probing the murder of a

In [28]:
query_texts = ["where the main character is a detective", "where the story is about a young girl"]
result = collection.query(query_texts=query_texts, n_results=5)

print(result)

{'ids': [['383', '496', '866', '881', '122'], ['343', '1777', '344', '1111', '736']], 'distances': [[0.360140860080719, 0.3627288341522217, 0.36536821722984314, 0.3725615441799164, 0.37443166971206665], [0.2909969985485077, 0.3103710114955902, 0.31172746419906616, 0.32301008701324463, 0.3244174122810364]], 'metadatas': [[None, None, None, None, None], [None, None, None, None, None]], 'embeddings': None, 'documents': [['Department \n Two cops form a task force to take down two mobsters, but when a sinister politician enters the picture, dubious loyalties and motives come to light.', 'Brick Mansions \n An undercover police detective partners with an ex-convict to take down a drug kingpin amid grime and crime in a walled-off Detroit housing project.', "Small Town Crime \n When a disgraced ex-cop discovers a dying woman, he's compelled to track down her killer — an act of self-redemption that takes him down a dark path.", 'I Am All Girls \n A relentless detective finds common ground with a

In [32]:
selected_ids = ["343","383"]

reference_texts = collection.get(ids=selected_ids)['documents']


for text in reference_texts:
    print(text)
    print("\n")

My Girl 
 This coming-of-age charmer follows a summer in the life of an 11-year-old girl who learns about love and loss as she grapples with profound changes.


Department 
 Two cops form a task force to take down two mobsters, but when a sinister politician enters the picture, dubious loyalties and motives come to light.




In [33]:
result = collection.query(query_texts=reference_texts, n_results=3, 
                          where = {
                              "type":"Movie",
                          })

print(result)


{'ids': [[], []], 'distances': [[], []], 'metadatas': [[], []], 'embeddings': None, 'documents': [[], []], 'uris': None, 'data': None}


### Metadata Updates ###

In [35]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1000 entries, 7 to 1930
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   show_id       1000 non-null   object
 1   type          1000 non-null   object
 2   title         1000 non-null   object
 3   director      1000 non-null   object
 4   cast          1000 non-null   object
 5   country       1000 non-null   object
 6   date_added    1000 non-null   object
 7   release_year  1000 non-null   int64 
 8   rating        1000 non-null   object
 9   duration      1000 non-null   object
 10  listed_in     1000 non-null   object
 11  description   1000 non-null   object
dtypes: int64(1), object(11)
memory usage: 101.6+ KB


In [36]:
metadatas = []
for index, row in df.iterrows():
    metadatas.append({"type":row['type'], "country":row['country'], "release_year" :row['release_year']})

In [37]:
metadatas[:2]

[{'type': 'Movie',
  'country': 'United States, Ghana, Burkina Faso, United Kingdom, Germany, Ethiopia',
  'release_year': 1993},
 {'type': 'TV Show', 'country': 'United Kingdom', 'release_year': 2021}]

In [40]:
collection.update(ids=ids, metadatas=metadatas)

In [41]:
collection.get(ids=selected_ids)

{'ids': ['343', '383'],
 'embeddings': None,
 'metadatas': [{'country': 'United States',
   'release_year': 1991,
   'type': 'Movie'},
  {'country': 'India', 'release_year': 2012, 'type': 'Movie'}],
 'documents': ['My Girl \n This coming-of-age charmer follows a summer in the life of an 11-year-old girl who learns about love and loss as she grapples with profound changes.',
  'Department \n Two cops form a task force to take down two mobsters, but when a sinister politician enters the picture, dubious loyalties and motives come to light.'],
 'uris': None,
 'data': None}

In [44]:
result = collection.query(query_texts=reference_texts, n_results=3, 
                          where = {
                              "type":"Movie",
                          })

print(result)

{'ids': [['343', '344', '1602'], ['383', '1662', '630']], 'distances': [[2.4195796868298203e-06, 0.2266308069229126, 0.2695145606994629], [2.320930434507318e-06, 0.25388291478157043, 0.25658872723579407]], 'metadatas': [[{'country': 'United States', 'release_year': 1991, 'type': 'Movie'}, {'country': 'United States', 'release_year': 1994, 'type': 'Movie'}, {'country': 'Philippines', 'release_year': 2013, 'type': 'Movie'}], [{'country': 'India', 'release_year': 2012, 'type': 'Movie'}, {'country': 'Egypt', 'release_year': 2018, 'type': 'Movie'}, {'country': 'United States', 'release_year': 2012, 'type': 'Movie'}]], 'embeddings': None, 'documents': [['My Girl \n This coming-of-age charmer follows a summer in the life of an 11-year-old girl who learns about love and loss as she grapples with profound changes.', "My Girl 2 \n A teen makes surprising discoveries when she talks to her dead mother's high school friends for an essay about someone she admires but has never met.", 'Must Be... Lov

In [47]:
result = collection.query(query_texts=reference_texts, n_results=3,
                          where = {
                              "$and":[
                                  {"type":{"$eq":"Movie"}},
                                  {"country":{"$eq":"United States"}},
                                  {"release_year":{"$gte":2015}}
                            
                              ]
                          } )

print(result)

{'ids': [['1156', '1597', '231'], ['866', '1582', '1633']], 'distances': [[0.2707473039627075, 0.3018142580986023, 0.3231540024280548], [0.2599184513092041, 0.27355197072029114, 0.31071051955223083]], 'metadatas': [[{'country': 'United States', 'release_year': 2021, 'type': 'Movie'}, {'country': 'United States', 'release_year': 2020, 'type': 'Movie'}, {'country': 'United States', 'release_year': 2021, 'type': 'Movie'}], [{'country': 'United States', 'release_year': 2017, 'type': 'Movie'}, {'country': 'United States', 'release_year': 2016, 'type': 'Movie'}, {'country': 'United States', 'release_year': 2020, 'type': 'Movie'}]], 'embeddings': None, 'documents': [['A Week Away \n In this uplifting musical, a troubled teen takes a leap of faith by attending summer camp — and unexpectedly finds love, friends and a place to belong.', 'You Are My Home \n After becoming separated from her family, a young girl finds herself in the care of a heartbroken woman who faces her own struggles.', 'The W

In [48]:
result = collection.query(query_texts=reference_texts, n_results=3,
                          where = {
                              "$or":[
                                  {"type":{"$eq":"Movie"}},
                                  {"country":{"$eq":"United States"}},
                                  {"release_year":{"$gte":2015}}
                            
                              ]
                          } )

print(result)

{'ids': [['343', '344', '1602'], ['383', '1662', '630']], 'distances': [[2.4195796868298203e-06, 0.2266308069229126, 0.2695145606994629], [2.320930434507318e-06, 0.25388291478157043, 0.25658872723579407]], 'metadatas': [[{'country': 'United States', 'release_year': 1991, 'type': 'Movie'}, {'country': 'United States', 'release_year': 1994, 'type': 'Movie'}, {'country': 'Philippines', 'release_year': 2013, 'type': 'Movie'}], [{'country': 'India', 'release_year': 2012, 'type': 'Movie'}, {'country': 'Egypt', 'release_year': 2018, 'type': 'Movie'}, {'country': 'United States', 'release_year': 2012, 'type': 'Movie'}]], 'embeddings': None, 'documents': [['My Girl \n This coming-of-age charmer follows a summer in the life of an 11-year-old girl who learns about love and loss as she grapples with profound changes.', "My Girl 2 \n A teen makes surprising discoveries when she talks to her dead mother's high school friends for an essay about someone she admires but has never met.", 'Must Be... Lov

In [49]:
collection.delete(ids=['343', '344', '1602'])

In [50]:
client.delete_collection(name="new-collection")

In [52]:
collection.count()

StopIteration: 