In [None]:
#### This notebook demonstrate populating a vector database (chromadb)

The example use text data from a whatsapp newsgroup. A read only group that publishes India business news as one-liner
from different newspapers.


In [1]:
import os
import re
from datetime import datetime, timedelta

from tqdm.notebook import tqdm
from time import sleep

In [2]:
import chromadb
chroma_client = chromadb.Client()

### Embedding function
By default, Chroma uses the Sentence Transformers all-MiniLM-L6-v2 model to create embeddings. This embedding model can create sentence and document embeddings that can be used for a wide variety of tasks. This embedding function runs locally on your machine, and may require you download the model files (this will happen automatically).

In [3]:
## VDB related imports
from chromadb.utils import embedding_functions
# By default Chroma uses 
embedding_func = embedding_functions.DefaultEmbeddingFunction()
news_collection = chroma_client.create_collection(name="news",embedding_function=embedding_func)

## news preprocessor
from util.text2json import get_news

In [4]:
def add_record(rec,collection):
    ''' adds a record to chroma db '''
    if type(rec)!=dict: # rec should be a json 
        return False
    if all(x in rec.keys() for x in ['ids' ,'documents','metadatas']) :
        try:
            collection.add(ids=rec['ids'],documents=rec['documents'],metadatas=rec['metadatas'])
        except Exception as e:
            print(f"Error : {e}")
            return False
    else:
        return False


#### PREPROCESSING TEXT FILE

get_news return a json making it easy to add to chromadb see: `text2json` notebook for details

In [5]:
news=get_news("./_chat.txt")

Rejected lines : 140
News Items : 12


#### ADD NEWS TO CHROMADB

uses helper function `add_record`

In [8]:
for i in tqdm(range(len(news))):
    add_record(news[i],news_collection)

  0%|          | 0/12 [00:00<?, ?it/s]

#### FETCH FROM AND QUERY CHROMADB

1. fetch records using `.get` and `.peek` using parameter to limit the output
2. use `.query` to filter and search for specific meta data

Examples follow

In [9]:
news_collection.get(limit=2)        # limit to 2 records
# you can use peek() instead of get() as it shows embeddings too
#news_collection.peek()

{'ids': ['2023-05-08ET', '2023-05-08BS'],
 'embeddings': None,
 'metadatas': [{'date': '2023-05-08', 'source': 'Economic Times'},
  {'date': '2023-05-08', 'source': 'Business Standard'}],
 'documents': ['Aditya Birla Fashion to raise up to ₹800 crore for TCNS acquisition,Indus plans capex push this fiscal to make the most of 5G boom,Blackstone signs binding pact for controlling stake in care hospitals,Equitas Small Finance Bank reports Q4 net profit at Rs 190.03 cr,Coal India Q4 Results: Profit declines 18% YoY to Rs 5,528 crore, dividend declared at Rs 4/share,TPG-backed RR Kabel files IPO papers with Sebi,Gold imports dip 24% to $35 billion in 2022-23,GSTN defers by 3 months implementation of e-invoice reporting time limit,Saudi Arabia economy grew 3.9% in Q1 boosted by non-oil activities,ChrysCap diversifies into public market, launches special fund,Daikin India becomes billion-dollar company, expect to double business in next 3 years,Electronic wearables production in India reaches

In [17]:
news_collection.query(
    query_texts=["Blackstone"],
    n_results=2
)

{'ids': [['2023-05-09ET', '2023-05-11BS']],
 'distances': [[1.7775036096572876, 1.8243114948272705]],
 'metadatas': [[{'date': '2023-05-09', 'source': 'Economic Times'},
   {'date': '2023-05-11', 'source': 'Business Standard'}]],
 'embeddings': None,
 'documents': [["Allianz-Shapoorji looks to exit Hyderabad's IT SEZ Waverock for Rs 2,000 cr,Energy PSUs eye 38,000 tonnes per annum green hydrogen capacity,Happiest Minds Technologies Q4 Results: Profit rises 11% YoY to Rs 58 crore; dividend declared at Rs 3.4/share,Kalpataru Power Q4 Results: Net profit up nearly 22% at Rs 140 crore,Alibaba logistics arm eyes up to $2 billion Hong Kong IPO: Reports,NLC India board approves raising up to Rs 5,000 crore via bonds,India, Canada trade ministers to review progress in talks on free trade agreement,Japan's Daiwa to pick up 15% stake in Ambit,Sajjan Jindal group planning JSW Paints stake sale to private equity funds,Fintech firm Fundly.ai bags $3 million in seed funding,Early\xad-stage VC 3one4 

In [18]:
# Identical to above query suggesting query_text is used to create an embedding before
# to find the nearest results
news_collection.query(
    query_embeddings=embedding_func(["Blackstone"]),
    n_results=3,
)

{'ids': [['2023-05-09ET', '2023-05-11BS', '2023-05-11ET']],
 'distances': [[1.7775036096572876, 1.8243114948272705, 1.8569608926773071]],
 'metadatas': [[{'date': '2023-05-09', 'source': 'Economic Times'},
   {'date': '2023-05-11', 'source': 'Business Standard'},
   {'date': '2023-05-11', 'source': 'Economic Times'}]],
 'embeddings': None,
 'documents': [["Allianz-Shapoorji looks to exit Hyderabad's IT SEZ Waverock for Rs 2,000 cr,Energy PSUs eye 38,000 tonnes per annum green hydrogen capacity,Happiest Minds Technologies Q4 Results: Profit rises 11% YoY to Rs 58 crore; dividend declared at Rs 3.4/share,Kalpataru Power Q4 Results: Net profit up nearly 22% at Rs 140 crore,Alibaba logistics arm eyes up to $2 billion Hong Kong IPO: Reports,NLC India board approves raising up to Rs 5,000 crore via bonds,India, Canada trade ministers to review progress in talks on free trade agreement,Japan's Daiwa to pick up 15% stake in Ambit,Sajjan Jindal group planning JSW Paints stake sale to private eq

In [19]:

news_collection.query(
    query_embeddings=embedding_func(["BlackStone"]),
    where_document={"$contains":"Blackstone"},
    n_results=10,
)


{'ids': [['2023-05-08M', '2023-05-08ET']],
 'distances': [[1.8841373920440674, 1.9864097833633423]],
 'metadatas': [[{'date': '2023-05-08', 'source': 'Mint'},
   {'date': '2023-05-08', 'source': 'Economic Times'}]],
 'embeddings': None,
 'documents': [["BetterPlace acquires fintech lending startup Bueno Finance,LIC, MFs plough $2 billion into IT firms in Q4 as shares tumble,India eyes clean energy sources to tackle tariffs,India conducts talks with UAE on pharma export pricing challenges,Blackstone, ADIA are likely bidders for HDFC’s Credila,NCLT to hear BoB plea in Rel Home case,Silver ETFs getting investors' traction; asset bases reach ₹1,800 crore,Connekkt Media Network signs ₹270 crore deal with AVS Studios for 3 films,Defence ministry approves posting women officers of Territorial Army along LoC,Grindwell Norton Q4 Earnings: PAT rise 10% YoY in Q4, net income up 20%, Board declares highest ever dividend.",
   'Aditya Birla Fashion to raise up to ₹800 crore for TCNS acquisition,Ind

In [22]:
# project only documents
news_collection.query(
    query_embeddings=embedding_func(["BlackStone"]),
    where_document={"$contains":"BetterPlace"},
    n_results=10,
    include=["documents"]
)


{'ids': [['2023-05-08M']],
 'distances': None,
 'metadatas': None,
 'embeddings': None,
 'documents': [["BetterPlace acquires fintech lending startup Bueno Finance,LIC, MFs plough $2 billion into IT firms in Q4 as shares tumble,India eyes clean energy sources to tackle tariffs,India conducts talks with UAE on pharma export pricing challenges,Blackstone, ADIA are likely bidders for HDFC’s Credila,NCLT to hear BoB plea in Rel Home case,Silver ETFs getting investors' traction; asset bases reach ₹1,800 crore,Connekkt Media Network signs ₹270 crore deal with AVS Studios for 3 films,Defence ministry approves posting women officers of Territorial Army along LoC,Grindwell Norton Q4 Earnings: PAT rise 10% YoY in Q4, net income up 20%, Board declares highest ever dividend."]]}