### Set up envs and libaries for ChromaDB

In [1]:
# Check the Jupyter Environment

import sys
print(sys.executable)

# Remember to restart Jupyter lab using 
# conda activate base
# if the env being used is conda

/Users/suneermehmood/miniconda3/bin/python


In [2]:
# run the below to insall chromadb if not already installed
# !pip install chromadb --user

import chromadb

In [3]:
import pandas as pd

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [4]:
# Set the below option if you would like to see the entire field length

pd.set_option('display.max_colwidth', None)

## Introduction to Kaggle DataSets

https://www.kaggle.com/datasets/nulldata/medium-post-titles

### Read data from file

In [5]:
# Read the sample data from Kaggle into a DF
df = pd.read_csv('medium_post_titles.csv')

In [6]:
df

Unnamed: 0,category,title,subtitle,subtitle_truncated_flag
0,work,"""21 Conversations"" - A fun (and easy) game for teams to get to know each other",A (new?) Icebreaker game to get your team to say all the interesting stuff,False
1,spirituality,"""Biblical Porn"" at Mars Hill",Author and UW lecturer Jessica Johnson talks about her new book on Mars Hill Church's and Mark Driscoll's evangelical masculinity,False
2,lgbtqia,"""CISGENDER?! Is That A Disease?!""","Or, a primer in gender vocabulary for the curious-minded",False
3,equality,"""Call me Nat Love"" :Black Cowboys and the Frontiers of American Freedom",,False
4,artificial-intelligence,"""Can I Train my Model on Your Computer?""",How we waste computational resources and how to stop it.,False
...,...,...,...,...
126413,javascript,🚀 Introducing Akita: A New State Management Pattern for Angular Applications,"Every developer knows state management is difficult. Continuously keeping track of what has been updated, why, and when, can become a",True
126414,cryptocurrency,🚀Sudden Bitcoin Price Breakout Sets New Bull Target at Over $5K,Short-term trend: Bullish (but overbought),False
126415,artificial-intelligence,🤖 Hack4th0n & RASA chatbot,"Past week I took part in an internal Hackathon with our team. We made a chatbot assistant, that will help you solve your everyday tasks. I",True
126416,artificial-intelligence,🤖AI Diary #4,"Topics in this issue include deep learning applied to radiology, language GANs falling short, integrating reasoning into AI models",True


### Perform TX

In [7]:
# Dropping null values
df = df.dropna()

# Dropping subtitle_truncated_flag 
df = df[-df['subtitle_truncated_flag']]


# Filter only the required topics_of_interest
topics_of_interest = ['artificial-intelligence','data-science','machine-learning']
df[df['category'].isin(topics_of_interest)]


# Combine two fields to 'text' column
df['text'] = df['title'] + df['subtitle']


# Create a dictionary column - Meta
df['meta'] = df.apply(lambda x: {
    'text': x['text'],
    'category': x['category']
}, axis = 1)


In [8]:
df.head()

# No vector embedding done yet

Unnamed: 0,category,title,subtitle,subtitle_truncated_flag,text,meta
0,work,"""21 Conversations"" - A fun (and easy) game for teams to get to know each other",A (new?) Icebreaker game to get your team to say all the interesting stuff,False,"""21 Conversations"" - A fun (and easy) game for teams to get to know each otherA (new?) Icebreaker game to get your team to say all the interesting stuff","{'text': '""21 Conversations"" - A fun (and easy) game for teams to get to know each otherA (new?) Icebreaker game to get your team to say all the interesting stuff', 'category': 'work'}"
1,spirituality,"""Biblical Porn"" at Mars Hill",Author and UW lecturer Jessica Johnson talks about her new book on Mars Hill Church's and Mark Driscoll's evangelical masculinity,False,"""Biblical Porn"" at Mars HillAuthor and UW lecturer Jessica Johnson talks about her new book on Mars Hill Church's and Mark Driscoll's evangelical masculinity","{'text': '""Biblical Porn"" at Mars HillAuthor and UW lecturer Jessica Johnson talks about her new book on Mars Hill Church's and Mark Driscoll's evangelical masculinity', 'category': 'spirituality'}"
2,lgbtqia,"""CISGENDER?! Is That A Disease?!""","Or, a primer in gender vocabulary for the curious-minded",False,"""CISGENDER?! Is That A Disease?!""Or, a primer in gender vocabulary for the curious-minded","{'text': '""CISGENDER?! Is That A Disease?!""Or, a primer in gender vocabulary for the curious-minded', 'category': 'lgbtqia'}"
4,artificial-intelligence,"""Can I Train my Model on Your Computer?""",How we waste computational resources and how to stop it.,False,"""Can I Train my Model on Your Computer?""How we waste computational resources and how to stop it.","{'text': '""Can I Train my Model on Your Computer?""How we waste computational resources and how to stop it.', 'category': 'artificial-intelligence'}"
5,cryptocurrency,"""Cypherpunks and Wall Street"": The Security Token Revolution & Regulation",Bruce Fenton presents at the World Blockchain Forum - and will cryptocurrency bring us to utopia or dystopia?,False,"""Cypherpunks and Wall Street"": The Security Token Revolution & RegulationBruce Fenton presents at the World Blockchain Forum - and will cryptocurrency bring us to utopia or dystopia?","{'text': '""Cypherpunks and Wall Street"": The Security Token Revolution & RegulationBruce Fenton presents at the World Blockchain Forum - and will cryptocurrency bring us to utopia or dystopia?', 'category': 'cryptocurrency'}"


## ChromaDB setup

### Connecting to ChromaDB

In [9]:
from chromadb.config import Settings

In [10]:
# ChromaDB set up - client
# If no parameter is provided within the paranthesis, 
# this uses the default in-memory DB

# chroma_client = chromadb.Client() # in-memory DB

# after migration to new version of ChromaDB
# client = chromadb.EphemeralClient()


# before migration / older version
# chroma_client = chromadb.Client(Settings(
#     persist_directory = 'medium-chroma-db',
#     chroma_db_impl = 'duckdb+parquet'
# )) #to persist the DB data


# after migration to new version of ChromaDB
chroma_client = chromadb.PersistentClient(path='medium-chroma-db')
 #to persist the DB data

In [11]:
# Define and create the collection for ChromaDB

# Drop the collection if required
# chroma_client.delete_collection(name='medium-article')


article_collection = chroma_client.create_collection(name = 'medium-article-2')

## Insert Data

In [23]:
# Inserting data
# ids from df index need to get converted to string
# all field values need to get converted to a list - that is how ChromaDB works

# Leaving to ChromaDB to use the default embedding method

# article_collection.add(
#     ids=[f'{x}' for x in df.index.tolist()],
#     documents=df['text'].tolist(),
#     metadatas=df['meta'].tolist()
# )

# Optionally, to truncate collections
article_collection.delete()

NameError: name 'article_collection' is not defined

## Inserting the data in batches
#### if there is any issue in writing any row

In [12]:
def insert_in_batches(collection, ids, documents, metadatas, batch_size=100):
    total = len(ids)
    for start in range(0, total, batch_size):
        end = start + batch_size
        batch_ids = ids[start:end]
        batch_documents = documents[start:end]
        batch_metadatas = metadatas[start:end]
        
        # Insert the current batch
        collection.add(
            ids=batch_ids,
            documents=batch_documents,
            metadatas=batch_metadatas
        )
        print(f"Inserted batch {start//batch_size + 1}/{(total + batch_size - 1)//batch_size}")

# Example usage
insert_in_batches(
    article_collection,
    ids=[f'{x}' for x in df.index.tolist()],
    documents=df['text'].tolist(),
    metadatas=df['meta'].tolist(),
    batch_size=1000  # Adjust the batch size based on your environment's capacity
)

[0;93m2024-02-12 15:25:41.028356 [W:onnxruntime:, helper.cc:67 IsInputSupported] CoreML does not support input dim > 16384. Input:embeddings.word_embeddings.weight, shape: {30522,384}[m
[0;93m2024-02-12 15:25:41.028761 [W:onnxruntime:, coreml_execution_provider.cc:81 GetCapability] CoreMLExecutionProvider::GetCapability, number of partitions supported by CoreML: 49 number of nodes in the graph: 323 number of nodes supported by CoreML: 231[m


Inserted batch 1/82
Inserted batch 2/82


KeyboardInterrupt: 

### Upsert the data (optional)

In [None]:
# Upserting the data
# If the record is already present, an Update would happen

article_collection.upsert(
    ids=[f'{x}' for x in df.index.tolist()],
    documents=df['text'].tolist(),
    metadatas=df['meta'].tolist()
)


## Vector Query


In [13]:
qry_str = 'best data science library?'

article_collection.query(query_texts=qry_str, n_results=1)


{'ids': [['2586']],
 'distances': [[0.677875280380249]],
 'metadatas': [[{'category': 'artificial-intelligence',
    'text': '5 Resources to Inspire Your Next Data Science ProjectDon’t worry — getting started is the hardest part'}]],
 'embeddings': None,
 'documents': [['5 Resources to Inspire Your Next Data Science ProjectDon’t worry — getting started is the hardest part']],
 'uris': None,
 'data': None}

In [14]:
qry_str = 'best data ai library?'

article_collection.query(query_texts=qry_str, n_results=1)

{'ids': [['289']],
 'distances': [[1.0606274604797363]],
 'metadatas': [[{'category': 'data-science',
    'text': '(Robot) data scientists as a serviceAutomating data science with symbolic regression and probabilistic programming.'}]],
 'embeddings': None,
 'documents': [['(Robot) data scientists as a serviceAutomating data science with symbolic regression and probabilistic programming.']],
 'uris': None,
 'data': None}