INSTALL DEPENDENCIES

In [None]:
!pip install langchain
!pip install langchain-google-genai
!pip install langchain-chroma

In [None]:
import pandas as pd

In [None]:
import pandas as pd
#dataset sourced from https://www.kaggle.com/datasets/jacopoferretti/bbc-articles-dataset/data
data=pd.read_csv("datafiles/source_data/bbc_news_dataset.csv")
sampled_data= data.groupby('labels', group_keys=False).apply(lambda x: x.sample(80))
sampled_data.to_excel("datafiles/source_data/sampled_data.xlsx")

In [None]:
sampled_data=pd.read_excel("datafiles/source_data/sampled_data.xlsx")

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_core.documents import Document as LangchainDocument


def getLangchainDocs(dfvals):
    langchain_docs=[]
    for idx,vals in dfvals.iterrows():
        category_vals=vals["labels"]
        metadata_vals={'category':category_vals}
        langchain_docs.append(LangchainDocument(page_content=vals["text"], metadata=metadata_vals))
    return langchain_docs

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200,
    add_start_index=True,
    separators=["\n\n", "\n", ".", " ", ""],
)
langchain_docs=getLangchainDocs(sampled_data)
docs_processed = []
for doc in langchain_docs:
    docs_processed += text_splitter.split_documents([doc])

In [None]:
from langchain_google_genai import GoogleGenerativeAIEmbeddings
import os

os.environ["GOOGLE_API_KEY"] = "<gemini api key>"


try:
    doc_embeddings_model = GoogleGenerativeAIEmbeddings(
        model="models/text-embedding-004", task_type="RETRIEVAL_DOCUMENT"
    )
    print("Gemini Embeddings model initialized successfully.")
except Exception as e:
    print(f"Error initializing Gemini Embeddings model: {e}")
    exit()

Gemini Embeddings model initialized successfully.


Embed docs into ChromaDB VectorDB

In [None]:
from langchain_chroma import Chroma
PERSIST_DIRECTORY ="../assets/chroma_langchain_db"
try:
    vectorstore = Chroma.from_documents(
        documents=docs_processed,
        collection_name="rag_collection",
        embedding=doc_embeddings_model,
        persist_directory=PERSIST_DIRECTORY
    )
    print("New ChromaDB vector store created and documents embedded successfully.")
except Exception as e:
    print(f"Error creating new ChromaDB vector store: {e}")
    exit()

New ChromaDB vector store created and documents embedded successfully.


Checking vectordb created

In [None]:
import chromadb
client = chromadb.PersistentClient(path=PERSIST_DIRECTORY)

collection_name="rag_collection"
try:
    collection = client.get_collection(name=collection_name) # Add your embedding function if it was set at collection creation
    print(f"Number of items in collection '{collection_name}': {collection.count()}")
    if collection.count() > 0:
        print("Sample items (first few):", collection.peek(limit=5))
    else:
        print("The collection is empty!")
except Exception as e:
    print(f"Error accessing collection: {e}")
    print("Make sure 'collection_name' is correct. Common LangChain default is 'langchain'.")
    print("Available collections:", client.list_collections())

Number of items in collection 'rag_collection': 1388
Sample items (first few): {'ids': ['61ff477a-d355-45de-b224-3683218274a1', 'b1502381-787f-4d45-b9e8-4f649927b8b9', 'ff955435-1b68-4a12-afce-f4c404386bec', '7a583235-991a-459d-a8f1-c1af8683722d', '6a573680-9517-4c54-bcb2-14314f598af8'], 'embeddings': array([[ 0.04338471, -0.0087636 , -0.0275056 , ...,  0.03518229,
         0.02648607, -0.04214432],
       [ 0.02587302, -0.02361513, -0.02861586, ..., -0.05110757,
         0.01055619, -0.01887239],
       [-0.02195599,  0.0062551 , -0.02853753, ..., -0.04722888,
         0.04992726, -0.03966647],
       [ 0.01791953, -0.04004534,  0.01699263, ..., -0.00812571,
         0.00557493, -0.01232196],
       [ 0.05340696, -0.01562292,  0.0276009 , ...,  0.00038536,
         0.01550805, -0.02316467]]), 'documents': ['Golden rule \'intact\' says ex-aide\n\nChancellor Gordon Brown will meet his golden economic rule "with a margin to spare", according to his former chief economic adviser.\n\nForme

Loading vector DB

In [None]:
from langchain_chroma import Chroma

vector_db_load = Chroma(persist_directory=PERSIST_DIRECTORY ,
    embedding_function=doc_embeddings_model,
    collection_name="rag_collection")

Embedding question/query for retrieval

In [None]:
try:
    query_embeddings_model = GoogleGenerativeAIEmbeddings(
        model="models/text-embedding-004", task_type="RETRIEVAL_QUERY"
    )
    print("Gemini Embeddings model initialized successfully.")
except Exception as e:
    print(f"Error initializing Gemini Embeddings model: {e}")
    exit()

Gemini Embeddings model initialized successfully.


Retrieval result

In [None]:
results = vector_db_load.similarity_search_by_vector(
    embedding=query_embeddings_model.embed_query("Industrial output reported by Nikkie"), k=1
)

In [None]:
results

[Document(id='551ffe3b-6227-4f61-b8cb-d0bd62e7ad94', metadata={'category': 'business', 'start_index': 158}, page_content="Industrial output rose 2.1% - adjusted for the time of year - in January from a month earlier. At the same time, retail sales picked up faster than at any time since 1997. The news sent Tokyo shares to an eight-month high, as investors hoped for a recovery from the three quarters of contraction seen from April 2004 on. The Nikkei 225 index ended the day up 0.7% at 11,740.60 points, with the yen strengthening 0.7% against the dollar to 104.53 yen. Weaker exports, normally the engine for Japan's economy in the face of weak domestic demand, had helped trigger a 0.1% contraction in the final three months of last year after two previous quarters of shrinking GDP. Only an exceptionally strong performance in the early months of 2004 kept the year as a whole from showing a decline. The output figures brought a cautiously optimistic response from economic officials")]

In [None]:
query_embeddings_model.embed_query("Industrial output reported by Nikkie")

[-0.00942942500114441,
 0.01302474644035101,
 0.022613612934947014,
 0.048060595989227295,
 -0.013123263604938984,
 0.0006020242581143975,
 0.02731187455356121,
 0.0188149344176054,
 -0.004764469340443611,
 -0.0006822975119575858,
 0.00800072867423296,
 0.018360357731580734,
 -0.06082752346992493,
 0.0035852801520377398,
 0.05131326615810394,
 -0.030637994408607483,
 0.04175132140517235,
 -0.034026749432086945,
 -0.08727480471134186,
 -0.0002813088067341596,
 -0.001058264053426683,
 -0.047523315995931625,
 -0.00875457189977169,
 -0.016876257956027985,
 -0.01146579161286354,
 -0.03970031812787056,
 0.007052257657051086,
 0.017445463687181473,
 -0.06993888318538666,
 -0.06697050482034683,
 -0.01686633750796318,
 0.06769154220819473,
 0.01925119198858738,
 -0.02709672972559929,
 -0.0002644738997332752,
 -0.0006213763263076544,
 -0.006606621202081442,
 0.03815817832946777,
 0.04576234519481659,
 -0.02436142787337303,
 -0.05998983234167099,
 0.0339876189827919,
 -0.043124500662088394,
 0.07