In [24]:
"""
Copyright 2024 USF Australia

Licensed under the MIT License

Author: Umer Imtiaz - umer.imiaz@gmail.com
Date: 24/8/2024
Version: 0.00
Code type: Alpha

This file contains the implementation of talk2 algorithm
work with memory and conversation. This will be hosted on the
umerai.com website as a product demo.

reference is the link below
https://python.langchain.com/v0.2/docs/tutorials/qa_chat_history/
"""

'\nCopyright 2024 USF Australia\n\nLicensed under the MIT License\n\nAuthor: Umer Imtiaz - umer.imiaz@gmail.com\nDate: 24/8/2024\nVersion: 0.00\nCode type: Alpha\n\nThis file contains the implementation of talk2 algorithm\nwork with memory and conversation. This will be hosted on the\numerai.com website as a product demo.\n\nreference is the link below\nhttps://python.langchain.com/v0.2/docs/tutorials/qa_chat_history/\n'

### Step 1: Loading the keys from a "keys.config" file

In [25]:
# ensuring safe programming
try:
    with open('keys.env', 'r', encoding="utf-8") as keys_file:
        key_lines = keys_file.readlines()
        keys_dict = dict(line.strip().split('=') for line in key_lines)
except FileNotFoundError:
    print("Error: keys.config file not found")
    exit()
except Exception as e:
    print (f"Error: An error occured while reading keys.config file: {e}")
    exit()

#print(f"dict = {keys_dict}")

In [26]:
#open the config file to read openai and langchain keys
# with open('keys.config', 'r', encoding="utf-8") as keys_file:
#     key_lines = keys_file.readlines()
#     keys_dict = dict(line.strip().split('=') for line in key_lines)

#loading values in the variables
try:
    openapi_key = keys_dict['openapi_key']
    # print(f"openapi_key={openapi_key}")
except KeyError:
    # print(f"Error: '{openapi_key}' not found in keys.config")
    print("Error: OPENAI key not found")

try:
    langchain_key = keys_dict['langchain_key']
    # print(f"langchain_key={langchain_key}")
except KeyError:
    # print(f"Error: '{langchain_key}' not found in keys.config")
    print("Error: langchain key not found - Langsmith traces will not log")



In [27]:
import openai
import os
from langchain_openai import ChatOpenAI

#setting the enviornment variables with the openai key values
os.environ['OPENAI_API_KEY'] = openapi_key
openai.api_key  = os.getenv('OPENAI_API_KEY')

""" print(os.environ['OPENAI_API_KEY']) """

" print(os.environ['OPENAI_API_KEY']) "

In [28]:
# setting the enviornment variables with langchain key values
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_API_KEY"] = langchain_key

""" print(os.environ['LANGCHAIN_API_KEY']) """

" print(os.environ['LANGCHAIN_API_KEY']) "

In [29]:
"""
This code takes a list of langchain Document objects and print. 
The document has two components page_contect->str and metadata->dict 
"""
def print_list_of_documents(loader_docs):
    counter=0
    print(loader_docs.__class__)
    for data in loader_docs:
        counter += 1
        print(f"\n\n{counter} {data.__class__}: {data}")
        # for key, val in data.items():
        #     print(f"\nkey: {key} ; value: {val}")
    counter = 0

### Step 2: Load the data from the sources

In [30]:
# Data source 1/4
# load data from the website
# reference : https://python.langchain.com/v0.1/docs/use_cases/web_scraping/

from langchain_community.document_loaders import WebBaseLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
# important variables
url_uci = "https://archive.ics.uci.edu/dataset/16/breast+cancer+wisconsin+prognostic"

#Step 2.1: Web page loading using a url or list of urls

web_page = WebBaseLoader(url_uci)
web_page_loader = web_page.load()
# print(web_page_loader.__class__)
# print(web_loader)
#Step 2.2: converting html page to (a document base object type) 
# simple text page with meta as dict, and page_content
#It's best suited for scenarios where the goal is to extract human-readable text 
# without needing to manipulate specific HTML elements.
""" from langchain_community.document_transformers import Html2TextTransformer

html2text = Html2TextTransformer()
web_docs2text = html2text.transform_documents(web_loader)
print(f"page contents: {web_docs2text[0].page_content[0:500]}")
print(f" meta data: {web_docs2text[0].metadata}")

for key, value in web_docs2text[0].metadata.items():
    print(f"\n{key}:{value}")
 """
#spliting Document object to smaller document objects 
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 500, chunk_overlap=100, length_function=len, is_separator_regex=False)
web_page_loader_docs = text_splitter.split_documents(web_page_loader)

#displaying Documents in the list of the documents
#print_list_of_documents(web_page_loader_docs)


### step 3: saving in data base

In [31]:
from langchain_chroma import Chroma
from langchain_openai import OpenAIEmbeddings
from langchain_openai import ChatOpenAI

#LLM modlel type - https://platform.openai.com/docs/models/gpt-4o-mini
llm = ChatOpenAI(model="gpt-4o-mini")
#Embedding model type
embdedding = OpenAIEmbeddings(model="text-embedding-3-large")
#setting up database store
vector_store = Chroma(collection_name="dpad_talk2data", embedding_function=embdedding, persist_directory="./chroma_langchain_db")

documents= web_page_loader_docs

In [33]:
# adding documents with unique ids of each
from uuid import uuid4

uuids = [str(uuid4()) for _ in range(len(documents))] # "_" can be "x", it does not matter. It is not used, anyway
print(uuids.__class__, len(uuids), uuids)

<class 'list'> 17 ['0d7062bf-6a55-4aa3-9fc7-3cf8b8b529ac', '7890ac3a-274e-4f8b-b770-58008e4b82d1', '8bbf2a1a-6fc6-4ebf-9ffb-0a8790cf84a2', '15448b60-4e2c-4dce-adc6-73455f216cea', 'dcc54eeb-2b00-4d4e-aa52-8b6fd6c8182f', 'a3006cb1-7486-403c-b0fd-4f6a64206cc1', 'e72c506e-a15f-41f0-8386-d121a34fab42', 'bbb78aa9-9233-4151-b1e8-105bbd64ee65', '9f6f1a2b-2089-485a-87db-9b47f9be109b', '289ab5cf-7d0a-4a07-a807-c2f194a19ed0', '0f24a7ad-8e46-462c-bc64-adb601c81a6b', '27d2c363-7908-4758-995a-f9c6867cb181', '2e9e92a0-86bb-43c9-b1b9-7cdf793d27c6', 'abe4a926-ef47-497a-8af8-4c52fcc34637', 'f144fdb9-582c-4430-b18b-7eb480f8f0ac', '0fc6b095-3c85-4562-9b7b-f8570df30ae3', 'ec37ff19-fdfe-4379-b7ae-814daa529095']


In [34]:
# step 2: Interacting - adding, storing, updating - playaround with the data
# storing data in chroma_langchain_db collection name is dpad_talk2data
# https://platform.openai.com/docs/models
#vector_store.from_documents(documents=documents,embedding=embdedding, ids=uuids)
vector_store.add_documents(documents=documents,embeddings=embdedding, ids=uuids)


['0d7062bf-6a55-4aa3-9fc7-3cf8b8b529ac',
 '7890ac3a-274e-4f8b-b770-58008e4b82d1',
 '8bbf2a1a-6fc6-4ebf-9ffb-0a8790cf84a2',
 '15448b60-4e2c-4dce-adc6-73455f216cea',
 'dcc54eeb-2b00-4d4e-aa52-8b6fd6c8182f',
 'a3006cb1-7486-403c-b0fd-4f6a64206cc1',
 'e72c506e-a15f-41f0-8386-d121a34fab42',
 'bbb78aa9-9233-4151-b1e8-105bbd64ee65',
 '9f6f1a2b-2089-485a-87db-9b47f9be109b',
 '289ab5cf-7d0a-4a07-a807-c2f194a19ed0',
 '0f24a7ad-8e46-462c-bc64-adb601c81a6b',
 '27d2c363-7908-4758-995a-f9c6867cb181',
 '2e9e92a0-86bb-43c9-b1b9-7cdf793d27c6',
 'abe4a926-ef47-497a-8af8-4c52fcc34637',
 'f144fdb9-582c-4430-b18b-7eb480f8f0ac',
 '0fc6b095-3c85-4562-9b7b-f8570df30ae3',
 'ec37ff19-fdfe-4379-b7ae-814daa529095']

In [35]:
retriever = vector_store.as_retriever()

In [37]:
# 2. Incorporate the retriever into a question-answering chain.
system_prompt = (
    "You are an assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer "
    "the question. If you don't know the answer, say that you "
    "don't know. Use three sentences maximum and keep the "
    "answer concise."
    "\n\n"
    "{context}"
)
from langchain_core.prompts import ChatPromptTemplate
prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)

from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains import create_retrieval_chain
question_answer_chain = create_stuff_documents_chain(llm, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

In [38]:
response = rag_chain.invoke({"input": "What is Task Decomposition?"})
response["answer"]
#print(response)

"I don't know."

In [40]:
from langchain.chains import create_history_aware_retriever
from langchain_core.prompts import MessagesPlaceholder

contextualize_q_system_prompt = (
    "Given a chat history and the latest user question "
    "which might reference context in the chat history, "
    "formulate a standalone question which can be understood "
    "without the chat history. Do NOT answer the question, "
    "just reformulate it if needed and otherwise return it as is."
)

contextualize_q_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", contextualize_q_system_prompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
)
history_aware_retriever = create_history_aware_retriever(
    llm, retriever, contextualize_q_prompt
)

In [41]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain

qa_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
)


question_answer_chain = create_stuff_documents_chain(llm, qa_prompt)

rag_chain = create_retrieval_chain(history_aware_retriever, question_answer_chain)

In [42]:
from langchain_core.messages import AIMessage, HumanMessage

chat_history = []

question = "provide me the list of parameters of breast cancer prognosis?"
ai_msg_1 = rag_chain.invoke({"input": question, "chat_history": chat_history})
print(ai_msg_1["answer"])
chat_history.extend(
    [
        HumanMessage(content=question),
        AIMessage(content=ai_msg_1["answer"]),
    ]
)

second_question = "Tell me more of R and how can we identify it?"
ai_msg_2 = rag_chain.invoke({"input": second_question, "chat_history": chat_history})

print(ai_msg_2["answer"])

The Breast Cancer Wisconsin (Prognostic) dataset contains 33 features, but specific parameters are not listed in the provided context. Generally, these parameters may include clinical measurements, tumor characteristics, and patient demographics. For an accurate list of parameters, you would need to refer to the dataset directly on the UCI Machine Learning Repository.
In the context of the provided information, "R" stands for recurrent cases of breast cancer. Recurrent cases can typically be identified through the analysis of clinical data, including the patient's history of tumor recurrence, time to recur, and other clinical features that may indicate the likelihood of recurrence. Specific methods, such as the Recurrence Surface Approximation (RSA) model, help predict time to recur using both recurrent and nonrecurrent cases.


In [43]:
from langchain_community.chat_message_histories import ChatMessageHistory
from langchain_core.chat_history import BaseChatMessageHistory
from langchain_core.runnables.history import RunnableWithMessageHistory

store = {}


def get_session_history(session_id: str) -> BaseChatMessageHistory:
    if session_id not in store:
        store[session_id] = ChatMessageHistory()
    return store[session_id]


conversational_rag_chain = RunnableWithMessageHistory(
    rag_chain,
    get_session_history,
    input_messages_key="input",
    history_messages_key="chat_history",
    output_messages_key="answer",
)

In [44]:
conversational_rag_chain.invoke(
    {"input": "What is Task Decomposition?"},
    config={
        "configurable": {"session_id": "abc123"}
    },  # constructs a key "abc123" in `store`.
)["answer"]

"I don't know."

In [45]:
conversational_rag_chain.invoke(
    {"input": "What are common ways of doing it?"},
    config={"configurable": {"session_id": "abc123"}},
)["answer"]

"I don't know."

In [46]:
for message in store["abc123"].messages:
    if isinstance(message, AIMessage):
        prefix = "AI"
    else:
        prefix = "User"

    print(f"{prefix}: {message.content}\n")

User: What is Task Decomposition?

AI: I don't know.

User: What are common ways of doing it?

AI: I don't know.



In [47]:
print(type(response))
for key, val in response.items():
    print(key, val.__class__, val)
    if isinstance(val, list):
        for list_val in val:
            print(f"\nYES {list_val}\n")

<class 'dict'>
input <class 'str'> What is Task Decomposition?
context <class 'list'> [Document(metadata={'description': 'Discover datasets around the world!', 'language': 'en', 'source': 'https://archive.ics.uci.edu/dataset/16/breast+cancer+wisconsin+prognostic', 'title': 'UCI Machine Learning Repository'}, page_content='The separation described above was obtained using Multisurface Method-Tree (MSM-T) [K. P. Bennett, "Decision Tree Construction Via Linear Programming." Proceedings of the 4th Midwest Artificial Intelligence and Cognitive Science Society, pp. 97-101, 1992], a classification method which uses linear programming to construct a decision tree.  Relevant features were selected using an exhaustive search in the space of 1-4 features and 1-3 separating planes.'), Document(metadata={'description': 'Discover datasets around the world!', 'language': 'en', 'source': 'https://archive.ics.uci.edu/dataset/16/breast+cancer+wisconsin+prognostic', 'title': 'UCI Machine Learning Reposit

In [48]:
response = rag_chain.invoke({"input": "provide me a list of parameters used in breast cancer machine learning?"})
response["answer"]

KeyError: "Input to ChatPromptTemplate is missing variables {'chat_history'}.  Expected: ['chat_history', 'context', 'input'] Received: ['input', 'context']"

In [49]:
response = rag_chain.invoke({"input": "what was observed at the time of surgery?"})
response["answer"]

KeyError: "Input to ChatPromptTemplate is missing variables {'chat_history'}.  Expected: ['chat_history', 'context', 'input'] Received: ['input', 'context']"

In [50]:
response = rag_chain.invoke({"input": "list and explain class labels?"})
response["answer"]

KeyError: "Input to ChatPromptTemplate is missing variables {'chat_history'}.  Expected: ['chat_history', 'context', 'input'] Received: ['input', 'context']"

In [51]:
results = vector_store.similarity_search(query="discover data set",k=7)
for doc in results:
    print(f"* {doc.page_content} [{doc.metadata}]")

* Features 33   Dataset Information    Additional Information  Each record represents follow-up data for one breast cancer case.  These are consecutive patients seen by Dr. Wolberg since 1984, and include only those cases exhibiting invasive breast cancer and no evidence of distant metastases at the time of diagnosis. [{'description': 'Discover datasets around the world!', 'language': 'en', 'source': 'https://archive.ics.uci.edu/dataset/16/breast+cancer+wisconsin+prognostic', 'title': 'UCI Machine Learning Repository'}]
* Features 33   Dataset Information    Additional Information  Each record represents follow-up data for one breast cancer case.  These are consecutive patients seen by Dr. Wolberg since 1984, and include only those cases exhibiting invasive breast cancer and no evidence of distant metastases at the time of diagnosis. [{'description': 'Discover datasets around the world!', 'language': 'en', 'source': 'https://archive.ics.uci.edu/dataset/16/breast+cancer+wisconsin+progno

In [52]:
results = vector_store.similarity_search_with_score(query="discover data set",k=7)
for doc, score in results:
    print(f"* [SIM={score:3f}] {doc.page_content} [{doc.metadata}]")

* [SIM=1.108000] Features 33   Dataset Information    Additional Information  Each record represents follow-up data for one breast cancer case.  These are consecutive patients seen by Dr. Wolberg since 1984, and include only those cases exhibiting invasive breast cancer and no evidence of distant metastases at the time of diagnosis. [{'description': 'Discover datasets around the world!', 'language': 'en', 'source': 'https://archive.ics.uci.edu/dataset/16/breast+cancer+wisconsin+prognostic', 'title': 'UCI Machine Learning Repository'}]
* [SIM=1.108121] Features 33   Dataset Information    Additional Information  Each record represents follow-up data for one breast cancer case.  These are consecutive patients seen by Dr. Wolberg since 1984, and include only those cases exhibiting invasive breast cancer and no evidence of distant metastases at the time of diagnosis. [{'description': 'Discover datasets around the world!', 'language': 'en', 'source': 'https://archive.ics.uci.edu/dataset/16/

In [53]:
vector_store.reset_collection()

PermissionError: [WinError 32] The process cannot access the file because it is being used by another process: './chroma_langchain_db\\768f2bb9-825a-4915-b724-62f6ecc2bee7\\data_level0.bin'

In [54]:
all_data = vector_store._collection.get()
all_ids = all_data['ids']
print(all_data['ids'])
print(len(all_data['ids']))
# retrieved_documents = vector_store.get_by_ids(all_ids)
# print(retrieved_documents)

['04bb801b-29f1-4ce0-916b-0f127d7ebb1b', '0d7062bf-6a55-4aa3-9fc7-3cf8b8b529ac', '0f24a7ad-8e46-462c-bc64-adb601c81a6b', '0fc6b095-3c85-4562-9b7b-f8570df30ae3', '0ffe75a1-ce7f-44fb-8841-7485dc04f4fe', '12fef564-7c05-4dac-8157-299ef08abc92', '1353e61a-dbb7-4b11-8efd-f68701d06b01', '15448b60-4e2c-4dce-adc6-73455f216cea', '171a5050-f23a-47af-839e-64f42e6b2c31', '1a1b8804-4072-4cd6-99a3-45fd3d9d3d74', '27d2c363-7908-4758-995a-f9c6867cb181', '289ab5cf-7d0a-4a07-a807-c2f194a19ed0', '2e9e92a0-86bb-43c9-b1b9-7cdf793d27c6', '33003111-bdd5-44b1-bfba-648528d11da2', '37813ae9-ae15-4aaa-84a3-347523d66767', '378e19c7-af07-4a72-98e0-c073b61c1415', '4601b115-995d-452f-a123-2fb2e028f66e', '53526b6b-3e78-4314-afc8-8b2a432f2d0d', '5683a8f6-18f3-438a-8782-1f3a196bd708', '584a3a90-0c15-4f58-9ceb-b269beed8e85', '5e4905ae-e0d7-4f8b-a09c-21ad137b926d', '616b620a-6049-4502-b7de-a020b2134f5e', '6753899e-9829-4d89-bc50-36e0f2930901', '690af81f-be0a-4ca8-817f-0483f39e2ab5', '71c7c5b5-ef1b-489a-88ae-c3ca936f5f2a',

In [55]:
#vector_store.delete_collection()
#vector_store.reset_collection()
def print_vector_stores(uuids, vector_store):
    counter = 0
    for x in range(len(uuids)):
        counter += 1
        record = vector_store.get(uuids[x])
        print(counter, record)
        for key, val in record.items():
            print(f"\nkey: {key} ; value: {val}")
        print(f"\n print records / total records: {counter} / {len(uuids)}\n")
    couner = 0
    #vector_store.delete_collection()

In [56]:
print_vector_stores(uuids, vector_store)
# ??add_document() suppose to keep adding/appending data with uuids but it is not happening. This needs to explored
# embeddings value is None - This should be 3072/1536 size vector embedding - this needs to explore as well

1 {'ids': ['0d7062bf-6a55-4aa3-9fc7-3cf8b8b529ac'], 'embeddings': None, 'metadatas': [{'description': 'Discover datasets around the world!', 'language': 'en', 'source': 'https://archive.ics.uci.edu/dataset/16/breast+cancer+wisconsin+prognostic', 'title': 'UCI Machine Learning Repository'}], 'documents': ['UCI Machine Learning Repository\n\n\n\n\nBreast Cancer Wisconsin (Prognostic) - UCI Machine Learning Repository'], 'uris': None, 'data': None, 'included': ['metadatas', 'documents']}

key: ids ; value: ['0d7062bf-6a55-4aa3-9fc7-3cf8b8b529ac']

key: embeddings ; value: None

key: metadatas ; value: [{'description': 'Discover datasets around the world!', 'language': 'en', 'source': 'https://archive.ics.uci.edu/dataset/16/breast+cancer+wisconsin+prognostic', 'title': 'UCI Machine Learning Repository'}]

key: documents ; value: ['UCI Machine Learning Repository\n\n\n\n\nBreast Cancer Wisconsin (Prognostic) - UCI Machine Learning Repository']

key: uris ; value: None

key: data ; value: No

In [57]:
# Data source 2/4
#load the data from the 'data_info_table.csv' 
from langchain_community.document_loaders import CSVLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
#instantiate
data_info_table = CSVLoader(file_path='./data_info_table.csv',
                                   csv_args={
                                       'delimiter': ';',
                                       'quotechar': '"',
                                       'fieldnames': ['name', 'role', 'type', 'demographic', 'description', 'units', 'missing_values']
                                   })
data_info_table_loader = data_info_table.load()
""" for doc in data_info_table_docs:
    print(f"{doc.page_content[:10000]}")
    print(f"{doc.metadata}") """

#spliting Document object to smaller document objects 
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 500, chunk_overlap=100, length_function=len, is_separator_regex=False)
original_data_loader_docs = text_splitter.split_documents(data_info_table_loader)
print(f"{original_data_loader_docs.__class__}")
counter=0
for data in data_info_table_loader:
    counter += 1
    print(f"\n\n{counter}{data.__class__}: {data}")
counter = 0

<class 'list'>


1<class 'langchain_core.documents.base.Document'>: page_content='name: name
role: role
type: type
demographic: demographic
description: description
units: units
missing_values: missing_values' metadata={'source': './data_info_table.csv', 'row': 0}


2<class 'langchain_core.documents.base.Document'>: page_content='name: ID
role: ID
type: Integer
demographic: NA
description: NA
units: NA
missing_values: no' metadata={'source': './data_info_table.csv', 'row': 1}


3<class 'langchain_core.documents.base.Document'>: page_content='name: Time
role: Feature
type: Integer
demographic: NA
description: NA
units: NA
missing_values: no' metadata={'source': './data_info_table.csv', 'row': 2}


4<class 'langchain_core.documents.base.Document'>: page_content='name: radius1
role: Feature
type: Continuous
demographic: NA
description: NA
units: NA
missing_values: no' metadata={'source': './data_info_table.csv', 'row': 3}


5<class 'langchain_core.documents.base.Document'>: page_content='

In [58]:
# Data source 3/4
#load the tables data from the 'original_data.csv'
#instantiate
original_data = CSVLoader(file_path='./original_data.csv',
                                   csv_args={
                                       'delimiter': ';',
                                       'quotechar': '"',
                                       'fieldnames': ['Time',
                                                      'radius1','texture1','perimeter1','area1','smoothness1','compactness1','concavity1','concave_points1','symmetry1','fractal_dimension1',
                                                      'radius2','texture2','perimeter2','area2','smoothness2','compactness2','concavity2','concave_points2','symmetry2','fractal_dimension2',
                                                      'radius3','texture3','perimeter3','area3','smoothness3','compactness3','concavity3','concave_points3','symmetry3','fractal_dimension3',
                                                      'tumor_size','lymph_node_status','Outcome']
                                   })
original_data_loader = original_data.load()
""" for doc in original_data_loader:
    print(f"{doc.page_content[:10000]}")
    print(f"{doc.metadata}") """

#spliting Document object to smaller document objects 
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 500, chunk_overlap=100, length_function=len, is_separator_regex=False)
original_data_loader_docs = text_splitter.split_documents(original_data_loader)
counter=0
for data in original_data_loader_docs:
    counter += 1
    print(f"\n\n{counter}{data.__class__}: {data}")
counter = 0



1<class 'langchain_core.documents.base.Document'>: page_content='Time: Time
radius1: radius1
texture1: texture1
perimeter1: perimeter1
area1: area1
smoothness1: smoothness1
compactness1: compactness1
concavity1: concavity1
concave_points1: concave_points1
symmetry1: symmetry1
fractal_dimension1: fractal_dimension1
radius2: radius2
texture2: texture2
perimeter2: perimeter2
area2: area2
smoothness2: smoothness2
compactness2: compactness2
concavity2: concavity2
concave_points2: concave_points2
symmetry2: symmetry2
fractal_dimension2: fractal_dimension2' metadata={'source': './original_data.csv', 'row': 0}


2<class 'langchain_core.documents.base.Document'>: page_content='concave_points2: concave_points2
symmetry2: symmetry2
fractal_dimension2: fractal_dimension2
radius3: radius3
texture3: texture3
perimeter3: perimeter3
area3: area3
smoothness3: smoothness3
compactness3: compactness3
concavity3: concavity3
concave_points3: concave_points3
symmetry3: symmetry3
fractal_dimension3: fractal

In [59]:
# Data source 4/4
#load the statistics from the algorithms from 'mldata.txt'

from langchain_community.document_loaders import TextLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

#loading text file the output is a langchain Document object
mldata = TextLoader('./mldata.txt')
mldata_loader= mldata.load()
print(mldata_loader.__class__)
#spliting Document object to smaller document objects 
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 500, chunk_overlap=100, length_function=len, is_separator_regex=False)
mldata_loader_docs = text_splitter.split_documents(mldata_loader)
counter=0
for data in mldata_loader_docs:
    counter += 1
    print(f"\n\n{counter}{data.__class__}: {data}")
counter = 0

<class 'list'>


1<class 'langchain_core.documents.base.Document'>: page_content='The project aims to develop a conversational AI system to interact with a breast cancer prognosis repository, 
generate explanations and predictions based on the data, and provide insights to healthcare professionals and patients. 
The project will develop a conversational interface that allows users to interact with the LLM model and ask questions 
about breast cancer prognosis. The interface will use natural language processing (NLP) techniques to understand' metadata={'source': './mldata.txt'}


2<class 'langchain_core.documents.base.Document'>: page_content='user input and generate responses based on the LLM model's predictions. This project will be hosted at umerai.com 
at later stages of project completion,for general use.' metadata={'source': './mldata.txt'}


3<class 'langchain_core.documents.base.Document'>: page_content='Machine Learning Data
Meta Data
uci_id: 16
name: Breast Cancer Wisconsin (P

### Step 3: Chroma Vector Store and ensure data integrity

In [60]:
# step 1: initializing
# https://python.langchain.com/v0.2/docs/integrations/vectorstores/chroma/
# https://python.langchain.com/v0.2/api_reference/community/vectorstores/langchain_community.vectorstores.chroma.Chroma.html#chroma
from langchain_chroma import Chroma
from langchain_openai import OpenAIEmbeddings

vector_store = Chroma(collection_name="dpad_talk2data", embedding_function=OpenAIEmbeddings(), persist_directory="./chroma_langchain_db")

# step 2: Interacting - adding, storing, updating - playaround with the data
# adding documents with unique ids of each
from uuid import uuid4
from langchain_core.documents import Document

documents= web_page_loader_docs + data_info_table_loader + original_data_loader_docs + mldata_loader_docs
uuids = [str(uuid4()) for _ in range(len(documents))] # "_" can be "x", it does not matter. It is not used, anyway


In [61]:
print(uuids.__class__, len(uuids))

<class 'list'> 462


In [62]:

# storing data in chroma_langchain_db collection name is dpad_talk2data
vector_store.add_documents(documents=documents, ids=uuids)

['dc42d43b-5c35-4dfe-924a-7cc0331576fd',
 '3bdcffa6-67a9-4887-b61d-c18daf955d46',
 '86c1f4ff-76e1-4a09-a2ac-7a7c4e470079',
 '46477c4e-a617-4c50-95dc-b141d6cf0dd1',
 'e7137d8f-8562-4640-a9c5-29b2b7fb6193',
 'e0fe5791-8560-4d9c-a0b7-f239b598b44a',
 '17e3a743-b922-439f-95fa-3b64dc92449a',
 '1fc63272-6547-4e3d-8df9-402eacb04e92',
 '79e1f3e8-0876-43bb-95ec-4796b5dd5530',
 '75b480a3-3cd2-45b1-b572-58e85d80f081',
 'f6fa04f1-6502-4cda-b196-344225937c38',
 '7a1733f4-8830-4d9a-bc4a-12581de7c733',
 'e03a2d62-8169-4e2c-9c2f-5b5fb6bfb8cf',
 'c4d57ccc-1731-4103-a7bb-aebc7b02a728',
 '7683041d-c86a-49af-a12d-31ac2df372f8',
 'df0d99ed-5a4f-483c-adb2-a448e3906ce6',
 '9738bb37-f7bf-414b-b7f2-5c2ff9858fb0',
 '984477cd-884d-4288-b205-b649e09c5bb8',
 'b6c7360f-afcd-4cb6-aa29-5ab8f22b8536',
 'b500a7bb-bb47-4026-8586-c0e165a23603',
 'f1d7f1e5-6475-4353-a4ad-a53c910d9a38',
 '1304a498-7ce5-4448-a13f-a54090e8d9f3',
 'c98ef236-f989-41ac-bf34-d759acc7869a',
 '246dc0e5-5958-45b4-ba1e-01ae7fb7d023',
 '5f59db5b-ee11-

In [63]:
counter = 0
for x in range(len(uuids)):
    counter += 1
    print(counter, vector_store.get(uuids[x]))

1 {'ids': ['dc42d43b-5c35-4dfe-924a-7cc0331576fd'], 'embeddings': None, 'metadatas': [{'description': 'Discover datasets around the world!', 'language': 'en', 'source': 'https://archive.ics.uci.edu/dataset/16/breast+cancer+wisconsin+prognostic', 'title': 'UCI Machine Learning Repository'}], 'documents': ['UCI Machine Learning Repository\n\n\n\n\nBreast Cancer Wisconsin (Prognostic) - UCI Machine Learning Repository'], 'uris': None, 'data': None, 'included': ['metadatas', 'documents']}
2 {'ids': ['3bdcffa6-67a9-4887-b61d-c18daf955d46'], 'embeddings': None, 'metadatas': [{'description': 'Discover datasets around the world!', 'language': 'en', 'source': 'https://archive.ics.uci.edu/dataset/16/breast+cancer+wisconsin+prognostic', 'title': 'UCI Machine Learning Repository'}], 'documents': ['Datasets Contribute Dataset Donate New Link External About Us Who We Are Citation Metadata Contact Information           Login       Breast Cancer Wisconsin (Prognostic)  Donated on 11/30/1995     Progno

In [64]:
counter = 0
for x in range(len(uuids)):
    counter += 1
    print(counter, vector_store.get(uuids[x]))
vector_store.delete_collection()

1 {'ids': ['dc42d43b-5c35-4dfe-924a-7cc0331576fd'], 'embeddings': None, 'metadatas': [{'description': 'Discover datasets around the world!', 'language': 'en', 'source': 'https://archive.ics.uci.edu/dataset/16/breast+cancer+wisconsin+prognostic', 'title': 'UCI Machine Learning Repository'}], 'documents': ['UCI Machine Learning Repository\n\n\n\n\nBreast Cancer Wisconsin (Prognostic) - UCI Machine Learning Repository'], 'uris': None, 'data': None, 'included': ['metadatas', 'documents']}
2 {'ids': ['3bdcffa6-67a9-4887-b61d-c18daf955d46'], 'embeddings': None, 'metadatas': [{'description': 'Discover datasets around the world!', 'language': 'en', 'source': 'https://archive.ics.uci.edu/dataset/16/breast+cancer+wisconsin+prognostic', 'title': 'UCI Machine Learning Repository'}], 'documents': ['Datasets Contribute Dataset Donate New Link External About Us Who We Are Citation Metadata Contact Information           Login       Breast Cancer Wisconsin (Prognostic)  Donated on 11/30/1995     Progno

In [65]:
from uuid import uuid4

from langchain_core.documents import Document

document_1 = Document(
    page_content="I had chocalate chip pancakes and scrambled eggs for breakfast this morning.",
    metadata={"source": "tweet"},
    id=1,
)

document_2 = Document(
    page_content="The weather forecast for tomorrow is cloudy and overcast, with a high of 62 degrees.",
    metadata={"source": "news"},
    id=2,
)

document_3 = Document(
    page_content="Building an exciting new project with LangChain - come check it out!",
    metadata={"source": "tweet"},
    id=3,
)

document_4 = Document(
    page_content="Robbers broke into the city bank and stole $1 million in cash.",
    metadata={"source": "news"},
    id=4,
)

document_5 = Document(
    page_content="Wow! That was an amazing movie. I can't wait to see it again.",
    metadata={"source": "tweet"},
    id=5,
)

document_6 = Document(
    page_content="Is the new iPhone worth the price? Read this review to find out.",
    metadata={"source": "website"},
    id=6,
)

document_7 = Document(
    page_content="The top 10 soccer players in the world right now.",
    metadata={"source": "website"},
    id=7,
)

document_8 = Document(
    page_content="LangGraph is the best framework for building stateful, agentic applications!",
    metadata={"source": "tweet"},
    id=8,
)

document_9 = Document(
    page_content="The stock market is down 500 points today due to fears of a recession.",
    metadata={"source": "news"},
    id=9,
)

document_10 = Document(
    page_content="I have a bad feeling I am going to get deleted :(",
    metadata={"source": "tweet"},
    id=10,
)

documents = [
    document_1,
    document_2,
    document_3,
    document_4,
    document_5,
    document_6,
    document_7,
    document_8,
    document_9,
    document_10,
]
uuids = [str(uuid4()) for _ in range(len(documents))]

vector_store.add_documents(documents=documents, ids=uuids)

ValueError: Chroma collection not initialized. Use `reset_collection` to re-create and initialize the collection. 

In [66]:
updated_document_1 = Document(
    page_content="I had chocalate chip pancakes and fried eggs for breakfast this morning.",
    metadata={"source": "tweet"},
    id=1,
)

updated_document_2 = Document(
    page_content="The weather forecast for tomorrow is sunny and warm, with a high of 82 degrees.",
    metadata={"source": "news"},
    id=2,
)

vector_store.update_document(document_id=uuids[0], document=updated_document_1)
# You can also update multiple documents at once
vector_store.update_documents(
    ids=uuids[:2], documents=[updated_document_1, updated_document_1]
)

ValueError: Chroma collection not initialized. Use `reset_collection` to re-create and initialize the collection. 

In [67]:
counter = 0
for x in range(len(uuids)):
    counter += 1
    print(counter, vector_store.get(uuids[x]))
    for key, val in vector_store.get(uuids[x]).items():
        print(f"\nkey: {key} ; value: {val}")
#vector_store.delete_collection()

ValueError: Chroma collection not initialized. Use `reset_collection` to re-create and initialize the collection. 

In [68]:
from langchain_openai import OpenAIEmbeddings

embeddings_model = OpenAIEmbeddings()

In [69]:
embeddings = embeddings_model.embed_documents(
    [
        "Hi there!",
        "Oh, hello!",
        "What's your name?",
        "My friends call me World",
        "Hello World!"
    ]
)
len(embeddings), len(embeddings[0])

(5, 1536)

In [70]:
embedded_query = embeddings_model.embed_query("What was the name mentioned in the conversation?")
embedded_query[:50]

[0.005377273540943861,
 -0.0006527779041789472,
 0.038980286568403244,
 -0.002967397216707468,
 -0.008834563195705414,
 0.0211923960596323,
 -0.017154492437839508,
 -0.0017368923872709274,
 -0.003005335107445717,
 -0.010418055579066277,
 0.022393209859728813,
 0.009157859720289707,
 0.003971925005316734,
 -0.00932280719280243,
 -0.010154140181839466,
 0.002856882754713297,
 0.0364203080534935,
 0.0043216124176979065,
 0.020572194829583168,
 -0.03235601261258125,
 -0.0032214156817644835,
 -0.0056642815470695496,
 0.0015241106739267707,
 0.02623317763209343,
 -0.011434128507971764,
 0.019648492336273193,
 0.02815975993871689,
 -0.01842128485441208,
 -0.002541834022849798,
 -0.016824597492814064,
 0.011434128507971764,
 0.0010465889936313033,
 -0.01475286390632391,
 0.006822209805250168,
 -0.05090925469994545,
 -0.003925739787518978,
 0.005565313156694174,
 -0.017867064103484154,
 0.02668183296918869,
 0.004661403596401215,
 0.022775888442993164,
 0.001755036530084908,
 -0.005403664894402

In [71]:
# Build a sample vectorDB
from langchain_chroma import Chroma
from langchain_community.document_loaders import WebBaseLoader
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter

# Load blog post
loader = WebBaseLoader("https://lilianweng.github.io/posts/2023-06-23-agent/")
data = loader.load()

# Split
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=0)
splits = text_splitter.split_documents(data)

# VectorDB
embedding = OpenAIEmbeddings(model="text-embedding-3-large")
vectordb = Chroma.from_documents(documents=splits, embedding=embedding)

In [72]:
from langchain.retrievers.multi_query import MultiQueryRetriever
from langchain_openai import ChatOpenAI

question = "What are the approaches to Task Decomposition?"
llm = ChatOpenAI(temperature=0)
retriever_from_llm = MultiQueryRetriever.from_llm(
    retriever=vectordb.as_retriever(), llm=llm
)

In [73]:
# Set logging for the queries
import logging

logging.basicConfig()
logging.getLogger("langchain.retrievers.multi_query").setLevel(logging.INFO)

In [74]:
unique_docs = retriever_from_llm.invoke(question)
len(unique_docs)

INFO:langchain.retrievers.multi_query:Generated queries: ['1. How can Task Decomposition be achieved through different methods?', '2. What strategies are commonly used for Task Decomposition?', '3. In what ways can Task Decomposition be approached and implemented effectively?']


4

In [75]:
from typing import List

from langchain_core.output_parsers import BaseOutputParser
from langchain_core.prompts import PromptTemplate
from langchain_core.pydantic_v1 import BaseModel, Field


# Output parser will split the LLM result into a list of queries
class LineListOutputParser(BaseOutputParser[List[str]]):
    """Output parser for a list of lines."""

    def parse(self, text: str) -> List[str]:
        lines = text.strip().split("\n")
        return list(filter(None, lines))  # Remove empty lines


output_parser = LineListOutputParser()

QUERY_PROMPT = PromptTemplate(
    input_variables=["question"],
    template="""You are an AI language model assistant. Your task is to generate five 
    different versions of the given user question to retrieve relevant documents from a vector 
    database. By generating multiple perspectives on the user question, your goal is to help
    the user overcome some of the limitations of the distance-based similarity search. 
    Provide these alternative questions separated by newlines.
    Original question: {question}""",
)
llm = ChatOpenAI(temperature=0)

# Chain
llm_chain = QUERY_PROMPT | llm | output_parser

# Other inputs
question = "What are the approaches to Task Decomposition?"

In [76]:
# Run
retriever = MultiQueryRetriever(
    retriever=vectordb.as_retriever(), llm_chain=llm_chain, parser_key="lines"
)  # "lines" is the key (attribute name) of the parsed output

# Results
unique_docs = retriever.invoke("What does the course say about regression?")
len(unique_docs)

INFO:langchain.retrievers.multi_query:Generated queries: ['1. Can you provide insights from the course on regression analysis?', '2. How is regression discussed in the course material?', '3. What topics related to regression are covered in the course?', '4. What information does the course offer about regression techniques?', '5. In what ways does the course address the topic of regression?']


9