In [233]:
"""
Copyright 2024 USF Australia

Licensed under the MIT License

Author: Umer Imtiaz - umer.imiaz@gmail.com
Date: 24/8/2024
Version: 0.00
Code type: Alpha

This file contains the implementation of talk2 algorithm
work with memory and conversation. This will be hosted on the
umerai.com website as a product demo.

reference is the link below
https://python.langchain.com/v0.2/docs/tutorials/qa_chat_history/
"""

'\nCopyright 2024 USF Australia\n\nLicensed under the MIT License\n\nAuthor: Umer Imtiaz - umer.imiaz@gmail.com\nDate: 24/8/2024\nVersion: 0.00\nCode type: Alpha\n\nThis file contains the implementation of talk2 algorithm\nwork with memory and conversation. This will be hosted on the\numerai.com website as a product demo.\n\nreference is the link below\nhttps://python.langchain.com/v0.2/docs/tutorials/qa_chat_history/\n'

### Step 1: Loading the keys from a "keys.env" file

In [234]:
# ensuring safe programming
try:
    with open('keys.env', 'r', encoding="utf-8") as keys_file:
        key_lines = keys_file.readlines()
        keys_dict = dict(line.strip().split('=') for line in key_lines)
except FileNotFoundError:
    print("Error: keys.config file not found")
    exit()
except Exception as e:
    print (f"Error: An error occured while reading keys.config file: {e}")
    exit()

#print(f"dict = {keys_dict}")

In [235]:
#open the config file to read openai and langchain keys

#loading values in the variables
try:
    openapi_key = keys_dict['openapi_key']
    # print(f"openapi_key={openapi_key}")
except KeyError:
    # print(f"Error: '{openapi_key}' not found in keys.config")
    print("Error: OPENAI key not found")

try:
    langchain_key = keys_dict['langchain_key']
    # print(f"langchain_key={langchain_key}")
except KeyError:
    # print(f"Error: '{langchain_key}' not found in keys.config")
    print("Error: langchain key not found - Langsmith traces will not log")



In [236]:
import openai
import os
from langchain_openai import ChatOpenAI

#setting the enviornment variables with the openai key values
os.environ['OPENAI_API_KEY'] = openapi_key
openai.api_key  = os.getenv('OPENAI_API_KEY')

""" print(os.environ['OPENAI_API_KEY']) """

" print(os.environ['OPENAI_API_KEY']) "

In [237]:
# setting the enviornment variables with langchain key values
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_API_KEY"] = langchain_key

""" print(os.environ['LANGCHAIN_API_KEY']) """

" print(os.environ['LANGCHAIN_API_KEY']) "

In [238]:
"""
This code takes a list of langchain Document objects and print. 
The document has two components page_contecnt->str and metadata->dict 
"""
def print_list_of_documents(loader_docs):
    counter=0
    print(loader_docs.__class__)
    for data in loader_docs:
        counter += 1
        print(f"\n\n{counter} {data.__class__}: {data}")
        ##
        if isinstance(data, dict): # if False this is a list only and not list of dictionaries
            for key, val in data.items():
                print(f"\nkey: {key} ; value: {val}")
    counter = 0

In [239]:
"""This displays the vector store information from a vector store using ids"""
def print_vector_stores(uuids, vector_store):
    counter = 0
    for x in range(len(uuids)):
        counter += 1
        record = vector_store.get(uuids[x])
        print(counter, record)
        for key, val in record.items():
            print(f"\nkey: {key} ; value: {val}")
        print(f"\n print records / total records: {counter} / {len(uuids)}\n")
    couner = 0

### Step 2: Load the data from the sources

In [240]:
# Data source 1/4
# load data from the website
# reference : https://python.langchain.com/v0.1/docs/use_cases/web_scraping/

from langchain_community.document_loaders import WebBaseLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
# important variables
url_uci = "https://archive.ics.uci.edu/dataset/16/breast+cancer+wisconsin+prognostic"

#Step 2.1: Web page loading using a url or list of urls

web_page = WebBaseLoader(url_uci)
web_page_loader = web_page.load()

#spliting Document object to smaller document objects 
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 500, chunk_overlap=100, length_function=len, is_separator_regex=False)
web_page_loader_docs = text_splitter.split_documents(web_page_loader)

#displaying Documents in the list of the documents
print_list_of_documents(web_page_loader_docs)


<class 'list'>


1 <class 'langchain_core.documents.base.Document'>: page_content='UCI Machine Learning Repository




Breast Cancer Wisconsin (Prognostic) - UCI Machine Learning Repository' metadata={'source': 'https://archive.ics.uci.edu/dataset/16/breast+cancer+wisconsin+prognostic', 'title': 'UCI Machine Learning Repository', 'description': 'Discover datasets around the world!', 'language': 'en'}


2 <class 'langchain_core.documents.base.Document'>: page_content='Datasets Contribute Dataset Donate New Link External About Us Who We Are Citation Metadata Contact Information           Login       Breast Cancer Wisconsin (Prognostic)  Donated on 11/30/1995     Prognostic Wisconsin Breast Cancer Database  Dataset Characteristics Multivariate Subject Area Health and Medicine Associated Tasks Classification, Regression Feature Type Real # Instances 198 # Features 33   Dataset Information    Additional Information  Each record represents follow-up data' metadata={'source': 'https://archive

In [241]:
# Data source 2/4
#load the data from the 'data_info_table.csv' 
from langchain_community.document_loaders import CSVLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
#instantiate
data_info_table = CSVLoader(file_path='./data_info_table.csv',
                                   csv_args={
                                       'delimiter': ';',
                                       'quotechar': '"',
                                       'fieldnames': ['name', 'role', 'type', 'demographic', 'description', 'units', 'missing_values']
                                   })
data_info_table_loader = data_info_table.load()
""" for doc in data_info_table_docs:
    print(f"{doc.page_content[:10000]}")
    print(f"{doc.metadata}") """

#spliting Document object to smaller document objects 
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 500, chunk_overlap=100, length_function=len, is_separator_regex=False)
original_data_loader_docs_data_info_table = text_splitter.split_documents(data_info_table_loader)

print_list_of_documents(original_data_loader_docs_data_info_table)


<class 'list'>


1 <class 'langchain_core.documents.base.Document'>: page_content='name: name
role: role
type: type
demographic: demographic
description: description
units: units
missing_values: missing_values' metadata={'source': './data_info_table.csv', 'row': 0}


2 <class 'langchain_core.documents.base.Document'>: page_content='name: ID
role: ID
type: Integer
demographic: NA
description: NA
units: NA
missing_values: no' metadata={'source': './data_info_table.csv', 'row': 1}


3 <class 'langchain_core.documents.base.Document'>: page_content='name: Time
role: Feature
type: Integer
demographic: NA
description: NA
units: NA
missing_values: no' metadata={'source': './data_info_table.csv', 'row': 2}


4 <class 'langchain_core.documents.base.Document'>: page_content='name: radius1
role: Feature
type: Continuous
demographic: NA
description: NA
units: NA
missing_values: no' metadata={'source': './data_info_table.csv', 'row': 3}


5 <class 'langchain_core.documents.base.Document'>: page_cont

In [242]:
# Data source 3/4
#load the tables data from the 'original_data.csv'
#instantiate
original_data = CSVLoader(file_path='./original_data.csv',
                                   csv_args={
                                       'delimiter': ';',
                                       'quotechar': '"',
                                       'fieldnames': ['Time',
                                                      'radius1','texture1','perimeter1','area1','smoothness1','compactness1','concavity1','concave_points1','symmetry1','fractal_dimension1',
                                                      'radius2','texture2','perimeter2','area2','smoothness2','compactness2','concavity2','concave_points2','symmetry2','fractal_dimension2',
                                                      'radius3','texture3','perimeter3','area3','smoothness3','compactness3','concavity3','concave_points3','symmetry3','fractal_dimension3',
                                                      'tumor_size','lymph_node_status','Outcome']
                                   })
original_data_loader = original_data.load()
""" for doc in original_data_loader:
    print(f"{doc.page_content[:10000]}")
    print(f"{doc.metadata}") """

#spliting Document object to smaller document objects 
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 500, chunk_overlap=100, length_function=len, is_separator_regex=False)
original_data_loader_docs = text_splitter.split_documents(original_data_loader)

print_list_of_documents(original_data_loader_docs)
""" counter=0
for data in original_data_loader_docs:
    counter += 1
    print(f"\n\n{counter}{data.__class__}: {data}")
counter = 0 """

<class 'list'>


1 <class 'langchain_core.documents.base.Document'>: page_content='Time: Time
radius1: radius1
texture1: texture1
perimeter1: perimeter1
area1: area1
smoothness1: smoothness1
compactness1: compactness1
concavity1: concavity1
concave_points1: concave_points1
symmetry1: symmetry1
fractal_dimension1: fractal_dimension1
radius2: radius2
texture2: texture2
perimeter2: perimeter2
area2: area2
smoothness2: smoothness2
compactness2: compactness2
concavity2: concavity2
concave_points2: concave_points2
symmetry2: symmetry2
fractal_dimension2: fractal_dimension2' metadata={'source': './original_data.csv', 'row': 0}


2 <class 'langchain_core.documents.base.Document'>: page_content='concave_points2: concave_points2
symmetry2: symmetry2
fractal_dimension2: fractal_dimension2
radius3: radius3
texture3: texture3
perimeter3: perimeter3
area3: area3
smoothness3: smoothness3
compactness3: compactness3
concavity3: concavity3
concave_points3: concave_points3
symmetry3: symmetry3
fractal_di

' counter=0\nfor data in original_data_loader_docs:\n    counter += 1\n    print(f"\n\n{counter}{data.__class__}: {data}")\ncounter = 0 '

In [243]:
# Data source 4/4
#load the statistics from the algorithms from 'mldata.txt'

from langchain_community.document_loaders import TextLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

#loading text file the output is a langchain Document object
mldata = TextLoader('./mldata.txt')
mldata_loader= mldata.load()
print(mldata_loader.__class__)
#spliting Document object to smaller document objects 
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 500, chunk_overlap=100, length_function=len, is_separator_regex=False)
mldata_loader_docs = text_splitter.split_documents(mldata_loader)

print_list_of_documents(mldata_loader_docs)

<class 'list'>
<class 'list'>


1 <class 'langchain_core.documents.base.Document'>: page_content='The project aims to develop a conversational AI system to interact with a breast cancer prognosis repository, 
generate explanations and predictions based on the data, and provide insights to healthcare professionals and patients. 
The project will develop a conversational interface that allows users to interact with the LLM model and ask questions 
about breast cancer prognosis. The interface will use natural language processing (NLP) techniques to understand' metadata={'source': './mldata.txt'}


2 <class 'langchain_core.documents.base.Document'>: page_content='user input and generate responses based on the LLM model's predictions. This project will be hosted at umerai.com 
at later stages of project completion,for general use.' metadata={'source': './mldata.txt'}


3 <class 'langchain_core.documents.base.Document'>: page_content='Machine Learning Data
Meta Data
uci_id: 16
name: Breast C

### step 3: saving in a data base

In [244]:
from langchain_chroma import Chroma
from langchain_openai import OpenAIEmbeddings, ChatOpenAI

#LLM modlel type - https://platform.openai.com/docs/models/gpt-4o-mini
llm = ChatOpenAI(model="gpt-4o-mini")
#Embedding model type
embdedding = OpenAIEmbeddings(model="text-embedding-3-large")

#setting up database store
vector_store = Chroma(collection_name="dpad_talk2data", embedding_function=embdedding, persist_directory="./chroma_langchain_db")

documents = web_page_loader_docs + original_data_loader_docs_data_info_table + original_data_loader_docs + mldata_loader_docs

In [245]:
# adding documents with unique ids of each
from uuid import uuid4

uuids = [str(uuid4()) for _ in range(len(documents))] # "_" can be "x", it does not matter. It is not used, anyway
print(uuids.__class__, len(uuids), uuids)

<class 'list'> 462 ['60808676-b240-4c81-8084-79d781c88207', 'f92ba0e8-a528-425b-bf0f-9d567a2f7abf', 'e4abffca-7802-43f0-9430-cad0138c25a7', '656eaab8-e09e-4c39-8edf-aab91aea5c59', 'a4ee8aa6-5fc5-4552-a6f6-d4554ef485d2', '3a3f23d8-7159-4bd3-8661-bd7372b265f7', '1aa3470f-bd2f-4200-96d4-4bd46e15fea0', '5e4faf08-5b99-409e-81c5-11aa155a9263', 'eb7f6100-173b-4781-b88e-70a4c549071d', 'd1e9af52-8d6f-49d0-99ec-9eb668a3f457', '188eb6f3-128d-4001-88d1-98f77ccf1bb0', '4b61639d-feab-4b33-8606-fb99744abdaf', '5eac7a66-6b55-402a-95a3-26289e22d805', '02115814-db06-4d1d-b4a9-d81e8d68e297', '7fff9b0f-432c-45a4-ade9-42e02a81c909', 'a10cf0c3-4657-46e5-adf1-800514715994', 'b03a14aa-781c-469f-8962-f310cd4c61ad', 'd187a18c-19ec-48c7-b311-931b84e2b7de', '9b333157-d5c5-4c1f-99b9-cdb2e0d4dd11', '613c3083-a1e9-4039-b943-c3e90b7a76ba', '8934c18b-c6e3-4094-8e22-53ef3b496b9b', 'fbb12050-d2c1-4e6c-8601-5d1d15063f44', 'c83dfdf8-6efd-4ff6-b500-5d478654dd54', '649870df-a70e-4f67-bbe4-687237a06dbf', '8bccd4eb-976a-49fa-

In [246]:
# step 2: Interacting - adding, storing, updating - playaround with the data
# storing data in chroma_langchain_db collection name is dpad_talk2data
# https://platform.openai.com/docs/models
#vector_store.from_documents(documents=documents,embedding=embdedding, ids=uuids)
uids_full = vector_store.add_documents(documents=documents,embeddings=embdedding, ids=uuids)


In [247]:
retriever = vector_store.as_retriever()

### Histroy aware retrieval and Chat Question Answer

In [248]:
from langchain.chains import create_history_aware_retriever
from langchain_core.prompts import MessagesPlaceholder

contextualize_q_system_prompt = (
    "Given a chat history and the latest user question "
    "which might reference context in the chat history, "
    "formulate a standalone question which can be understood "
    "without the chat history. Do NOT answer the question, "
    "just reformulate it if needed and otherwise return it as is."
)

contextualize_q_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", contextualize_q_system_prompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
)
history_aware_retriever = create_history_aware_retriever(
    llm, retriever, contextualize_q_prompt
)
# print(type(history_aware_retriever))
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain

system_prompt = (
    "You are an assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer "
    "the question. If you don't know the answer, say that you "
    "don't know. Use three sentences maximum and keep the "
    "answer concise."
    "\n\n"
    "{context}"
)

qa_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
)


question_answer_chain = create_stuff_documents_chain(llm, qa_prompt)

rag_chain = create_retrieval_chain(history_aware_retriever, question_answer_chain)
from langchain_community.chat_message_histories import ChatMessageHistory
from langchain_core.chat_history import BaseChatMessageHistory
from langchain_core.runnables.history import RunnableWithMessageHistory

store = {}


def get_session_history(session_id: str) -> BaseChatMessageHistory:
    if session_id not in store:
        store[session_id] = ChatMessageHistory()
    return store[session_id]


conversational_rag_chain = RunnableWithMessageHistory(
    rag_chain,
    get_session_history,
    input_messages_key="input",
    history_messages_key="chat_history",
    output_messages_key="answer",
)

### Coversation - Talk2Data

In [249]:
conversational_rag_chain.invoke(
    {"input": "Provide me informaion about diameter parameter?"},
    config={
        "configurable": {"session_id": "abc123"}
    },  # constructs a key "abc123" in `store`.
)["answer"]

"The retrieved context does not provide information about the diameter parameter. Therefore, I don't know the answer."

In [250]:
conversational_rag_chain.invoke(
    {"input": "How does it is linked with outcome R recur?"},
    config={"configurable": {"session_id": "abc123"}},
)["answer"]

"The retrieved context does not specify how the diameter parameter is linked to the outcome R. Therefore, I don't know the answer."

In [251]:
conversational_rag_chain.invoke(
    {"input": "What other paramters are important in reoccurance?"},
    config={"configurable": {"session_id": "abc123"}},
)["answer"]

"The retrieved context mentions ten real-valued features computed for each cell nucleus, but it does not specify what those parameters are. Therefore, I don't know the answer."

In [252]:
conversational_rag_chain.invoke(
    {"input": "Provide the unit of measurement of each?"},
    config={"configurable": {"session_id": "abc123"}},
)["answer"]

"The retrieved context does not provide information about the units of measurement for the parameters mentioned. Therefore, I don't know the answer."

In [253]:
conversational_rag_chain.invoke(
    {"input": "proivde the information of the confusion matrix?"},
    config={"configurable": {"session_id": "abc123"}},
)["answer"]

"The retrieved context does not contain information about the confusion matrix. Therefore, I don't know the answer."

In [254]:
for message in store["abc123"].messages:
    if isinstance(message, AIMessage):
        prefix = "AI"
    else:
        prefix = "User"

    print(f"{prefix}: {message.content}\n")

User: Provide me informaion about diameter parameter?

AI: The retrieved context does not provide information about the diameter parameter. Therefore, I don't know the answer.

User: How does it is linked with outcome R recur?

AI: The retrieved context does not specify how the diameter parameter is linked to the outcome R. Therefore, I don't know the answer.

User: What other paramters are important in reoccurance?

AI: The retrieved context mentions ten real-valued features computed for each cell nucleus, but it does not specify what those parameters are. Therefore, I don't know the answer.

User: Provide the unit of measurement of each?

AI: The retrieved context does not provide information about the units of measurement for the parameters mentioned. Therefore, I don't know the answer.

User: proivde the information of the confusion matrix?

AI: The retrieved context does not contain information about the confusion matrix. Therefore, I don't know the answer.



In [214]:
vector_store.reset_collection()