In [350]:
"""
Copyright 2024 USF Australia

Licensed under the MIT License

Author: Umer Imtiaz - umer.imiaz@gmail.com
Date: 24/8/2024
Version: 0.00
Code type: Alpha

This file contains the implementation of talk2 algorithm
work with memory and conversation. This will be hosted on the
umerai.com website as a product demo.

reference is the link below
https://python.langchain.com/v0.2/docs/tutorials/qa_chat_history/
"""

'\nCopyright 2024 USF Australia\n\nLicensed under the MIT License\n\nAuthor: Umer Imtiaz - umer.imiaz@gmail.com\nDate: 24/8/2024\nVersion: 0.00\nCode type: Alpha\n\nThis file contains the implementation of talk2 algorithm\nwork with memory and conversation. This will be hosted on the\numerai.com website as a product demo.\n\nreference is the link below\nhttps://python.langchain.com/v0.2/docs/tutorials/qa_chat_history/\n'

### Step 1: Loading the keys from a "keys.env" file

In [351]:
# ensuring safe programming
try:
    with open('keys.env', 'r', encoding="utf-8") as keys_file:
        key_lines = keys_file.readlines()
        keys_dict = dict(line.strip().split('=') for line in key_lines)
except FileNotFoundError:
    print("Error: keys.config file not found")
    exit()
except Exception as e:
    print (f"Error: An error occured while reading keys.config file: {e}")
    exit()

#print(f"dict = {keys_dict}")

In [352]:
#open the config file to read openai and langchain keys

#loading values in the variables
try:
    openapi_key = keys_dict['openapi_key']
    # print(f"openapi_key={openapi_key}")
except KeyError:
    # print(f"Error: '{openapi_key}' not found in keys.config")
    print("Error: OPENAI key not found")

try:
    langchain_key = keys_dict['langchain_key']
    # print(f"langchain_key={langchain_key}")
except KeyError:
    # print(f"Error: '{langchain_key}' not found in keys.config")
    print("Error: langchain key not found - Langsmith traces will not log")



In [353]:
import openai
import os
from langchain_openai import ChatOpenAI

#setting the enviornment variables with the openai key values
os.environ['OPENAI_API_KEY'] = openapi_key
openai.api_key  = os.getenv('OPENAI_API_KEY')

""" print(os.environ['OPENAI_API_KEY']) """

" print(os.environ['OPENAI_API_KEY']) "

In [354]:
# setting the enviornment variables with langchain key values
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_API_KEY"] = langchain_key

""" print(os.environ['LANGCHAIN_API_KEY']) """

" print(os.environ['LANGCHAIN_API_KEY']) "

In [355]:
"""
This code takes a list of langchain Document objects and print. 
The document has two components page_contecnt->str and metadata->dict 
"""
def print_list_of_documents(loader_docs):
    counter=0
    print(loader_docs.__class__)
    for data in loader_docs:
        counter += 1
        print(f"\n\n{counter} {data.__class__}: {data}")
        ##
        if isinstance(data, dict): # if False this is a list only and not list of dictionaries
            for key, val in data.items():
                print(f"\nkey: {key} ; value: {val}")
    counter = 0

In [356]:
"""This displays the vector store information from a vector store using ids"""
def print_vector_stores(uuids, vector_store):
    counter = 0
    for x in range(len(uuids)):
        counter += 1
        record = vector_store.get(uuids[x])
        print(counter, record)
        for key, val in record.items():
            print(f"\nkey: {key} ; value: {val}")
        print(f"\n print records / total records: {counter} / {len(uuids)}\n")
    couner = 0

### Step 2: Load the data from the sources

In [357]:
# Data source 1/4
# load data from the website
# reference : https://python.langchain.com/v0.1/docs/use_cases/web_scraping/

from langchain_community.document_loaders import WebBaseLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
# important variables
url_uci = "https://archive.ics.uci.edu/dataset/16/breast+cancer+wisconsin+prognostic"

#Step 2.1: Web page loading using a url or list of urls

web_page = WebBaseLoader(url_uci)
web_page_loader = web_page.load()

#spliting Document object to smaller document objects 
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 500, chunk_overlap=100, length_function=len, is_separator_regex=False)
web_page_loader_docs = text_splitter.split_documents(web_page_loader)

#displaying Documents in the list of the documents
print_list_of_documents(web_page_loader_docs)


<class 'list'>


1 <class 'langchain_core.documents.base.Document'>: page_content='UCI Machine Learning Repository




Breast Cancer Wisconsin (Prognostic) - UCI Machine Learning Repository' metadata={'source': 'https://archive.ics.uci.edu/dataset/16/breast+cancer+wisconsin+prognostic', 'title': 'UCI Machine Learning Repository', 'description': 'Discover datasets around the world!', 'language': 'en'}


2 <class 'langchain_core.documents.base.Document'>: page_content='Datasets Contribute Dataset Donate New Link External About Us Who We Are Citation Metadata Contact Information           Login       Breast Cancer Wisconsin (Prognostic)  Donated on 11/30/1995     Prognostic Wisconsin Breast Cancer Database  Dataset Characteristics Multivariate Subject Area Health and Medicine Associated Tasks Classification, Regression Feature Type Real # Instances 198 # Features 33   Dataset Information    Additional Information  Each record represents follow-up data' metadata={'source': 'https://archive

In [358]:
# Data source 2/4
#load the data from the 'data_info_table.csv' 
from langchain_community.document_loaders import CSVLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
#instantiate
data_info_table = CSVLoader(file_path='./data_info_table.csv',
                                   csv_args={
                                       'delimiter': ';',
                                       'quotechar': '"',
                                       'fieldnames': ['name', 'role', 'type', 'demographic', 'description', 'units', 'missing_values']
                                   })
data_info_table_loader = data_info_table.load()
""" for doc in data_info_table_docs:
    print(f"{doc.page_content[:10000]}")
    print(f"{doc.metadata}") """

#spliting Document object to smaller document objects 
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 500, chunk_overlap=100, length_function=len, is_separator_regex=False)
original_data_loader_docs_data_info_table = text_splitter.split_documents(data_info_table_loader)

print_list_of_documents(original_data_loader_docs_data_info_table)


<class 'list'>


1 <class 'langchain_core.documents.base.Document'>: page_content='name: name
role: role
type: type
demographic: demographic
description: description
units: units
missing_values: missing_values' metadata={'source': './data_info_table.csv', 'row': 0}


2 <class 'langchain_core.documents.base.Document'>: page_content='name: ID
role: ID
type: Integer
demographic: NA
description: NA
units: NA
missing_values: no' metadata={'source': './data_info_table.csv', 'row': 1}


3 <class 'langchain_core.documents.base.Document'>: page_content='name: Time
role: Feature
type: Integer
demographic: NA
description: NA
units: NA
missing_values: no' metadata={'source': './data_info_table.csv', 'row': 2}


4 <class 'langchain_core.documents.base.Document'>: page_content='name: radius1
role: Feature
type: Continuous
demographic: NA
description: NA
units: NA
missing_values: no' metadata={'source': './data_info_table.csv', 'row': 3}


5 <class 'langchain_core.documents.base.Document'>: page_cont

In [359]:
# Data source 3/4
#load the tables data from the 'original_data.csv'
#instantiate
original_data = CSVLoader(file_path='./original_data.csv',
                                   csv_args={
                                       'delimiter': ';',
                                       'quotechar': '"',
                                       'fieldnames': ['Time',
                                                      'radius1','texture1','perimeter1','area1','smoothness1','compactness1','concavity1','concave_points1','symmetry1','fractal_dimension1',
                                                      'radius2','texture2','perimeter2','area2','smoothness2','compactness2','concavity2','concave_points2','symmetry2','fractal_dimension2',
                                                      'radius3','texture3','perimeter3','area3','smoothness3','compactness3','concavity3','concave_points3','symmetry3','fractal_dimension3',
                                                      'tumor_size','lymph_node_status','Outcome']
                                   })
original_data_loader = original_data.load()
""" for doc in original_data_loader:
    print(f"{doc.page_content[:10000]}")
    print(f"{doc.metadata}") """

#spliting Document object to smaller document objects 
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 500, chunk_overlap=100, length_function=len, is_separator_regex=False)
original_data_loader_docs = text_splitter.split_documents(original_data_loader)

print_list_of_documents(original_data_loader_docs)
""" counter=0
for data in original_data_loader_docs:
    counter += 1
    print(f"\n\n{counter}{data.__class__}: {data}")
counter = 0 """

<class 'list'>


1 <class 'langchain_core.documents.base.Document'>: page_content='Time: Time
radius1: radius1
texture1: texture1
perimeter1: perimeter1
area1: area1
smoothness1: smoothness1
compactness1: compactness1
concavity1: concavity1
concave_points1: concave_points1
symmetry1: symmetry1
fractal_dimension1: fractal_dimension1
radius2: radius2
texture2: texture2
perimeter2: perimeter2
area2: area2
smoothness2: smoothness2
compactness2: compactness2
concavity2: concavity2
concave_points2: concave_points2
symmetry2: symmetry2
fractal_dimension2: fractal_dimension2' metadata={'source': './original_data.csv', 'row': 0}


2 <class 'langchain_core.documents.base.Document'>: page_content='concave_points2: concave_points2
symmetry2: symmetry2
fractal_dimension2: fractal_dimension2
radius3: radius3
texture3: texture3
perimeter3: perimeter3
area3: area3
smoothness3: smoothness3
compactness3: compactness3
concavity3: concavity3
concave_points3: concave_points3
symmetry3: symmetry3
fractal_di

' counter=0\nfor data in original_data_loader_docs:\n    counter += 1\n    print(f"\n\n{counter}{data.__class__}: {data}")\ncounter = 0 '

In [360]:
# Data source 4/4
#load the statistics from the algorithms from 'mldata.txt'

from langchain_community.document_loaders import TextLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

#loading text file the output is a langchain Document object
mldata = TextLoader('./mldata.txt')
mldata_loader= mldata.load()
print(mldata_loader.__class__)
#spliting Document object to smaller document objects 
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 500, chunk_overlap=100, length_function=len, is_separator_regex=False)
mldata_loader_docs = text_splitter.split_documents(mldata_loader)

print_list_of_documents(mldata_loader_docs)

<class 'list'>
<class 'list'>


1 <class 'langchain_core.documents.base.Document'>: page_content='Project information Data
The project aims to develop a conversational AI system to interact with a breast cancer prognosis repository,
generate explanations and predictions based on the data, and provide insights to healthcare professionals and patients.
The project will develop a conversational interface that allows users to interact with the LLM model and ask questions
about breast cancer prognosis. The interface will use natural language processing (NLP) techniques to understand' metadata={'source': './mldata.txt'}


2 <class 'langchain_core.documents.base.Document'>: page_content='user input and generate responses based on the LLM model's predictions. This project will be hosted at umerai.com
at later stages of project completion,for general use.' metadata={'source': './mldata.txt'}


3 <class 'langchain_core.documents.base.Document'>: page_content='Machine Learning Data
Meta Data
uci_

### step 3: saving in a data base

In [361]:
from langchain_chroma import Chroma
from langchain_openai import OpenAIEmbeddings, ChatOpenAI

#LLM modlel type - https://platform.openai.com/docs/models/gpt-4o-mini
llm = ChatOpenAI(model="gpt-4o-mini")
#Embedding model type
embdedding = OpenAIEmbeddings(model="text-embedding-3-large")

#setting up database store
vector_store = Chroma(collection_name="dpad_talk2data", embedding_function=embdedding, persist_directory="./chroma_langchain_db")

documents = web_page_loader_docs + original_data_loader_docs_data_info_table + original_data_loader_docs + mldata_loader_docs

In [362]:
# adding documents with unique ids of each
from uuid import uuid4

uuids = [str(uuid4()) for _ in range(len(documents))] # "_" can be "x", it does not matter. It is not used, anyway
print(uuids.__class__, len(uuids), uuids)

<class 'list'> 497 ['f632aeb7-29ab-44f0-9736-6da1c1a44458', 'f2086f42-b2ec-4c6e-9a0e-47477826ed8f', '82f922a6-31a0-40e1-b6c4-bb70dadb2e6b', 'ae22b0a0-e536-49b2-aa85-78af93f38a84', '094b869a-b143-4737-b718-16c186f3e6da', 'fbfeda3b-afad-4459-b25b-e14f1d1404e4', 'd1cf43f3-16aa-49fc-a93b-2cd558351ce8', '23a99247-1e00-4b68-960b-6107de7694cd', '6c32d75b-9f05-4fa0-b0dc-a4dbfadd7000', '8314ea66-a5f0-491d-bd15-49e31b9deae4', 'aba1ef98-74fb-451b-bac3-3a3bafa2df86', '1a540bfd-5ce8-49f7-b069-b2bcd4f4bb92', '0ba03a36-49ff-410a-b424-49c3c82e8d58', '91336fd8-19bd-48d1-a852-db9724f8004a', '46df0249-793a-4cd9-824c-042d67c2f2ed', '1e5b20e2-e0ba-4b44-8bb7-430309ebe494', '12b6741f-bbbf-430e-94fc-f50ff0a6a802', '78b16b58-6135-492a-a716-ebf0e5fb37e6', '1b28c613-7971-48ff-9fb2-8e5bfe7f7c7e', 'c0f12268-40ef-4242-9e5c-6a9e78aa48a7', '43110cb4-6805-446c-9edc-3931e66ea850', '4245f619-712d-4c4e-aed8-bb14d2b19a8b', 'cafb100f-d9e4-413c-a31d-cf5dde738f1b', 'b3d849ac-b43e-4d3a-9309-47b85353a4f5', '1b5771d6-8536-4e3f-

In [363]:
# step 2: Interacting - adding, storing, updating - playaround with the data
# storing data in chroma_langchain_db collection name is dpad_talk2data
# https://platform.openai.com/docs/models
#vector_store.from_documents(documents=documents,embedding=embdedding, ids=uuids)
uids_full = vector_store.add_documents(documents=documents,embeddings=embdedding, ids=uuids)


In [364]:
retriever = vector_store.as_retriever()

### Histroy aware retrieval and Chat Question Answer

In [365]:
from langchain.chains import create_history_aware_retriever
from langchain_core.prompts import MessagesPlaceholder

contextualize_q_system_prompt = (
    "Given a chat history and the latest user question "
    "which might reference context in the chat history, "
    "formulate a standalone question which can be understood "
    "without the chat history. Do NOT answer the question, "
    "just reformulate it if needed and otherwise return it as is."
)

contextualize_q_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", contextualize_q_system_prompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
)
history_aware_retriever = create_history_aware_retriever(
    llm, retriever, contextualize_q_prompt
)
# print(type(history_aware_retriever))
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain

system_prompt = (
    "You are an assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer "
    "the question. If you don't know the answer, say that you "
    "don't know. Use three sentences maximum and keep the "
    "answer concise."
    "\n\n"
    "{context}"
)

qa_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
)


question_answer_chain = create_stuff_documents_chain(llm, qa_prompt)

rag_chain = create_retrieval_chain(history_aware_retriever, question_answer_chain)
from langchain_community.chat_message_histories import ChatMessageHistory
from langchain_core.chat_history import BaseChatMessageHistory
from langchain_core.runnables.history import RunnableWithMessageHistory

store = {}


def get_session_history(session_id: str) -> BaseChatMessageHistory:
    if session_id not in store:
        store[session_id] = ChatMessageHistory()
    return store[session_id]


conversational_rag_chain = RunnableWithMessageHistory(
    rag_chain,
    get_session_history,
    input_messages_key="input",
    history_messages_key="chat_history",
    output_messages_key="answer",
)

### Coversation - Talk2Data

In [366]:
conversational_rag_chain.invoke(
    {"input": "Provide me informaion about diameter parameter?"},
    config={
        "configurable": {"session_id": "abc123"}
    },  # constructs a key "abc123" in `store`.
)["answer"]

'The diameter parameter is typically calculated as twice the radius of a shape. In the context of the provided data, the diameter can be derived from the radius values by multiplying each radius by two. However, specific diameter values are not directly provided in the retrieved context.'

In [367]:
conversational_rag_chain.invoke(
    {"input": "How does it is linked with outcome R recur?"},
    config={"configurable": {"session_id": "abc123"}},
)["answer"]

'The outcome "R" likely indicates a recurrence of a tumor or a specific classification related to tumor behavior. The parameters such as radius, texture, perimeter, and others may correlate with tumor characteristics that influence recurrence risk. However, without specific analysis or data linking these parameters to outcome "R," the exact relationship cannot be determined.'

In [368]:
conversational_rag_chain.invoke(
    {"input": "What other paramters are important in reoccurance?"},
    config={"configurable": {"session_id": "abc123"}},
)["answer"]

'Important parameters that may influence recurrence include tumor size, lymph node status, and various features of the cell nuclei, such as texture, perimeter, area, smoothness, compactness, concavity, concave points, symmetry, and fractal dimension. These factors can provide insights into the aggressiveness and behavior of the tumor. Each of these features can be analyzed to assess the risk of recurrence in patients.'

In [369]:
conversational_rag_chain.invoke(
    {"input": "Provide the unit of measurement of each?"},
    config={"configurable": {"session_id": "abc123"}},
)["answer"]

'The units of measurement for the parameters typically involved in tumor assessment are as follows:\n\na) Radius - typically measured in micrometers (µm) or millimeters (mm)  \nb) Texture - usually a unitless value (standard deviation of gray-scale values)  \nc) Perimeter - measured in micrometers (µm) or millimeters (mm)  \nd) Area - measured in square micrometers (µm²) or square millimeters (mm²)  \ne) Smoothness - unitless value  \nf) Compactness - unitless value  \ng) Concavity - unitless value  \nh) Concave points - unitless value (count)  \ni) Symmetry - unitless value  \nj) Fractal dimension - unitless value  \n\nNote that the exact units may vary based on specific studies or methodologies used.'

In [370]:
conversational_rag_chain.invoke(
    {"input": "proivde the information of the confusion matrix?"},
    config={"configurable": {"session_id": "abc123"}},
)["answer"]

"A confusion matrix is a table used to evaluate the performance of a classification model by comparing predicted classifications to actual outcomes. It typically contains four key components: \n\n1. True Positives (TP): The number of correct positive predictions.\n2. True Negatives (TN): The number of correct negative predictions.\n3. False Positives (FP): The number of incorrect positive predictions (Type I error).\n4. False Negatives (FN): The number of incorrect negative predictions (Type II error).\n\nFrom these values, various performance metrics, such as accuracy, precision, recall, and F1-score, can be derived to assess the model's effectiveness."

In [371]:
conversational_rag_chain.invoke(
    {"input": "When was this data updated? Also tell me source of your answer?"},
    config={"configurable": {"session_id": "abc123"}},
)["answer"]

'The data was last updated on Sunday, January 14, 2024. This information is sourced from the retrieved context provided.'

In [372]:
conversational_rag_chain.invoke(
    {"input": "Provide me anythin about KNN? Was KNN model loading was succesfull? Also provide me its parameter used. Write each in a speparate paragraph"},
    config={"configurable": {"session_id": "abc123"}},
)["answer"]

'KNN, or K-Nearest Neighbors, is a simple, non-parametric classification algorithm that classifies data points based on the classes of their nearest neighbors in the feature space. It is commonly used for classification and regression tasks and relies on distance metrics to determine the closest data points.\n\nYes, the KNN model loading was successful as indicated by the message "Working Successful >>>> KNN."\n\nThe parameters used for the KNeighborsClassifier are: \n{\'algorithm\': \'auto\', \'leaf_size\': 30, \'metric\': \'minkowski\', \'metric_params\': None, \'n_jobs\': None, \'n_neighbors\': 5, \'p\': 2, \'weights\': \'uniform\'}.'

In [373]:
conversational_rag_chain.invoke(
    {"input": "Provide me confusion matrix score of it. Whcih scroe is largest"},
    config={"configurable": {"session_id": "abc123"}},
)["answer"]

'The confusion matrix score for KNN is as follows:\n\n```\n[[27  5]\n [ 4  4]]\n```\n\nIn this matrix, the largest score is 27, which represents the True Positives (TP).'

In [374]:
conversational_rag_chain.invoke(
    {"input": "Provide and explain confusion matrix of all classifiers. Exaplina the results of each. Compare true positives of the classifiers. Discuss in less than 300 words"},
    config={"configurable": {"session_id": "abc123"}},
)["answer"]

'Here are the confusion matrices for each classifier along with their explanations:\n\n1. **Decision Tree Classifier**:\n   ```\n   [[28  4]\n    [ 4  4]]\n   ```\n   - True Positives (TP): 28, False Positives (FP): 4, False Negatives (FN): 4, True Negatives (TN): 4. The model correctly identified 28 positive cases, showing a decent performance but with 4 false negatives indicating some missed positive cases.\n\n2. **Random Forest Classifier**:\n   ```\n   [[31  1]\n    [ 5  3]]\n   ```\n   - TP: 31, FP: 1, FN: 5, TN: 3. This model achieved the highest true positive count of 31 and a low false positive count of 1, indicating strong predictive accuracy and fewer misclassifications.\n\n3. **KNN Classifier**:\n   ```\n   [[27  5]\n    [ 4  4]]\n   ```\n   - TP: 27, FP: 5, FN: 4, TN: 4. The KNN model had 27 true positives, which is slightly lower than the Decision Tree but higher than the number of false negatives, suggesting a balanced performance.\n\n4. **Logistic Regression**:\n   ```\n

In [375]:
for message in store["abc123"].messages:
    if isinstance(message, AIMessage):
        prefix = "AI"
    else:
        prefix = "User"

    print(f"{prefix}: {message.content}\n")

User: Provide me informaion about diameter parameter?

AI: The diameter parameter is typically calculated as twice the radius of a shape. In the context of the provided data, the diameter can be derived from the radius values by multiplying each radius by two. However, specific diameter values are not directly provided in the retrieved context.

User: How does it is linked with outcome R recur?

AI: The outcome "R" likely indicates a recurrence of a tumor or a specific classification related to tumor behavior. The parameters such as radius, texture, perimeter, and others may correlate with tumor characteristics that influence recurrence risk. However, without specific analysis or data linking these parameters to outcome "R," the exact relationship cannot be determined.

User: What other paramters are important in reoccurance?

AI: Important parameters that may influence recurrence include tumor size, lymph node status, and various features of the cell nuclei, such as texture, perimeter

In [294]:
vector_store.reset_collection()