In [2]:
import os
from operator import itemgetter
import pandas as pd
from dotenv import load_dotenv
from datetime import datetime
from docx import Document
import PyPDF2
import nltk
from nltk.tokenize import sent_tokenize
from langchain.chat_models import ChatOpenAI
from langchain.vectorstores.chroma import Chroma
from langchain.prompts import PromptTemplate
from langchain.text_splitter import CharacterTextSplitter
from langchain.memory import ConversationBufferMemory
from langchain.schema.runnable import RunnablePassthrough, RunnableLambda
from langchain.schema import StrOutputParser
from langchain.storage import InMemoryStore
from langchain.retrievers import ParentDocumentRetriever
from langchain.retrievers.multi_vector import MultiVectorRetriever
from langchain.storage import InMemoryByteStore
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import OpenAIEmbeddings
# from langchain_community.vectorstores import Chroma
from langchain_community.document_loaders import TextLoader
from langchain_community.document_loaders import UnstructuredWordDocumentLoader
from langchain_community.document_loaders import PyPDFLoader

In [3]:
# Load environment variables
load_dotenv()

# Define the directory where the data source files are located
DATASOURCE_DIR = "./datasource/"

In [6]:
def file_loader(file_path):
    # If the file is a .docx file
    if file_path.endswith(".docx"):
        return UnstructuredWordDocumentLoader(file_path)
    # If the file is a .pdf file
    elif file_path.endswith(".pdf"):
        return PyPDFLoader(file_path)
    # If the file is a .txt file
    elif file_path.endswith(".txt"):
        return TextLoader(file_path)
    # If the file type is not supported
    else:
        # Return None
        return None

In [9]:
loaders = []
# Iterate over all files in the directory
for f in os.listdir(DATASOURCE_DIR):
    # If the current item is a file
    file_path = os.path.join(DATASOURCE_DIR, f)
    if os.path.isfile(file_path):
        loaders.append(file_loader(file_path))
docs = []
for loader in loaders:
    docs.extend(loader.load())

# This text splitter is used to create the child documents
child_splitter = RecursiveCharacterTextSplitter(chunk_size=400)
texts = child_splitter.split_documents(docs)

In [16]:
from langchain.vectorstores.pgvector import PGVector
from langchain_openai import OpenAIEmbeddings
embeddings = OpenAIEmbeddings()
CONNECTION_STRING = "postgresql+psycopg2://postgres:123456@localhost:5432/doctor_vector_db"
COLLECTION_NAME = 'doctor_lib_vectors'

db = PGVector.from_documents(
    embedding=embeddings,
    documents=texts,
    collection_name=COLLECTION_NAME,
    connection_string=CONNECTION_STRING,
)

In [17]:
retriever = db.as_retriever()

In [18]:
query = "I speak portugese, I need to visit a Orthodontist"
docs_retrieved = retriever.get_relevant_documents(query)
docs_retrieved

[Document(page_content='optimal oral health and a harmonious smile. If the time slots are not suitable, please call the practice.  \nSpoken languages  \nEnglish, Spanish, French and Portuguese  \n \nWebsite  \nSee the site  \n________________________________________  \nNational and university diplomas', metadata={'source': './datasource/Dr Catherine Lasvergnas Buffet Orthodontist.pdf', 'page': 0}),
 Document(page_content="mouth and to make patients' smiles more harmonious. The practi ce offers treatments for children, \nadolescents and adults (orthopedic appliances, vestibular technique, lingual technique and aligner \ntreatments).  \nNational and university diplomas  \nState diploma of doctor in dental surgery - UFR of odontology Garancière - Paris -Cité University  \nOthers formations", metadata={'source': './datasource/Dr Lassaad BEN HAOUIA, Orthodontist.pdf', 'page': 0}),
 Document(page_content="Doctor Lassaad Ben Haouia welcomes you to his office in Poissy. He is a qualified speci

In [22]:
len(docs_retrieved)

4

In [19]:
docs_retrieved[0]

Document(page_content='optimal oral health and a harmonious smile. If the time slots are not suitable, please call the practice.  \nSpoken languages  \nEnglish, Spanish, French and Portuguese  \n \nWebsite  \nSee the site  \n________________________________________  \nNational and university diplomas', metadata={'source': './datasource/Dr Catherine Lasvergnas Buffet Orthodontist.pdf', 'page': 0})

In [20]:
docs_retrieved[0].page_content

'optimal oral health and a harmonious smile. If the time slots are not suitable, please call the practice.  \nSpoken languages  \nEnglish, Spanish, French and Portuguese  \n \nWebsite  \nSee the site  \n________________________________________  \nNational and university diplomas'

In [21]:
file_url = docs_retrieved[0].metadata['source']
file_url

'./datasource/Dr Catherine Lasvergnas Buffet Orthodontist.pdf'

In [24]:
from langchain_community.document_loaders import TextLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings, ChatOpenAI

loader = PyPDFLoader(file_url)
documents = loader.load()
text_splitter = CharacterTextSplitter(chunk_size=200, chunk_overlap=0)
texts = text_splitter.split_documents(documents)
embeddings = OpenAIEmbeddings()
db = FAISS.from_documents(texts, embeddings)
retriever = db.as_retriever()

In [25]:
# Initialize the OpenAI model
llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)

# Define the prompt template for the chatbot
prompt_template = """
You are an healthcare assistant chatbot. Based on the retrieved context, answer the question. 
Explain why the information in the context can and, or cannot answer the question. 
The response is in the following format:
Recommended doctor is:
Why it's a good choice:
What's missing:
Source is:
If you don't know the answer, just say that you don't know. 
Context: {context}
Question: {question}  
Answer:
"""

# Create the prompt template for the chatbot
prompt = PromptTemplate(
    input_variables=["context", "question"],
    template=prompt_template,
)

# Create the LangChain
rag_chain = (
    {
        "context": retriever, 
        "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [None]:
query = "I speak portugese, I need to visit a Orthodontist"
llm_response = rag_chain.invoke(query)

In [28]:
print(llm_response)

Recommended doctor is: Dr Catherine Lasvergnas Buffet

Why it's a good choice: Dr Catherine Lasvergnas Buffet is an orthodontist who offers a full range of orthodontic treatments for children, adolescents, and adults. She is fluent in Portuguese, so she will be able to communicate with you effectively.

What's missing: The context does not provide information about the availability of appointments or whether Dr Catherine Lasvergnas Buffet is currently accepting new patients.

Source is: ./datasource/Dr Catherine Lasvergnas Buffet Orthodontist.pdf


In [39]:
file_loader(file_url)

<langchain_community.document_loaders.text.TextLoader at 0x16793f21c90>