# Create a Q&A Chatbot with LangChain Project

### Set the OpenAI API Key as an Environment Variable

In [None]:
%load_ext dotenv
%dotenv

### Import the Libraries

In [None]:
from langchain_community.document_loaders.pdf import PyPDFLoader

from langchain_text_splitters import (MarkdownHeaderTextSplitter, 
                                      TokenTextSplitter)

from langchain_core.output_parsers.string import StrOutputParser
from langchain_core.messages import SystemMessage
from langchain_core.prompts import (PromptTemplate,
                                    HumanMessagePromptTemplate, 
                                    ChatPromptTemplate)
from langchain_core.runnables import (RunnablePassthrough, 
                                      RunnableLambda, 
                                      chain)

from langchain_openai import (ChatOpenAI, 
                              OpenAIEmbeddings)

from langchain_chroma.vectorstores import Chroma

### Load the Course Transcript

In [None]:
loader_pdf = PyPDFLoader("Introduction_to_Tableau.pdf")
docs_list = loader_pdf.load()

In [None]:
len(docs_list)

In [None]:
string_list_concat = "".join([i.page_content for i in docs_list])

In [None]:
string_list_concat

### Split the Course Transcript with MarkdownHeaderTextSplitter

In [None]:
md_splitter = MarkdownHeaderTextSplitter(
    headers_to_split_on = [("#", "Section Title"),
                           ("##", "Lecture Title")]
)

docs_list_md_split = md_splitter.split_text(string_list_concat)

In [None]:
len(docs_list_md_split)

### Create a Chain to Correct the Course Transcript

In [None]:
string_list_split = [i.page_content for i in docs_list_md_split]

In [None]:
string_list_split

In [None]:
PROMPT_FORMATTING_S = '''Improve the following Tableau lecture transcript by:
- Splitting the text into meaningful paragraphs
- Correcting any misplaced punctuation
- Fixing mistranscribed words (e.g., changing 'tableaux' to 'Tableau')"
'''

PROMPT_TEMPLATE_FORMATTING_H = '''This is the transcript:
{lecture_transcript}
'''

In [None]:
prompt_formatting_s = SystemMessage(content=PROMPT_FORMATTING_S)
prompt_template_formatting_h = HumanMessagePromptTemplate.from_template(template=PROMPT_TEMPLATE_FORMATTING_H)

chat_prompt_template_formatting = ChatPromptTemplate(messages=[prompt_formatting_s, 
                                                               prompt_template_formatting_h])

In [None]:
chat = ChatOpenAI(model_name='gpt-4o', 
                  seed=365,
                  temperature=0)

In [None]:
str_output_parser = StrOutputParser()

In [None]:
chain_formatting = (chat_prompt_template_formatting 
                    | chat
                    | str_output_parser)

In [None]:
string_list_formatted = chain_formatting.batch(string_list_split)

In [None]:
string_list_formatted

In [None]:
for i in string_list_formatted:
    print(i)
    print('''
-------------------
    ''')

In [None]:
for i, j in zip(docs_list_md_split, string_list_formatted):
    i.page_content = j

In [None]:
for i in docs_list_md_split:
    print(i.page_content)
    print('''
-------------------
    ''')

In [None]:
len(docs_list_md_split)

### Split the Lectures with TokenTextSplitter

In [None]:
token_splitter = TokenTextSplitter(encoding_name="cl100k_base", 
                                   chunk_size=500, 
                                   chunk_overlap=50)

In [None]:
docs_list_tokens_split = token_splitter.split_documents(docs_list_md_split)

In [None]:
len(docs_list_tokens_split)

### Create Embeddings, Vector Store, and Retriever

In [None]:
embedding = OpenAIEmbeddings(model='text-embedding-3-small')

In [None]:
# vectorstore = Chroma.from_documents(documents = docs_list_tokens_split, 
#                                     embedding = embedding, 
#                                     persist_directory = "./intro-to-tableau")

vectorstore = Chroma(persist_directory = "./intro-to-tableau", 
                     embedding_function = embedding)

In [None]:
len(vectorstore.get()["documents"])

In [None]:
retriever = vectorstore.as_retriever(search_type = 'mmr', 
                                     search_kwargs = {'k':2, 
                                                      'lambda_mult':0.7})

### Create Prompts and Prompt Templates for the Q&A Chatbot Chain

In [None]:
PROMPT_CREATING_QUESTION = '''Lecture: {question_lecture}
Title: {question_title}
Body: {question_body}'''

PROMPT_RETRIEVING_S = '''You will receive a question from a student taking a Tableau course, which includes a title and a body. 
The corresponding lecture will also be provided.

Answer the question using only the provided context.

At the end of your response, include the section and lecture names where the context was drawn from, formatted as follows: 
Resources: 
Section: *Section Title*, Lecture: *Lecture Title* 
...
Replace *Section Title* and *Lecture Title* with the appropriate titles.'''

PROMPT_TEMPLATE_RETRIEVING_H = '''This is the question:
{question}

This is the context:
{context}'''

prompt_creating_question = PromptTemplate.from_template(template=PROMPT_CREATING_QUESTION)
prompt_retrieving_s = SystemMessage(content=PROMPT_RETRIEVING_S)
prompt_template_retrieving_h = HumanMessagePromptTemplate.from_template(template=PROMPT_TEMPLATE_RETRIEVING_H)

chat_prompt_template_retrieving = ChatPromptTemplate([prompt_retrieving_s, 
                                                      prompt_template_retrieving_h])

### Create the First Version of the Q&A Chatbot Chain

In [None]:
chain_retrieving = (prompt_creating_question
                    | RunnableLambda(lambda x: x.text)
                    | {'context': retriever,
                       'question': RunnablePassthrough()}
                    | chat_prompt_template_retrieving 
                    | chat
                    | str_output_parser)

In [None]:
result = chain_retrieving.invoke({"question_lecture": "Adding a custom calculation",
                                  "question_title": "Why are we using SUM here? It's unclear to me.",
                                  "question_body": "This question refers to calculating the GM%."})

In [None]:
result

### Create a Runnable Function to Format the Context

In [None]:
@chain
def format_context(dictionary):
    
    formatted_string = ""
    retrieved_list = dictionary["context"]
    
    for i in range(len(retrieved_list)):
        formatted_string += f'''
Document {i+1}
Section Title: {retrieved_list[i].metadata["Section Title"]}
Lecture Title: {retrieved_list[i].metadata["Lecture Title"]}
Content: {retrieved_list[i].page_content}

-------------------
'''
        
    new_dictionary = {"context": formatted_string, 
                      "question": dictionary["question"]}
    
    return new_dictionary

In [None]:
chain_retrieving_improved = (prompt_creating_question 
                             | RunnableLambda(lambda x: x.text)
                             | {'context': retriever,
                                'question': RunnablePassthrough()} 
                             | format_context
                             | chat_prompt_template_retrieving
                             | chat
                             | str_output_parser)

In [None]:
result_improved = chain_retrieving_improved.invoke({"question_lecture": "Adding a custom calculation",
                                                    "question_title": "Why are we using SUM here? It's unclear to me.",
                                                    "question_body": "This question refers to calculating the GM%."})

In [None]:
result_improved

### Stream the Response

In [None]:
result_streamed = chain_retrieving_improved.stream({"question_lecture": "Adding a custom calculation",
                                                    "question_title": "Why are we using SUM here? It's unclear to me.",
                                                    "question_body": "This question refers to calculating the GM%."})

In [None]:
result_streamed

In [None]:
for chunk in result_streamed:
    print(chunk, end="")