In [2]:
import torch
import transformers
from transformers import LlamaTokenizer, LlamaForCausalLM, GenerationConfig, pipeline

## Llama.cpp

In [3]:
from langchain.llms import LlamaCpp
from langchain import PromptTemplate, LLMChain
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler

In [4]:
# Callbacks support token-wise streaming
callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])
# Verbose is required to pass to the callback manager

In [5]:
n_gpu_layers = 41 # Change this value based on your model and your GPU VRAM pool.
n_batch = 512 # Should be between 1 and n_ctx, consider the amount of VRAM in your GPU.


model_path={
    "wizard-vicuna-13B-q5_1" : "./models/wizard-vicuna-13B.ggmlv3.q5_1.bin",
    "wizardLM-13B.q5_1" : "./models/wizardLM-13B-Uncensored.ggmlv3.q5_1.bin",
    "wizardLM-13B.q4_1" : "./models/wizardLM-13B-Uncensored.ggmlv3.q4_1.bin",
    
}

# Make sure the model path is correct for your system!
local_llm = LlamaCpp(
    model_path=model_path["wizardLM-13B.q5_1"],
    n_gpu_layers=n_gpu_layers, n_batch=n_batch,
    callback_manager=None, 
    verbose=True,
    n_ctx=2048,
    temperature=0
)

ggml_init_cublas: found 1 CUDA devices:
  Device 0: NVIDIA GeForce RTX 3060
llama.cpp: loading model from ./models/wizardLM-13B-Uncensored.ggmlv3.q5_1.bin
llama_model_load_internal: format     = ggjt v3 (latest)
llama_model_load_internal: n_vocab    = 32001
llama_model_load_internal: n_ctx      = 2048
llama_model_load_internal: n_embd     = 5120
llama_model_load_internal: n_mult     = 256
llama_model_load_internal: n_head     = 40
llama_model_load_internal: n_layer    = 40
llama_model_load_internal: n_rot      = 128
llama_model_load_internal: ftype      = 9 (mostly Q5_1)
llama_model_load_internal: n_ff       = 13824
llama_model_load_internal: n_parts    = 1
llama_model_load_internal: model size = 13B
llama_model_load_internal: ggml ctx size =    0.09 MB
llama_model_load_internal: using CUDA for GPU acceleration
llama_model_load_internal: mem required  = 2165.28 MB (+ 1608.00 MB per state)
llama_model_load_internal: allocating batch_size x 1 MB = 512 MB VRAM for the scratch buffer
llama

## HuggingFace

In [5]:
model_name = {
    "wizardLM-7B-HF" = "TheBloke/wizardLM-7B-HF",
    "wizard-vicuna-13B-GPTQ" = "TheBloke/wizard-vicuna-13B-GPTQ",
    "Wizard-Vicuna-13B-Uncensored" = "ehartford/Wizard-Vicuna-13B-Uncensored",
}

tokenizer = LlamaTokenizer.from_pretrained(model_name)

model = LlamaForCausalLM.from_pretrained(model_name["wizardLM-7B-HF"],
                                              load_in_8bit=True,
                                              device_map='auto',
                                              torch_dtype=torch.float16,
                                              low_cpu_mem_usage=True)

SyntaxError: invalid syntax (2159739359.py, line 2)

## Load data

In [6]:
from langchain.document_loaders import JSONLoader

import json
from pathlib import Path
from pprint import pprint


file_path='./data/faq_dataset.json'
data = json.loads(Path(file_path).read_text())

In [7]:
loader = JSONLoader(
    file_path='./data/faq_dataset.json',
    jq_schema='.questions[] | "Question: \(.question) \n Answer: \(.answer)\n"')

data = loader.load()

## Prepare embeddings

In [9]:
import os

from langchain.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter

from langchain.chains import RetrievalQA

from InstructorEmbedding import INSTRUCTOR
from langchain.embeddings import HuggingFaceInstructEmbeddings

In [10]:
from langchain.embeddings import HuggingFaceInstructEmbeddings, HuggingFaceEmbeddings

embedding_model_attr = {
    "all-MiniLM-L6-v2" : { "name" : "sentence-transformers/all-MiniLM-L6-v2" , 'kwargs' : {'device': 'cpu'} },
    "instructor-base" : { "name" : "hkunlp/instructor-base" , 'kwargs' : {"device": "cpu"} },
    "instructor-xl" : { "name" : "hkunlp/instructor-xl" , 'kwargs' : {'device': 'cuda'} },
}

embedding_model_name = "all-MiniLM-L6-v2"

instructor_embeddings =  HuggingFaceEmbeddings(model_name=embedding_model_attr[embedding_model_name]['name'], 
                                               model_kwargs=embedding_model_attr[embedding_model_name]['kwargs'])


In [11]:
## Here is the nmew embeddings being used
embedding = instructor_embeddings

vectordb = Chroma.from_documents(documents=data, 
                                 embedding=embedding)

In [10]:
query = "How can I open an account?"
vectordb.similarity_search_with_score(query)

[(Document(page_content="Question: How can I create an account? \n Answer: To create an account, click on the 'Sign Up' button on the top right corner of our website and follow the instructions to complete the registration process.\n", metadata={'source': '/home/tamizh/projects/llm-doc-retrieval-and-qa/data/faq_dataset.json', 'seq_num': 1}),
  0.6770732998847961),
 (Document(page_content='Question: Can I order without creating an account? \n Answer: Yes, you can place an order as a guest without creating an account. However, creating an account offers benefits such as order tracking and easier future purchases.\n', metadata={'source': '/home/tamizh/projects/llm-doc-retrieval-and-qa/data/faq_dataset.json', 'seq_num': 17}),
  1.2404284477233887),
 (Document(page_content='Question: Do you have a loyalty program? \n Answer: Yes, we have a loyalty program where you can earn points for every purchase. These points can be redeemed for discounts on future orders. Please visit our website to le

## Retriever

In [12]:
retriever = vectordb.as_retriever(search_kwargs={"k": 3})

## Stuff Chain

In [14]:
# create the chain to answer questions 
qa_chain = RetrievalQA.from_chain_type(llm=local_llm, 
                                  chain_type="stuff", 
                                  retriever=retriever, 
                                  return_source_documents=True)

In [15]:
## Cite sources

import textwrap

def wrap_text_preserve_newlines(text, width=110):
    # Split the input text into lines based on newline characters
    lines = text.split('\n')

    # Wrap each line individually
    wrapped_lines = [textwrap.fill(line, width=width) for line in lines]

    # Join the wrapped lines back together using newline characters
    wrapped_text = '\n'.join(wrapped_lines)

    return wrapped_text

def process_llm_response(llm_response):
    print("Answer:")
    print(wrap_text_preserve_newlines(llm_response['result']))
    print('Reference:')
    for i, src_doc in enumerate(llm_response["source_documents"]):
        print(f'FAQ #{src_doc.metadata["seq_num"]}:')
        print(src_doc.page_content)

In [17]:
# full example
def get_answer(query):
    llm_response = qa_chain(query)
    process_llm_response(llm_response)

In [18]:
get_answer("What are the conditions for requesting a refund? Do I need to keep the receipt?")

Answer:
 To request a refund, you must have a receipt or proof of purchase. If you don't have a receipt, contact our
customer support team for assistance. Refunds are typically processed within 10-14 business days after we
receive your return.
Reference:
FAQ #33:
Question: Can I return a product without a receipt? 
 Answer: A receipt or proof of purchase is usually required for returns. Please refer to our return policy or contact our customer support team for assistance.

FAQ #61:
Question: Can I return a product if I no longer have the original receipt? 
 Answer: While a receipt is preferred for returns, we may be able to assist you without it. Please contact our customer support team for further guidance.

FAQ #4:
Question: What is your return policy? 
 Answer: Our return policy allows you to return products within 30 days of purchase for a full refund, provided they are in their original condition and packaging. Please refer to our Returns page for detailed instructions.

FAQ #64:



llama_print_timings:        load time =  3017.18 ms
llama_print_timings:      sample time =    23.57 ms /    57 runs   (    0.41 ms per token)
llama_print_timings: prompt eval time =  3017.08 ms /   341 tokens (    8.85 ms per token)
llama_print_timings:        eval time =  8270.14 ms /    56 runs   (  147.68 ms per token)
llama_print_timings:       total time = 11404.08 ms


In [17]:
query = "What are the conditions for requesting a refund? Do I need to keep the receipt?" 
vectordb.similarity_search_with_score(query)

[(Document(page_content='Question: Can I return a product without a receipt? \n Answer: A receipt or proof of purchase is usually required for returns. Please refer to our return policy or contact our customer support team for assistance.\n', metadata={'source': '/home/tamizh/projects/llm-doc-retrieval-and-qa/data/faq_dataset.json', 'seq_num': 33}),
  0.7929237484931946),
 (Document(page_content='Question: Can I return a product if I no longer have the original receipt? \n Answer: While a receipt is preferred for returns, we may be able to assist you without it. Please contact our customer support team for further guidance.\n', metadata={'source': '/home/tamizh/projects/llm-doc-retrieval-and-qa/data/faq_dataset.json', 'seq_num': 61}),
  0.8628055453300476),
 (Document(page_content='Question: What is your return policy? \n Answer: Our return policy allows you to return products within 30 days of purchase for a full refund, provided they are in their original condition and packaging. Ple

In [25]:
print(qa_chain.combine_documents_chain.llm_chain.prompt.template)

Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.

{context}

Question: {question}
Helpful Answer:


In [26]:
llm_response = qa_chain(query)
llm_response

Llama.generate: prefix-match hit


 To open an account, please visit our website and click on the 'Sign Up' button located on the top right corner of the homepage. From there, you will be prompted to enter your personal and business information, as well as your payment details. Once you have completed the registration process, you will receive a confirmation email to activate your account.


llama_print_timings:        load time =  2952.18 ms
llama_print_timings:      sample time =    37.51 ms /    74 runs   (    0.51 ms per token)
llama_print_timings: prompt eval time =  3160.42 ms /   310 tokens (   10.19 ms per token)
llama_print_timings:        eval time = 11416.28 ms /    73 runs   (  156.39 ms per token)
llama_print_timings:       total time = 14830.66 ms


{'query': 'How can I open an account?',
 'result': " To open an account, please visit our website and click on the 'Sign Up' button located on the top right corner of the homepage. From there, you will be prompted to enter your personal and business information, as well as your payment details. Once you have completed the registration process, you will receive a confirmation email to activate your account.",
 'source_documents': [Document(page_content="Question: How can I create an account? \n Answer: To create an account, click on the 'Sign Up' button on the top right corner of our website and follow the instructions to complete the registration process.\n", metadata={'source': '/home/tamizh/projects/llm-doc-retrieval-and-qa/data/faq_dataset.json', 'seq_num': 1}),
  Document(page_content='Question: Can I order without creating an account? \n Answer: Yes, you can place an order as a guest without creating an account. However, creating an account offers benefits such as order tracking a

## Refine chain

In [13]:
# create the chain to answer questions 
qa_refine_chain = RetrievalQA.from_chain_type(
    llm=local_llm, 
                                  chain_type="refine", 
                                  retriever=retriever,
                                              verbose=True,
                                  return_source_documents=True)

In [14]:
llm_response = qa_refine_chain(query)



[1m> Entering new RetrievalQA chain...[0m



llama_print_timings:        load time =  1892.35 ms
llama_print_timings:      sample time =    14.55 ms /    34 runs   (    0.43 ms per token)
llama_print_timings: prompt eval time =  1892.29 ms /    82 tokens (   23.08 ms per token)
llama_print_timings:        eval time =  4695.60 ms /    33 runs   (  142.29 ms per token)
llama_print_timings:       total time =  6655.42 ms
Llama.generate: prefix-match hit

llama_print_timings:        load time =  1892.35 ms
llama_print_timings:      sample time =    35.27 ms /    85 runs   (    0.41 ms per token)
llama_print_timings: prompt eval time =  1423.71 ms /   160 tokens (    8.90 ms per token)
llama_print_timings:        eval time = 12018.94 ms /    84 runs   (  143.08 ms per token)
llama_print_timings:       total time = 13629.71 ms
Llama.generate: prefix-match hit



[1m> Finished chain.[0m



llama_print_timings:        load time =  1892.35 ms
llama_print_timings:      sample time =    58.16 ms /   142 runs   (    0.41 ms per token)
llama_print_timings: prompt eval time =  1497.90 ms /   202 tokens (    7.42 ms per token)
llama_print_timings:        eval time = 20907.67 ms /   141 runs   (  148.28 ms per token)
llama_print_timings:       total time = 22897.98 ms


In [15]:
llm_response

{'query': 'How can I open an account?',
 'result': " \n------------\nQuestion: How can I open an account and also sign up for your loyalty program?\n Answer: To open an account and sign up for our loyalty program, click on the 'Sign Up' button on the top right corner of our website. During the registration process, you will have the option to join our loyalty program. Alternatively, you can also sign up through the 'Order as a Guest' option during checkout, but we recommend creating an account for easier future purchases and order tracking benefits. Once you have completed the registration process, you will be able to earn points for every purchase that can be redeemed for discounts on future orders.",
 'source_documents': [Document(page_content="Question: How can I create an account? \n Answer: To create an account, click on the 'Sign Up' button on the top right corner of our website and follow the instructions to complete the registration process.\n", metadata={'source': '/home/tamiz

## Alternative version

In [12]:
from langchain.chains import RetrievalQAWithSourcesChain

retriever = vectordb.as_retriever(search_kwargs={"k": 3})

chain = RetrievalQAWithSourcesChain.from_chain_type(
        llm=local_llm,
        chain_type="stuff",
        retriever=retriever, 
        return_source_documents=True)

In [40]:
chain(query)

Llama.generate: prefix-match hit

llama_print_timings:        load time =  4275.01 ms
llama_print_timings:      sample time =    13.50 ms /    32 runs   (    0.42 ms per token)
llama_print_timings: prompt eval time =   832.51 ms /    11 tokens (   75.68 ms per token)
llama_print_timings:        eval time =  6013.54 ms /    31 runs   (  193.99 ms per token)
llama_print_timings:       total time = 13634.38 ms


{'question': 'How can I open an account?',
 'answer': " To create an account, click on the 'Sign Up' button on the top right corner of our website and follow the instructions to complete the registration process.",
 'sources': '',
 'source_documents': [Document(page_content="Question: How can I create an account? \n Answer: To create an account, click on the 'Sign Up' button on the top right corner of our website and follow the instructions to complete the registration process.\n", metadata={'source': '/home/tamizh/projects/llm-doc-retrieval-and-qa/data/faq_dataset.json', 'seq_num': 1}),
  Document(page_content='Question: Can I order without creating an account? \n Answer: Yes, you can place an order as a guest without creating an account. However, creating an account offers benefits such as order tracking and easier future purchases.\n', metadata={'source': '/home/tamizh/projects/llm-doc-retrieval-and-qa/data/faq_dataset.json', 'seq_num': 17})]}

In [16]:
class Document:
    def __init__(self, page_content):
        self.page_content = page_content

def dummy_chain(query):
    return {
        'question': 'How can I open an account?', 
        'answer': " To create an account, click on the 'Sign Up' button on the top right corner of our website and follow the instructions to complete the registration process.", 
        'sources': '', 
        'source_documents': [
            Document(page_content="Question: How can I create an account? \n Answer: To create an account"), 
            Document(page_content="Question: Do you have a loyalty program? \n Answer: No, we have royalty program"), 
        ]}


In [14]:
import gradio as gr
import random
import time

try:
    faq_bot.close()
except:
    pass

css = """ 
    #output_container_0 div.eta-bar {
    display: none !important; transform: none !important;
    }
"""

def format_answer(answer_dict):
    sources = [(doc.page_content.split("\n")[0].replace("Question: ", "").strip(),  
                  doc.page_content.split("\n")[1].replace("Answer: ", "").strip())
                  for doc in answer_dict["source_documents"]]
    answer = answer_dict["answer"]
    references = "## References\n" + "\n\n".join(f"**{q}**\n\n > {a}" for q, a in sources)
    return (answer, references)
    
def generate_response(query):
    generated_text = chain(query)
    answer, references = format_answer(generated_text)
    return {answer_block: answer, references_block: references}

with gr.Blocks(css=css, theme=gr.themes.Monochrome()) as faq_bot:
    gr.Markdown("Talk to our FAQ bot")
    
    with gr.Row():
        with gr.Column():
            answer_block = gr.Textbox(label="Answers", lines=2)
        with gr.Column():
            references_block = gr.Markdown("## References")
    inputs = gr.Textbox(label="Type your question here")
    
    with gr.Row():
        submit_btn = gr.Button("Ask")
        clear_btn = gr.ClearButton([inputs, answer_block, references_block])
        
    submit_btn.click(fn=generate_response, 
                     inputs=inputs, 
                     outputs=[answer_block, references_block],
                     show_progress=False)
    examples_block = gr.Examples(
        ["How can I create an account?", 
         "What is the return policy?",
         "How can I contact customer support?"], inputs)
    
faq_bot.launch()

Closing server running on port: 7860
Running on local URL:  http://127.0.0.1:7860

To create a public link, set `share=True` in `launch()`.




Llama.generate: prefix-match hit

llama_print_timings:        load time =  3781.58 ms
llama_print_timings:      sample time =    24.05 ms /    54 runs   (    0.45 ms per token)
llama_print_timings: prompt eval time =     0.00 ms /     1 tokens (    0.00 ms per token)
llama_print_timings:        eval time = 11029.36 ms /    54 runs   (  204.25 ms per token)
llama_print_timings:       total time = 11288.25 ms
Llama.generate: prefix-match hit

llama_print_timings:        load time =  3781.58 ms
llama_print_timings:      sample time =     5.15 ms /    12 runs   (    0.43 ms per token)
llama_print_timings: prompt eval time =  3530.32 ms /   302 tokens (   11.69 ms per token)
llama_print_timings:        eval time =  2125.30 ms /    11 runs   (  193.21 ms per token)
llama_print_timings:       total time =  5896.50 ms
Llama.generate: prefix-match hit

llama_print_timings:        load time =  3781.58 ms
llama_print_timings:      sample time =    41.39 ms /    89 runs   (    0.47 ms per token)
l

In [24]:
import langchain
import gradio as gr

if interface:
    interface.close()


# Define Gradio interface
interface = gr.Interface(
    fn=generate_response,
    inputs=gr.inputs.Textbox(label="Type your message here..."),
    outputs=gr.outputs.HTML(label=""),
    title="LangChain Chatbot",
    description="Talk to the LangChain chatbot!",
    layout="vertical",
    examples=[
        ["How can I return a product?"],
        ["What is the return policy?"],
        ["How can I contact customer support?"],
    ],
    allow_flagging=False,
    show_input=True,
    show_output=True,
    progress_bar=False,
    placeholder="Ask your question",
)

interface.css  = """
    
    #output_container_0 div.eta-bar {
    display: none !important; transform: none !important;
    }
   """
   

# Launch interface
interface.launch()

  super().__init__(
  super().__init__(
  interface = gr.Interface(
  interface = gr.Interface(
  interface = gr.Interface(
  interface = gr.Interface(


Running on local URL:  http://127.0.0.1:7867

To create a public link, set `share=True` in `launch()`.




In [16]:
import langchain
import gradio as gr


class Document:
    def __init__(self, page_content):
        self.page_content = page_content

def dummy_chain(query):
    return {
        'question': 'How can I open an account?', 
        'answer': " To create an account, click on the 'Sign Up' button on the top right corner of our website and follow the instructions to complete the registration process.", 
        'sources': '', 
        'source_documents': [
            Document(page_content="Question: How can I create an account? \n Answer: To create an account"), 
            Document(page_content="Question: Do you have a loyalty program? \n Answer: No, we have royalty program"), 
        ]}

def format_answer(answer_dict):
    # Extract answer and references from dictionary
    answer = answer_dict["answer"]
    
    references = [(doc.page_content.split("\n")[0].replace("Question: ", "").strip(),  
                  doc.page_content.split("\n")[1].replace("Answer: ", "").strip())
                  for doc in answer_dict["source_documents"]]

    
    # Create HTML containers for answer and references
    answer_container = f"<div style='background-color: #141414; padding: 10px'>{answer}</div>"

    references_container = ""
    for (question, answer) in references:
        references_container += f"<div style='background-color: #040404; padding: 10px; margin-top: 10px'>{answer}</div>"
    
    # Combine containers and return
    return answer_container + references_container


def generate_response(query):
    generated_text = dummy_chain(query)
    response = format_answer(generated_text)
    return response


In [50]:
## Backup
import gradio as gr
import random
import time

try:
    faq_bot.close()
except:
    pass

css = """ 
    #output_container_0 div.eta-bar {
    display: none !important; transform: none !important;
    }
"""

def format_answer(answer_dict):
    answer = answer_dict["answer"]
    
    sources = [(doc.page_content.split("\n")[0].replace("Question: ", "").strip(),  
                  doc.page_content.split("\n")[1].replace("Answer: ", "").strip())
                  for doc in answer_dict["source_documents"]]

    answer = f"## {answer}\n"

    references = "## References\n"
    for (question, answer) in sources:
        references += f"**{question}**\n\n"
        references += f"> {answer}\n\n"
    return (answer, references)
    
def generate_response(query):
    generated_text = dummy_chain(query)
    answer, references = format_answer(generated_text)
    return {answer_block: answer, references_block: references}

with gr.Blocks(css=css, theme=gr.themes.Monochrome()) as faq_bot:
    gr.Markdown("Talk to our FAQ bot")
    with gr.Row():
        with gr.Column():
            answer_block = gr.Textbox(label="Answers", lines=2)
        with gr.Column():
            references_block = gr.Markdown("## References")
    inputs = gr.Textbox(label="Type your question here")
    with gr.Row():
        submit_btn = gr.Button("Ask")
        clear_btn = gr.ClearButton([inputs, answer_block, references_block])
    submit_btn.click(fn=generate_response, 
                     inputs=inputs, 
                     outputs=[answer_block, references_block],
                     show_progress=False)
    gr.Examples(
        [
        ["How can I return a product?", "How can I return a product?"],
        ["What is the return policy?", "What is the return policy?"],
        ["How can I contact customer support?", "How can I contact customer support?"],
        ], inputs)
    
faq_bot.launch()

Closing server running on port: 7860
Running on local URL:  http://127.0.0.1:7860

To create a public link, set `share=True` in `launch()`.


