## RAG example with Langchain, FAISS and OpenAI


#### Bases parameters

In [11]:
index_name = 'pdf_docs'
# Define the OpenAI API key
openai_api_key = 'xxx'

#### Imports

In [12]:
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.llms import OpenAI
import openai
import os
from tqdm import tqdm, trange
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI
from pprint import pprint


#### Initialize the connection

In [13]:
embeddings = HuggingFaceEmbeddings()
faiss_index = FAISS.load_local(folder_path="pdf", embeddings=embeddings, index_name=index_name, allow_dangerous_deserialization=True)

retriever = faiss_index.as_retriever(search_type="similarity", search_kwargs={"k": 4})



#### Initialize query chain

In [14]:
# NOTE: This template syntax is specific to Llama2
system_prompt="""<s>
You are a helpful, respectful and honest assistant.
You will be given a question you need to answer, and a context to provide you with information. You must answer the question based as much as possible on this context.
Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.

If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.

Context: {context}
"""

llm = ChatOpenAI(openai_api_key=openai_api_key, model="gpt-4o", max_tokens=512, temperature=0.1)

prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)

retriever = faiss_index.as_retriever(index_name=index_name, search_type="similarity", search_kwargs={"k": 4})

question_answer_chain = create_stuff_documents_chain(llm, prompt)
chain = create_retrieval_chain(retriever, question_answer_chain)

#### Query example

In [18]:
question = "Which plan does Telstra have?"
result = chain.invoke({"input": question})

pprint(result['answer'])

('Telstra offers several mobile plans, as outlined in the provided context. '
 'Here are the details of the available plans:\n'
 '\n'
 '1. **Basic Plan**\n'
 '   - Minimum Monthly Charge: $62/month\n'
 '   - Monthly Data Allowance: 50GB\n'
 '   - Network Access: 3G, 4G/4GX, 5G\n'
 '   - Plan Speeds: Capped Speed\n'
 '   - Calls + SMS + MMS + MessageBank® to standard Australian numbers: '
 'Unlimited\n'
 '\n'
 '2. **Essential Plan**\n'
 '   - Minimum Monthly Charge: $72/month\n'
 '   - Monthly Data Allowance: 180GB\n'
 '   - Network Access: 3G, 4G/4GX, 5G\n'
 '   - Plan Speeds: Ultimate 4G & 5G Speeds\n'
 '   - Calls + SMS + MMS + MessageBank® to standard Australian numbers: '
 'Unlimited\n'
 '\n'
 '3. **Premium Plan**\n'
 '   - Minimum Monthly Charge: $95/month\n'
 '   - Monthly Data Allowance: 300GB\n'
 '   - Network Access: 3G, 4G/4GX, 5G\n'
 '   - Plan Speeds: Ultimate 4G & 5G Speeds\n'
 '   - Calls + SMS + MMS + MessageBank® to standard Australian numbers: '
 'Unlimited\n'
 '\n'
 '

#### Retrieve source

In [None]:
def remove_duplicates(input_list):
    unique_list = []
    for item in input_list:
        if item.metadata['source'] not in unique_list:
            unique_list.append(item.metadata['source'])
    return unique_list

results = remove_duplicates(result['source_documents'])

for s in results:
    print(s)