# Retrieval Augmented Generation (RAG) for QA Bot
This notebook demonstrates how to build a QA bot using Retrieval Augmented Generation (RAG) with OpenAI API and Pinecone DB.

In [ ]:
# Install required libraries
!pip install openai pinecone-client

In [ ]:
# Import required libraries
import openai
import pinecone
import re
from typing import List

In [ ]:
# Define Helper Functions
def preprocess_text(text: str) -> str:
    """
    Preprocess the input text by removing special characters and extra spaces.
    """
    text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces with a single space
    text = re.sub(r'[^\w\s]', '', text)  # Remove special characters
    return text.strip()

In [ ]:
# Define your OpenAI API key
openai_api_key = "sk-...QqAA"  # Ensure this is a string

# Configure the OpenAI client with the API key
openai.api_key = openai_api_key

# Test the OpenAI API setup by making a simple request
try:
    response = openai.Completion.create(
        engine="davinci",
        prompt="Hello, world!",
        max_tokens=5
    )
    # Print the response to verify the setup
    print(response.choices[0].text.strip())
except Exception as e:
    print(f"Error with OpenAI API request: {e}")

In [ ]:
# Define your Pinecone API key
pinecone_api_key = "pcsk_3mvD2b_***********"

# Initialize the Pinecone client with the API key
try:
    pinecone.init(api_key=pinecone_api_key)
    print("Pinecone client initialized successfully.")
except Exception as e:
    print(f"Error initializing Pinecone client: {e}")

# Create a new index in Pinecone
index_name = "qa_bot_index"
try:
    pinecone.create_index(name=index_name, dimension=512, metric="cosine")
    print(f"Index '{index_name}' created successfully.")
except Exception as e:
    print(f"Error creating index '{index_name}': {e}")

In [ ]:
# Define functions to generate embeddings, upsert embeddings, and query the index
def generate_embeddings(texts: List[str]) -> List[List[float]]:
    """Generate embeddings for a list of texts using OpenAI API."""
    try:
        response = openai.Embedding.create(input=texts, engine="text-embedding-ada-002")
        embeddings = [embedding['embedding'] for embedding in response['data']]
    except Exception as e:
        print(f"Error generating embeddings: {e}")
        embeddings = []
    return embeddings

def upsert_embeddings(index, texts: List[str], ids: List[str]):
    """Upsert embeddings into the Pinecone index."""
    preprocessed_texts = [preprocess_text(text) for text in texts]
    embeddings = generate_embeddings(preprocessed_texts)
    vectors = [(id, embedding) for id, embedding in zip(ids, embeddings)]
    index.upsert(vectors)

def query_index(index, query_text: str, top_k: int = 5) -> List[str]:
    """Query the Pinecone index with a text and return the top_k most similar texts."""
    preprocessed_query = preprocess_text(query_text)
    query_embedding = generate_embeddings([preprocessed_query])[0]
    query_response = index.query(queries=[query_embedding], top_k=top_k)
    if not query_response or 'matches' not in query_response:
        return []
    return [match['id'] for match in query_response['matches']]

In [ ]:
# Example usage
documents = [
    "Our return policy allows returns within 30 days of purchase.",
    "You can return items within 30 days for a full refund.",
    "Returns are accepted within 30 days of the purchase date.",
    "You have 30 days to return an item from the date you received it.",
    "Items can be returned within 30 days for a full refund."
]
document_ids = [f"doc_{i}" for i in range(len(documents))]

upsert_embeddings(index, documents, document_ids)

query_text = "What is the return policy?"
retrieved_document_ids = query_index(index, query_text)
retrieved_documents = [documents[int(id.split('_')[1])] for id in retrieved_document_ids]
rag_answer = generate_answer(query_text, retrieved_documents)

print("RAG Model Answer:", rag_answer)