# Task 1 - RAG Model for QA Bot
This Colab notebook implements a Retrieval-Augmented Generation (RAG) model for a QA bot, leveraging the OpenAI API and Pinecone DB. The QA bot answers questions based on a set of business documents.

## Key Components:
- **Text Preprocessing**: Removes special characters, lemmatizes words, and filters out stopwords.
- **Pinecone Indexing**: Documents are tokenized into sentences, embedded, and added to Pinecone for retrieval.
- **Retrieval**: Retrieves relevant documents based on query embedding similarity.
- **Generation**: Uses GPT-3.5-turbo to generate answers from the retrieved context.
- **Gradio Interface**: Provides a user interface to interact with the bot.

In [None]:
!pip install openai pinecone-client nltk gradio

In [None]:
import os
import openai
import pinecone
import nltk
from typing import List, Dict, Any
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
import logging
import gradio as gr

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Set up API keys (you'll need to set these in Colab)
openai.api_key = os.getenv("OPENAI_API_KEY")
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
PINECONE_ENV = "your-pinecone-environment"  # e.g., 'us-west1-gcp'

# Initialize Pinecone
try:
    pinecone.init(api_key=PINECONE_API_KEY, environment=PINECONE_ENV)
except Exception as e:
    logger.error(f'Failed to initialize Pinecone: {str(e)}')
    raise

In [None]:
# Create or connect to an index
index_name = "business-qa-index"
try:
    if index_name not in pinecone.list_indexes():
        pinecone.create_index(index_name, dimension=1536)  # OpenAI embeddings are 1536 dimensions
    index = pinecone.Index(index_name)
except Exception as e:
    logger.error(f'Failed to create or connect to Pinecone index: {str(e)}')
    raise

In [None]:
def preprocess_text(text: str) -> str:
    '''Preprocess the text by removing special characters, lemmatizing, and removing stopwords.'''
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove special characters and digits
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word.lower()) for word in text.split()]
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    return ' '.join(tokens)

In [None]:
def get_embedding(text: str) -> List[float]:
    '''Get embedding for a given text using OpenAI's API.'''
    try:
        response = openai.Embedding.create(input=text, model="text-embedding-ada-002")
        return response['data'][0]['embedding']
    except Exception as e:
        logger.error(f'Failed to get embedding: {str(e)}')
        raise

def add_documents_to_index(documents: List[str]):
    '''Add documents to the Pinecone index.'''
    try:
        for i, doc in enumerate(documents):
            preprocessed_doc = preprocess_text(doc)
            sentences = sent_tokenize(doc)
            for j, sentence in enumerate(sentences):
                embedding = get_embedding(sentence)
                index.upsert(vectors=[(f"{i}-{j}", embedding, {"text": sentence, "preprocessed": preprocess_text(sentence)})])
    except Exception as e:
        logger.error(f'Failed to add documents to index: {str(e)}')
        raise

In [None]:
def retrieve_relevant_docs(query: str, k: int = 3) -> List[Dict[str, Any]]:
    '''Retrieve the k most relevant documents for a given query.'''
    try:
        query_embedding = get_embedding(query)
        results = index.query(query_embedding, top_k=k, include_metadata=True)
        return [{"text": result['metadata']['text'], "preprocessed": result['metadata']['preprocessed']} for result in results['matches']]
    except Exception as e:
        logger.error(f'Failed to retrieve relevant documents: {str(e)}')
        raise

In [None]:
def generate_answer(query: str, context: List[Dict[str, Any]]) -> str:
    '''Generate an answer using OpenAI's API with improved prompt engineering.'''
    try:
        context_str = "\n".join([doc["text"] for doc in context])
        prompt = f'''You are a helpful AI assistant for a business. Use the following context to answer the user's question. 
        If the answer is not in the context, say "I don't have enough information to answer that question."

        Context:
        {context_str}

        User Question: {query}

        Please provide a concise and accurate answer based on the given context:'''
        
        response = openai.ChatCompletion.create(
            model="gpt-3.5-turbo",
            messages=[
                {"role": "system", "content": "You are a helpful AI assistant for a business."},
                {"role": "user", "content": prompt}
            ],
            max_tokens=150,
            n=1,
            stop=None,
            temperature=0.7,
        )
        return response.choices[0].message['content'].strip()
    except Exception as e:
        logger.error(f'Failed to generate answer: {str(e)}')
        raise

In [None]:
def qa_bot(query: str) -> str:
    '''Main function to handle user queries.'''
    try:
        relevant_docs = retrieve_relevant_docs(query)
        answer = generate_answer(query, relevant_docs)
        return answer
    except Exception as e:
        logger.error(f'QA bot encountered an error: {str(e)}')
        return "I'm sorry, but I encountered an error while processing your query. Please try again later."

# Example usage and Gradio Interface
documents = [
    "Our company, TechInnovate, was founded in 1995 by Dr. Jane Smith.",
    "We specialize in AI-driven solutions for businesses, focusing on natural language processing and computer vision.",
    "Our headquarters is located in San Francisco, California, with satellite offices in New York and London.",
    "TechInnovate's flagship product, AIAssist, has been adopted by over 500 Fortune 1000 companies.",
    "In 2022, we launched our new cloud-based AI platform, which has seen a 200% growth in user adoption within the first year.",
]

# Add documents to the index
add_documents_to_index(documents)

# Gradio UI
def gradio_interface(query):
    return qa_bot(query)

iface = gr.Interface(
    fn=gradio_interface,
    inputs=gr.Textbox(lines=2, placeholder="Enter your question here..."),
    outputs="text",
    title="TechInnovate AI Assistant",
    description="Ask me anything about TechInnovate!",
    theme="default"
)

iface.launch(share=True)