In [2]:
from pptx import Presentation
import pytesseract
from PIL import Image
import io
import google.generativeai as genai
import json
import PyPDF2
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.utils import embedding_functions
import re
import os

# Configure API key
genai.configure(api_key="AIzaSyAEdj0FwtMdMyqiNAXs6Gz0HrZ0KIe6s4Q")  # Replace with your actual API key
model = genai.GenerativeModel('gemini-2.0-flash')

# Configure Tesseract path if needed
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract'  # Adjust to your path

# Initialize Embedding Model and Chroma
embedding_model = SentenceTransformer('all-mpnet-base-v2')
chroma_ef = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="all-mpnet-base-v2")

# Create chroma directory if it doesn't exist
os.makedirs("./chroma_db", exist_ok=True)
client = chromadb.PersistentClient(path="./chroma_db")
collection = client.get_or_create_collection("chemistry_collection", embedding_function=chroma_ef)

def extract_page_data(page, page_type):
    """Extracts text and image captions from a page (PPT or PDF)."""
    if page_type == "ppt":
        slide_data = {"slide_text": [], "image_captions": []}
        for shape in page.shapes:
            if shape.has_text_frame:
                for paragraph in shape.text_frame.paragraphs:
                    slide_data["slide_text"].append(paragraph.text)
            elif shape.has_picture:
                try:
                    image_bytes = shape.image.blob  # Fixed: Changed page.image to shape.image
                    image = Image.open(io.BytesIO(image_bytes))
                    text = pytesseract.image_to_string(image)
                    slide_data["image_captions"].append(text)
                except Exception as e_image:
                    print(f"Error processing image: {e_image}")
        return slide_data

    elif page_type == "pdf":
        page_text = page.extract_text()
        return {"slide_text": [page_text], "image_captions": []}  # PDFs don't have images in the same way.

    else:
        return {"slide_text": [], "image_captions": []}

def process_ppt_pages(ppt_file_path):
    """Processes each page of a PPT and extracts data."""
    try:
        presentation = Presentation(ppt_file_path)
        page_data = []
        for i, slide in enumerate(presentation.slides):
            data = extract_page_data(slide, "ppt")
            page_data.append({"page_number": i + 1, "data": data, "page_type": "ppt"})
        return page_data
    except Exception as e:
        print(f"Error processing PPT: {e}")
        return []

def process_pdf_pages(pdf_file_path):
    """Processes each page of a PDF and extracts data."""
    try:
        with open(pdf_file_path, "rb") as file:
            pdf_reader = PyPDF2.PdfReader(file)
            page_data = []
            for i, page in enumerate(pdf_reader.pages):
                data = extract_page_data(page, "pdf")
                page_data.append({"page_number": i + 1, "data": data, "page_type": "pdf"})
        return page_data
    except Exception as e:
        print(f"Error processing PDF: {e}")
        return []

def process_files(file_path):
    """Processes either a PPT or PDF file based on its extension."""
    if file_path.lower().endswith(".pptx"):
        return process_ppt_pages(file_path)
    elif file_path.lower().endswith(".pdf"):
        return process_pdf_pages(file_path)
    else:
        print("Unsupported file type.")
        return []

def extract_chemistry_qa_from_page(page_data):
    """Extracts chemistry questions and answers from a page's text using Gemini."""
    slide_text = "\n".join(page_data["slide_text"])
    image_captions = "\n".join(page_data["image_captions"])
    combined_text = slide_text + "\n" + image_captions

    if not combined_text.strip():
        return None

    prompt = f"""
    Extract chemistry questions and answers from the following text. 
    Format your response as a JSON object with the following structure:
    {{
        "questions": [
            {{
                "question_text": "Full text of the question",
                "options": {{
                    "A": "Text of option A",
                    "B": "Text of option B",
                    "C": "Text of option C",
                    "D": "Text of option D"
                }},
                "correct_answer": "Letter of the correct option (A, B, C, or D)",
                "explanation": "Explanation of the answer"
            }},
            ...
        ]
    }}
    
    If there are no clear questions and answers, return an empty questions array.
    If options are not lettered, assign them letters in order.
    
    Text: {combined_text}
    """

    try:
        response = model.generate_content(prompt)
        response_text = response.text
        
        # Try to parse as JSON
        try:
            # Find JSON content within response (in case there's extra text)
            json_match = re.search(r'```json\s*([\s\S]*?)\s*```', response_text)
            if json_match:
                json_str = json_match.group(1)
            else:
                json_str = response_text
                
            # Clean up any markdown formatting
            json_str = re.sub(r'```.*?```', '', json_str, flags=re.DOTALL)
            
            # Parse the JSON
            result = json.loads(json_str)
            return result
        except json.JSONDecodeError:
            # If parsing fails, try to identify and fix common JSON issues
            cleaned_text = response_text.strip()
            # Remove markdown code blocks if present
            cleaned_text = re.sub(r'```json|```', '', cleaned_text)
            
            # Attempt to parse again
            try:
                result = json.loads(cleaned_text)
                return result
            except:
                print(f"Failed to parse JSON from response: {response_text}")
                # Return a structured empty result
                return {"questions": []}
    except Exception as e:
        print(f"Error extracting Q&A: {e}")
        return {"questions": []}

def process_files_and_extract_qa(file_path):
    """Processes all pages and extracts chemistry Q&A for each page."""
    page_data = process_files(file_path)
    all_questions = []

    for i, page in enumerate(page_data):
        print(f"Processing page {i+1}/{len(page_data)}...")
        qa_data = extract_chemistry_qa_from_page(page["data"])
        
        if qa_data and "questions" in qa_data and qa_data["questions"]:
            for q in qa_data["questions"]:
                q["page_number"] = page["page_number"]
                q["page_type"] = page["page_type"]
                all_questions.append(q)

    print(f"Extracted {len(all_questions)} questions total.")
    return all_questions

def store_qa_in_db(questions):
    """Stores the Q&A data in the vector database."""
    for i, q in enumerate(questions):
        # Prepare data for embedding
        question_id = f"q{i+1}"
        
        # Create full text representation for embedding
        full_text = f"Question: {q['question_text']}\n"
        
        if "options" in q:
            options_text = ""
            for option_key, option_text in q["options"].items():
                options_text += f"Option {option_key}: {option_text}\n"
            full_text += options_text
        
        if "correct_answer" in q:
            full_text += f"Correct Answer: {q['correct_answer']}\n"
        
        if "explanation" in q:
            full_text += f"Explanation: {q['explanation']}\n"
        
        # Generate embeddings
        embedding = embedding_model.encode(full_text).tolist()
        
        # Store in Chroma
        collection.add(
            embeddings=[embedding],
            documents=[full_text],
            metadatas=[{
                "page_number": q.get("page_number", 0),
                "page_type": q.get("page_type", "unknown"),
                "question_id": question_id
            }],
            ids=[question_id]
        )
        
        print(f"Stored question {i+1} in database with ID {question_id}")

def query_qa_db(query, n_results=5):
    """Queries the chemistry Q&A vector database and returns relevant results."""
    query_embedding = embedding_model.encode(query).tolist()
    results = collection.query(
        query_embeddings=[query_embedding],
        n_results=n_results
    )
    return results

def generate_answer(user_query, retrieved_results):
    """Generates an answer using Gemini based on retrieved results."""
    # Compile context from retrieved documents
    context_docs = retrieved_results["documents"][0] if retrieved_results["documents"] else []
    context = "\n\n".join(context_docs)
    
    prompt = f"""
    Based on the following chemistry question and answer information, provide a detailed answer to the user's query.
    
    Retrieved Information:
    {context}
    
    User Query: {user_query}
    
    Provide a clear, detailed answer with explanations of the chemistry concepts involved. If the answer involves chemical reactions, explain the mechanism where appropriate.
    """
    
    response = model.generate_content(prompt)
    return response.text

def rag_pipeline(user_query, n_results=3):
    """Full RAG pipeline: Retrieval + Answer Generation."""
    print(f"Processing query: {user_query}")
    
    # Retrieve relevant Q&A content
    retrieved_results = query_qa_db(user_query, n_results)
    
    # Generate answer based on retrieved content
    answer = generate_answer(user_query, retrieved_results)
    
    return answer

# Example usage
def main():
    # Path to the chemistry PDF
    file_path = "chem.pdf"  # Change to your file
    
    # Step 1: Process the PDF and extract Q&A data
    print("Extracting questions and answers from PDF...")
    questions = process_files_and_extract_qa(file_path)
    
    # Step 2: Store the Q&A data in the vector database
    print("Storing Q&A data in the vector database...")
    store_qa_in_db(questions)
    
    # Step 3: Use the RAG pipeline to answer questions
    print("\nRAG pipeline ready! You can now ask chemistry questions.")
    print("Example queries:")
    print("- What is the mechanism for alkene bromination?")
    print("- How do Grignard reagents form?")
    print("- What is the difference between SN1 and SN2 reactions?")
    
    while True:
        user_query = "What is Grignard reagent?"  # Change to your query
        if user_query.lower() == 'exit':
            break
        
        answer = rag_pipeline(user_query)
        print("\nAnswer:")
        print(answer)
        break

if __name__ == "__main__":
    main()

KeyboardInterrupt: 