In [8]:
import os
from dotenv import load_dotenv
import PyPDF2
from langchain_experimental.text_splitter import SemanticChunker
from langchain_community.embeddings import HuggingFaceEmbeddings
import google.generativeai as genai
import pymongo
from langchain import hub
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.schema.output_parser import StrOutputParser
from langchain.schema.runnable import RunnablePassthrough
from docx import Document
import numpy as np
from pathlib import Path

In [9]:
class SimpleRAGApp:
    def __init__(self):
        print("🚀 Starting Simple RAG Application...")
        
        # Setup Google AI
        self.google_api_key = os.getenv("GOOGLE_API_KEY")
        genai.configure(api_key=self.google_api_key)
        
        # Setup MongoDB
        self.mongo_uri = os.getenv("MONGODB_URI")
        self.client = pymongo.MongoClient(self.mongo_uri)
        self.db = self.client["rag_database"]
        self.collection = self.db["document_chunks"]
        
        # Setup embeddings for semantic chunking
        self.embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
        
        # Setup semantic chunker
        self.text_splitter = SemanticChunker(embeddings=self.embeddings)
        
        # Setup Gemini LLM
        self.model = ChatGoogleGenerativeAI(
            model="gemini-1.5-flash",
            google_api_key=self.google_api_key
        )
        
        # Setup RAG prompt
        self.prompt = hub.pull("rlm/rag-prompt")
        
        print("✅ RAG Application initialized!")

    def extract_pdf_text(self, pdf_path):
        """Step 1: Extract text from PDF"""
        print(f"📄 Reading PDF: {pdf_path}")
        
        try:
            with open(pdf_path, 'rb') as file:
                pdf_reader = PyPDF2.PdfReader(file)
                text = ""
                
                for page_num, page in enumerate(pdf_reader.pages):
                    page_text = page.extract_text()
                    text += page_text
                    print(f"   ✅ Page {page_num + 1} processed")
                
                print(f"📊 Total text extracted: {len(text)} characters")
                return text
                
        except Exception as e:
            print(f"❌ Error reading PDF: {e}")
            return None
    def create_semantic_chunks(self, text):
        """Step 2: Create semantic chunks"""
        print("🧠 Creating semantic chunks...")
        
        try:
            chunks = self.text_splitter.split_text(text)
            print(f"✅ Created {len(chunks)} semantic chunks")
            
            # Show sample chunks
            for i, chunk in enumerate(chunks[:2]):
                print(f"   Sample Chunk {i+1}: {chunk[:100]}...")
            
            return chunks
            
        except Exception as e:
            print(f"❌ Error creating chunks: {e}")
            return []
    
    def process_pdf_and_query(self, pdf_path, query):
        """Complete RAG pipeline"""
        print("🎯 Starting complete RAG pipeline...")
        
        # Step 1: Extract text from PDF
        text = self.extract_pdf_text(pdf_path)
        if not text:
            return
        
        # Step 2: Create semantic chunks
        chunks = self.create_semantic_chunks(text)
        if not chunks:
            return
        
        # # Step 3: Generate embeddings
        # chunk_embeddings = self.generate_embeddings(chunks)
        # if not chunk_embeddings:
        #     return
        
        # # Step 4: Store in MongoDB
        # self.store_in_mongodb(chunk_embeddings)
        
        # # Step 5: Generate answer
        # answer = self.generate_answer(query)
        
        # # Step 6: Save to DOCX
        # self.save_to_docx(query, answer)
        
        # print("\n🎉 RAG Pipeline Complete!")
        # print(f"📝 Question: {query}")
        # print(f"💡 Answer: {answer[:200]}...")
        
        # return answer        

    

In [15]:
# Initialize RAG app
rag_app = SimpleRAGApp()


# # Get current directory
# current_dir = Path.cwd()
# print(f"Current directory: {current_dir}")

# pdf_path = os.path.join(current_dir ,"data", "pyhtonapplications.pdf")
# output_path = os.path.join(current_dir ,"outputs", "result.docx")
# Process your PDF

query = "What is the main topic of this document?"  # Change this to your question
    
# Run complete pipeline
answer = rag_app.process_pdf_and_query(r"d:\AI\assginment_env\data\pythonai.pdf", query)

🚀 Starting Simple RAG Application...


Please use the `langsmith sdk` instead:
  pip install langsmith
Use the `pull_prompt` method.
  res_dict = client.pull_repo(owner_repo_commit)


✅ RAG Application initialized!
🎯 Starting complete RAG pipeline...
📄 Reading PDF: d:\AI\assginment_env\data\pythonai.pdf
   ✅ Page 1 processed
   ✅ Page 2 processed
   ✅ Page 3 processed
   ✅ Page 4 processed
   ✅ Page 5 processed
   ✅ Page 6 processed
   ✅ Page 7 processed
   ✅ Page 8 processed
   ✅ Page 9 processed
   ✅ Page 10 processed
   ✅ Page 11 processed
   ✅ Page 12 processed
   ✅ Page 13 processed
   ✅ Page 14 processed
   ✅ Page 15 processed
   ✅ Page 16 processed
   ✅ Page 17 processed
   ✅ Page 18 processed
   ✅ Page 19 processed
   ✅ Page 20 processed
   ✅ Page 21 processed
   ✅ Page 22 processed
   ✅ Page 23 processed
   ✅ Page 24 processed
   ✅ Page 25 processed
   ✅ Page 26 processed
   ✅ Page 27 processed
   ✅ Page 28 processed
   ✅ Page 29 processed
   ✅ Page 30 processed
   ✅ Page 31 processed
   ✅ Page 32 processed
   ✅ Page 33 processed
   ✅ Page 34 processed
   ✅ Page 35 processed
   ✅ Page 36 processed
   ✅ Page 37 processed
   ✅ Page 38 processed
   ✅ Page 39 p