<a href="https://colab.research.google.com/github/kim90000/2-rag-Successful/blob/main/suc_2rag_pdf.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

###############################################################

In [None]:
!pip install faiss-gpu
!pip install bitsandbytes
!huggingface-cli login
!pip install sentence-transformers langchain langchain-community tqdm pypdf faiss-gpu



شغال رائع

In [None]:
import os
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from langchain_community.vectorstores import FAISS
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains import ConversationalRetrievalChain
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.llms import HuggingFacePipeline
from langchain.memory import ConversationBufferMemory

# التأكد من استخدام CUDA إذا كان متوفراً
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
device = "cuda" if torch.cuda.is_available() else "cpu"
torch.set_float32_matmul_precision('high')

def get_gpu_memory():
    if torch.cuda.is_available():
        return torch.cuda.get_device_properties(0).total_memory / 1024**3
    return 0

class RAGSystem:
    def __init__(self, model_path="mistralai/Mistral-7B-Instruct-v0.2",
                 embedding_model="BAAI/bge-large-en-v1.5"):
        self.device = device
        self.setup_model(model_path)
        self.setup_embeddings(embedding_model)
        self.vector_db = None
        self.qa_chain = None

    def setup_model(self, model_path):
        print("تحميل النموذج...")
        self.tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)

        gpu_memory = get_gpu_memory()
        dtype = torch.float16 if gpu_memory < 16 else torch.bfloat16

        model_kwargs = {
            "torch_dtype": dtype,
            "device_map": "auto",
            "trust_remote_code": True,
            "load_in_8bit": gpu_memory < 8,
            "max_memory": {0: f"{int(gpu_memory * 0.9)}GB"},
        }

        try:
            model = AutoModelForCausalLM.from_pretrained(
                model_path,
                **model_kwargs
            )

            pipe = pipeline(
                "text-generation",
                model=model,
                tokenizer=self.tokenizer,
                max_new_tokens=2048,
                temperature=0.7,
                top_p=0.95,
                device_map="auto",
                batch_size=1,
                model_kwargs={"torch_dtype": dtype}
            )
        except Exception as e:
            print(f"خطأ أثناء تحميل النموذج: {str(e)}")
            model_kwargs.update({
                "load_in_8bit": True,
                "torch_dtype": torch.float16,
            })
            model = AutoModelForCausalLM.from_pretrained(
                model_path,
                **model_kwargs
            )
            pipe = pipeline(
                "text-generation",
                model=model,
                tokenizer=self.tokenizer,
                max_new_tokens=1024,
                temperature=0.7,
                top_p=0.95,
                device_map="auto",
                batch_size=1
            )

        self.llm = HuggingFacePipeline(pipeline=pipe)
        print("تم تحميل النموذج بنجاح")

    def setup_embeddings(self, model_name):
        print("تحميل نموذج التضمين...")
        self.embeddings = HuggingFaceEmbeddings(
            model_name=model_name,
            model_kwargs={'device': self.device},
            encode_kwargs={'device': self.device, 'batch_size': 32}
        )
        print("تم تحميل نموذج التضمين بنجاح")

    def load_documents(self, pdf_paths):
        print("تحميل المستندات...")
        loaders = [PyPDFLoader(path) for path in pdf_paths]
        pages = []
        for loader in loaders:
            pages.extend(loader.load())

        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=1024,
            chunk_overlap=64
        )
        doc_splits = text_splitter.split_documents(pages)

        print(f"تم تقسيم المستندات إلى {len(doc_splits)} جزء")
        return doc_splits

    def create_vector_db(self, pdf_paths):
        splits = self.load_documents(pdf_paths)
        print("إنشاء قاعدة البيانات المتجهة...")
        self.vector_db = FAISS.from_documents(splits, self.embeddings)
        print("تم إنشاء قاعدة البيانات المتجهة بنجاح")

        memory = ConversationBufferMemory(
            memory_key="chat_history",
            output_key='answer',
            return_messages=True
        )

        self.qa_chain = ConversationalRetrievalChain.from_llm(
            self.llm,
            retriever=self.vector_db.as_retriever(),
            memory=memory,
            chain_type="stuff",
            return_source_documents=True,
            verbose=False
        )
        print("تم إعداد نظام الأسئلة والأجوبة بنجاح")

    def save_vector_db(self, path="vector_db"):
        if self.vector_db:
            self.vector_db.save_local(path)
            print(f"تم حفظ قاعدة البيانات المتجهة في {path}")

    def load_vector_db(self, path="vector_db", allow_unsafe=False):
        if os.path.exists(path):
            print("تحميل قاعدة البيانات المتجهة...")
            try:
                self.vector_db = FAISS.load_local(
                    path,
                    self.embeddings,
                    allow_dangerous_deserialization=allow_unsafe
                )
                print("تم تحميل قاعدة البيانات المتجهة بنجاح")
                return True
            except ValueError as e:
                print(f"تحذير: {str(e)}")
                print("\nتأكد من أمان الملف أو استخدم allow_unsafe=True")
                return False
        return False

    def query(self, question, chat_history=[]):
        if not self.qa_chain:
            raise ValueError("يجب إنشاء قاعدة البيانات المتجهة أولاً")

        formatted_history = []
        for user_msg, bot_msg in chat_history:
            formatted_history.append(f"User: {user_msg}")
            formatted_history.append(f"Assistant: {bot_msg}")

        response = self.qa_chain.invoke({
            "question": question,
            "chat_history": formatted_history
        })

        answer = response["answer"]
        if "Helpful Answer:" in answer:
            answer = answer.split("Helpful Answer:")[-1].strip()

        sources = [
            {
                "content": doc.page_content.strip(),
                "page": doc.metadata["page"] + 1
            }
            for doc in response["source_documents"][:3]
        ]

        return answer, sources

def main():
    rag = RAGSystem()
    pdf_files = ["/content/The_Lightning_Thief_-_Percy_Jackson_1-10.pdf"]  # ضع مسار ملفات PDF هنا
    vector_db_path = "vector_db"

    # التحقق من وجود قاعدة البيانات
    if not rag.load_vector_db(vector_db_path, allow_unsafe=True):
        rag.create_vector_db(pdf_files)
        rag.save_vector_db(vector_db_path)

    chat_history = []

    while True:
        question = input("\nاكتب سؤالك (او 'خروج' للإنهاء): ")
        if question.lower() in ['exit', 'quit', 'خروج']:
            break

        answer, sources = rag.query(question, chat_history)
        chat_history.append((question, answer))

        print(f"\nالإجابة: {answer}")
        print("\nالمصادر:")
        for src in sources:
            print(f"المحتوى: {src['content']} - الصفحة: {src['page']}")

if __name__ == "__main__":
    main()


#####################################################################

شغال رائع

In [None]:



!pip install --upgrade langchain transformers chromadb




from langchain.vectorstores import Chroma
from langchain.document_loaders import PyPDFLoader
from langchain.embeddings.base import Embeddings
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from langchain.llms import HuggingFacePipeline
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
from sentence_transformers import SentenceTransformer
import torch

class SentenceTransformerEmbeddings(Embeddings):
    """Custom embedding class for SentenceTransformer models."""
    def __init__(self, model_name):
        self.model = SentenceTransformer(model_name)

    def embed_documents(self, texts):
        return self.model.encode(texts, show_progress_bar=True).tolist()

    def embed_query(self, text):
        return self.model.encode(text, show_progress_bar=False).tolist()

def setup_chroma_db(pdf_path, embedding_model_name="hkunlp/instructor-large"):
    """Load PDF, split into chunks, and set up ChromaDB with embeddings."""
    loader = PyPDFLoader(pdf_path)
    documents = loader.load_and_split()
    embeddings = SentenceTransformerEmbeddings(embedding_model_name)
    db = Chroma.from_documents(documents, embeddings)
    return db

def setup_mistral_pipeline(model_name="mistralai/Mistral-7B-Instruct-v0.2", device="cuda"):
    """Set up the Mistral model as a HuggingFace pipeline."""
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16, device_map="auto")
    pipeline_model = pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
        max_length=1024,
        temperature=0.7,
        top_p=0.9
    )
    # Pass the pipeline object as a keyword argument to HuggingFacePipeline
    return HuggingFacePipeline(pipeline=pipeline_model)

def build_qa_chain(retriever, llm):
    """Build a QA chain with a retriever and language model."""
    prompt_template = PromptTemplate(
        input_variables=["context", "question"],
        template="""
        Use the following context to answer the question:
        Context: {context}
        Question: {question}
        Answer in detail.
        """
    )
    qa_chain = RetrievalQA.from_chain_type(
        retriever=retriever,
        llm=llm,
        chain_type="stuff",
        return_source_documents=True,
        chain_type_kwargs={"prompt": prompt_template}
    )
    return qa_chain

if __name__ == "__main__":
    # Define paths and model names
    pdf_path = "/content/The_Lightning_Thief_-_Percy_Jackson_1-10.pdf"  # Path to your PDF
    embedding_model_name = "hkunlp/instructor-large"
    mistral_model_name = "mistralai/Mistral-7B-Instruct-v0.2"

    # Step 1: Set up ChromaDB
    print("Setting up ChromaDB...")
    vector_db = setup_chroma_db(pdf_path, embedding_model_name)
    retriever = vector_db.as_retriever()

    # Step 2: Set up Mistral pipeline
    print("Setting up Mistral pipeline...")
    mistral_pipeline = setup_mistral_pipeline(mistral_model_name)

    # Step 3: Build QA Chain
    print("Building QA chain...")
    qa_chain = build_qa_chain(retriever, mistral_pipeline)

    # Step 4: Ask a question
    question = "What is the main topic of the document?"
    print(f"Question: {question}")

    # Step 5: Get the answer
    result = qa_chain({"query": question})
    answer = result["result"]
    source_documents = result["source_documents"]

    # Step 6: Display the answer and sources
    print("\nAnswer:")
    print(answer)
    print("\nSource Documents:")
    for doc in source_documents:
        print(doc.page_content)
