# Make PDF files as Retrieval-Augmented Generation Embedding

## Data Preparation

### Packages

In [12]:
# required libraries

# !pip install pypdf PyPDF2 langchain transformers datasets 

In [13]:
import os
import PyPDF2

In [22]:
from langchain.document_loaders import PyPDFLoader
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.chains import RetrievalQA
# from langchain.llms import OpenAI 
# from langchain.embeddings import OpenAIEmbeddings

from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from sentence_transformers import SentenceTransformer


ModuleNotFoundError: No module named 'sentence_transformers'

### Load

In [5]:
# Setting the pdf file location
loader = PyPDFLoader('test_pdf/Walker_2010.pdf')

# Setting the pdf loader
docs = loader.load()

# Printing out the loaded document
print(docs[10].page_content[:300])

ImportError: pypdf package not found, please install it with `pip install pypdf`

In [9]:
# Step 1: Extract text and preprocess it
def extract_text_from_pdfs(pdf_folder):
    documents = []
    for file_name in os.listdir(pdf_folder):
        if file_name.endswith('.pdf'):
            file_path = os.path.join(pdf_folder, file_name)
            loader = PyPDFLoader(file_path)
            docs = loader.load()
            documents.extend(docs)
    return documents

In [8]:
documents = []
for file_name in os.listdir('test_pdf'):
    if file_name.endswith('.pdf'):
        file_path = os.path.join('test_pdf', file_name)
        loader = PyPDFLoader(file_path)
        docs = loader.load()
        documents.extend(docs)

In [10]:
# Step 2: Split text into manageable chunks
def split_documents(documents):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=200,
    )
    return text_splitter.split_documents(documents)

In [11]:
# Step 3: Generate embeddings and store in FAISS
def create_vector_store(documents, vectorstore_path, embedding_model_name="sentence-transformers/all-MiniLM-L6-v2"):
    embedding_model = SentenceTransformer(embedding_model_name)
    texts = [doc.page_content for doc in documents]
    metadatas = [doc.metadata for doc in documents]
    embeddings = embedding_model.encode(texts, show_progress_bar=True)
    vector_store = FAISS.from_texts(texts, embeddings, metadatas=metadatas)
    vector_store.save_local(vectorstore_path)
    return vector_store

In [17]:
# Step 4: Load FAISS vector store
def load_vector_store(vectorstore_path, embedding_model_name="sentence-transformers/all-MiniLM-L6-v2"):
    embedding_model = SentenceTransformer(embedding_model_name)
    return FAISS.load_local(vectorstore_path, embedding_model)

In [18]:
# Step 5: Create a RAG pipeline
def create_rag_pipeline(vector_store, llm_model_name="decapoda-research/llama-7b-hf"):
    # Load the LLaMA model
    tokenizer = AutoTokenizer.from_pretrained(llm_model_name)
    model = AutoModelForCausalLM.from_pretrained(llm_model_name, device_map="auto", torch_dtype="auto")

    # Use the LLaMA model in a pipeline
    llm = pipeline("text-generation", model=model, tokenizer=tokenizer, device=0)

    # Create a retriever and QA chain
    retriever = vector_store.as_retriever(search_type="similarity", search_kwargs={"k": 5})
    qa_chain = RetrievalQA.from_chain_type(
        llm=llm,
        retriever=retriever,
        chain_type="stuff",
        return_source_documents=True
    )
    return qa_chain

In [20]:
# Main function
def main(pdf_folder, vectorstore_path, llm_model_name="decapoda-research/llama-7b-hf"):
    if not os.path.exists(vectorstore_path):
        print("Extracting and embedding documents...")
        documents = extract_text_from_pdfs(pdf_folder)
        split_docs = split_documents(documents)
        create_vector_store(split_docs, vectorstore_path)
    else:
        print("Loading existing vector store...")
    
    vector_store = load_vector_store(vectorstore_path)
    qa_chain = create_rag_pipeline(vector_store, llm_model_name)

    # Interactive Q&A
    while True:
        query = input("\nEnter your query (or 'exit' to quit): ")
        if query.lower() == 'exit':
            break
        result = qa_chain.run(query)
        answer = result['answer']
        sources = result['source_documents']
        print(f"\nAnswer: {answer}")
        print("\nSources:")
        for doc in sources:
            print(f"- {doc.metadata['source']}")



In [21]:
if __name__ == "__main__":
    pdf_folder = "test_pdf"  # Replace with your folder path
    vectorstore_path = "vector_store"  # Directory to save FAISS vector store
    main(pdf_folder, vectorstore_path)

Extracting and embedding documents...


NameError: name 'SentenceTransformer' is not defined

### RAG

### Embed

### Save