# Make PDF files as Retrieval-Augmented Generation Embedding

## Data Preparation

### Packages

In [36]:
# required libraries

# pip install pypdf PyPDF2 langchain transformers datasets sentence-transformers langchain-community langchain[embeddings]

# pip install google-auth google-auth-oauthlib google-api-python-client


In [37]:
import os
import PyPDF2

In [38]:
# from langchain.document_loaders import PyPDFLoader
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.chains import RetrievalQA
# from langchain.llms import OpenAI 
# from langchain.embeddings import OpenAIEmbeddings

from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from sentence_transformers import SentenceTransformer


In [39]:
from googleapiclient.discovery import build
from googleapiclient.http import MediaIoBaseDownload
from google.oauth2 import service_account
import io

### Google Drive

In [40]:
# Google Drive API credentials
SERVICE_ACCOUNT_FILE = 'google_service_key/confident-coda-260302-dd39ee8cec44.json'
SCOPES = ['https://www.googleapis.com/auth/drive']

In [41]:
def authenticate_google_drive():
    """Authenticate with Google Drive API using a service account."""
    credentials = service_account.Credentials.from_service_account_file(
        SERVICE_ACCOUNT_FILE, scopes=SCOPES
    )
    return build('drive', 'v3', credentials=credentials)

In [42]:
def get_pdfs_from_folder(drive_service, folder_id):
    """List and download PDF files from a specific Google Drive folder."""
    query = f"'{folder_id}' in parents and mimeType='application/pdf'"
    results = drive_service.files().list(q=query, fields="files(id, name)").execute()
    files = results.get('files', [])
    
    # Download the PDF files
    pdf_dir = "google_drive_pdfs"
    os.makedirs(pdf_dir, exist_ok=True)
    
    for file in files:
        file_id = file['id']
        file_name = file['name']
        request = drive_service.files().get_media(fileId=file_id)
        file_path = os.path.join(pdf_dir, file_name)
        
        with io.FileIO(file_path, 'wb') as fh:
            downloader = MediaIoBaseDownload(fh, request)
            done = False
            while not done:
                status, done = downloader.next_chunk()
                print(f"Downloaded {file_name} - {int(status.progress() * 100)}%")
    
    return pdf_dir


## RAG Embedding - Vector Store

- Work flow
1) Document loader
2) Text splitter
3) Embedding
4) Vector Store


### Test

In [43]:
# Setting the pdf file location
loader = PyPDFLoader('test_pdf/Walker_2010.pdf')

# Setting the pdf loader
docs = loader.load()

# Printing out the loaded document
print(docs[0].page_content[:300])

Disparities and access to healthy food in the United States: A review of food
deserts literature
Renee E. Walkera,b,n, Christopher R. Keanea, Jessica G. Burkea
a Department of Behavioral and Community Health Sciences, University of Pittsburgh Graduate School of Public Health, 130 DeSoto Street, Pitt


### Definition

In [44]:
# Step 1: Extract text and preprocess it
def extract_text_from_pdfs(pdf_folder):
    documents = []
    for file_name in os.listdir(pdf_folder):
        if file_name.endswith('.pdf'):
            file_path = os.path.join(pdf_folder, file_name)
            loader = PyPDFLoader(file_path)
            docs = loader.load()
            documents.extend(docs)
    return documents

In [46]:
# Step 2: Split text into manageable chunks
def split_documents(documents):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000, chunk_overlap=200  # chunk size and overlap
    )
    return text_splitter.split_documents(documents)

In [47]:
# Step 3: Generate embeddings and store in FAISS
def create_vector_store(documents, vectorstore_path, embedding_model_name="sentence-transformers/all-MiniLM-L6-v2"):
    from langchain.embeddings import HuggingFaceEmbeddings

    # Use HuggingFaceEmbeddings wrapper for SentenceTransformer
    embedding_function = HuggingFaceEmbeddings(model_name=embedding_model_name)
    
    texts = [doc.page_content for doc in documents]
    metadatas = [doc.metadata for doc in documents]

    # Create FAISS vector store with the embedding function
    vector_store = FAISS.from_texts(texts, embedding=embedding_function, metadatas=metadatas)
    vector_store.save_local(vectorstore_path)
    return vector_store

In [48]:
# Step 4: Load FAISS vector store
def load_vector_store(vectorstore_path, embedding_model_name="sentence-transformers/all-MiniLM-L6-v2"):
    embedding_model = SentenceTransformer(embedding_model_name)
    return FAISS.load_local(vectorstore_path, embedding_model)

In [49]:
# Step 5: Create a RAG pipeline
def create_rag_pipeline(vector_store, llm_model_name="decapoda-research/llama-7b-hf"):
    # Load the LLaMA model
    tokenizer = AutoTokenizer.from_pretrained(llm_model_name)
    model = AutoModelForCausalLM.from_pretrained(llm_model_name, device_map="auto", torch_dtype="auto")

    # Use the LLaMA model in a pipeline
    llm = pipeline("text-generation", model=model, tokenizer=tokenizer, device=0)

    # Create a retriever and QA chain
    retriever = vector_store.as_retriever(search_type="similarity", search_kwargs={"k": 5})
    qa_chain = RetrievalQA.from_chain_type(
        llm=llm,
        retriever=retriever,
        chain_type="stuff",
        return_source_documents=True
    )
    return qa_chain

In [50]:
# def main(google_folder_id, vectorstore_path):
#     drive_service = authenticate_google_drive()
#     pdf_folder = get_pdfs_from_folder(drive_service, google_folder_id)
#     documents = extract_text_from_pdfs(pdf_folder)
#     split_docs = split_documents(documents)
#     create_vector_store(split_docs, vectorstore_path)
#     print("Embedding creation complete!")

In [54]:
# def main(google_folder_id, vectorstore_path, embedding_model_name="sentence-transformers/all-MiniLM-L6-v2"):
#     """
#     Process all PDF files in the Google Drive folder, embedding and storing vectors.
#     Skips files with existing vector stores.
#     """
#     # Authenticate Google Drive API
#     drive_service = authenticate_google_drive()

#     # Query for PDF files in the Google Drive folder
#     query = f"'{google_folder_id}' in parents and mimeType='application/pdf'"
#     results = drive_service.files().list(q=query, fields="files(id, name)").execute()
#     files = results.get('files', [])

#     if not files:
#         print("No PDF files found in the Google Drive folder.")
#         return

#     # Ensure vector store path exists
#     os.makedirs(vectorstore_path, exist_ok=True)

#     # Temporary folder to store downloaded PDFs
#     temp_pdf_folder = "temp_pdfs"
#     os.makedirs(temp_pdf_folder, exist_ok=True)

#     # Iterate over all PDF files
#     for file in files:
#         file_id = file['id']
#         file_name = file['name']
#         temp_pdf_path = os.path.join(temp_pdf_folder, file_name)

#         # Check if vector store for the file already exists
#         vectorstore_file = os.path.join(vectorstore_path, f"{file_name}.faiss")
#         metadata_file = os.path.join(vectorstore_path, f"{file_name}.pkl")
#         if os.path.exists(vectorstore_file) and os.path.exists(metadata_file):
#             print(f"Vector store already exists for {file_name}, skipping embedding.")
#             continue

#         # Download the file
#         print(f"Downloading {file_name} from Google Drive...")
#         try:
#             request = drive_service.files().get_media(fileId=file_id)
#             with io.FileIO(temp_pdf_path, 'wb') as fh:
#                 downloader = MediaIoBaseDownload(fh, request)
#                 done = False
#                 while not done:
#                     status, done = downloader.next_chunk()
#                     print(f"Download progress: {int(status.progress() * 100)}%")
#         except Exception as e:
#             print(f"Failed to download {file_name}: {e}")
#             continue

#         # Process the downloaded PDF
#         try:
#             print(f"Processing {file_name}...")

#             # Extract text from the PDF
#             documents = extract_text_from_pdfs(temp_pdf_folder)

#             # Split documents into chunks
#             split_docs = split_documents(documents)

#             # Create and save the vector store
#             create_vector_store(split_docs, vectorstore_path, embedding_model_name)
#             print(f"Vector store saved for {file_name} at {vectorstore_path}")

#         except Exception as e:
#             print(f"Failed to process {file_name}: {e}")
#             continue

#     print("All files processed.")

In [58]:
def main(google_folder_id, vectorstore_path, batch_size=5, embedding_model_name="sentence-transformers/all-MiniLM-L6-v2"):
    """
    Process PDF files in batches of 5 from the Google Drive folder, embedding and storing vectors.
    Skips files with existing vector stores.
    """
    # Authenticate Google Drive API
    drive_service = authenticate_google_drive()

    # Query for PDF files in the Google Drive folder
    query = f"'{google_folder_id}' in parents and mimeType='application/pdf'"
    results = drive_service.files().list(q=query, fields="files(id, name)").execute()
    files = results.get('files', [])

    if not files:
        print("No PDF files found in the Google Drive folder.")
        return

    # Ensure vector store path exists
    os.makedirs(vectorstore_path, exist_ok=True)

    # Temporary folder to store downloaded PDFs
    temp_pdf_folder = "temp_pdfs"
    os.makedirs(temp_pdf_folder, exist_ok=True)

    # Process files in batches
    total_files = len(files)
    for i in range(0, total_files, batch_size):
        batch_files = files[i:i + batch_size]
        print(f"\nProcessing batch {i // batch_size + 1}/{(total_files + batch_size - 1) // batch_size}...")

        for file in batch_files:
            file_id = file['id']
            file_name = file['name']
            temp_pdf_path = os.path.join(temp_pdf_folder, file_name)

            # Check if vector store for the file already exists
            vectorstore_file = os.path.join(vectorstore_path, f"{file_name}.faiss")
            metadata_file = os.path.join(vectorstore_path, f"{file_name}.pkl")
            if os.path.exists(vectorstore_file) and os.path.exists(metadata_file):
                print(f"Vector store already exists for {file_name}, skipping embedding.")
                continue

            # Download the file
            print(f"Downloading {file_name} from Google Drive...")
            try:
                request = drive_service.files().get_media(fileId=file_id)
                with io.FileIO(temp_pdf_path, 'wb') as fh:
                    downloader = MediaIoBaseDownload(fh, request)
                    done = False
                    while not done:
                        status, done = downloader.next_chunk()
                        print(f"Download progress: {int(status.progress() * 100)}%")
            except Exception as e:
                print(f"Failed to download {file_name}: {e}")
                continue

            # Process the downloaded PDF
            try:
                print(f"Processing {file_name}...")

                # Extract text from the PDF
                documents = extract_text_from_pdfs(temp_pdf_folder)

                # Split documents into chunks
                split_docs = split_documents(documents)

                # Create and save the vector store
                create_vector_store(split_docs, vectorstore_path, embedding_model_name)
                print(f"Vector store saved for {file_name} at {vectorstore_path}")

            except Exception as e:
                print(f"Failed to process {file_name}: {e}")
                continue

    print("All files processed.")

### Run

In [None]:
# if __name__ == "__main__":
#     GOOGLE_FOLDER_ID = "1rnq1P_IGpzZzXp8iHaKLKt6BlJXkT2FI"  # Replace with your Google Drive folder ID
#     VECTORSTORE_PATH = "vector_store"                        # Directory to save FAISS vector store
#     main(GOOGLE_FOLDER_ID, VECTORSTORE_PATH)

Downloading Smets_2022_IJERPH_The changing landscape of food deserts and swamps over more than a decade in Flanders Belgium.pdf from Google Drive...
Download progress: 100%
Processing Smets_2022_IJERPH_The changing landscape of food deserts and swamps over more than a decade in Flanders Belgium.pdf...
Vector store saved for Smets_2022_IJERPH_The changing landscape of food deserts and swamps over more than a decade in Flanders Belgium.pdf at vector_store
Downloading Amin_2021_Food policy_Predicting access to healthful food retailers with ML.pdf from Google Drive...
Download progress: 100%
Processing Amin_2021_Food policy_Predicting access to healthful food retailers with ML.pdf...
Vector store saved for Amin_2021_Food policy_Predicting access to healthful food retailers with ML.pdf at vector_store
Downloading Ares_2024_Health and Place_WEIRD and non-consensual food deserts and swamps - a scoping review of operational definitions.pdf from Google Drive...
Download progress: 100%
Processin

In [59]:
if __name__ == "__main__":
    GOOGLE_FOLDER_ID = "1rnq1P_IGpzZzXp8iHaKLKt6BlJXkT2FI"  # Replace with your Google Drive folder ID
    VECTORSTORE_PATH = "vector_store"                        # Directory to save FAISS vector store

    main(GOOGLE_FOLDER_ID, VECTORSTORE_PATH, batch_size=5)


Processing batch 1/6...
Downloading Smets_2022_IJERPH_The changing landscape of food deserts and swamps over more than a decade in Flanders Belgium.pdf from Google Drive...
Download progress: 100%
Processing Smets_2022_IJERPH_The changing landscape of food deserts and swamps over more than a decade in Flanders Belgium.pdf...
Vector store saved for Smets_2022_IJERPH_The changing landscape of food deserts and swamps over more than a decade in Flanders Belgium.pdf at vector_store
Downloading Amin_2021_Food policy_Predicting access to healthful food retailers with ML.pdf from Google Drive...
Download progress: 100%
Processing Amin_2021_Food policy_Predicting access to healthful food retailers with ML.pdf...
Vector store saved for Amin_2021_Food policy_Predicting access to healthful food retailers with ML.pdf at vector_store
Downloading Ares_2024_Health and Place_WEIRD and non-consensual food deserts and swamps - a scoping review of operational definitions.pdf from Google Drive...
Download