<a href="https://colab.research.google.com/github/Nitroblaster99/YannisGerontopoulos_MLE_Assignment/blob/main/RAG_pdf_extract_and_classify.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Install Libraries

In [1]:
!pip install langchain --quiet
!pip install langchain-community --quiet
!pip install torch --quiet
!pip install pypdf --quiet
!pip install faiss-cpu --quiet
!pip install flash-attention --quiet
!pip install transformers --quiet
!pip install langchain-huggingface --quiet

In [2]:
# Import necessary libraries
import warnings
warnings.filterwarnings("ignore")  # Ignore warnings to keep the output clean
from langchain.document_loaders import PyPDFLoader  # To load PDFs
from langchain.text_splitter import RecursiveCharacterTextSplitter  # To split large documents into chunks
from langchain_core.prompts import PromptTemplate  # To define custom prompts for the language model
from langchain.chains import RetrievalQA  # For performing Question-Answer retrieval
import torch
from langchain.embeddings import HuggingFaceEmbeddings  # Embeddings for FAISS
from langchain import HuggingFaceHub  # HuggingFace model loading
from langchain.vectorstores import FAISS  # FAISS for document indexing and retrieval
import re  # Regex for cleaning text and extracting relevant info
import zipfile  # For extracting uploaded zip file
import json  # For saving final output in JSON format
from google.colab import files  # For file upload/download in Google Colab
import os  # To work with file paths

Upload Zipped Files

In [4]:
# Upload the zip file containing PDFs
uploaded = files.upload()  # This triggers file upload via Google Colab UI

Saving ICDAR2024_papers.zip to ICDAR2024_papers.zip


In [5]:
# Extract the uploaded zip file
zip_filename = list(uploaded.keys())[0]  # Get the uploaded zip file name
with zipfile.ZipFile(zip_filename, 'r') as zip_ref:
    zip_ref.extractall('/content/ICDAR2024_papers/ICDAR2024_papers.zip')  # Extract the contents of the zip file into a folder

In [13]:
# Function to get all PDF file paths from the extracted folder
def get_pdf_paths_from_folder(folder_path):
    """Walk through the folder and collect all PDF file paths."""
    pdf_paths = []
    for root, dirs, files in os.walk(folder_path):
        for file in files:
            if file.lower().endswith('.pdf'):  # Only collect PDF files
                pdf_paths.append(os.path.join(root, file))
    return pdf_paths

In [6]:
# Function to open and clean PDFs
def open_pdfs(file_path):
    """Load and clean the text from the PDF."""
    try:
        pdf_loader = PyPDFLoader(file_path)  # Use PyPDFLoader to load the PDF
        pdf = pdf_loader.load()  # Load PDF content

        # Clean up the text content by removing excessive newlines
        cleaned_pdf = []
        if isinstance(pdf, list):  # If multiple documents are returned (edge case)
            for doc in pdf:
                doc.page_content = " ".join(doc.page_content.split())  # Remove excessive newlines
                cleaned_pdf.append(doc)
            return cleaned_pdf
        else:
            pdf.page_content = " ".join(pdf.page_content.split())  # Clean single document
            return [pdf]
    except Exception as e:
        return [f"Error opening {file_path}: {e}"]  # Handle errors

Split Documents

In [7]:
# Split the documetns
def text_splits(data, size: int, overlap: int):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=size, chunk_overlap=overlap)
    chunks = text_splitter.split_documents(data)
    return chunks

Download Embedding and LLM model

In [8]:
# Load the embedding model
embeddings = HuggingFaceEmbeddings(model_name='mixedbread-ai/mxbai-embed-large-v1')  # Load embedding model once

  embeddings = HuggingFaceEmbeddings(model_name='mixedbread-ai/mxbai-embed-large-v1')  # Load embedding model once


In [9]:
# Load the LLM
llm = HuggingFaceHub(
    repo_id="mistralai/Mistral-7B-Instruct-v0.3",  # Specify the LLM
    model_kwargs=dict(max_new_tokens=1024, temperature=0.1, verbose=False),  # LLM parameters
    huggingfacehub_api_token=""  # HuggingFace API token
)

  llm = HuggingFaceHub(


Initialize json format and helper functions

In [10]:
# Initialize the JSON structure
final_json = {
    "tables": [],
    "classification": [],
    "keyInformationExtraction": [],
    "opticalCharacterRecognition": [],
    "datasets": [],
    "layoutUnderstanding": [],
    "others": []
}

In [11]:
# Helper function to normalize strings (lowercase, no punctuation, no extra spaces)
def normalize_string(s):
    """Normalize a string by lowercasing, removing punctuation, and extra whitespace."""
    return ' '.join(re.sub(r'[^\w\s]', '', s.lower()).split())

# Normalize the list of authors (remove brackets and extra spaces)
def normalize_authors(authors):
    """Normalize the authors list by removing brackets and extra whitespace."""
    return [normalize_string(author.strip('[]')) for author in authors]

# Check if an entry already exists in the category list to avoid duplicates
def entry_exists(category_list, title, authors):
    """Check if an entry with the same normalized title and authors exists."""
    normalized_title = normalize_string(title)
    normalized_authors = normalize_authors(authors)
    for entry in category_list:
        if (normalize_string(entry["title"]) == normalized_title and
            normalize_authors(entry["authors"]) == normalized_authors):
            return True  # Entry exists
    return False  # Entry doesn't exist

In [14]:
# Get the paths of all PDFs from the extracted folder
folder_path = '/content/ICDAR2024_papers/ICDAR2024_papers.zip/ICDAR2024_proceedings_pdfs'  # Path to extracted folder
file_paths = get_pdf_paths_from_folder(folder_path)

Initialize Retriever, Prompt Template and run RetrivalQA chain

In [15]:
# Initialize a set to keep track of processed entries to avoid duplicates
processed_entries = set()

# Process each PDF file
for file_path in file_paths:
    # Load and clean the PDF
    opened_pdfs = open_pdfs(file_path)
    splits = text_splits(opened_pdfs, 2048, 512)  # Split the PDF into chunks

    # Create a vector store and retriever for the current document
    vector_store = FAISS.from_documents(splits, embeddings)  # Create FAISS index for this document
    retriever = vector_store.as_retriever()  # Create a retriever from the FAISS index

    # QA chain setup for this document
    qa_stuff = RetrievalQA.from_chain_type(
        llm=llm,
        chain_type="stuff",  # Using the "stuff" chain type for simple Q&A
        retriever=retriever,
        verbose=False,
        return_source_documents=False,
    )

    # Define the prompt for extracting metadata from the paper
    prompt_template = """Your task is to read the content of a scientific paper.
    First, extract the title of the paper which is usually on the top.
    After, list all the authors from the paper.
    Then, based on the content and focus of the paper, categorize it into one of the following categories: Tables, Classification,
    Key Information Extraction, Optical Character Recognition (OCR), Datasets, Document Layout Understanding, or Others.

    After processing the paper, return only the following in a structured format:
    Title: The title of the paper.
    Authors: A list of authors.
    Category: One of the above categories based on the paper's primary subject matter.
    Ensure your categorization is based on the main focus of the paper, even if it overlaps with multiple categories.
    """

    # Get the response from the model
    response = qa_stuff.run(prompt_template)

    # Clean the response
    cleaned_response = re.sub(r'Use the following pieces of context to answer the question.*?Helpful Answer:', '', response, flags=re.DOTALL).strip()
    final_response = cleaned_response.replace('\n\n', ' ')  # Replace double newlines with a space

    # Extract the title, authors, and category using regex patterns

    # Extract the title by searching for "Title:" followed by any characters (.*)
    title_match = re.search(r"Title:\s*(.*)", final_response)
    # Extract the authors by searching for "Authors:" followed by any characters (.*)
    authors_match = re.search(r"Authors:\s*(.*)", final_response)
    # Extract the category by searching for "Category:" followed by any characters (.*)
    category_match = re.search(r"Category:\s*(.*)", final_response)

    # Get the file name from the file path
    file_name = os.path.basename(file_path)
    # If a title match is found, strip extra spaces; otherwise, return "Title not found"
    title = title_match.group(1).strip() if title_match else "Title not found"
    # If authors are found, split them by comma and space, otherwise return an empty list
    authors = authors_match.group(1).strip().split(", ") if authors_match else []
    # If a category match is found, strip extra spaces; otherwise, return "Other"
    category = category_match.group(1).strip() if category_match else "Other"

    # Create a unique identifier for this entry
    entry_id = f"{normalize_string(title)}|{'|'.join(normalize_authors(authors))}"

    # Check if this entry has already been processed to avoid duplicates
    if entry_id in processed_entries:
        print(f"Skipping duplicate entry: {title}")
        continue  # Skip adding the duplicate entry

    # Add the entry to the set of processed entries
    processed_entries.add(entry_id)

    # Append to the appropriate category in the final JSON structure
    category_map = {
        "Tables": "tables",
        "Classification": "classification",
        "Key Information Extraction": "keyInformationExtraction",
        "Optical Character Recognition": "opticalCharacterRecognition",
        "Datasets": "datasets",
        "Document Layout Understanding": "layoutUnderstanding"
    }

    # Retrieve the target category for the paper; if it's not recognized, assign it to "others"
    target_category = category_map.get(category, "others")

    # If the entry doesn't already exist in the target category, add it to the final JSON structure
    if not entry_exists(final_json[target_category], title, authors):
        final_json[target_category].append({
            "originalFileName": file_name, # Include the original file name for reference
            "title": title, # Add the extracted title
            "authors": authors # Add the list of authors
        })

  response = qa_stuff.run(prompt_template)


KeyboardInterrupt: 

Save files in json format

In [None]:
# Save the JSON to a file
output_file = '/content/test14.json'
with open(output_file, 'w') as f:
    json.dump(final_json, f, indent=4)

# Optional: Download the file to your local system
files.download(output_file)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>