# classify_pdf

In [9]:
# !pip install PyPDF2 pdfplumber

In [12]:
import os
import shutil
from typing import List, Dict
import PyPDF2
import pdfplumber
from pathlib import Path

def setup_folders(base_path: str):
    """
    Set up working folders within the specified base path.

    Args:
        base_path (str): Path to the root directory containing the PDF files.
    """
    
    # Navigate to the base directory
    full_path = Path(base_path.strip('/'))
    os.chdir(full_path)
    print(f"Working directory set to: {os.getcwd()}")
    
    # Create 'pdf' and 'scan' folders if they don't exist
    (full_path / 'pdf').mkdir(exist_ok=True)
    (full_path / 'scan').mkdir(exist_ok=True)
    
    return full_path

class PDFOrganizer:
    def __init__(self, working_dir: str = None):
        """
        Initialize PDFOrganizer with a working directory.

        Args:
            working_dir (str, optional): Path to the working directory.
        """
        if working_dir:
            self.working_dir = Path(working_dir)
        else:
            self.working_dir = Path.cwd()
            
        self.pdf_folder = self.working_dir / 'pdf'
        self.scan_folder = self.working_dir / 'scan'
        
        # Ensure folders exist
        self.pdf_folder.mkdir(exist_ok=True)
        self.scan_folder.mkdir(exist_ok=True)

    def is_scanned_pdf(self, pdf_path: str) -> bool:
        """
        Determine if the PDF is scanned by checking for extractable text.

        Args:
            pdf_path (str): Path to the PDF file.
            
        Returns:
            bool: True if the PDF is scanned, False if searchable.
        """
        try:
            with pdfplumber.open(pdf_path) as pdf:
                first_page = pdf.pages[0]
                text = first_page.extract_text()
                
                if not text or text.isspace():
                    with open(pdf_path, 'rb') as file:
                        reader = PyPDF2.PdfReader(file)
                        page = reader.pages[0]
                        text = page.extract_text()
                        
                        if not text or text.isspace():
                            return True
                            
                return False
                
        except Exception as e:
            print(f"Error checking file {pdf_path}: {str(e)}")
            return False

    def organize_pdfs(self) -> Dict[str, List[str]]:
        """
        Classify and move PDF files into the appropriate folders.

        Returns:
            Dict[str, List[str]]: Lists of scanned and searchable PDFs, and errors.
        """
        results = {
            'scanned': [],
            'searchable': [],
            'errors': []
        }
        
        # Get all PDF files in the 'raw_data' directory
        pdf_files = list(self.working_dir.glob('*.pdf'))

        print(f"Starting classification of {len(pdf_files)} PDF files...")

        for idx, file_path in enumerate(pdf_files, 1):
            file_name = file_path.name
            try:
                if self.is_scanned_pdf(file_path):
                    destination = self.scan_folder / file_name
                    results['scanned'].append(file_name)
                else:
                    destination = self.pdf_folder / file_name
                    results['searchable'].append(file_name)
                
                # Move the file
                if not destination.exists():
                    shutil.move(str(file_path), str(destination))
                else:
                    base, ext = os.path.splitext(file_name)
                    counter = 1
                    while destination.exists():
                        new_name = f"{base}_{counter}{ext}"
                        destination = destination.parent / new_name
                        counter += 1
                    shutil.move(str(file_path), str(destination))
                
            except Exception as e:
                print(f"Error processing file {file_name}: {str(e)}")
                results['errors'].append(file_name)
                
        return results

    def print_summary(self, results: Dict[str, List[str]]):
        """
        Print a summary of the PDF classification results.

        Args:
            results (Dict[str, List[str]]): Results of the classification.
        """
        print("\n=== PDF Classification Report ===")
        print(f"Working directory: {self.working_dir}")
        
        print(f"\nTotal scanned PDFs: {len(results['scanned'])}")
        if results['scanned']:
            print("Scanned PDFs:")
            for file in results['scanned']:
                print(f"  - {file}")
                
        print(f"\nTotal searchable PDFs: {len(results['searchable'])}")
        if results['searchable']:
            print("Searchable PDFs:")
            for file in results['searchable']:
                print(f"  - {file}")
                
        if results['errors']:
            print(f"\nTotal errors: {len(results['errors'])}")
            print("Files with errors:")
            for file in results['errors']:
                print(f"  - {file}")

def main(folder_path: str):
    """
    Main function to run the PDF classification and organization.

    Args:
        folder_path (str): Path to the base folder containing the PDFs.
    """
    try:
        # Set up working directory and folders
        working_dir = setup_folders(folder_path)
        
        # Initialize and run organizer
        organizer = PDFOrganizer(working_dir)
        results = organizer.organize_pdfs()
        organizer.print_summary(results)
        
    except Exception as e:
        print(f"Error: {str(e)}")

# Usage
if __name__ == "__main__":
    # Set the base folder path here
    folder_path = "C:\\Users\\trungpt\\Documents\\trungpt\\project_1\\test_llama\\raw_data"
    main(folder_path)

Working directory set to: c:\Users\trungpt\Documents\trungpt\project_1\test_llama\raw_data
Starting classification of 5 PDF files...

=== PDF Classification Report ===
Working directory: C:\Users\trungpt\Documents\trungpt\project_1\test_llama\raw_data

Total scanned PDFs: 2
Scanned PDFs:
  - TULD01.pdf
  - TULD03.pdf

Total searchable PDFs: 3
Searchable PDFs:
  - NQLD01.pdf
  - NQLD02.pdf
  - TULD02.pdf


# Read and parse PDF document

In [2]:
# !pip install pytesseract pillow pymupdf

In [14]:
import os
import fitz  # PyMuPDF for handling scanned PDFs with OCR
import PyPDF2
import pdfplumber
import pytesseract
from PIL import Image
from pathlib import Path
import io

# Set Tesseract executable path if needed
# pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"

def parse_pdf(file_path):
    """
    Parses a PDF document and extracts text content.
    
    Args:
        file_path (str): Path to the PDF file.
        
    Returns:
        str: Extracted text from the PDF.
    """
    text_content = ""
    try:
        # First, try to extract text from a searchable PDF
        with pdfplumber.open(file_path) as pdf:
            for page in pdf.pages:
                text = page.extract_text()
                if text:
                    text_content += text + "\n"
        
        # If no text is found, try OCR for scanned PDFs
        if not text_content.strip():
            print(f"Performing OCR on scanned PDF: {file_path}")
            with fitz.open(file_path) as pdf:
                for page_num in range(len(pdf)):
                    page = pdf[page_num]
                    pix = page.get_pixmap()
                    img = Image.open(io.BytesIO(pix.tobytes()))
                    text_content += pytesseract.image_to_string(img) + "\n"
                    
    except Exception as e:
        print(f"Error reading {file_path}: {e}")
        
    return text_content

def read_and_parse_pdfs(base_folder):
    """
    Reads and parses all PDF documents in specified folders.
    
    Args:
        base_folder (str): Path to the folder containing 'pdf' and 'scan' folders.
    """
    base_path = Path(base_folder)
    pdf_folder = base_path / 'pdf'
    scan_folder = base_path / 'scan'
    
    # Ensure the folders exist
    if not pdf_folder.exists() or not scan_folder.exists():
        print("Folders 'pdf' and 'scan' are not found in the specified base path.")
        return
    
    # Traverse each folder and parse PDFs
    for folder in [pdf_folder, scan_folder]:
        print(f"\n--- Parsing PDFs in {folder} ---")
        
        for pdf_file in folder.glob("*.pdf"):
            print(f"\nReading file: {pdf_file.name}")
            text = parse_pdf(pdf_file)
            print(text)

# Specify the path to the 'raw_data' folder
if __name__ == "__main__":
    base_folder = "C:\\Users\\trungpt\\Documents\\trungpt\\project_1\\test_llama\\raw_data"
    read_and_parse_pdfs(base_folder)



--- Parsing PDFs in C:\Users\trungpt\Documents\trungpt\project_1\test_llama\raw_data\pdf ---

Reading file: NQLD01.pdf
CÔNG TY CẤP NƯỚC SÀI GÒN CỘNG HÒA XÃ HỘI CHỦ NGHĨA VIỆT NAM
TRÁCH NHIỆM HỮU HẠN MỘT THÀNH VIÊN Độc lập – Tự do – Hạnh phúc
CÔNG TY CỔ PHẦN CẤP NƯỚC TRUNG AN
NỘI QUY LAO ĐỘNG
(Ban hành kèm theo Quyết định số 099 /QĐ-TA-TCHC ngày 07 / 8 /2018)
CHƯƠNG I
NHỮNG QUY ĐỊNH CHUNG
Điều 1. Mục đích ban hành Nội quy lao động
Nội quy lao động là văn bản định chế được ban hành quy định kỷ luật lao động mà
người lao động phải chấp hành khi làm việc tại Công ty Cổ phần Cấp nước Trung An (gọi tắt
là Công ty) và chế tài áp dụng để xử lý các trường hợp vi phạm kỷ luật lao động .
Điều 2. Phạm vi và đối tượng áp dụng
Nội quy lao động này được áp dụng đối với người lao động Việt Nam làm việc tại Công
ty theo hình thức và các loại hợp đồng lao động.
CHƯƠNG II
QUY ĐỊNH VỀ THỜI GIỜ LÀM VIỆC, THỜI GIỜ NGHỈ NGƠI
Điều 3. Thời giờ làm việc
1. Thời giờ làm việc tại Công ty là: 08 giờ trong một ngà