In [None]:
import os
import PyPDF2
from pdf2image import convert_from_path
import pytesseract

def clean_text(text):
    """Remove or replace invalid Unicode characters."""
    return text.encode('utf-8', 'ignore').decode('utf-8', 'ignore')

def pdf_to_text(pdf_path, output_txt, poppler_path=None):
    """Extract text from a PDF using PyPDF2; fallback to OCR if no text found."""
    try:
        with open(pdf_path, 'rb') as pdf_file:
            pdf_reader = PyPDF2.PdfReader(pdf_file)
            text = ''

            for page in pdf_reader.pages:
                extracted_text = page.extract_text()
                if extracted_text:
                    text += extracted_text + '\n'
    except Exception as e:
        print(f"Error reading {pdf_path} with PyPDF2: {e}")
        text = ''

    # If no text was extracted, use OCR
    if not text.strip():
        print(f"Using OCR for {pdf_path} (no selectable text found).")
        text = pdf_to_text_ocr(pdf_path, poppler_path)

    # Clean the text to remove invalid Unicode characters
    text = clean_text(text)

    with open(output_txt, 'w', encoding='utf-8') as txt_file:
        txt_file.write(text)

def pdf_to_text_ocr(pdf_path, poppler_path=None):
    """Convert PDF to text using OCR (Tesseract)."""
    try:
        images = convert_from_path(pdf_path, poppler_path=poppler_path)  # Convert PDF pages to images
        text = ''

        for img in images:
            text += pytesseract.image_to_string(img) + '\n'
        
        return text
    except Exception as e:
        print(f"Error processing {pdf_path} with OCR: {e}")
        return ''

if __name__ == "__main__":
    pdf_dir = "/mnt/c/Users/Zy565/Downloads/EXHD/dataNEOUROlift/BOOKS/Network_Security" # Directory containing PDFs
    output_dir = "/mnt/c/Users/Zy565/Downloads/EXHD/dataNEOUROlift/BOOKS/NetworkTEXTdir"  # Directory to save text files
    os.makedirs(output_dir, exist_ok=True)
    
    # Adjust Poppler path if needed
    poppler_path = "/home/your_username/my_conda_env/bin/"  # Update if required
    # Check if the Poppler path exists
    if not os.path.exists(poppler_path):
        print(f"Warning: Poppler path {poppler_path} does not exist. Update the path if necessary.")
    for filename in os.listdir(pdf_dir):
        if filename.lower().endswith('.pdf'):
            pdf_path = os.path.join(pdf_dir, filename)
            output_txt = os.path.join(output_dir, filename.replace('.pdf', '.txt'))
            pdf_to_text(pdf_path, output_txt, poppler_path)
            print(f"Converted {filename} to text successfully!")

Converted 270__Cyber_Security_Interview_Questions_.pdf to text successfully!
Converted Applied_Math_for_Security.pdf to text successfully!
Converted Azure_Security.pdf to text successfully!
Converted A_Brief_Introduction_to_neaural_networks.pdf to text successfully!
Converted A_Hybrid_Neural_Network_Architecture_to_.pdf to text successfully!
Converted Cheat_Sheets_for_AI__Neural_Networks__Ma.pdf to text successfully!
Converted Cloud_Native_Software_Security_Handbook.pdf to text successfully!
Converted Computer_Networks__2023_.pdf to text successfully!
Converted Cybersecurity_All_in_One_For_Dummies.pdf to text successfully!
Converted Cybersecurity_Architect_s_Handbook.pdf to text successfully!
Converted CyberSecurity_in_a_DevOps_Environment.pdf to text successfully!
Converted Cyber_Security_Interview_Questions.pdf to text successfully!
Converted Data_Communication_and_Computer_Networks.pdf to text successfully!
Converted Deep_Learning_Networks.pdf to text successfully!
Converted Discove

: 

In [None]:
import os
import gc
from pdf2image import convert_from_path
import pytesseract

# Set the path to the Tesseract executable
pytesseract.pytesseract.tesseract_cmd = r'/usr/bin/tesseract'  # Update this path as needed

# Set the TESSDATA_PREFIX environment variable
os.environ['TESSDATA_PREFIX'] = '/usr/share/tesseract-ocr/4.00/tessdata'  # Update this path as needed

def pdf_to_text_ocr(pdf_path, output_txt, image_output_dir):
    """Convert PDF to text using OCR, processing one page at a time."""
    os.makedirs(image_output_dir, exist_ok=True)
    extracted_text = ""

    # Convert PDF to images one page at a time
    images = convert_from_path(pdf_path)
    for i, img in enumerate(images):
        image_path = os.path.join(image_output_dir, f"page_{i+1}.png")
        img.save(image_path, "PNG")
        
        # Perform OCR on the current page
        page_text = pytesseract.image_to_string(image_path)
        extracted_text += page_text + "\n\n"  # Separate pages with newlines
        
        # Clean up memory for the current page
        del img, page_text
        gc.collect()

    # Save the extracted text to a file
    with open(output_txt, "w", encoding="utf-8") as txt_file:
        txt_file.write(extracted_text)

    # Clean up memory
    del images, extracted_text
    gc.collect()

if __name__ == "__main__":
    pdf_dir = "/mnt/c/Users/Zy565/Downloads/EXHD/dataNEOUROlift/BOOKS/Network_Security"  # Directory containing PDFs
    output_dir = "/mnt/c/Users/Zy565/Downloads/EXHD/dataNEOUROlift/BOOKS/NetworkTEXTdir"  # Directory to save text files
    image_dir = "output_images"  # Directory to save images of pages
    os.makedirs(output_dir, exist_ok=True)

    for filename in os.listdir(pdf_dir):
        if filename.lower().endswith(".pdf"):
            pdf_path = os.path.join(pdf_dir, filename)
            output_txt = os.path.join(output_dir, filename.replace(".pdf", ".txt"))
            image_output_dir = os.path.join(image_dir, filename.replace(".pdf", ""))

            # Process the PDF file
            pdf_to_text_ocr(pdf_path, output_txt, image_output_dir)
            print(f"OCR completed: {filename} -> {output_txt}")

            # Clean up memory after processing each file
            del pdf_path, output_txt, image_output_dir
            gc.collect()

: 

: 

In [None]:
import os
import gc
from pdf2image import convert_from_path
import pytesseract

# Set the path to the Tesseract executable
pytesseract.pytesseract.tesseract_cmd = r'/usr/bin/tesseract'  # Update this path as needed

# Set the TESSDATA_PREFIX environment variable
os.environ['TESSDATA_PREFIX'] = '/usr/share/tesseract-ocr/4.00/tessdata'  # Update this path as needed

def pdf_to_text_ocr(pdf_path, output_txt, image_output_dir):
    """Convert PDF to text using OCR, processing one page at a time."""
    os.makedirs(image_output_dir, exist_ok=True)
    extracted_text = ""

    # Convert PDF to images one page at a time
    images = convert_from_path(pdf_path)
    for i, img in enumerate(images):
        image_path = os.path.join(image_output_dir, f"page_{i+1}.png")
        img.save(image_path, "PNG")
        
        # Perform OCR on the current page
        page_text = pytesseract.image_to_string(image_path)
        extracted_text += page_text + "\n\n"  # Separate pages with newlines
        
        # Clean up memory for the current page
        del img, page_text
        gc.collect()

    # Save the extracted text to a file
    with open(output_txt, "w", encoding="utf-8") as txt_file:
        txt_file.write(extracted_text)

    # Clean up memory
    del images, extracted_text
    gc.collect()

if __name__ == "__main__":
    pdf_dir = "/mnt/c/Users/Zy565/Downloads/EXHD/dataNEOUROlift/BOOKS/Network_Security"  # Directory containing PDFs
    output_dir = "/mnt/c/Users/Zy565/Downloads/EXHD/dataNEOUROlift/BOOKS/NetworkTEXTdir"  # Directory to save text files
    image_dir = "output_images"  # Directory to save images of pages
    os.makedirs(output_dir, exist_ok=True)

    for filename in os.listdir(pdf_dir):
        if filename.lower().endswith(".pdf"):
            pdf_path = os.path.join(pdf_dir, filename)
            output_txt = os.path.join(output_dir, filename.replace(".pdf", ".txt"))
            image_output_dir = os.path.join(image_dir, filename.replace(".pdf", ""))

            # Process the PDF file
            pdf_to_text_ocr(pdf_path, output_txt, image_output_dir)
            print(f"OCR completed: {filename} -> {output_txt}")

            # Clean up memory after processing each file
            del pdf_path, output_txt, image_output_dir
            gc.collect()

OCR completed: 270__Cyber_Security_Interview_Questions_.pdf -> /mnt/c/Users/Zy565/Downloads/EXHD/dataNEOUROlift/BOOKS/NetworkTEXTdir/270__Cyber_Security_Interview_Questions_.txt


: 

: 

: 

: 

: 

: 