In [None]:
import os
import shutil
import fitz  # PyMuPDF
import concurrent.futures

# Define folder paths
folder1 = "/mnt/c/Users/Zy565/Downloads/EXHD/dataNEOUROlift/BOOKS"
folder2 = "/mnt/c/Users/Zy565/Downloads/EXHD/dataNEOUROlift/BOOKS/less_than_200_pages"

# Ensure destination folder exists
os.makedirs(folder2, exist_ok=True)

# Function to process a single PDF
def process_pdf(filename):
    if not filename.lower().endswith(".pdf"):
        return  # Skip non-PDF files

    file_path = os.path.join(folder1, filename)

    try:
        # Open PDF and count pages
        with fitz.open(file_path) as doc:
            num_pages = len(doc)

        # Move PDF if it has less than 10 pages
        if num_pages < 200:
            shutil.move(file_path, os.path.join(folder2, filename))
            print(f"Moved: {filename} ({num_pages} pages)")
    
    except Exception as e:
        print(f"Error processing {filename}: {e}")

# Get all PDF files
pdf_files = [f for f in os.listdir(folder1) if f.lower().endswith(".pdf")]

# Use ThreadPoolExecutor to process PDFs in parallel
with concurrent.futures.ThreadPoolExecutor(max_workers=20) as executor:
    executor.map(process_pdf, pdf_files)


Moved: 004_AllCourseSlides.pdf (192 pages)
Moved: 150__Python_Pattern_Programs__2024_.pdf (199 pages)
Moved: 120_Advanced_JavaScript_Interview_Questi.pdf (143 pages)
Moved: 150__JavaScript_Pattern_Programs.pdf (184 pages)
Moved: 100_SQL_Server_Mistakes_and_How_to_Avoid.pdf (131 pages)
Moved: 001_365_Data_Science_Data_Science_Interv.pdf (111 pages)
Moved: 2_HCIA_AI_V3_0_Lab_Guide.pdf (169 pages)
Moved: 2014_Book_ThePythonWorkbook.pdf (169 pages)
Moved: 501_Challenging_Logic_and_Reasoning_Prob.pdf (160 pages)
Moved: 100_Page_Python_Intro.pdf (117 pages)
Moved: Abella_Hernando_120_Advanced_Python_Inte.pdf (131 pages)
Moved: 50_Python_Concepts_Every_Developer_Shoul_1.pdf (182 pages)
Moved: 3___Advanced_MySQL_Analysis.pdf (180 pages)
Moved: 11_1_gpt_4.pdf (100 pages)
Moved: 005_PL_300_Exam_Prep.pdf (165 pages)
Moved: 80__Python_Coding_Challenges_for_Beginne.pdf (128 pages)
Moved: 190__Python_Interview_Questions_and_Answ.pdf (192 pages)
Moved: Ai7ba57.pdf (132 pages)
Moved: 5_611164849552805

In [None]:
import os
import shutil
import fitz  # PyMuPDF
import pdfplumber  # Alternative PDF parser
import multiprocessing

# Define folder paths
folder1 = "/mnt/c/Users/Zy565/Downloads/EXHD/dataNEOUROlift/BOOKS"
folder2 = "/mnt/c/Users/Zy565/Downloads/EXHD/dataNEOUROlift/BOOKS/more_than_100_pages"

# Ensure destination folder exists
os.makedirs(folder2, exist_ok=True)

# Function to process a single PDF
def process_pdf(filename):
    file_path = os.path.join(folder1, filename)

    if not filename.lower().endswith(".pdf") or not os.path.isfile(file_path):
        print(f"Skipping (not found or invalid): {filename}")
        return

    num_pages = 0  # Default value in case of an error

    # Attempt to read using PyMuPDF
    try:
        with fitz.open(file_path) as doc:
            num_pages = len(doc)
    except Exception as e:
        print(f"Error with PyMuPDF: {filename} - {e}")
        # Try using pdfplumber as a backup
        try:
            with pdfplumber.open(file_path) as pdf:
                num_pages = len(pdf.pages)
        except Exception as e2:
            print(f"Skipping (unreadable): {filename} - {e2}")
            return  # Skip unreadable files

    # Move PDF if it has more than 500 pages
    if num_pages > 500:
        shutil.move(file_path, os.path.join(folder2, filename))
        print(f"Moved: {filename} ({num_pages} pages)")

if __name__ == "__main__":
    # Get all PDF files
    pdf_files = [f for f in os.listdir(folder1) if f.lower().endswith(".pdf")]

    # Use multiprocessing Pool for parallel execution
    num_workers = min(multiprocessing.cpu_count(), len(pdf_files))  # Max cores or file count
    with multiprocessing.Pool(processes=num_workers) as pool:
        pool.map(process_pdf, pdf_files)


Moved: 003_All_slides.pdf (626 pages)
Moved: Daily_Dose_Of_Data_Science_Full_Archive.pdf (531 pages)
Moved: Algorithms_for_image_processing_and_comp.pdf (506 pages)
Moved: Begin_to_Code.pdf (595 pages)
Moved: Coding_Games_in_Scratch.pdf (676 pages)
Moved: Architecting_for_Scale.pdf (507 pages)
Moved: 1111534128.pdf (530 pages)
Moved: Advanced_Data_Mining_and_Applications.pdf (848 pages)
Moved: 2_Practical_Data_Science.pdf (821 pages)
Moved: Database_System_Concepts.pdf (1373 pages)
Moved: Better_Python_Code__2_.pdf (841 pages)
Moved: Cracking_Coding_Interview.pdf (712 pages)
Moved: Database_System_Concepts_etc_.pdf (1519 pages)
Moved: DataFrame_Manipulation__2024_.pdf (696 pages)
Moved: 15_Math_Concepts_Every_Data_Scientist_Sh.pdf (577 pages)
Moved: Beyond_AI.pdf (532 pages)
Moved: Cracking_the_Coding_Interview__6th_Editi.pdf (712 pages)
Moved: A_Course_in_Natural_Language_Processing.pdf (543 pages)
Moved: Cracking_the_Coding_Interview__6th_Editi_1.pdf (712 pages)
Moved: Arduino_for_Ar