<h1>Project 2</h1>
Traverse through folder tree and filter pdf files
<br/>
<h3>Requirements</h3>
1.Add sub-folders called “One”, “Two”, “Three” under the folder called “/content”
<br/>
2.Add PDF files under each of the sub-folders
<br/>
3.Load all PDF files under the sub-folders and load the PDF content
<br/>
4.Write the content to a text file called “output.txt” under each sub-folder respectively
<br/>
<h3>Error Handling</h3>
1.Take care of case where folder is not available
<br/>
2.Take care of case where PDF file is not present in the sub-folder
<br/>
3.Take care of case where the output.txt file is not available
<br/>


In [8]:
import os
import PyPDF2

def create_dir_subdirs(working_dir, sub_dir_list):
    try : 
        if not os.path.exists(working_dir):
            os.makedirs(working_dir)
        
        for sub_dir in sub_dir_list:
            if not os.path.exists(os.path.join(working_dir, sub_dir)):
                os.makedir(os.path.join(working_dir, sub_dir), exist_ok=True)
    except Exception as e:
        print(f"Error while creating subfolders: {e}")

def extract_pdf_file_content(file_name):
    content = ""
    try:
        with open(file_name, "rb") as f:
            pdf_reader = PyPDF2.PdfReader(f)
            for page_no in range(len(pdf_reader.pages)):
                content += pdf_reader.pages[page_no].extract_text()
                
    except Exception as e:
        print(f"Error while extracting pdf file : {e}")

    return content

def process_pdfs_in_subfolders(working_dir, subfolders):
    for sub_dir in subfolders:
        sub_dir_path = os.path.join(working_dir, sub_dir)
        #check for directory exists
        if not os.path.exists(sub_dir_path):
            os.makedir(sub_dir_path)

        #check for pdf files
        pdf_files = [f for f in os.listdir(sub_dir_path) if f.endswith('.pdf')]
        if not pdf_files:
            print(f"Warning: No PDF files found in '{subfolder_path}'.")
            continue

        # Output file path
        output_text_path = os.path.join(sub_dir_path, 'output.txt')

        try:
            with open(output_text_path, "a") as f:
                for pdf_file in pdf_files:
                    pdf_file_path = os.path.join(sub_dir_path, pdf_file)
                    pdf_content = extract_pdf_file_content(pdf_file_path)
                    if pdf_content:
                        f.write(f"Content from {pdf_file}:\n")
                        f.write(pdf_content)
                        f.write("\n\n")
                    else:
                        print("No text extracted!")
        except Exception as e:
            print(f"Error writing to {output_text_path}: {e}")

if __name__ == "__main__":
    content_folder = './content'
    subfolders = ['One', 'Two', 'Three']

    # 1. Create sub-folders
    create_dir_subdirs(content_folder, subfolders)

    # 2. Process PDFs and write content to output.txt
    process_pdfs_in_subfolders(content_folder, subfolders)