In [None]:
%pip install langchain pytesseract opencv-python numpy PyMuPDF Pillow



In [None]:
%pip install PyMuPDF



In [None]:
import os
import pytesseract
import cv2
import numpy as np
import json
import io
import fitz
from PIL import Image

In [23]:
def pdf_to_images(pdf_path, dpi=300):
    doc = fitz.open(pdf_path)
    images = []
    for page_num in range(len(doc)):
        pix = doc[page_num].get_pixmap(matrix=fitz.Matrix(dpi/72, dpi/72))
        img = Image.open(io.BytesIO(pix.tobytes("ppm")))
        images.append(img)
    return images

In [None]:
def preprocess_pdf(pdf_dir):
    pdf_files = [f for f in os.listdir(pdf_dir) if f.endswith('.pdf')]
    extracted_data = {}
    
    for pdf_file in pdf_files:
        pdf_path = os.path.join(pdf_dir, pdf_file)
        try: 
            # Convert pdf to image
            images = pdf_to_images(pdf_path)
            full_text = []
            
            for image in images:
                # Convert PIL to OpenCV image format
                open_cv_image = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
                
                # Gray-scale the image
                gray = cv2.cvtColor(open_cv_image, cv2.COLOR_BGR2GRAY)
                
                # Denoising
                denoised_img = cv2.fastNlMeansDenoising(gray, None, 10, 7, 21)
                
                # Adaptive Tresholding 
                adaptive_thresh = cv2.adaptiveThreshold(denoised_img, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 11, 2)
                
                # Extract text using Tesseract
                text = pytesseract.image_to_string(adaptive_thresh, config='--psm 6') 
                full_text.append(text)
                
            # Save to extracted text into the dictionary
            extracted_data[pdf_file] = ' '.join(full_text)
            
        except Exception as e:
            print(f"Error processing {pdf_file}: {str(e)}")

    # Ensure the directory exists
    output_dir = "/kaggle/working"
    os.makedirs(output_dir, exist_ok=True)

    # Save extracted text to a json file
    output_file = os.path.join(output_dir, 'extracted_text.json')
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(extracted_data, f, ensure_ascii=False, indent=4)

    print(f"Extracted text saved to {output_file}")

In [25]:
pdf_dir = "/kaggle/input/disease-fact-sheet/Disease_Fact_Sheet"
preprocess_pdf(pdf_dir)

Extracted text saved to /kaggle/working/extracted_text.json


In [None]:
# Check if the file is successfully created
import os
output_file = "/kaggle/working/extracted_text.json"
if os.path.exists(output_file):
    print(f"File successfully created at: {output_file}")
else:
    print(f"File not found at: {output_file}")

File successfully created at: /kaggle/working/extracted_text.json


In [None]:
# Check if the folder and pdfs exists
import os
print(os.listdir("/kaggle/input/disease-fact-sheet"))
print(os.listdir("/kaggle/input/disease-fact-sheet/Disease_Fact_Sheet"))

['Disease_Fact_Sheet']
['Rocky_Mountain_Spotted_Fever.pdf', 'Avian_Influenza.pdf', 'Leptospirosis.pdf', 'Lyme_Disease.pdf', 'HIB.pdf', 'Bed_Bugs.pdf', 'Roseola.pdf', 'Rotavirus.pdf', 'Influenza.pdf', 'Impetigo.pdf', 'Strep_Throat.pdf', 'Kawasaki_Disease.pdf', 'Malaria.pdf', 'Tularemia.pdf', 'H1N1.pdf', 'Hepatitis_B.pdf', 'Ringworm.pdf', 'Ciguatera_Fish_Poisoning.pdf', 'Group_A_Streptococcal_Disease.pdf', 'Brucellosis.pdf', 'Molluscum_Contagiosum.pdf', 'Chickenpox.pdf', 'CMV.pdf', 'Shigella.pdf', 'Mumps.pdf', 'Scarlet_Fever.pdf', 'Polio.pdf', 'MRSA.pdf', 'Pneumonia.pdf', 'Cyclosporiasis.pdf', 'Diphtheria.pdf', 'WNV.pdf', 'HIV.pdf', 'Ebola.pdf', 'Gastroenteritis.pdf', 'Cat_Scratch_Disease.pdf', 'Pinworm.pdf', 'Otitis_Media.pdf', 'Mononucleosis.pdf', 'Scabies.pdf', 'Psittacosis.pdf', 'Trichinosis.pdf', 'Hemolytic_Uremic_Syndrome.pdf', 'Staph.pdf', 'Plague.pdf', 'Head_Lice.pdf', 'Measles.pdf', 'Carbapenem_Resistant_Acinetobacter.pdf', 'Syphilis.pdf', 'Histoplasmosis.pdf', 'Listeriosis.pdf'