In [None]:
# Install required packages
!pip install pdfplumber google-generativeai

Collecting pdfplumber
  Downloading pdfplumber-0.11.8-py3-none-any.whl.metadata (43 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
Collecting pdfminer.six==20251107 (from pdfplumber)
  Downloading pdfminer_six-20251107-py3-none-any.whl.metadata (4.2 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-5.0.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (67 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.9/67.9 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
Downloading pdfplumber-0.11.8-py3-none-any.whl (60 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.0/60.0 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pdfminer_six-20251107-py3-none-any.whl (5.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
# Import libraries
import pdfplumber
from google import genai
import os
import json
from datetime import datetime
import re
import glob

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Pass Gemini API key
client = genai.Client(api_key="YOUR_API_KEY_HERE")

In [None]:
# Extract text
def extract_text_from_pdf(pdf_path):
    """Extract text from PDF file with 2-column format support using pdfplumber"""
    text = ""
    try:
        with pdfplumber.open(pdf_path) as pdf:
            for page_num, page in enumerate(pdf.pages):
                # Get page dimensions
                page_width = page.width
                page_height = page.height

                # Define left column (0 to middle)
                left_bbox = (0, 0, page_width / 2, page_height)

                # Define right column (middle to end)
                right_bbox = (page_width / 2, 0, page_width, page_height)

                # Extract left column text
                left_crop = page.crop(left_bbox)
                left_text = left_crop.extract_text() or ""

                # Extract right column text
                right_crop = page.crop(right_bbox)
                right_text = right_crop.extract_text() or ""

                # Combine left then right
                page_text = left_text + "\n" + right_text + "\n"
                text += page_text

    except Exception as e:
        print(f"Error reading PDF: {e}")
        return None

    return text

In [None]:
# Clean text with Gemini
def clean_text_with_gemini(raw_text):
    """Use Gemini AI to clean and extract only paragraph text, removing image/graph references"""
    prompt = """You are a text cleaning assistant. Your task is to clean medical textbook content.

INSTRUCTIONS:
1. Extract ONLY the main body text (paragraphs of content)
2. REMOVE all text related to:
   - Figure captions and references (e.g., "Figure 1.2", "See Figure 3")
   - Table captions and references (e.g., "Table 2.1", "as shown in Table")
   - Graph descriptions and references
   - Image descriptions and references
   - Box/sidebar content labels
   - Page numbers
   - Headers and footers
   - References to visual elements (e.g., "shown in the diagram", "illustrated in")
3. Keep all medical terminology, drug names, and clinical information
4. Preserve paragraph structure with line breaks between paragraphs
5. Keep section headings if they are part of the main text flow
6. Remove any OCR artifacts or garbled text
7. Start with KEY FEATURES, INTRODUCTION, EPIDEMIOLOGY, etc.

---
SAMPLE OUTPUT's first few lines:

KEY FEATURES
• Trichomoniasis is a common, sexually transmitted disease caused by a protozoan parasite that infects the urogenital tract of men and women.
• The vagina is the most common site of infection in women.
• Many infected women are asymptomatic, but clinical features include vaginal discharge, often yellow or green, often frothy; vulvovaginal irritation; and dysuria.
• The urethra is the most common site of infection in men.
• Most men with trichomoniasis do not have signs or symptoms; however, some men may exhibit urethral irritation and discharge or mild burning after urination or ejaculation.
• Traditionally, diagnosis and differential diagnosis may be made by wet mount of vaginal material; nucleic acid amplification techniques are available.
• Trichomoniasis is treated with oral 5′-nitroimidazoles, although resistance is developing.

INTRODUCTION
Trichomoniasis is a common, worldwide, urogenital infection with Trichomonas vaginalis. It is a frequent cause of symptomatic vaginitis and a less common cause of nongonococcal urethritis (NGU).

EPIDEMIOLOGY
Trichomoniasis is transmitted primarily by penile–vaginal and possibly by penile–anal coitus. Because it is sexually transmitted, it is strikingly associated with higher risk for other sexually transmitted infections (STIs), and coincident STIs should be sought.
---

Return ONLY the cleaned text. Do not add any commentary or explanations.

---
TEXT TO CLEAN:

""" + raw_text

    response = client.models.generate_content(
        model="gemini-2.5-flash",
        contents=prompt
    )
    return response.text

In [None]:
# Save text to file
def save_text_to_file(text, pdf_path, output_dir="/content/drive/MyDrive/LMM/demo_kz/disease_chapters_text"):
    """Save cleaned text to a .txt file"""
    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)

    # Generate text filename based on PDF name
    pdf_name = os.path.splitext(os.path.basename(pdf_path))[0]
    txt_filename = f"{pdf_name}.txt"
    txt_path = os.path.join(output_dir, txt_filename)

    # Save text to file
    try:
        with open(txt_path, 'w', encoding='utf-8') as txt_file:
            txt_file.write(text)
        print(f"✓ Text saved to: {txt_path}")
        return txt_path
    except Exception as e:
        print(f"Error saving text file: {e}")
        return None

In [None]:
# Process all PDFs
def process_all_pdfs_to_text(folder_path, selected_indices=None, output_dir="/content/drive/MyDrive/LMM/demo_kz/disease_chapters_text"):
    """Process all PDF files in the specified folder and extract clean text"""

    # Find all PDF files in the folder
    pdf_pattern = os.path.join(folder_path, "*.pdf")
    pdf_files = sorted(glob.glob(pdf_pattern))

    if not pdf_files:
        print(f"No PDF files found in folder: {folder_path}")
        return

    # Filter by selected indices if provided
    if selected_indices:
        selected_files = []
        for idx in selected_indices:
            if 1 <= idx <= len(pdf_files):
                selected_files.append(pdf_files[idx - 1])
            else:
                print(f"Warning: Index {idx} is out of range (1-{len(pdf_files)})")
        pdf_files = selected_files

    if not pdf_files:
        print("No valid files selected for processing")
        return

    # Filter out already processed files
    unprocessed_files = []
    already_processed = []

    for pdf_path in pdf_files:
        pdf_name = os.path.splitext(os.path.basename(pdf_path))[0]
        txt_filename = f"{pdf_name}.txt"
        txt_path = os.path.join(output_dir, txt_filename)

        if os.path.exists(txt_path):
            already_processed.append(os.path.basename(pdf_path))
        else:
            unprocessed_files.append(pdf_path)

    # Print status
    print(f"Total PDF files found: {len(pdf_files)}")
    print(f"Already processed: {len(already_processed)}")
    print(f"Need to process: {len(unprocessed_files)}")

    if already_processed:
        print(f"\nAlready processed files ({len(already_processed)}):")
        for filename in already_processed[:10]:  # Show first 10
            print(f"  ✓ {filename}")
        if len(already_processed) > 10:
            print(f"  ... and {len(already_processed) - 10} more")

    if not unprocessed_files:
        print("\nAll files have already been processed!")
        return

    print(f"\nFiles to process ({len(unprocessed_files)}):")
    for i, pdf_file in enumerate(unprocessed_files, 1):
        print(f"  {i}. {os.path.basename(pdf_file)}")

    processed_count = 0
    failed_count = 0

    for pdf_path in unprocessed_files:
        print(f"\n{'='*60}")
        print(f"Processing: {os.path.basename(pdf_path)}")
        print('='*60)

        try:
            # Extract raw text from PDF
            raw_text = extract_text_from_pdf(pdf_path)

            if raw_text:
                print(f"✓ Extracted raw text ({len(raw_text)} characters)")

                # Clean text using Gemini AI
                cleaned_text = clean_text_with_gemini(raw_text)
                print(f"✓ Cleaned text ({len(cleaned_text)} characters)")

                # Save to text file
                txt_path = save_text_to_file(cleaned_text, pdf_path, output_dir)

                if txt_path:
                    processed_count += 1
                    print(f"✓ Successfully processed: {os.path.basename(pdf_path)}")
                else:
                    failed_count += 1
                    print(f"✗ Failed to save text for: {os.path.basename(pdf_path)}")
            else:
                failed_count += 1
                print(f"✗ Failed to extract text from: {os.path.basename(pdf_path)}")

        except Exception as e:
            failed_count += 1
            print(f"✗ Error processing {os.path.basename(pdf_path)}: {e}")

    print(f"\n{'='*60}")
    print("PROCESSING SUMMARY")
    print('='*60)
    print(f"Total files in folder: {len(pdf_files)}")
    print(f"Already processed (skipped): {len(already_processed)}")
    print(f"Newly processed: {processed_count}")
    print(f"Failed: {failed_count}")
    print('='*60)

In [None]:
# Set paths and run
folder_path = "/content/drive/MyDrive/LMM/demo_kz/disease_chapters_demo"
output_text_dir = "/content/drive/MyDrive/LMM/demo_kz/disease_chapters_text"

# Check if folder exists
if os.path.exists(folder_path):
    process_all_pdfs_to_text(folder_path, output_dir=output_text_dir)
else:
    print(f"Folder not found: {folder_path}")

Total PDF files found: 1
Already processed: 0
Need to process: 1

Files to process (1):
  1. 1_Tropical_Lung_Diseases.pdf

Processing: 1_Tropical_Lung_Diseases.pdf
✓ Extracted raw text (28906 characters)
✓ Cleaned text (20417 characters)
✓ Text saved to: /content/drive/MyDrive/LMM/demo_kz/disease_chapters_text/1_Tropical_Lung_Diseases.txt
✓ Successfully processed: 1_Tropical_Lung_Diseases.pdf

PROCESSING SUMMARY
Total files in folder: 1
Already processed (skipped): 0
Newly processed: 1
Failed: 0
