In [None]:
# --- STEP 1: CLEAN ENVIRONMENT & INSTALL ---
import os
import subprocess
import sys
import re

print("üßπ Cleaning environment (removing conflicting libraries)...")
# Uninstall conflicts quietly
subprocess.run([sys.executable, "-m", "pip", "uninstall", "-y", "jax", "jaxlib", "tensorflow", "chex", "flax"], capture_output=True)

print("üì¶ Installing dependencies...")
# Install marker and boto3
subprocess.run([sys.executable, "-m", "pip", "install", "marker-pdf", "boto3"], capture_output=True)
print("‚úì Environment ready.\n")


# --- STEP 2: MAIN PROCESSING SCRIPT ---
import boto3
import shutil
import time
from botocore.exceptions import NoCredentialsError

# Force CPU settings globally
os.environ["CUDA_VISIBLE_DEVICES"] = ""
os.environ["TORCH_DEVICE"] = "cpu"

# AWS Configuration
BUCKET_NAME = 'tarannumpdf'
INPUT_FOLDER = 'finra/'
OUTPUT_FOLDER = 'output/'

# --- CREDENTIALS ---
aws_access_key = "add your own"
aws_secret_key = "ADD YOUR OWN"
aws_region = "ap-south-1"

# Initialize S3 client
print(f"Connecting to AWS S3 ({aws_region})...")
s3_client = boto3.client(
    's3',
    aws_access_key_id=aws_access_key,
    aws_secret_access_key=aws_secret_key,
    region_name=aws_region
)
print(f"‚úì Connected to AWS S3")

# Create local directories
os.makedirs('pdfs', exist_ok=True)
os.makedirs('markdown_output', exist_ok=True)

# --- SMART RESUME LOGIC ---
print(f"\nChecking existing progress in s3://{BUCKET_NAME}/{OUTPUT_FOLDER}...")

# 1. Get list of files already in Output
existing_md_files = set()
try:
    paginator = s3_client.get_paginator('list_objects_v2')
    for page in paginator.paginate(Bucket=BUCKET_NAME, Prefix=OUTPUT_FOLDER):
        if 'Contents' in page:
            for obj in page['Contents']:
                # Store just the filename "Report.md"
                existing_md_files.add(os.path.basename(obj['Key']))
    print(f"‚úì Found {len(existing_md_files)} already processed files.")
except Exception as e:
    print(f"‚ö†Ô∏è Could not list output folder (assuming empty): {e}")

# 2. Get list of PDFs to process
print(f"Listing PDFs from s3://{BUCKET_NAME}/{INPUT_FOLDER}...")
pdf_files_to_process = []
try:
    paginator = s3_client.get_paginator('list_objects_v2')
    for page in paginator.paginate(Bucket=BUCKET_NAME, Prefix=INPUT_FOLDER):
        if 'Contents' in page:
            for obj in page['Contents']:
                s3_key = obj['Key']
                if s3_key.lower().endswith('.pdf'):
                    # Check if this file is already done
                    base_name = os.path.splitext(os.path.basename(s3_key))[0]
                    expected_md = f"{base_name}.md"
                    
                    if expected_md in existing_md_files:
                        print(f"  ‚è≠Ô∏è  Skipping {base_name} (Already exists in output)")
                    else:
                        pdf_files_to_process.append(s3_key)
                        
    print(f"\nüìã Pending files: {len(pdf_files_to_process)}")

except Exception as e:
    print(f"‚ùå Error accessing S3 bucket: {e}")
    pdf_files_to_process = []


# --- PROCESSING LOOP ---
if pdf_files_to_process:
    print("\nStarting Batch Processing...")
    print('='*60)

    start_time = time.time()
    processed_count = 0
    failed_files = []

    # Download only the pending files
    for idx, s3_key in enumerate(pdf_files_to_process, 1):
        filename = os.path.basename(s3_key)
        local_path = os.path.join('pdfs', filename)
        
        pdf_start = time.time()
        base_name = os.path.splitext(filename)[0]
        
        print(f"\n[{idx}/{len(pdf_files_to_process)}] Processing: {filename}")

        # Download
        if not os.path.exists(local_path):
            print(f"  Downloading...")
            s3_client.download_file(BUCKET_NAME, s3_key, local_path)

        try:
            # Create unique temp directory
            temp_dir = f"temp_conversion_{idx}"
            if os.path.exists(temp_dir): shutil.rmtree(temp_dir)
            os.makedirs(temp_dir)

            # Copy PDF to temp
            temp_pdf_path = os.path.join(temp_dir, filename)
            shutil.copy2(local_path, temp_pdf_path)
            
            print(f"  Converting...")
            
            clean_env = os.environ.copy()
            clean_env["CUDA_VISIBLE_DEVICES"] = ""
            
            # Run marker
            result = subprocess.run(
                ['marker_single', filename],
                capture_output=True,
                text=True,
                cwd=temp_dir,
                env=clean_env
            )
            
            # --- OUTPUT DETECTION LOGIC ---
            md_found = False
            
            # 1. Try to find the path in the logs
            log_output = result.stdout + result.stderr
            match = re.search(r"Saved markdown to\s+(.*)", log_output)
            
            search_paths = [temp_dir]
            if match:
                search_paths.append(match.group(1).strip())
            
            # 2. Add fallback path
            fallback_path = "/usr/local/lib/python3.12/dist-packages/conversion_results"
            if os.path.exists(fallback_path):
                search_paths.append(fallback_path)

            # 3. Search for the file
            for search_path in search_paths:
                if not os.path.exists(search_path): continue
                    
                for root, dirs, files in os.walk(search_path):
                    for file in files:
                        if file.lower().endswith('.md') and base_name.lower() in file.lower():
                            source_md = os.path.join(root, file)
                            dest_md = os.path.join('markdown_output', f"{base_name}.md")
                            
                            with open(source_md, 'r', encoding='utf-8') as f:
                                content = f.read()
                            
                            if len(content) > 10:
                                shutil.copy2(source_md, dest_md)
                                
                                # Upload to S3
                                s3_key_out = f"{OUTPUT_FOLDER}{base_name}.md"
                                s3_client.upload_file(dest_md, BUCKET_NAME, s3_key_out)
                                print(f"  ‚úì Success! Uploaded: {s3_key_out}")
                                
                                md_found = True
                                processed_count += 1
                                break
                    if md_found: break
                if md_found: break

            if not md_found:
                print(f"  ‚úó Failed. Error logs:")
                print(result.stderr[-500:] if result.stderr else "No error output captured.")
                failed_files.append(filename)
            
            # Cleanup
            if os.path.exists(temp_dir): shutil.rmtree(temp_dir)
            # Optional: Remove local PDF to save space
            if os.path.exists(local_path): os.remove(local_path)

            # ETA
            elapsed = time.time() - start_time
            avg_time = elapsed / idx
            remaining = (len(pdf_files_to_process) - idx) * avg_time
            print(f"  ‚è±Ô∏è  ETA: {remaining/60:.1f}min")

        except Exception as e:
            print(f"  ‚úó Error: {e}")
            failed_files.append(filename)

    print(f"\n{'='*60}")
    print(f"Finished. Success: {processed_count} | Failed: {len(failed_files)}")
else:
    print("üéâ All files are already processed!")