In [None]:
# Install dependencies
!pip install pytesseract transformers datasets torch diffusers google-colab
!apt-get install tesseract-ocr

# Import libraries
import os
import pytesseract
from PIL import Image
import pandas as pd
from datasets import load_dataset
from diffusers import StableDiffusionPipeline
from google.colab import drive
import io

# Mount Google Drive
drive.mount('/content/drive')
OUTPUT_DIR = '/content/drive/MyDrive/MultiModalDocAI/processed'
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Load dataset via API
try:
    dataset = load_dataset('rvl-cdip', split='train[:1000]')  # Stream 1000 samples
    print('Dataset loaded successfully.')
except Exception as e:
    print(f'Error loading dataset: {e}')
    raise SystemExit

# OCR function
def extract_text(image):
    try:
        text = pytesseract.image_to_string(image)
        return text if text.strip() else 'No text extracted'
    except Exception as e:
        print(f'OCR error: {e}')
        return 'OCR failed'

# Generate synthetic document images
def generate_synthetic_image(prompt='invoice document with text'):
    try:
        pipe = StableDiffusionPipeline.from_pretrained('stabilityai/stable-diffusion-2-1', torch_dtype=torch.float16)
        pipe = pipe.to('cuda')
        image = pipe(prompt).images[0]
        synthetic_path = os.path.join(OUTPUT_DIR, f'synthetic_{prompt.replace(' ', '_')}.png')
        image.save(synthetic_path)
        return synthetic_path, image
    except Exception as e:
        print(f'Synthetic image generation error: {e}')
        return None, None

# Preprocess streamed data
processed_data = []
for idx, item in enumerate(dataset):
    try:
        image = Image.open(io.BytesIO(item['image']))  # Stream image from memory
        label = item['label']
        
        # Extract text
        text = extract_text(image)
        
        # Generate synthetic image
        synthetic_path, synthetic_image = generate_synthetic_image()
        synthetic_text = extract_text(synthetic_image) if synthetic_image else 'No synthetic text'
        
        # Store in memory
        processed_data.append({
            'image_id': idx,
            'text': text,
            'label': label,
            'synthetic_image_path': synthetic_path if synthetic_path else 'None',
            'synthetic_text': synthetic_text
        })
    except Exception as e:
        print(f'Error processing sample {idx}: {e}')
        continue

# Save to CSV in Google Drive
if processed_data:
    df = pd.DataFrame(processed_data)
    df.to_csv(os.path.join(OUTPUT_DIR, 'processed_data.csv'), index=False)
    print(f'Preprocessing complete. Data saved to {OUTPUT_DIR}')
else:
    print('No data processed. Check dataset access or preprocessing steps.')