In [None]:
# Initialize EasyOCR reader globally, use CPU by setting gpu=True to avoid CUDA issues
import easyocr  # For Optical Character Recognition (OCR)
import requests  # To fetch images from URLs
from PIL import Image  # For handling image processing
from io import BytesIO  # To convert the response content into bytes
import numpy as np  # For handling image arrays
import cv2  # For image processing (e.g., converting to grayscale)
import concurrent.futures  # For multi-threaded processing
import gc  # To manually manage memory and run garbage collection
import time  # To add delays between batch processing
import pandas as pd  # To work with DataFrames

# Initialize EasyOCR reader globally, use GPU by setting gpu=True 
reader = easyocr.Reader(['en'], gpu=True)  

def preprocess_image(image):
    max_size = (500,500)  # Resize to a smaller size to reduce memory usage
    image.thumbnail(max_size, Image.Resampling.LANCZOS)
    
    # Convert the image to numpy array and grayscale
    image_np = np.array(image)
    gray_image = cv2.cvtColor(image_np, cv2.COLOR_RGB2GRAY)
    
    return gray_image  # Return the grayscale image

def extract_text(image_url):
    try:
        # Fetch and open image from URL
        response = requests.get(image_url)
        response.raise_for_status()  # Check if request was successful
        image = Image.open(BytesIO(response.content))
        image_np = preprocess_image(image)  # Preprocess image
        
        # Extract text using EasyOCR
        result = reader.readtext(image_np)
        text = ' '.join([res[1] for res in result])
        return text
    except requests.RequestException as e:
        return f"Request error: {e}"
    except Exception as e:
        return f"Processing error: {e}"

def process_images_in_batches(image_urls, batch_size=1000):
    num_threads = 2  # Reduce the number of threads to avoid memory overload
    total_images = len(image_urls)
    print(f"Starting to process {total_images} images in batches of {batch_size}...")
    
    for i in range(0, total_images, batch_size):
        batch_urls = image_urls[i:i + batch_size]
        print(f"Processing batch {i // batch_size + 1} with {len(batch_urls)} images...")
        
        with concurrent.futures.ThreadPoolExecutor(max_workers=num_threads) as executor:
            results = list(executor.map(extract_text, batch_urls))
        
        # Write results incrementally (you can modify this to save to a file or DB)
        df.loc[i:i + len(batch_urls) - 1, 'description'] = results
        
        # Clear memory
        del results
        gc.collect()  # Call garbage collector to free up memory
        
        print(f"Batch {i // batch_size + 1} completed.")
        time.sleep(2)  # Small sleep to reduce system load

    print("Completed processing all images.")

# Assuming df is your DataFrame and 'image_link' is the column with URLs
image_urls = df['image_link'].tolist()
process_images_in_batches(image_urls)

# Save the DataFrame after all batches are processed
df.to_csv('image_descriptions_train.csv', index=False)

# Print a success message after completion
print("All images processed, and descriptions saved.")