In [None]:
import easyocr
import cv2
import pandas as pd
from pathlib import Path
import os
from tqdm import tqdm

# Initialize reader for both Bengali and English
print("Initializing EasyOCR reader for Bengali and English...")
reader = easyocr.Reader(['bn', 'en'], gpu=False)  # Both Bangla and English
print("✓ Reader initialized")

# Read your CSV
csv_path = '/data/raw/Train/Train.csv'
image_folder = '/data/raw/Train/Image'

df = pd.read_csv(csv_path)
print(f"Loaded {len(df)} images from CSV")

# Extract text from each image
extracted_texts = []

for idx, row in tqdm(df.iterrows(), total=len(df), desc="Processing images"):
    image_name = row['Image_name']
    image_path = os.path.join(image_folder, image_name)
    
    try:
        if os.path.exists(image_path):
            # Perform OCR
            result = reader.readtext(image_path)
            
            # Combine all detected text
            text = ' '.join([detection[1] for detection in result])
            extracted_texts.append(text)
        else:
            print(f"Image not found: {image_path}")
            extracted_texts.append("")
        
    except Exception as e:
        print(f"Error processing {image_path}: {e}")
        extracted_texts.append("")

# Add extracted text as a new column to dataframe
df['Extracted_Text'] = extracted_texts

# Save to CSV with the new column
output_path = '/data/extract/meme_train_data_with_text.csv'
df.to_csv(output_path, index=False)

print(f"\n✓ Saved results to {output_path}")
print(f"\nDataFrame columns: {df.columns.tolist()}")
print(f"\nSample extracted texts:")
print(df[['Image_name', 'Label', 'Extracted_Text']].head(10))
print(f"\nStatistics:")
print(f"  Total images processed: {len(df)}")
print(f"  Images with extracted text: {(df['Extracted_Text'] != '').sum()}")
print(f"  Images with no text: {(df['Extracted_Text'] == '').sum()}")

In [None]:
import easyocr
import cv2
import pandas as pd
from pathlib import Path
import os
from tqdm import tqdm

# Initialize reader for both Bengali and English
print("Initializing EasyOCR reader for Bengali and English...")
reader = easyocr.Reader(['bn', 'en'], gpu=False)  # Both Bangla and English
print("✓ Reader initialized")

# Read your CSV
csv_path = '/data/raw/Test/Test.csv'
image_folder = '/data/raw/Test/Image'

df = pd.read_csv(csv_path)
print(f"Loaded {len(df)} images from CSV")

# Extract text from each image
extracted_texts = []

for idx, row in tqdm(df.iterrows(), total=len(df), desc="Processing images"):
    image_name = row['Image_name']
    image_path = os.path.join(image_folder, image_name)
    
    try:
        if os.path.exists(image_path):
            # Perform OCR
            result = reader.readtext(image_path)
            
            # Combine all detected text
            text = ' '.join([detection[1] for detection in result])
            extracted_texts.append(text)
        else:
            print(f"Image not found: {image_path}")
            extracted_texts.append("")
        
    except Exception as e:
        print(f"Error processing {image_path}: {e}")
        extracted_texts.append("")

# Add extracted text as a new column to dataframe
df['Extracted_Text'] = extracted_texts

# Save to CSV with the new column
output_path = '/data/extract/meme_test_data_with_text.csv'
df.to_csv(output_path, index=False)

print(f"\n✓ Saved results to {output_path}")
print(f"\nDataFrame columns: {df.columns.tolist()}")
print(f"\nSample extracted texts:")
print(df[['Image_name', 'Extracted_Text']].head(10))
print(f"\nStatistics:")
print(f"  Total images processed: {len(df)}")
print(f"  Images with extracted text: {(df['Extracted_Text'] != '').sum()}")
print(f"  Images with no text: {(df['Extracted_Text'] == '').sum()}")