## Using Tesseract OCR to extract Caption from Meme Images

### Step 1: Importing the requied libraries and setting up the paths

In [1]:
import cv2
import pytesseract
import string
import os
pytesseract.pytesseract.tesseract_cmd = r'C:\\Program Files\\Tesseract-OCR\\tesseract.exe'

image_folder = 'C:\\Users\\soumi\\Desktop\\precog\\img\\train_data\\hateful_memes'
output_file = "ocr_results_for_hateful_memes.txt"

### Step 2: Defining the Preprocessor function to help filter out the text

In [None]:
def preprocess_image(image):
    image = cv2.bilateralFilter(image, 5, 55, 60)
    image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    _, image = cv2.threshold(image, 240, 255, cv2.THRESH_BINARY)
    return image

### Step 3: Running the model on a dataset of hateful meme images

In [2]:
allowed_chars = string.ascii_letters + string.digits + " "

# Open the output text file in write mode
with open(output_file, "w") as f:
    # Iterate through all images in the folder
    for filename in os.listdir(image_folder):
        # Get the full path of the image
        image_path = os.path.join(image_folder, filename)

        # Read the image
        image = cv2.imread(image_path)

        # Preprocess the image
        image = preprocess_image(image)
        custom_config = r"--oem 3 --psm 11 -c tessedit_char_whitelist= 'ABCDEFGHIJKLMNOPQRSTUVWXYZ '"
        # Perform OCR and remove extra newlines
        text = pytesseract.image_to_string(image)
        # Filter text, replacing unrecognized characters with space
        filtered_text = "".join(
            char if char in allowed_chars else " " for char in text
        )

        # Remove extra newlines
        filtered_text = filtered_text.replace("\n", " ")

        # Write the results to the text file
        f.write(f"Image: {filename}, Text: {filtered_text}\n")

#Saving the results
print(f"OCR results saved to: {output_file}")

OCR results saved to: ocr_results_for_hateful_memes.txt
