In [2]:
import zipfile
from PIL import Image, ImageDraw
import numpy as np
import pytesseract
import math
import os
from mtcnn import MTCNN


# Define the path to the pre-trained MTCNN model
mtcnn_model_path = '/Users/yatinpatel/Desktop/Python3-Programming-Course/mtcnn_weights.npy'

# Create the MTCNN face detector
face_detector = MTCNN(weights_file=mtcnn_model_path)

# Define the output folder
output_folder = '/Users/yatinpatel/Desktop/Python3-Programming-Course/Deeplearning_face_detection/Boundingbox_and_cnfiddencescore_on_face_text'

# Create the output folder if it doesn't exist
if not os.path.exists(output_folder):
    os.makedirs(output_folder)


# Define a function to extract (name, image, text) from a ZIP file
def zip_images_extraction(name):
    """
    Get all the information (name, image, text) from a ZIP file.
    :input: The name of a ZIP file.
    :output: A list of dictionaries. Each dictionary contains all the information (name, image, text) of an image object.
    """
    # ZIP name
    zip_name = name

    # Output
    out = []

    # Extract all the information
    with zipfile.ZipFile(zip_name) as myzip:
        zip_infos = myzip.infolist()

        for ele in zip_infos:
            # Name
            name = ele.filename
            # Image
            img = Image.open(myzip.open(name))
            # Text
            pytesseract.pytesseract.tesseract_cmd = r'/usr/local/bin/tesseract'
            img_strs = pytesseract.image_to_string(img.convert('L'))

            # Test if "Christopher" or "Mark" are in the text
            if "Christopher" in img_strs or "Mark" in img_strs:
                # Example of a dictionary
                my_dict = {"name": name, "img": img, "text": img_strs}
                out.append(my_dict)

    return out


# Define a function to extract faces using MTCNN and create a contact sheet with bounding boxes
def extract_faces(img):
    # Convert PIL image to numpy array
    img_np = np.array(img)

    # Perform face detection
    faces = face_detector.detect_faces(img_np)

    # If no faces are detected
    if len(faces) == 0:
        return None

    # Create a contact sheet
    contact_sheet = img.copy()
    draw = ImageDraw.Draw(contact_sheet)

    for face in faces:
        # Extract face coordinates and confidence score
        x, y, w, h = face['box']
        confidence = face['confidence']

        # Draw bounding box
        draw.rectangle([(x, y), (x + w, y + h)], outline="pink")

        # Draw confidence score
        draw.text((x, y - 15), f"Confidence: {confidence:.2f}", fill="pink")

    return contact_sheet


# Define the search function
def value_search(value, zip_name):
    with zipfile.ZipFile(zip_name) as myzip:
        zip_infos = myzip.infolist()

        for ele in zip_infos:
            # Name
            name = ele.filename
            # Image
            img = Image.open(myzip.open(name))
            # Text
            pytesseract.pytesseract.tesseract_cmd = r'/usr/local/bin/tesseract'
            img_strs = pytesseract.image_to_string(img.convert('L'))

            # Test if value is in the text
            if value in img_strs:
                # Print out the name of the figure
                print("Results found in file {}".format(name))

                # Extract faces and create contact sheet
                contact_sheet = extract_faces(img)
                if contact_sheet is not None:
                    # Save the contact sheet
                    contact_sheet.save(os.path.join(output_folder, name.split('.')[0] + '.png'))
                    contact_sheet.show()
                else:
                    print("But there were no faces in that file")


# Define a function to highlight text extraction with confidence score
def highlight_text(img, text, confidence):
    draw = ImageDraw.Draw(img)

    # Find all occurrences of the text in the image
    occurrences = [occurrence for occurrence in pytesseract.image_to_boxes(img.convert('L'))
                   if occurrence[5:] == text]

    # Highlight each occurrence with bounding box and confidence score
    for occurrence in occurrences:
        x, y, x2, y2 = map(int, occurrence.split(' ')[1:5])
        draw.rectangle([(x, img.height - y), (x2, img.height - y2)], outline="green")
        draw.text((x, img.height - y2 - 15), f"Confidence: {confidence:.2f}", fill="green")

    return img


# Define the search function with highlighted text
def value_search_highlighted(value, zip_name):
    with zipfile.ZipFile(zip_name) as myzip:
        zip_infos = myzip.infolist()

        for ele in zip_infos:
            # Name
            name = ele.filename
            # Image
            img = Image.open(myzip.open(name))
            # Text
            img_strs = pytesseract.image_to_string(img.convert('L'))

            # Test if value is in the text
            if value in img_strs:
                # Print out the name of the figure
                print("Results found in file {}".format(name))

                # Highlight text and confidence score
                img_highlighted = highlight_text(img, value, confidence=1.0)

                # Save the highlighted image
                img_highlighted.save(os.path.join(output_folder, name.split('.')[0] + '_highlighted.png'))
                img_highlighted.show()


# Extract all the information related to small_img.zip and images.zip
small_imgs = zip_images_extraction("/Users/yatinpatel/Desktop/Python3-Programming-Course/small_img.zip")
big_imgs = zip_images_extraction("/Users/yatinpatel/Desktop/Python3-Programming-Course/images.zip")

# Reproduce the search for "Christopher" with highlighted text
value = "Christopher"
zip_name = "/Users/yatinpatel/Desktop/Python3-Programming-Course/small_img.zip"
value_search_highlighted(value, zip_name)

# Reproduce the search for "Mark" with highlighted text
value = "Mark"
zip_name = "/Users/yatinpatel/Desktop/Python3-Programming-Course/images.zip"
value_search_highlighted(value, zip_name)


Results found in file a-0.png
Results found in file a-3.png
Results found in file a-0.png
Results found in file a-1.png
Results found in file a-10.png
Results found in file a-13.png
Results found in file a-2.png
Results found in file a-3.png
Results found in file a-8.png
