In [None]:
%%time
from zipfile import ZipFile
from math import ceil

from PIL import Image, ImageDraw, ImageEnhance
from IPython.display import display
import pytesseract
import numpy as np
import cv2 as cv
import string

def refine(text):
    """
    Convert text into list of lowercase words and also removes punctuation from words
    :param: string to refine
    :return: set of words
    """
    words = set(text.split())
    valid_words = set()
    for word in words:
        word = word.strip(string.punctuation).lower()
        valid_words.add(word)
    
    return valid_words

def make_better(img_lst):
    """
    Resizes all images in image to equal size
    :param: list of images(PIL.Image.Image object)
    :return: list of images(PIL.Image.Image object)
    """
    avg_width = round(sum([img.width for img in img_lst]) / len(img_lst))
    avg_height = round(sum([img.height for img in img_lst]) / len(img_lst))
    
    for idx, img in enumerate(img_lst):
        img_lst[idx] = img.resize((avg_width, avg_height), resample=Image.LANCZOS)
    return img_lst

def create_contact_sheet(img_lst):
    """
    Creates contact sheet of given images
    :param: list of images(PIL.Image.Image objects)
    :return: contact sheet of images
    :rtype: PIL.Image.Image object 
    """
    first_image = img_lst[0]
    width = first_image.width * 5
    height = first_image.height * ceil(len(img_lst) / 5)
    contact_sheet = Image.new(first_image.mode, (width, height))
    x = 0
    y = 0

    for img in img_lst:
        contact_sheet.paste(img, (x, y) )
        
        # Detect edge
        if x + first_image.width == contact_sheet.width:
            x = 0
            y += first_image.height
        else:
            x += first_image.width

    # resize and display the contact sheet
    contact_sheet = contact_sheet.resize((round(contact_sheet.width/2), round(contact_sheet.height/2)))
    return contact_sheet


def process(newspaper, word):
    """ 
    Process image to detect face and text
    :param: file like object related to image or path(string) to image
    :return: tuple list of words found in image and contact sheet of faces found 
    """
    image = Image.open(newspaper)
    img = np.array(image)
    gray = cv.cvtColor(img, cv.COLOR_RGB2GRAY)
    
    # Detect text
    text = pytesseract.image_to_string(gray)
    refined_wrd_lst = refine(text)

    contact_sheet = None
    found = False
    if word in refined_wrd_lst:
        found = True

        # Detect faces
        face_cascade = cv.CascadeClassifier("readonly/haarcascade_frontalface_default.xml")
        face_boxes = face_cascade.detectMultiScale(gray, 1.3, 5)

        cropped_img_lst = []
        for x, y, w, h in face_boxes:
            cropped_img = image.crop((x, y, x + w, y + h))
            cropped_img_lst.append(cropped_img)

        if len(cropped_img_lst) != 0:
            cropped_img_lst = make_better(cropped_img_lst)
            contact_sheet = create_contact_sheet(cropped_img_lst)
        
    return found, contact_sheet


def main():
    """ Begins the execution of code """

    path_to_zip = input("Path to zip: ")
    
    # Asks user to input a word to find
    word = input("Word: ").lower()
    
    # Opens zip file in ZipFile object
    with ZipFile(path_to_zip) as zip_newspapers:
        
        # Extract list of names of file in zip
        newspapers_name_lst = zip_newspapers.namelist()
        
        # Process each image in zip file
        for newspaper_name in newspapers_name_lst:
            
            # Opens image in zipfile as file object
            with zip_newspapers.open(newspaper_name) as newspaper:
                
                # words represent list of words found in newspaper and images represent
                # contact sheet of faces
                found, contact_sheet = process(newspaper, word)
                
                if found:
                    if contact_sheet is None:
                        # Word is found but there are no faces in image                
                        print("But there were no faces in that file!")
                    else:
                        display(contact_sheet)              
                

# Calling the main function
main()