In [18]:
from __future__ import print_function

import zipfile
import os
import csv
import sys
import imagehash
from io import BytesIO
from PIL import Image

def create_image_hashes(zipfilename, csvfilename):
    archive = zipfile.ZipFile(zipfilename, 'r')
    map_hashes = {}
    allfiles = archive.namelist()
    filecount = len(allfiles)
    for filename in archive.namelist():
        if not filename.lower().endswith(".jpg"):
            continue

        imgdata = archive.read(filename)

        try:            
            im = Image.open(BytesIO(imgdata))

            img_hash = imagehash.average_hash(im)
            img_key = os.path.basename(os.path.splitext(filename)[0])
            map_hashes[img_key] = img_hash
        except BaseException as e:
            print('Cannot process image: %s' % filename, e)
#             break
            continue
    print('From %s %d images loaded' % (zipfilename, len(map_hashes)))
    
    with open(csvfilename, 'w+') as csvfile:
        writer = csv.writer(csvfile, delimiter=',', quotechar='"')

        for key in map_hashes:
            writer.writerow([key, map_hashes[key]])

In [19]:
for i in range(10):
    zipfilename = 'data/Images_' + str(i) + '.zip'
    csvfilename = 'processed_data/hashes' + str(i) + '.csv'
    print('Processing file %s' % zipfilename)
    create_image_hashes(zipfilename, csvfilename)

Processing file data/Images_0.zip
Cannot process image: Images_0/6/9268406.jpg cannot identify image file <_io.BytesIO object at 0x7fef1170b348>
From Images_0/9/9999909.jpg 1082012 images loaded
Processing file data/Images_1.zip
Cannot process image: Images_1/13/4515613.jpg cannot identify image file <_io.BytesIO object at 0x7fef41621dc8>
Cannot process image: Images_1/14/9322814.jpg cannot identify image file <_io.BytesIO object at 0x7fef41621dc8>
From Images_1/19/9999919.jpg 1082331 images loaded
Processing file data/Images_2.zip
From Images_2/29/9999829.jpg 1082285 images loaded
Processing file data/Images_3.zip
Cannot process image: Images_3/36/10684636.jpg cannot identify image file <_io.BytesIO object at 0x7fef31761048>
From Images_3/39/9999939.jpg 1082022 images loaded
Processing file data/Images_4.zip
Cannot process image: Images_4/41/12953041.jpg cannot identify image file <_io.BytesIO object at 0x7fef3d2bedc8>
Cannot process image: Images_4/41/13717141.jpg cannot identify ima

## Example of usage

In [28]:
from PIL import Image
import imagehash
hash = imagehash.average_hash(Image.open('test.png'))
print(hash)
otherhash = imagehash.average_hash(Image.open('other.png'))
print(otherhash)
print(hash == otherhash)
print(imagehash.hex_to_hash(str(hash)) - otherhash)

e7e7ff21b1b131ff
ff00ff0080ff00ff
False
20
