### Image data
The goal of this notebook is to detail how to interact with the images associated to the set of ads provided for the CP1 during the MEMEX Winter QPR 2017.

### Data to download
- Training images:
- Training images info:
- Test images:
- Test images info:

Plus the data available on the Wiki

### Info files
1. adjusted_images.json
2. image_url_sha1.csv
3. faces.jl
4. images_faces_stats.jl

### Outputs
1. image_documents.jl   
These are the children documents (images) related to all of the ads int the input.    
2. image_url_to_valid_sha1.csv
This is a mapping of the URLs images to their SHA1, the images in this file have been downloaded and their SHA1 computed properly.

In [4]:
import os
import csv
import json

In [8]:
# set some parameters
data_dir = "../data"
prefix = "test"
if prefix=="train":
    input_file = "train_adjusted.json"
else:
    input_file = "test_adjusted_unlabelled.json"

In [9]:
images_dir = os.path.join(data_dir,prefix+"_images")
url_sha1_file = os.path.join(data_dir,prefix+"_image_url_sha1.csv")
faces_file = os.path.join(data_dir,prefix+"_faces.jl")
stats_file = os.path.join(data_dir,prefix+"_images_faces_stats.jl")
images_file = os.path.join(data_dir,prefix+"_adjusted_images.json")

In [16]:
# parse faces_file
def parse_faces(faces_file):
    faces_dict = {}
    with open(faces_file, "rt") as faces:
        for line in faces:
            one_face_dict = json.loads(line)
            img_sha1 = one_face_dict.keys()[0]
            nb_faces = len(one_face_dict[img_sha1].keys())
            #print nb_faces
            faces_dict[img_sha1] = dict()
            faces_dict[img_sha1]['count'] = nb_faces
            faces_dict[img_sha1]['detections'] = one_face_dict[img_sha1]
    return faces_dict

In [17]:
faces_dict = parse_faces(faces_file)

In [30]:
print len(faces_dict)
i = 3
print faces_dict.keys()[i], faces_dict[faces_dict.keys()[i]]

20401
3140d44a1e4f2a84f10265fb03d0472c89681ed0 {'count': 1, 'detections': {u'3140d44a1e4f2a84f10265fb03d0472c89681ed0_160-31-227-122': {u'score': u'0.997924685478', u'bbox': u'160.391361728,31.2674056292,227.213230789,122.488013085'}}}


In [27]:
# parse images_file
def parse_images_file(images_file):
    ads_images_dict = {}
    with open(images_file, "rt") as images:
        for line in images:
            one_image_dict = json.loads(line)
            ad_id_list = one_image_dict['obj_parent']
            img_url = one_image_dict['obj_stored_url']
            if type(ad_id_list) is not list:
                ad_id_list = [ad_id_list]
            for ad_id in ad_id_list:
                if ad_id not in ads_images_dict:
                    ads_images_dict[ad_id] = [img_url]
                else:
                    ads_images_dict[ad_id].append(img_url)
    return ads_images_dict

In [28]:
ads_images_dict = parse_images_file(images_file)

In [38]:
print len(ads_images_dict)
print ads_images_dict.keys()[0],ads_images_dict[ads_images_dict.keys()[0]]

21984
28DE96FEE11CB9182D318FC40599E579C69E99D25DB233F64C98F13081DBC018 [u'https://s3.amazonaws.com/roxyimages/e6285c42db6f7332f0e842a4d10d2c1bcaca8d58.jpg', u'https://s3.amazonaws.com/roxyimages/238cb337defb4ae99e2e357d585a0053e5072fcc.jpg', u'https://s3.amazonaws.com/roxyimages/439d654bc365f36daccff602849ba7bb3a608836.jpg', u'https://s3.amazonaws.com/roxyimages/420957839e1eb4a41ec19c283130f633006cbf56.jpg', u'https://s3.amazonaws.com/roxyimages/c86cea7543c8f855dd77c0037e146aef1fd81d1a.jpg', u'https://s3.amazonaws.com/roxyimages/32ebde5b3cabb7773346d41b333c04e0d89ff4e3.jpg', u'https://s3.amazonaws.com/roxyimages/6fea3fe79e0e2eef18a09f0daf9c998e796ce7cd.jpg', u'https://s3.amazonaws.com/roxyimages/5163ef2e476107b4c7a31d22021a6e200bc8c03e.jpg', u'https://s3.amazonaws.com/roxyimages/c9af9f955efce2533ee0df843eaba666aff7cb28.jpg', u'https://s3.amazonaws.com/roxyimages/83845b26e5a25e00dd7c5c7c8894cdddc2493285.jpg']


In [33]:
# parse image_url_sha1_file
def parse_url_sha1_file(url_sha1_file):
    url_sha1_dict = {}
    with open(url_sha1_file,"rt") as img_url_sha1:
        for line in img_url_sha1:
            url, sha1 = line.split(',')
            url_sha1_dict[url] = sha1
    return url_sha1_dict

In [34]:
 url_sha1_dict = parse_url_sha1_file(url_sha1_file)

In [37]:
print len(url_sha1_dict)
print url_sha1_dict.keys()[0],url_sha1_dict[url_sha1_dict.keys()[0]]

54569
https://s3.amazonaws.com/memex-images/full/85d9a4e5f4927f22218f79f496a14757ee8b3413.jpg f912b73662a9a50e42cb5d7726beea69c4ba0b56



### Show images and faces of one ad

In [1]:
from PIL import Image

In [43]:
def get_ad_images(ad_id, ads_images_dict, url_sha1_dict):
    images_url_list = ads_images_dict[ad_id]
    images_sha1s = []
    for image_url in images_url_list:
        if image_url is None or not image_url:
            continue
        try:
            images_sha1s.append(url_sha1_dict[image_url.strip()].strip())
        except:
            print 'Cannot find sha1 for: {}.'.format(image_url)
    return images_sha1s

In [44]:
ad_id = "28DE96FEE11CB9182D318FC40599E579C69E99D25DB233F64C98F13081DBC018"
images_sha1s = get_ad_images(ad_id, ads_images_dict, url_sha1_dict)

In [45]:
print images_sha1s

['6097a3ecb7b3421562a67b04c77370a02db9b025', '400e47619ed62a93c94c47a979b9c86c89c1c272', 'c0f5bc5c6ca7c1f9350b0bd9a3c9af6785b83127', '3ddd075702acb00dc5a313320c496de4d638fac5', '02a11b6047cea0f2bd634e565d71ada6442e82a3', '0ced961073951ccc415c8e51a90060a275a6628d', 'e6d5a5a0a996d4da18ef7d150766fc8ca9cc4f8e', '402f797ff471f609144aa38019c61c658945dbbf', '08a459d7587f7053c0d843949279d80ee9cb8841', 'c3380eda734c16d4d65381067f6b49475ab899e7']


In [71]:
def get_faces_images(images_sha1s, faces_dict):
    faces_out = {}
    for sha1 in images_sha1s:
        tmp_faces = faces_dict[sha1]
        if tmp_faces['count']==0:
            faces_out[sha1] = []
            continue
        bboxes = []
        for face in tmp_faces['detections']:
            bbox = [float(x) for x in tmp_faces['detections'][face]['bbox'].split(',')]
            bbox.append(float(tmp_faces['detections'][face]['score']))
            bboxes.append(bbox)
        #print bboxes
        faces_out[sha1] = bboxes
    return faces_out

In [76]:
def draw_face_bbox(img, bboxes, width=4):
    from PIL import ImageDraw
    draw = ImageDraw.Draw(img)
    for bbox in bboxes:
        for i in range(width):
            rect_start = (int(np.round(bbox[0] + width/2 - i)), int(np.round(bbox[1] + width/2 - i)))
            rect_end = (int(np.round(bbox[2] - width/2 + i)), int(np.round(bbox[3] - width/2 + i)))
            draw.rectangle((rect_start, rect_end), outline=(0, 255, 0))
        # print score?
        if len(bbox)==5:
            draw.text((np.round((bbox[0]+bbox[2])/2),np.round(bbox[1])), str(bbox[4]), fill=(255,255,255,128))


In [77]:
def open_image(sha1, images_dir):
    from PIL import Image
    img = Image.open(os.path.join(images_dir, sha1[:3], sha1))
    return img

In [79]:
faces = get_faces_images(images_sha1s, faces_dict)
print faces

{'c3380eda734c16d4d65381067f6b49475ab899e7': [], '400e47619ed62a93c94c47a979b9c86c89c1c272': [], '6097a3ecb7b3421562a67b04c77370a02db9b025': [[347.294961773, 155.247852325, 498.546011746, 370.930160522, 0.844537436962]], '02a11b6047cea0f2bd634e565d71ada6442e82a3': [], 'c0f5bc5c6ca7c1f9350b0bd9a3c9af6785b83127': [], '402f797ff471f609144aa38019c61c658945dbbf': [], '0ced961073951ccc415c8e51a90060a275a6628d': [], '08a459d7587f7053c0d843949279d80ee9cb8841': [], 'e6d5a5a0a996d4da18ef7d150766fc8ca9cc4f8e': [[91.988831386, 463.030171201, 388.110469349, 783.963114321, 0.711548626423]], '3ddd075702acb00dc5a313320c496de4d638fac5': []}


In [81]:
for face in faces:
    if faces[face]:
        img = open_image(face, images_dir)
        draw_face_bbox(img, faces[face])
        img.show()

IOError: [Errno 2] No such file or directory: '../data/test_images/609/6097a3ecb7b3421562a67b04c77370a02db9b025'