## Data Preparation   
The goal of this notebook is to take the inputs described below and generate a single CSV with only the information required to perform image-level processing and ad/cluster level aggregating, while performing basic sanity checks. 

### Inputs
1. CP1_train_ads_labelled_fall2016.jsonl   
This is a json lines file of ads that contain _id, class, cluster_id
2. es_child_documents.jl   
This is a json lines file of image objects which contain obj_stored_url and obj_parent
3. image_url_to_valid_sha1.csv    
This is a csv containing 2 columns: an image url (obj_stored_url), and a sha1 checksum of the file

### Outputs
1. CP1_data.csv    
This is a csv file containing 4 columns: cluster_id, ad_id, image_sha, class

In [None]:
__depends__ = ['CP1_train_ads_labelled_fall2016.jsonl',
               'es_child_documents.jl',
               'image_url_to_valid_sha1.csv']
__dest__ = ['map.clusterId_to_adId.pickle',
            'map.adId_to_clusterId.pickle',
            'map.clusterId_to_class.pickle',
            'map.class_to_clusterId.pickle',
            'map.adId_to_sha1s.pickle',
            'map.sha1_to_adIds.pickle',
            'map.cleaned.clusterId_to_adId.pickle',
            'map.cleaned.adId_to_clusterId.pickle',
            'map.cleaned.adId_to_sha1s.pickle',
            'map.cleaned.sha1_to_adIds.pickle',
            'map.cp1_data.csv']

In [None]:
# Input file paths
OFFICIAL_DATA_FILE = 'CP1_train_ads_labelled_fall2016.jsonl'
ES_QUERY_RESULTS = 'es_child_documents.jl'
IMAGE_URL_SHA1_MAP = 'image_url_to_valid_sha1.csv'
# Mapping Cache Files
CLUSTER_ID_TO_AD_ID = 'map.clusterId_to_adId.pickle'
AD_ID_TO_CLUSTER_ID = 'map.adId_to_clusterId.pickle'
CLUSTER_ID_TO_CLASS = 'map.clusterId_to_class.pickle'
CLASS_TO_CLUSTER_ID = 'map.class_to_clusterId.pickle'
AD_ID_TO_SHA1S = 'map.adId_to_sha1s.pickle'
SHA1_TO_AD_IDS = 'map.sha1_to_adIds.pickle'
# These have been stripped of image SHA1s that were labeled as both positive and negative
CLUSTER_ID_TO_AD_ID_CLEANED = 'map.cleaned.clusterId_to_adId.pickle'
AD_ID_TO_CLUSTER_ID_CLEANED = 'map.cleaned.adId_to_clusterId.pickle'
AD_ID_TO_SHA1S_CLEANED = 'map.cleaned.adId_to_sha1s.pickle'
SHA1_TO_AD_IDS_CLEANED = 'map.cleaned.sha1_to_adIds.pickle'
# Full association mapping CSV file
CP1_DATA_FILE = 'map.cp1_data.csv'

In [None]:
import copy
import cPickle as pickle
import csv
import json
import numpy as np
import os

from collections import Counter, defaultdict

### Sanity checking the official data

Assumptions:   
1) The relationship between ad_id and cluster_id is many -> 1    
2) The relationship between cluster_id and class is 1 -> 1

Abbreviate the data to what we need: ad ids, cluster ids, and classes.

In [None]:
cluster_id_to_ad_ids = defaultdict(set)
ad_id_to_cluster_ids = defaultdict(set)

cluster_id_to_class = defaultdict(set)
class_to_cluster_id = defaultdict(set)

if (os.path.isfile(CLUSTER_ID_TO_AD_ID) and 
        os.path.isfile(AD_ID_TO_CLUSTER_ID) and
        os.path.isfile(CLUSTER_ID_TO_CLASS) and
        os.path.isfile(CLASS_TO_CLUSTER_ID)):
    print "Loading existing intermediate results"
    with open(CLUSTER_ID_TO_AD_ID) as f:
        cluster_id_to_ad_ids = pickle.load(f)
    with open(AD_ID_TO_CLUSTER_ID) as f:
        ad_id_to_cluster_ids = pickle.load(f)
    with open(CLUSTER_ID_TO_CLASS) as f:
        cluster_id_to_class = pickle.load(f)
    with open(CLASS_TO_CLUSTER_ID) as f:
        class_to_cluster_id = pickle.load(f)
else:
    print "Extracting relevant info from JSONL"
    with open(OFFICIAL_DATA_FILE) as infile:
        for line in infile:
            document = json.loads(line.strip())

            cluster_id_to_ad_ids[document['cluster_id']].add(document['_id'])
            ad_id_to_cluster_ids[document['_id']].add(document['cluster_id'])
            cluster_id_to_class[document['cluster_id']].add(document['class'])
            class_to_cluster_id[document['class']].add(document['cluster_id'])
            
    print "Saving intermediate results"
    with open(CLUSTER_ID_TO_AD_ID, 'wb') as f:
        pickle.dump(cluster_id_to_ad_ids, f, -1)
    with open(AD_ID_TO_CLUSTER_ID, 'wb') as f:
        pickle.dump(ad_id_to_cluster_ids, f, -1)
    with open(CLUSTER_ID_TO_CLASS, 'wb') as f:
        pickle.dump(cluster_id_to_class, f, -1)
    with open(CLASS_TO_CLUSTER_ID, 'wb') as f:
        pickle.dump(class_to_cluster_id, f, -1)

In [None]:
# Sanity check that each cluster has at least one ad
for cluster_id, ad_ids in cluster_id_to_ad_ids.iteritems():
    assert len(ad_ids) > 0
    
# Sanity check no ad falls in more than one cluster (assumption 1)
all_ad_ids = []
num_unique_ad_ids = 0

for _, ad_ids in cluster_id_to_ad_ids.iteritems():
    all_ad_ids += list(ad_ids)
    num_unique_ad_ids += len(ad_ids)
    
assert len(all_ad_ids) == num_unique_ad_ids

# Sanity check that each cluster only belongs to one class (assumption 2) 
for _, cls in cluster_id_to_class.iteritems():
    assert len(cls) == 1

### Official data descriptions

In [None]:
print '%d clusters (%d positive, %d negative)' % (len(cluster_id_to_class),
                                                  len([x for x in cluster_id_to_class.values() if x == {1}]),
                                                  len([x for x in cluster_id_to_class.values() if x == {0}]))

In [None]:
ads_per_positive_cluster = [len(ad_ids) for cid, ad_ids in cluster_id_to_ad_ids.iteritems() \
                            if cluster_id_to_class[cid] == {1}]
print 'min/med/avg/max/total ads per positive cluster: %d/%d/%d/%d/%d' % (min(ads_per_positive_cluster),
                                                                          np.median(ads_per_positive_cluster),
                                                                          np.average(ads_per_positive_cluster),
                                                                          max(ads_per_positive_cluster),
                                                                          sum(ads_per_positive_cluster))

In [None]:
ads_per_negative_cluster = [len(ad_ids) for cid, ad_ids in cluster_id_to_ad_ids.iteritems() \
                            if cluster_id_to_class[cid] == {0}]
print 'min/med/avg/max/total ads per negative cluster: %d/%d/%d/%d/%d' % (min(ads_per_negative_cluster),
                                                                          np.median(ads_per_negative_cluster),
                                                                          np.average(ads_per_negative_cluster),
                                                                          max(ads_per_negative_cluster),
                                                                          sum(ads_per_negative_cluster))

### Associating Imagery

The shas present here have already been vetted by SMQTK.

In [None]:
if os.path.isfile(AD_ID_TO_SHA1S) and os.path.isfile(SHA1_TO_AD_IDS):
    print "Loading cached maps"
    ad_id_to_shas = pickle.load(open(AD_ID_TO_SHA1S))
    sha_to_ad_ids = pickle.load(open(SHA1_TO_AD_IDS))
else:
    ad_id_to_shas = defaultdict(set)
    sha_to_ad_ids = defaultdict(set)
    ad_id_to_image_urls = defaultdict(set)
    image_url_to_ad_ids = defaultdict(set)

    print 'Read adId->imageSha relationships from ElasticSearch query results'
    with open(ES_QUERY_RESULTS) as infile:
        for line in infile:
            document = json.loads(line.strip())

            if isinstance(document['obj_parent'], list):
                ad_ids = document['obj_parent']
            else:
                ad_ids = [document['obj_parent']]

            for ad_id in ad_ids:
                if document['obj_stored_url']:
                    ad_id_to_image_urls[ad_id].add(document['obj_stored_url'])
                    image_url_to_ad_ids[document['obj_stored_url']].add(ad_id)

    print 'Read in [URL,SHA1] pairs for computable images'
    image_url_to_sha = {}
    image_sha_to_url = {}
    with open(IMAGE_URL_SHA1_MAP) as infile:
        for (image_url, sha1) in csv.reader(infile):
            image_url_to_sha[image_url] = sha1
            image_sha_to_url[sha1] = image_url

    print 'Construct map of ad ID to image SHA1s in that ad'
    for (ad_id, image_urls) in ad_id_to_image_urls.iteritems():
        try:
            ad_image_shas = set([image_url_to_sha[url] for url in image_urls])
            ad_id_to_shas[ad_id] = ad_image_shas
            for sha in ad_image_shas:
                sha_to_ad_ids[sha].add(ad_id)
        except KeyError:
            # There might not be a sha1 for the image url since some shas were invalid (from SMQTK) 
            pass
    
    print 'Save mappings'
    with open(AD_ID_TO_SHA1S, 'wb') as f:
        pickle.dump(ad_id_to_shas, f, -1)
    with open(SHA1_TO_AD_IDS, 'wb') as f:
        pickle.dump(sha_to_ad_ids, f, -1)
        
    del ad_id_to_image_urls, image_url_to_ad_ids

In [None]:
# Sanity check that each ad has at least 1 sha
for shas in ad_id_to_shas.values():
    assert len(shas) > 0

In [None]:
shas_per_ad = map(len, ad_id_to_shas.values())
print 'min/med/avg/max/total images per ad: %d/%d/%d/%d/%d' % (min(shas_per_ad),
                                                               np.median(shas_per_ad),
                                                               np.average(shas_per_ad),
                                                               max(shas_per_ad),
                                                               sum(shas_per_ad))

In [None]:
ad_per_sha = map(len, ad_id_to_shas.values())
print 'min/med/avg/max/total ads per SHA1: %d/%d/%d/%d/%d' % (min(ad_per_sha),
                                                               np.median(ad_per_sha),
                                                               np.average(ad_per_sha),
                                                               max(ad_per_sha),
                                                               sum(ad_per_sha))

In [None]:
# Sanity check: All ads have at least one image
c = 0
for ad, shas in ad_id_to_shas.iteritems():
    if len(shas) == 0:
        assert "Ad with no images:", ad

In [None]:
# Find shas that are marked positive and negative
sha_to_class = defaultdict(set)
        
for sha, parent_ads in sha_to_ad_ids.iteritems():
    sha_classes = set()
    for ad_id in parent_ads:
        for cluster_id in ad_id_to_cluster_ids[ad_id]:
            sha_classes.update(cluster_id_to_class[cluster_id])
    sha_to_class[sha] = sha_classes

bad_shas = set([sha for sha, classes in sha_to_class.iteritems() if len(classes) > 1])

print len(bad_shas)

In [None]:
print "copy original mappings"
cluster_id_to_ad_ids_cleaned = pickle.load(open(CLUSTER_ID_TO_AD_ID))
ad_id_to_cluster_ids_cleaned = pickle.load(open(AD_ID_TO_CLUSTER_ID))
ad_id_to_shas_cleaned = pickle.load(open(AD_ID_TO_SHA1S))
sha_to_ad_ids_cleaned = pickle.load(open(SHA1_TO_AD_IDS))

In [None]:
empty_ads = set()
empty_clusters = set()

print "Clearing SHA1 from referent ads"
for bad_sha in bad_shas:
    parent_ads = sha_to_ad_ids_cleaned[bad_sha]
    for p_ad in parent_ads:
        ad_id_to_shas_cleaned[p_ad].remove(bad_sha)
        # check for empty adIds
        if len(ad_id_to_shas_cleaned[p_ad]) == 0:
            empty_ads.add(p_ad)
            del ad_id_to_shas_cleaned[p_ad]
    del sha_to_ad_ids_cleaned[bad_sha]
    
print "Clearing empty ads from referent clusters"
for e_ad in empty_ads:
    parent_clusters = ad_id_to_cluster_ids_cleaned[e_ad]
    for p_cluster in parent_clusters:
        cluster_id_to_ad_ids_cleaned[p_cluster].remove(e_ad)
        # Check for empty clusters
        if len(cluster_id_to_ad_ids_cleaned) == 0:
            empty_clusters.add(p_cluster)
            del cluster_id_to_ad_ids_cleaned[p_cluster]
    del ad_id_to_cluster_ids_cleaned[e_ad]

print
print "New empty ads:", len(empty_ads)
print "New empty clusters:", len(empty_clusters)
print
print 'original ads:', len(ad_id_to_shas), len(ad_id_to_cluster_ids)
print 'cleaned ads :', len(ad_id_to_shas_cleaned), len(ad_id_to_cluster_ids_cleaned)
print
print 'original clusters:', len(cluster_id_to_ad_ids)
print 'cleaned clusters :', len(cluster_id_to_ad_ids_cleaned)
    
with open(AD_ID_TO_SHA1S_CLEANED, 'wb') as f:
    pickle.dump(ad_id_to_shas_cleaned, f, -1)
with open(SHA1_TO_AD_IDS_CLEANED, 'wb') as f:
    pickle.dump(sha_to_ad_ids_cleaned, f, -1)
with open(CLUSTER_ID_TO_AD_ID_CLEANED, 'wb') as f:
    pickle.dump(cluster_id_to_ad_ids_cleaned, f, -1)
with open(AD_ID_TO_CLUSTER_ID_CLEANED, 'wb') as f:
    pickle.dump(ad_id_to_cluster_ids_cleaned, f, -1)

Create one CSV with all the relevant information, in the format of:    
cluster_id, ad_id, image_sha, class

In [None]:
with open(CP1_DATA_FILE, 'w') as outfile:
    writer = csv.writer(outfile, lineterminator='\n')
    
    for (cluster_id, ad_ids) in cluster_id_to_ad_ids_cleaned.iteritems():
        for ad_id in ad_ids:
            for image_sha in ad_id_to_shas_cleaned[ad_id]:
                writer.writerow([cluster_id, ad_id, image_sha, list(cluster_id_to_class[cluster_id])[0]])

In [None]:
# Print clusters ordered by number of images
clusters = defaultdict(set)

with open(CP1_DATA_FILE) as infile:
    for (cid, ad_id, sha, cls) in csv.reader(infile):
        clusters[cid].add(sha)

clusters_by_size = sorted(clusters.items(), key=lambda x: len(x[1]), reverse=True)

for (cluster_id, shas) in clusters_by_size:
    print '%s %s %d' % (cluster_id, cluster_id_to_class[cluster_id], len(shas))