# Generate cloud computer vision logs

**Input: Dataframe of embeddings that has an 'id' column**

**Output: Log of a specific size with a specific number of unique faces**

----------
misc
- Used a Zipfian distribution to produce logs of various sizes from the encodings
- There are repeated encodings in the logs
- Each experiment was carried out 5x and the results averaged
- We generate log files (usage traces) where each entry
includes a vector encoding representing a face; i.e. the
logs represented that generated as a byproduct of using
an AIaaS face service. Encodings are computed using
the TensorFlow implementation of FaceNet (Sandberg 2020).

-  Two types of log are generated: fixed trace uses
a specified number (5000) of different faces for each version of the log; while varied trace uses a Zipfian distribution to select a varied number of different faces for
each log size. Some encodings in the log are repeated to
generate the required number of entries.

In [1]:
import pandas as pd
import numpy as np

import random

In [2]:
def get_data(datasets):
    # Load Datasets
    data_path = '../data/'

    names_path = lambda dataset_name: '%s%s-names.txt' % (data_path, dataset_name)
    embeddings_path = lambda dataset_name: '%s%s_embeddings.npz' % (data_path, dataset_name)

    # Return ids of images in each dataset as a list
    def retrive_ids(filepath):
        with open(filepath, 'r') as file:
            ids = file.read().split()
        return ids

    data_dfs = []
    for name in datasets:
        data_df = pd.DataFrame(np.load(embeddings_path(name))['arr_0'])
        data_df['id'] = retrive_ids(names_path(name))

        data_dfs.append(data_df)

    all_data_df = pd.concat(data_dfs)
    all_data_df = all_data_df.sample(frac=1).reset_index(drop=True)
    print('Dataset contains %s images of %s different people \n' % (len(all_data_df), len(all_data_df['id'].value_counts().keys())))
    
    return all_data_df

In [11]:
# Return a subset of df where 'id' has exactly n unqiue values
def get_subset_n_unique_ids(df, n):
    random_ids = pd.Series(df['id'].unique()).sample(n)
    return df[df['id'].isin(random_ids)]

# Return log of right size
def resize_and_maintain_distribution(df, n, exact):
    """
        df: dataframe with id column whos size want to reduce
        n:  specified log length
        exact: You want exactly n faces, if false, get roughly n faces and save a lot of time
        
        if      len(df) > log legnth: Keep distribution the same, remove a random image from each id (if count > 1), until log length = n
        else if len(df) < log length: Keep distribution the same, repeat a random image from each id, until log length = n
        else    log already of correct length
    """
    # Save a LOT of time
    if not exact: 
        df = df.sample(n=n, replace=True)
#     print('(Generating Log: len(df)=%d, log_length=%d...)' % (len(df), n), end='')
    
    while len(df) < n:
        unique_ids = df['id'].unique()
        for unique_id in unique_ids:
            df_subset = df[df['id'] == unique_id]
            
            # Randomly repeat one row else skip
            df = pd.concat([df, 
                            pd.DataFrame(df_subset.sample(n=1).values, columns=df.columns) ], ignore_index=True)
            
            # Break if reached desired log length
            if len(df) == n:
                break  
    
    while len(df) > n:
        unique_ids = df['id'].unique()
        for unique_id in unique_ids:
            df_subset = df[df['id'] == unique_id]
            
            # If more than one row randomly drop one row else skip
            if len(df_subset) > 1:
                df = df.drop(labels=df_subset.sample(n=1).index, axis=0)
            
            # Break if reached desired log length
            if len(df) == n:
                break  
    
#     print('done')
    return df

def generate_log(df, log_length, n_faces, exact=False):
    """
        df: dataframe where each column is a dimension in the embedding space,
            each row a different image and the final column contains the id of the person in the image
        log_length:    control the length of the log
        n_faces:       control the number of unique individuals in each log
        
        returns: dataframe with the same structure as embeddings_df, may have some repeated images
    """
    # Repeatedly randomly sample df until you have n_faces unique faces and a len(log) >= log_length
    counter=1
    log = get_subset_n_unique_ids(df, n=n_faces)

    # Resize the log, mantaining the distribution of images of each person
    log = resize_and_maintain_distribution(log, n=log_length, exact=exact)
    
    # Checks
    assert len(log) == log_length, 'log length incorrect %d != %d' % (len(log), log_length)
    
    if exact:
        assert log.id.nunique() == n_faces, 'Incorrect number of unique faces %d != %d' % (log.id.nunique(), n_faces)
    else:
        print("WARNING: Given n_people=%d, we have n_people=%d" % (n_faces, log.id.nunique()))
    
    return log

In [12]:
def generate_biased_log(df, log_length, proportion):
    """
        df: original df youre sampling from
        log_length: specified log length
        proportion: approximately make this proportion of the log be one person 0.1=10%
    """
    # pick from top people
    victim_id = random.choice(df['id'].value_counts().keys()[:25].to_list())

    # Resample the victim images until you have a df of size == log_length * proportion
    victim_images= df[df['id'] == victim_id].sample(n=int(log_length * proportion), 
                                                    replace=True)

    # Extract remainder of the log normally
    non_victim_images = df.sample(n=int(log_length * (1-proportion)), replace=True)
    
    log = pd.concat([victim_images, non_victim_images])
    
    print('Log of length %d with %d unique people and %d images of one person' % (len(log), len(log['id'].value_counts().keys()),
                                                                                  len(log[log['id']==victim_id])))
    return log

In [47]:
def generate_log_equal_proportion_each_person(df, log_length, proportion):
    """
        df: original df youre sampling from
        log_length: specified log length
        proportion: approximately make this proportion of the log for each person person 0.1=10%
    """
    log_dfs = []
    for _ in range(int(1/proportion)):
        # Resample the one persons images until you have a df of size == log_length * proportion
        person_id = random.choice(df['id'].value_counts().keys()[:40].to_list())
#         person_id = random.choice(df['id'].value_counts().keys().to_list())
        person_images= df[df['id'] == person_id].sample(n=int(log_length * proportion), 
                                                        replace=True)
        log_dfs.append(person_images)
    
    log = pd.concat(log_dfs)
    
    # Get correct log length
    log = resize_and_maintain_distribution(log, log_length, exact=False)
    
    print('Log of length %d with %d unique people and approx %d images of each person %s' % (len(log), len(log['id'].value_counts().keys()),
                                                                                  int(log_length * proportion), log['id'].value_counts().values.tolist()))
    return log

In [1]:
import networkx as nx
import math

from chinese_whispers import chinese_whispers, aggregate_clusters

def face_distance(face_encodings, face_to_compare):
    """
    Given a list of face encodings, compare them to a known face encoding and get a euclidean distance
    for each comparison face. The distance tells you how similar the faces are.
    :param faces: List of face encodings to compare
    :param face_to_compare: A face encoding to compare against
    :return: A numpy ndarray with the distance for each face in the same order as the 'faces' array
    """
    import numpy as np
    if len(face_encodings) == 0:
        return np.empty((0))

    #return 1/np.linalg.norm(face_encodings - face_to_compare, axis=1)
    face_to_compare = np.array(face_to_compare)
    return np.sum(face_encodings*face_to_compare,axis=1)

def get_encoding_graph(encoding_list, threshold=0.75):
    """ Chinese Whispers Algorithm
    Modified from Alex Loveless' implementation,
    http://alexloveless.co.uk/data/chinese-whispers-graph-clustering-in-python/
    Inputs:
        encoding_list: a list of facial encodings from face_recognition
        threshold: facial match threshold,default 0.6
        iterations: since chinese whispers is an iterative algorithm, number of times to iterate
    Outputs:
        sorted_clusters: a list of clusters, a cluster being a list of imagepaths,
            sorted by largest cluster to smallest
    """

    #from face_recognition.api import _face_distance
    from random import shuffle
    import networkx as nx
    # Create graph
    nodes = []
    edges = []

    image_paths, encodings = zip(*encoding_list)

    if len(encodings) <= 1:
        print ("No enough encodings to cluster!")
        return []

    for idx, face_encoding_to_check in enumerate(encodings):
        # Adding node of facial encoding
        node_id = idx+1

        # Initialize 'cluster' to unique value (cluster of itself)
        node = (node_id, {'cluster': image_paths[idx], 'path': image_paths[idx]})
        nodes.append(node)

        # Facial encodings to compare
        if (idx+1) >= len(encodings):
            # Node is last element, don't create edge
            break

        compare_encodings = encodings[idx+1:]
        distances = face_distance(compare_encodings, face_encoding_to_check)
        encoding_edges = []
        for i, distance in enumerate(distances):
            if distance > threshold:
                # Add edge if facial match
                edge_id = idx+i+2
                encoding_edges.append((node_id, edge_id, {'weight': distance}))

        edges = edges + encoding_edges

    G = nx.Graph()
    G.add_nodes_from(nodes)
    G.add_edges_from(edges)
    
    return G

def do_chinese_whispers(G):
    chinese_whispers(G, weighting='top', iterations=20)
    
    return aggregate_clusters(G)

In [3]:
def estimation_error(n_clusters, n_people):
    return (abs(n_clusters - n_people) / n_people) * 100