# Generate cloud computer vision logs

**Input: Dataframe of embeddings that has an 'id' column**

**Output: Log of a specific size with a specific number of unique faces**

----------
misc
- Used a Zipfian distribution to produce logs of various sizes from the encodings
- There are repeated encodings in the logs
- Each experiment was carried out 5x and the results averaged
- We generate log files (usage traces) where each entry
includes a vector encoding representing a face; i.e. the
logs represented that generated as a byproduct of using
an AIaaS face service. Encodings are computed using
the TensorFlow implementation of FaceNet (Sandberg 2020).

-  Two types of log are generated: fixed trace uses
a specified number (5000) of different faces for each version of the log; while varied trace uses a Zipfian distribution to select a varied number of different faces for
each log size. Some encodings in the log are repeated to
generate the required number of entries.

In [39]:
import pandas as pd
import numpy as np

In [49]:
# Return a subset of df where 'id' has exactly n unqiue values
def get_subset_n_unique_ids(df, n):
    random_ids = pd.Series(df['id'].unique()).sample(n)
    return df[df['id'].isin(random_ids)]

# Return log of right size
def resize_and_maintain_distribution(df, n):
    """
        df: dataframe with id column whos size want to reduce
        n:  specified log length
        
        Keep distribution the same, remove a random image from each id (if count > 1), until log length = n
    """
    while len(df) > n:
        unique_ids = df['id'].unique()
        for unique_id in unique_ids:
            df_subset = df[df['id'] == unique_id]
            
            # If more than one row randomly drop one row else skip
            if len(df_subset) > 1:
                df = df.drop(labels=df_subset.sample(n=1).index, axis=0)
            
            # Break if reached desired log length
            if len(df) == n:
                break  
    
    return df

def generate_log(df, log_length, n_faces):
    # TODO can extend this so that if len(log) > log_length can instead just repeat images?
    """
        df: dataframe where each column is a dimension in the embedding space,
            each row a different image and the final column contains the id of the person in the image
        log_length:    control the length of the log
        n_faces:       control the number of unique individuals in each log
        
        returns: dataframe with the same structure as embeddings_df, may have some repeated images
    """
    # Repeatedly randomly sample df until you have n_faces unique faces and a len(log) >= log_length
    counter=1
    log = get_subset_n_unique_ids(df, n=n_faces)
    while len(log) < log_length:
        log = get_subset_n_unique_ids(df, n=n_faces)
        counter += 1
    print('Took %d iterations to find a df subset' % counter)    
    
    # Resize the log, mantaining the distribution of images of each person
    log = resize_and_maintain_distribution(log, n=log_length)
    
    # Checks
    assert len(log) == log_length, 'log length incorrect %d != %d' % (len(log), log_length)
    assert log.id.nunique() == n_faces, 'Incorrect number of unique faces %d != %d' % (log.id.nunique(), n_faces)
    
    return log