# Generate cloud computer vision logs

**Input: Dataframe of embeddings that has an 'id' column**

**Output: Log of a specific size with a specific number of unique faces**

----------
misc
- Used a Zipfian distribution to produce logs of various sizes from the encodings
- There are repeated encodings in the logs
- Each experiment was carried out 5x and the results averaged
- We generate log files (usage traces) where each entry
includes a vector encoding representing a face; i.e. the
logs represented that generated as a byproduct of using
an AIaaS face service. Encodings are computed using
the TensorFlow implementation of FaceNet (Sandberg 2020).

-  Two types of log are generated: fixed trace uses
a specified number (5000) of different faces for each version of the log; while varied trace uses a Zipfian distribution to select a varied number of different faces for
each log size. Some encodings in the log are repeated to
generate the required number of entries.

In [1]:
import pandas as pd
import numpy as np

In [2]:
# Read from text file list of id for who is in which image
def get_name_list(names_path):
    name_list = []
    with open(names_path, "r") as names:
        for row in names:
            name, count = row.split()[0], int(row.split()[1]) 
            name_list.extend([name] * count)
    
    return name_list

In [4]:
lfw_a_embeddings = np.load('faces_embeddings.npz')['arr_0']
lfw_a_embeddings_df = pd.DataFrame(lfw_a_embeddings)

# Idenity of each embedding in the list
name_list = get_name_list('a-names.txt')
lfw_a_embeddings_df['id'] = name_list

lfw_a_embeddings_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,119,120,121,122,123,124,125,126,127,id
0,-1.123352,-0.434801,-1.047122,-1.269473,-0.195218,0.813537,-0.842875,-0.479084,0.988208,-2.462518,...,0.445605,1.575916,1.951356,0.704910,1.111240,0.700760,-0.000323,-0.413812,-0.177034,AJ_Cook
1,-0.276660,-0.929376,-0.871309,1.887984,0.294494,-0.392032,1.193839,-1.332300,1.016225,1.183325,...,1.601537,0.041679,-1.321221,-0.362160,-0.613001,-0.731198,-0.024927,0.814170,-0.565774,AJ_Lamas
2,0.046266,-1.365577,0.904331,0.308679,1.453258,-1.847158,0.107858,-1.022902,1.149489,-0.438184,...,-0.445793,2.051965,-0.288605,0.396998,-1.583032,-0.912693,-0.927178,-0.585495,0.399059,Aaron_Eckhart
3,-0.786084,-1.590150,0.908262,-1.015456,-1.524350,0.417413,0.085213,-0.371191,-0.536306,0.107306,...,-0.289110,0.225846,-0.163956,-0.299800,0.376664,-1.527830,-0.331605,0.459349,0.111617,Aaron_Guiel
4,-0.954761,-1.453103,0.508415,-0.667681,-0.688487,0.624586,0.298282,-0.703009,0.110573,0.169510,...,0.260893,0.438698,-0.199018,-0.830035,0.604224,-1.823791,-0.872411,0.425842,0.717366,Aaron_Patterson
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1049,-0.337823,-2.008454,2.566872,-1.616262,-0.866825,0.296573,-0.472138,0.880347,-0.986221,0.391021,...,-1.665424,0.645080,-2.333163,-0.836760,-0.509632,0.334259,-1.011701,-0.261623,-0.570788,Azmi_Bishara
1050,0.662095,0.721160,0.417328,0.983635,0.491448,1.149256,0.752710,-2.286117,0.599011,1.982112,...,0.824698,0.305375,0.209043,0.082835,-1.195638,2.019420,0.257630,0.152687,-1.281225,Azra_Akin
1051,-1.404683,0.861827,-0.729335,0.452092,0.224669,-1.049955,-0.007121,-0.258552,1.032617,-0.901522,...,0.338379,-1.166980,-1.983911,0.284591,1.283130,0.122160,-1.484203,1.641596,-0.799490,Azra_Akin
1052,1.036005,-0.057296,0.226287,0.198919,0.825064,0.986293,0.623526,-1.962369,-0.563830,2.164161,...,0.704248,1.288848,-0.312801,-0.183548,-0.598150,1.389290,0.472267,0.588352,-0.975882,Azra_Akin


In [73]:
# Return a subset of df where 'id' has exactly n unqiue values
def get_subset_n_unique_ids(df, n):
    random_ids = pd.Series(df['id'].unique()).sample(n)
    return df[df['id'].isin(random_ids)]

# Return log of right size
def resize_and_maintain_distribution(df, n):
    """
        df: dataframe with id column whos size want to reduce
        n:  specified log length
        
        Keep distribution the same, remove a random image from each id (if count > 1), until log length = n
    """
    while len(df) > n:
        unique_ids = df['id'].unique()
        for unique_id in unique_ids:
            df_subset = df[df['id'] == unique_id]
            
            # If more than one row randomly drop one row else skip
            if len(df_subset) > 1:
                df = df.drop(labels=df_subset.sample(n=1).index, axis=0)
            
            # Break if reached desired log length
            if len(df) == n:
                break  
    
    return df

def generate_log(df, log_length, n_faces):
    """
        df: dataframe where each column is a dimension in the embedding space,
            each row a different image and the final column contains the id of the person in the image
        log_length:    control the length of the log
        n_faces:       control the number of unique individuals in each log
        
        returns: dataframe with the same structure as embeddings_df, may have some repeated images
    """
    df = get_n_unique(df, n=n_faces)
    df = resize_and_maintain_distribution(df, n=log_length)
    
    # Checks
    assert len(df) == log_length
    assert df.id.nunique() == n_faces
    
    return df

In [74]:
log = generate_log(lfw_a_embeddings_df, log_length=500, n_faces=300)