# Adult Dataset Analysis

This notebook uses the self-influence results generated by **adult_training.ipynb** to perform analysis and investigate the nature of self-influence.

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import random
import seaborn as sns
import pickle
import pandas as pd

### Load Results
Models are loaded from pickle files. First entry in the file is always teh averaged self-influence score. A list of models used is created to refer to the specific model in the list.

In [None]:
model_cps = [0, 1, 2, 3, 4, 5, 10, 20, 30, 40, 50, -1]

# List of dictionaries: Average CP, CP 1, CP 2, CP 3, CP 5, CP 10, CP 50
with open('results/scan_results_shuffled_500.pickle', 'rb') as handle:
    scan_results = pickle.load(handle)


### Results

Method defined to apply cuts on self-influence score

In [None]:
def results_with_cut(results, min_infl = 0, max_infl=1000):
    processed_results = []
    for result in results:
        X = [i for i in result.get("memorisation") if i > min_infl and i < max_infl]
        processed_results.append(X)
    return processed_results

### Plotting influence distributions and boxplots of distribution

In [None]:
# plot result histograms
MIN = 1
MAX = 400

processed_results = results_with_cut(scan_results, MIN, MAX)

In [None]:
# plot options
BINS = 100
ALPHA = 0.3
X_MAX = MAX
EPOCH_INDEX = model_cps.index(50)

# influence distribution
f, ax = plt.subplots(figsize=(10, 5), ncols=1)

ax.hist(processed_results[EPOCH_INDEX], bins=BINS, histtype=u'step', linewidth=2, label='Epoch 50')
ax.hist(processed_results[-1], bins=BINS, histtype=u'step', linewidth=2, label='Ave. Checkpoint')
    
ax.legend(frameon=False, fontsize=12)
ax.set_yscale('log')
ax.set_xlim([0, X_MAX])
ax.set_title("Self-influence distribution for training dataset", fontsize=14)
#ax.set(xlabel="Self Influence Score", ylabel="Counts")
plt.xlabel("Self Influence Score", fontsize=12)
plt.ylabel("Counts", fontsize=12)

plt.savefig("plots/self_influence_two_distributions.pdf")

In [None]:
# plot options
BINS = 100
ALPHA = 0.3
EPOCH = 50
X_MAX = MAX
EPOCH_INDEX = model_cps.index(EPOCH)

# influence distribution
f, ax = plt.subplots(figsize=(10, 5), ncols=1)
for i in range(len(processed_results)):
    LABEL = "Ave. CP" if model_cps[i]==-1 else"Epoch {}".format(model_cps[i]) 
    ax.hist(processed_results[i], bins=BINS, histtype=u'step', linewidth=2, label=LABEL)
    
ax.legend(frameon=False, fontsize=12)
ax.set_yscale('log')
ax.set_xlim([0, X_MAX])
ax.set_title("Self-influence distribution for training dataset", fontsize=14)
#ax.set(xlabel="Self Influence Score", ylabel="Counts")

plt.xlabel("Self Influence Score", fontsize=12)
plt.ylabel("Counts", fontsize=12)

plt.savefig("plots/self_influence_adult.pdf")

In [None]:
# plot options
boxprops = dict(linestyle='-', linewidth=2, color='black')
flierprops = dict(marker='o', markerfacecolor='white', markersize=10,
                  linestyle='none')
medianprops = dict(linestyle='-', linewidth=2, color='firebrick')

# create box plots for different CPs 
f_1, ax_1 = plt.subplots(figsize=(8, 5), ncols=1)

LABELS = []
for i in range(len(processed_results)):
    label = "Ave. CP" if model_cps[i]==-1 else"CP-{}".format(model_cps[i-1])
    LABELS.append(label)
    
ax_1.set_xticklabels(LABELS, fontdict=None, minor=False)
ax_1.set_xlabel("Checkpoint Snapshots", fontsize=12)
ax_1.set_ylabel("Self-Influence Score", fontsize=12)


ax_1.boxplot(processed_results, widths=0.9, medianprops=medianprops, boxprops=boxprops, flierprops=flierprops)
plt.show()

### Studying the evolution of individual influence examples across training

A method which returns an example index that lies between the desired minimum and maximum selected percentile influence score.

In [None]:
def return_example_index(p_start, p_width, reduced_width, m):
    """
    This function returns the index of a random example between a desired percentile range.
    - p_start is used to determine the lowest index 
    - p_width represents the width of the percentile
    - reduced_width is used to select a smaller region within width of percentile
      to return examples closer to the median representation
    - e.g. p_start = 0.2 and p_width = 0.2 will return a random example between the 20th-40th percentile 
    """
    if p_start and p_width > 1:
        print ("Set percentile and width between 0-1.")
    else:
        n = len(m) # length of array
        sorted_m = np.argsort(m) # sorts from smallest to largest influence and returns index ordering
        min_i, max_i = p_start*n, p_start*n + p_width*n # find min and max point given start and width
        median_width = (1 - reduced_width)/2. # calculate the reduced width 
        min_i_median = round(min_i + (p_width*n*median_width))
        max_i_median = round(max_i - (p_width*n*median_width))
        
        # use reduced width to select random example and return index
        return random.choice(sorted_m[min_i_median:max_i_median]) 

In [None]:
# method to return the influence score of a given index
def return_influence_of_index(scan_results, index):
    influence_score = []
    for model in scan_results:
        influence_score.append(model["memorisation"][index])
    return influence_score

Now we're computing the influence scores from examples across different deciles and collecting results for all CP models into a list

In [None]:
EPOCH = 1
epoch_index = model_cps.index(EPOCH)
m_selected = scan_results[epoch_index]["memorisation"] # influence scores from CP 50

p_starts = np.linspace(0, 0.9, 10)

indices = [] # store indices of examples
for p_start in p_starts:
    index = return_example_index(float(p_start), 0.1, 0.1, m_selected)
    indices.append(index)

Finding the influence score for all the given indices. Generates a list of lists, where each list represent the evolution of a given index.

In [None]:
results_for_10_examples = []
for index in indices:
    example_influence_across_CPs = []
    for i, model in enumerate(model_cps):
        if model == -1: 
            continue # skip averaged model
        else:
            infl_score = scan_results[i]["memorisation"][index]
            example_influence_across_CPs.append(infl_score)
    results_for_10_examples.append(example_influence_across_CPs)

Generating a plot of influence score vs different epochs for the 10 examples selected. This plot shows how influence score for a given example evolves over training. Note that the examples are selected from the 50th epoch and projected backwards. 

In [None]:
f, ax = plt.subplots(figsize=(10, 5), ncols=1)

for i, value in enumerate(results_for_10_examples):
    LABEL = "{}".format(i+1)                                           
    ax.plot(model_cps[:-1], results_for_10_examples[i], "-o", markersize=5, label=LABEL, linewidth=2)

ax.set_xlim([-1, 57])
handles, labels = ax.get_legend_handles_labels()
ax.legend(handles[::-1], labels[::-1], loc='right', frameon=True, fontsize=10)
ax.set_yscale('log')
ax.set_title("Evolution of influence for individaul examples", fontsize=14)

plt.xlabel("Epoch", fontsize=12)
plt.ylabel("Self Influence Score", fontsize=12)

plt.savefig("plots/self_influence_evolution_epoch_1.pdf")

### Studying the distributional evolution of influence in tails

In [None]:
# return upper X percentile indices
def return_upper_percentile_indices(p, m):
    """
    > This function returns the indices of the upper pth fraction of most influential examples
    > p represents upper "pth" percentile of the dataset and is set between 0 and 1
    """
    if p > 1:
        
        print ("Set percentile between 0-1.")
    else:
        n = len(m)
        sorted_m = np.argsort(m)
        min_i, max_i = round(n - p*n), n
        return sorted_m[min_i:max_i]

In [None]:
# return upper X percentile indices
def return_lower_percentile_indices(p, m):
    """
    > This function returns the indices of the upper pth fraction of most influential examples
    > p represents upper "pth" percentile of the dataset and is set between 0 and 1
    """
    if p > 1:
        print ("Set percentile between 0-1.")
    else:
        n = len(m)
        sorted_m = np.argsort(m)
        max_i = round(p*n)
        return sorted_m[0:max_i]

In [None]:
# returns a list of indices that match the selected models tail indices
def return_match_index(scan_results, selected_model, p_width):
    sub_list = return_upper_percentile_indices(p_width, selected_model["memorisation"])
    all_cp_index = []
    for model in range(len(model_cps)):
        index_list = [] # stores matching indices for model
        sorted_model = np.argsort(scan_results[model]["memorisation"]) # sorting model in influence score
        for item in sub_list:
            index_list.append(np.where(sorted_model == item)[0][0])
        all_cp_index.append(index_list)
    return all_cp_index

Using **return_match_index** to count the number of matches between the upper ***p_width*** highest influence examples of a selected epoch/model and all the other epochs/models.

In [None]:
EPOCH = 50
epoch_index = model_cps.index(EPOCH)
selected_model = scan_results[epoch_index]
p_width=0.05 # selecting top 10%
all_results = return_match_index(scan_results, selected_model, p_width)

Reformatting data structure to feed into **seaborn** plots.

In [None]:
# creating a data dictionary to feed into seaborn plot
data = {
    'CP 0' : all_results[0],
    "CP 1" : all_results[1],
    "CP 2" : all_results[2],
    "CP 3" : all_results[3],
    "CP 4" : all_results[4],
    "CP 5" : all_results[5],
    "CP 10" : all_results[6],
    "CP 20" : all_results[7],
    "CP 30" : all_results[8],
    "CP 40" : all_results[9],
    "CP 50" : all_results[10],
    "Ave. CP" : all_results[11]
}
df = pd.DataFrame.from_dict(data, orient='index')
df = df.transpose()
df[:5]

In [None]:
fig, ax = plt.subplots(figsize=(10, 6))

sns.violinplot(ax=ax, data=df)
plt.axhline(y=len(all_results[0])*(1/p_width-1), color='r', linestyle='-')
plt.axvline(x=10.5, color='black', linestyle='--')

plt.xlabel("Epoch", fontsize=12)
plt.ylabel("Ordered Self Influence Index", fontsize=12)

plt.savefig("plots/self_influence_distribution_evolution_epoch_1.pdf")

## Create pruned datasets
This section is used to create and save pruned datasets. The indices are selected by removing a percentage of the most influential examples from the last epoch (50th). The indices are saved as lists into a pickle file to be used in training.

In [None]:
def upper_indices_to_remove(data, epoch_index, prune_frac):
    indices_to_remove = return_upper_percentile_indices(prune_frac, 
                                                  data[epoch_index]["memorisation"])[::-1].flatten()
    return sorted(indices_to_remove)

In [None]:
def lower_indices_to_remove(data, epoch_index, prune_frac):
    indices_to_remove = return_lower_percentile_indices(prune_frac, 
                                                  data[epoch_index]["memorisation"])[::-1].flatten()
    return sorted(indices_to_remove)

In [None]:
EPOCH = -1
epoch_index = model_cps.index(EPOCH)
data = scan_results

In [None]:
# remove upper
prune_indices = []
prune_frac_scan = [0.05, 0.010, 0.0125, 0.15, 0.175, 0.02, 0.0225, 0.025]
for i, prune_frac in enumerate(prune_frac_scan):
    prune_indices.append(upper_indices_to_remove(data, epoch_index, prune_frac))

In [None]:
pruned_datasets_dict = {
    "frac_list" : prune_frac_scan,
    "prune_indices" : prune_indices
}

In [None]:
# save options
EXTENSION = "pruned_ds_cpave_bs500_2".format(EPOCH)

# store data (serialize)
with open('results/{}.pickle'.format(EXTENSION), 'wb') as handle:
    pickle.dump(pruned_datasets_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)