# nGram Error Analysis


In [None]:
import sys
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

## 1. Error Calculation per Instance

In [None]:
path = ''
file_names = ['0.1.dat', '1.dat', '5.dat', '10.dat', '20.dat', '50.dat']
df_names = ['eps_01', 'eps_1', 'eps_5', 'eps_10', 'eps_20', 'eps_50']
for name, file in zip(df_names, file_names):
    exec(f"{name} = pd.read_csv('{path+file}', dtype={{'Seq': str}}, sep = ':')")
datasets = {"eps_01": eps_01, "eps_1": eps_1, "eps_5": eps_5, "eps_10": eps_10, "eps_20": eps_20, "eps_50": eps_50}

for df in datasets.values():
    df['Seq'] = df['Seq'].astype(str)

og = pd.read_csv('/home/myercel1/research/KTH-traces/Ziming/notebooks/data/ngram_data/small_example-original-5grams.dat', sep = ':', dtype={'Seq': str})

og['ngram'] = og['Seq'].apply(lambda x: len(x.split()))
for dataset_name, dataset in datasets.items():
    dataset['ngram'] = dataset['Seq'].apply(lambda x: len(str(x).split()))

### 1.1. With Noisy counts less than 1

#### 1.1.1. Absolute Error

In [None]:
def abs_diff_line_plot(n):
    # Collect all unique sequences for the given n-gram across all datasets
    all_sequences = set()
    for dataset in datasets.values():
        ngram_dataset = dataset[dataset['ngram'] == n]
        all_sequences.update(ngram_dataset['Seq'].unique())

    # Create a dictionary to map each unique sequence to a numerical value; sort the keys number by number
    sequence_mapping = {sequence: i for i, sequence in enumerate(sorted(all_sequences, key=lambda x: [int(num) for num in x.split()]))}

    # Identify the n-grams in the original dataset
    og_ngram = og[og['ngram'] == n]
    
    # Plotting
    plt.figure(figsize=(10, 6))

    # Iterate over each noisy dataset
    for dataset_name, dataset in datasets.items():
        # Identify the n-grams
        dataset_ngram = dataset[dataset['ngram'] == n]

        # Calculate the differences
        # Merge on 'Seq' and use suffixes to differentiate columns
        difference = pd.merge(dataset_ngram, og_ngram, on='Seq', how='outer', suffixes=('_' + dataset_name, '_og'))

        # Fill NaNs with 0
        difference.fillna(0, inplace=True)

        # Calculate the absolute difference
        difference['abs_difference'] = np.abs(difference['Counts_og'] - difference['Counts_' + dataset_name])

        # Sort the sequences based on the order of sequences on the x-axis
        difference['seq_numeric'] = difference['Seq'].map(sequence_mapping)
        difference.sort_values('seq_numeric', inplace=True)

        # Plotting scatter plot
        plt.scatter(difference['seq_numeric'], difference['abs_difference'], label=dataset_name)

        # Connect the dots with a line
        plt.plot(difference['seq_numeric'], difference['abs_difference'], label=None)

    # Set the tick labels to the original sequences
    plt.xticks(list(sequence_mapping.values()), list(sequence_mapping.keys()), rotation='vertical')

    plt.xlabel(f"{n}-gram sequences")
    plt.ylabel('Absolute difference in counts')
    plt.title(f"Absolute difference in counts for {n}-grams across datasets")
    plt.legend()
    plt.show()

In [None]:
for i in range(1,6):
    abs_diff_line_plot(i)

#### 1.1.1 Results description:

We can see that the absolute difference decreases as n-gram length gets larger. Likewise, the absolute difference decreases as epsilon value increases (more privacy loss).

#### 1.1.2. Percentage Error with Sanity Bound 0.008

In [None]:
def perc_diff_line_plot(n,sanity_bound):
    # Collect all unique sequences for the given n-gram across all datasets
    all_sequences = set()
    for dataset in datasets.values():
        ngram_dataset = dataset[dataset['ngram'] == n]
        all_sequences.update(ngram_dataset['Seq'].unique())

    # Create a dictionary to map each unique sequence to a numerical value; sort the keys number by number
    sequence_mapping = {sequence: i for i, sequence in enumerate(sorted(all_sequences, key=lambda x: [int(num) for num in x.split()]))}

    # Identify the n-grams in the original dataset
    og_ngram = og[og['ngram'] == n]
    
    # Plotting
    plt.figure(figsize=(10, 6))

    # Iterate over each noisy dataset
    for dataset_name, dataset in datasets.items():
        # Identify the n-grams
        dataset_ngram = dataset[dataset['ngram'] == n]

        # Calculate the differences
        difference = pd.merge(dataset_ngram, og_ngram, on='Seq', how='outer', suffixes=('_' + dataset_name, '_og'))

        # Fill NaNs with 0
        difference.fillna(0, inplace=True)
        
        # Calculate the absolute difference
        difference['abs_difference'] = np.abs(difference['Counts_og'] - difference['Counts_' + dataset_name])

        # Calculate the percentage
        difference['percentage_difference'] = difference.apply(lambda row: 100 * row['abs_difference'] / max(row['Counts_og'], sanity_bound), axis=1)

        # Sort the sequences based on the order of sequences on the x-axis
        difference['seq_numeric'] = difference['Seq'].map(sequence_mapping)
        difference.sort_values('seq_numeric', inplace=True)

        # Plotting scatter plot
        plt.scatter(difference['seq_numeric'], difference['percentage_difference'], label=dataset_name)

        # Connect the dots with a line
        plt.plot(difference['seq_numeric'], difference['percentage_difference'], label=None)

    # Set the tick labels to the original sequences
    plt.xticks(list(sequence_mapping.values()), list(sequence_mapping.keys()), rotation='vertical')

    plt.xlabel(f"{n}-gram sequences")
    plt.ylabel('Percentage difference in counts')
    plt.title(f"Percentage difference in counts for {n}-grams across datasets")
    plt.legend()
    plt.show()

In [None]:
for i in range(1,6):
    perc_diff_line_plot(i, 0.008)

#### 1.1.2 Results description:

Contrary to absolute differences, percent differences increase as n-gram length increases. As expected, percent difference decreases with increasing epsilon value (more privacy loss).

#### 1.1.3. Percentage Error with Sanity Bound 3

In [None]:
for i in range(1,6):
    perc_diff_line_plot(i, 3)

#### 1.1.3 Results description:

Changing the sanity bound to 3 significantly lowered percent error. Now, similar to absolute error, percent error decreases with increasing n-gram length and decreases with increasing epsilon value (more privacy loss).

Sanity bound of 3 is the average count per ngram in the data set.

### 1.2. Without Noisy counts less than 1

#### 1.2.1. Absolute Error

In [None]:
def abs_diff_line_plot_wo(n):
    # Collect all unique sequences for the given n-gram across all datasets
    all_sequences = set()
    for dataset in datasets.values():
        ngram_dataset = dataset[dataset['ngram'] == n]
        all_sequences.update(ngram_dataset['Seq'].unique())

    # Create a dictionary to map each unique sequence to a numerical value; sort the keys number by number
    sequence_mapping = {sequence: i for i, sequence in enumerate(sorted(all_sequences, key=lambda x: [int(num) for num in x.split()]))}

    # Identify the n-grams in the original dataset
    og_ngram = og[og['ngram'] == n]
    
    # Plotting
    plt.figure(figsize=(10, 6))

    # Iterate over each noisy dataset
    for dataset_name, dataset in datasets.items():
        # Identify the n-grams
        dataset_ngram = dataset[dataset['ngram'] == n]

        # Calculate the differences
        # Merge on 'Seq' and use suffixes to differentiate columns
        difference = pd.merge(dataset_ngram, og_ngram, on='Seq', how='outer', suffixes=('_' + dataset_name, '_og'))

        # Fill NaNs with 0
        difference.fillna(0, inplace=True)

        # Calculate the absolute difference
        difference['abs_difference'] = np.abs(difference['Counts_og'] - difference['Counts_' + dataset_name])
        
        # Filter out data entries where noisy count is less than 1
        difference = difference[difference['Counts_' + dataset_name] >= 1]

        # Sort the sequences based on the order of sequences on the x-axis
        difference['seq_numeric'] = difference['Seq'].map(sequence_mapping)
        difference.sort_values('seq_numeric', inplace=True)

        # Plotting scatter plot
        plt.scatter(difference['seq_numeric'], difference['abs_difference'], label=dataset_name)

        # Connect the dots with a line
        plt.plot(difference['seq_numeric'], difference['abs_difference'], label=None)

    # Set the tick labels to the original sequences
    plt.xticks(list(sequence_mapping.values()), list(sequence_mapping.keys()), rotation='vertical')

    plt.xlabel(f"{n}-gram sequences")
    plt.ylabel('Absolute difference in counts')
    plt.title(f"Absolute difference in counts for {n}-grams across datasets")
    plt.legend()
    plt.show()

In [None]:
for i in range(1,6):
    abs_diff_line_plot_wo(i)

#### 1.2.1 Results description:

Similar to the result from 1.1.1, we can see that the absolute difference in counts decreases as n-gram length gets larger. It is also apparent that the absolute difference decreases as epsilon value increases, indicating more privacy loss.

#### 1.2.2. Percentage Error with Sanity Bound 0.008

In [None]:
def perc_diff_line_plot_wo(n,sanity_bound):
    # Collect all unique sequences for the given n-gram across all datasets
    all_sequences = set()
    for dataset in datasets.values():
        ngram_dataset = dataset[dataset['ngram'] == n]
        all_sequences.update(ngram_dataset['Seq'].unique())

    # Create a dictionary to map each unique sequence to a numerical value; sort the keys number by number
    sequence_mapping = {sequence: i for i, sequence in enumerate(sorted(all_sequences, key=lambda x: [int(num) for num in x.split()]))}

    # Identify the n-grams in the original dataset
    og_ngram = og[og['ngram'] == n]
    
    # Plotting
    plt.figure(figsize=(10, 6))

    # Iterate over each noisy dataset
    for dataset_name, dataset in datasets.items():
        # Identify the n-grams
        dataset_ngram = dataset[dataset['ngram'] == n]

        # Calculate the differences
        difference = pd.merge(dataset_ngram, og_ngram, on='Seq', how='outer', suffixes=('_' + dataset_name, '_og'))

        # Fill NaNs with 0
        difference.fillna(0, inplace=True)
        
        # Calculate the absolute difference
        difference['abs_difference'] = np.abs(difference['Counts_og'] - difference['Counts_' + dataset_name])

        # Calculate the percentage
        difference['percentage_difference'] = difference.apply(lambda row: 100 * row['abs_difference'] / max(row['Counts_og'], sanity_bound), axis=1)
        
        # Filter out data entries where noisy count is less than 1
        difference = difference[difference['Counts_' + dataset_name] >= 1]

        # Sort the sequences based on the order of sequences on the x-axis
        difference['seq_numeric'] = difference['Seq'].map(sequence_mapping)
        difference.sort_values('seq_numeric', inplace=True)

        # Plotting scatter plot
        plt.scatter(difference['seq_numeric'], difference['percentage_difference'], label=dataset_name)

        # Connect the dots with a line
        plt.plot(difference['seq_numeric'], difference['percentage_difference'], label=None)

    # Set the tick labels to the original sequences
    plt.xticks(list(sequence_mapping.values()), list(sequence_mapping.keys()), rotation='vertical')

    plt.xlabel(f"{n}-gram sequences")
    plt.ylabel('Percentage difference in counts')
    plt.title(f"Percentage difference in counts for {n}-grams across datasets")
    plt.legend()
    plt.show()

In [None]:
for i in range(1,6):
    perc_diff_line_plot_wo(i,0.008)

#### 1.2.2 Results description:

Percent differences increase as n-gram length increases similar to 1.1.2. Percent difference decreases with increasing epsilon value. Compared to 1.1.2, there are much less plots in these results as the noisy counts less than 1 are removed.

#### 1.2.3. Percentage Error with Sanity Bound 3

In [None]:
for i in range(1,6):
    perc_diff_line_plot_wo(i,3)

#### 1.2.3. Results description:

Changing the sanity bound to 3 significantly lowered percent error, similar to 1.1.3. Percent error decreases with increasing n-gram length and decreases with increasing epsilon value. For 4-grams, the percentage error was lowered but maintai there are no changes compared with 1.2.2.

## 2. Error Calculation per NGram type (n-size)

To analyze how the length of the gram (gram of size n) impacts the error between noisy and original n-gram counts, we calculated the error per n-gram type.

### 2.1. Taking the Average of Instance Errors

The first method we used to calculate percentage error per gram type was taking the average of the count errors for each instance (one specific n-gram).

#### 2.1.1. Sanity Bound 0.008

For types of grams having count lower than the sanity bound, we replace their counts with sanity bound, which is, by paper standards, 0.1% of the dataset size.

In [None]:
def plot_adj_avg_percentage_error(epsilons, ngrams, sanity_bound):
    # Initialize a figure
    plt.figure(figsize=(10, 6))
    
    # Iterate over each epsilon
    for epsilon in epsilons:
        avg_errors = []
        
        # Iterate over each n-gram
        for n in ngrams:
            # Collect all percentage differences for the given epsilon and n-gram
            all_percentage_differences = []
            
            # Identify the n-grams in the original dataset
            og_ngram = og[og['ngram'] == n]
            
            # Identify the n-grams for particular epsilon
            dataset_ngram = datasets[epsilon][datasets[epsilon]['ngram'] == n]
            
            # Calculate the differences
            # Merge on 'Seq' and use suffixes to differentiate columns
            difference = pd.merge(dataset_ngram, og_ngram, on='Seq', how='outer', suffixes=('_' + epsilon, '_og'))

            # Fill NaNs with 0
            difference.fillna(0, inplace=True)

            # Calculate the absolute difference
            difference['abs_difference'] = np.abs(difference['Counts_og'] - difference['Counts_' + epsilon])

            # Calculate the percentage difference
            difference['percentage_difference'] = difference.apply(lambda row: 100 * row['abs_difference'] / max(row['Counts_og'], sanity_bound), axis=1)

            # Filter out noisy counts that are less than 1
            difference = difference[difference['Counts_' + epsilon] >= 1]
            
            # Append the percentage differences to the list
            all_percentage_differences.extend(difference['percentage_difference'])

            # Calculate the average percentage error for the current epsilon and n-gram
            # avg_error = np.mean(all_percentage_differences)
            avg_error = np.mean(all_percentage_differences) if all_percentage_differences else np.nan

            # Append the average error to the list
            avg_errors.append(avg_error)

        # Plotting
        plt.plot(ngrams, avg_errors, marker='o', label=epsilon)

    plt.xlabel("n-gram")
    plt.ylabel("Average Percentage Error")
    plt.title("Average Percentage Error for n-grams across Epsilons")
    plt.legend()
    plt.show()

In [None]:
epsilons = ["eps_01", "eps_1", "eps_5", "eps_10", "eps_20", "eps_50"]
ngrams = range(1, 5)
plot_adj_avg_percentage_error(epsilons, ngrams, 0.008)

#### 2.1.1. Results description:

You can see that with the size of the gram increasing, the percentage error increases, so the general trend is satisfactory to the common intuition we had about the results. However, the values of the errors themselves are outrageously large, which prompted us to try another sanity bound value.

#### 2.1.2. Sanity Bound 3 for All Epsilons


For our specific dataset, we figured that 0.1% of the original dataset's size might not be an appropriate sanity bound, as our dataset is much smaller than the one used in the original paper. Therefore, we took average n-gram count in the whole dataset, which was 3, and tried it as a sanity bound as well.

In [None]:
epsilons = ["eps_01", "eps_1", "eps_5", "eps_10", "eps_20", "eps_50"]
plot_adj_avg_percentage_error(epsilons, ngrams, 3)

#### 2.1.2. Results description:


The results show that setting a sanity bound to a value more specific to our dataset was a good decision, as the percentage error values are within the range of what we would consider appropriate by common sense and still follow the general trend of the error increasing with the size of the n-gram. Four-grams have errors around 60% and 90% which is quite a gap, while unigram errors are in an approximately 10% to 15% error with epsilon 5 being the outlier and having 60% error for uni-, bi-, and gour-grams. In other words, generally, the higher the n, the higher the error percentage.

### 2.2. Taking the Sums

Another method we used to analyze the percentage errors by n-gram type is taking the sums of the counts of all grams of the same n and finding the error between those sums and their noisy versions.

In [None]:
eps = [0.1, 1, 5, 10, 20]
path = ""
orig_name = path + "small_example-original-5grams.dat"
filenames = []
for epsilon in eps:
    noisy_name_epsilon = path + "small_example-noisy-n_max_5-l_max_5-eps_" + str(epsilon) + ".dat"
    filenames.append(noisy_name_epsilon)
    
def make_dict(filename, gram_type):
    file = open(filename, 'r')
    l = file.readlines()[2:]
    dct = {}
    for line in l:
        lst = line.strip().split(":")
        gram = lst[0]
        if len(gram.replace(" ","")) == gram_type:
            count = float(lst[1])
            dct[gram] = count
    return dct
        
def per_gram_type(orig_name, filenames, n):
    orig = make_dict(orig_name, n)
    df1 = pd.DataFrame.from_dict(orig, orient='index', columns=['count']).reset_index().rename(columns={'index':'gram'})
    noisy_frames = []
    for noisy_name in filenames:
        noisy = make_dict(noisy_name, n)
        df2 = pd.DataFrame.from_dict(noisy, orient='index',columns=['count']).reset_index().rename(columns={'index':'gram'})
        noisy_frames.append(df2)
    return df1, noisy_frames
    
def sum_up(orig_name,filenames):
    final = pd.DataFrame()
    types_list = ['uni', 'bi','tri','quad','five']
    final['gramtype'] = ['uni', 'bi','tri','quad','five']
    for i in range(5):
        df1, noisy_frames = per_gram_type(orig_name,filenames, i+1)
        eps_diffs = []
        counter = 0
        for df2 in noisy_frames:
            df1.fillna(0, inplace = True)
            df2.fillna(0, inplace = True)
            orig = df1['count'].sum()
            noisy = df2['count'].sum()
            diff = (np.abs(orig - noisy)/orig)*100
            epsilon = eps[counter]
            final.loc[i, epsilon] = diff
            counter += 1
    final.plot(x="gramtype", y=[0.1, 1, 5, 10, 20], kind="line", style='.-', figsize=(10, 10))

In [None]:
sum_up(orig_name,filenames)

### 2.2. Results description:

The plots illustrate a similar trend and percentage error values (with the exception of epsilons 0.1 and 1 as those had mostly 0 values in their noisy datasets) to the other method used above: the higher the n, the higher the percentage error. Four grams, for the reason we are not sure about, are a low outlier in the trend, but five grams demonstrate around 75% error compared to 5 - 25% error for unigrams.

## 3. Boxplots

As the noises added by the Laplace mechanism is unbounded, noisy counts for each sequence varies significantly among multiple rounds. We therefore use Boxplot.py to generate 100 rounds of noisy counts for each epsilon value (0.1, 1, 5, 10, 20, 50) and examine the performance of the ngram algorithm based on the length of ngrams and magnitude of epsilon.

In [None]:
path = ''
og = pd.read_csv('', sep = ':', dtype={'Seq': str})
og['ngram'] = og['Seq'].apply(lambda x: len(x.split()))

### 3.1 Averaging Percentages

Given a particular epsilon and n, for each round, we 1) calculate the percentage error for each sequence, and 2) average the percentage errors of all sequences to derive avg_error. We then draw the boxplot of average percentage errors for 100 rounds. When we calculate the percentage error, the denominator is replaced by sanity_bound whenever the original count of that sequence is 0.

#### 3.1.1 Sanity Bound 0.008

In [None]:
def plot_avg_percentage_error_boxplot(epsilon, sanity_bound):
    # epsilon options: 'eps_0.1', 'eps_1', 'eps_5', 'eps_10', 'eps_20', 'eps_50'
    
    ngrams = range(1, 5)
    datasets = {}

    # Read in all rounds for epsilon into datasets
    for i in range(1,101):
        file = epsilon + '-round_' + str(i) + '.dat'
        name = 'round_ ' + str(i)
        dataset = pd.read_csv(path + file, dtype={'Seq': str}, sep=':')
        # File name exmaples: round_1 from path .../small_example-eps_0.1-round_1.dat
        dataset['Seq'] = dataset['Seq'].astype(str) # Convert Seq column into strings
        dataset['ngram'] = dataset['Seq'].apply(lambda x: len(str(x).split())) # Classiy ngram
        datasets[name] = dataset
    
    # Initialize a figure
    plt.figure(figsize=(10, 6))
    
    # Iterate over each n-gram
    for n in ngrams:
        error_distribution = []
        
        # Loop over each round, and find the average percentage error for that round
        for dataset_name, dataset in datasets.items():
            # Identify the n-grams in the original dataset
            og_ngram = og[og['ngram'] == n]
            
            # Identify the n-grams
            dataset_ngram = dataset[dataset['ngram'] == n]
            
            # Merge datasets - union sequences
            difference = pd.merge(dataset_ngram, og_ngram, on='Seq', how='outer', suffixes=('_' + dataset_name, '_og'))

            # Fill NaNs with 0
            difference.fillna(0, inplace=True)

            # Calculate the absolute difference
            difference['abs_difference'] = np.abs(difference['Counts_og'] - difference['Counts_' + dataset_name])

            # Calculate the percentage difference
            difference['percentage_difference'] = difference.apply(lambda row: 100 * row['abs_difference'] / max(row['Counts_og'], sanity_bound), axis=1)

            # Filter out seqs whose noisy counts are less than 1
            difference = difference[difference['Counts_' + dataset_name] >= 1]

            # Calculate the average percentage error for the current epsilon and n-gram
            avg_error = np.mean(difference['percentage_difference'])
            
            if not np.isnan(avg_error):
                # Append the average error to the list if avg_error is not nan
                # avg_error is nan if percentage_difference col is empty after filter
                error_distribution.append(avg_error)

        # Plotting boxplots for each n-gram
        plt.boxplot(error_distribution, positions=[n], labels=[n])

    plt.xlabel("n-gram")
    plt.ylabel("Average Percentage Error")
    plt.title(f"Average Percentage Error per n-gram for {epsilon}")
    plt.show()

In [None]:
epsilons = ['eps_0.1', 'eps_1', 'eps_5', 'eps_10', 'eps_20', 'eps_50']
for epsilon in epsilons:
    plot_avg_percentage_error_boxplot(epsilon, 0.008)

* We observe that as epsilon increases, the median average percentage error decreases, corresponding to greater privacy loss and decreasing magnitude of added noises.
* As length of n-gram increases, the average percentage error generally increases. This is likely because a lot of fake sequences not present in the original dataset are created and their percentage errors are huge because we divide the noisy count by a small sanity bound of 0.008.

#### 3.1.2 Sanity Bound 3

In [None]:
epsilons = ['eps_0.1', 'eps_1', 'eps_5', 'eps_10', 'eps_20', 'eps_50']
for epsilon in epsilons:
    plot_avg_percentage_error_boxplot(epsilon, 3)

* As epsilon increases, the median percentage error first decreases and then plateaus after epsilon reaches 5.
* As length of n-gram increases, the median percentage error fluctuates. It seems that the median percentage error is not entirely related to the length of n-gram, and this is contrary to our expectation that it will increase while length of n-gram increases. To address this, we apply variable sanity-bound in the following section, using average count per gram per n.

#### 3.1.3 Varying sanity bound

In [None]:
path = ''
og = pd.read_csv('small_example-original-5grams.dat', sep = ':', dtype={'Seq': str})
# Classify n-grams
og['ngram'] = og['Seq'].apply(lambda x: len(x.split()))

In [None]:
def avg_counts(n):
    ngram1 = og[og['ngram']==n]
    avg_counts = np.sum(ngram1['Counts'])/len(ngram1['Counts'])
    print(f'Average counts for {n}-gram: ', avg_counts)

In [None]:
ngrams = range(1,5)
for n in ngrams:
    avg_counts(n)

In [None]:
def plot_avg_percentage_error_boxplot_v(epsilon, sanity_bounds):
    # epsilon options: 'eps_0.1', 'eps_1', 'eps_5', 'eps_10', 'eps_20', 'eps_50'
    
    ngrams = range(1, 5)
    datasets = {}

    # Read in all rounds for epsilon into datasets
    for i in range(1,101):
        file = epsilon + '-round_' + str(i) + '.dat'
        name = 'round_ ' + str(i)
        dataset = pd.read_csv(path + file, dtype={'Seq': str}, sep=':')
        # File name exmaples: round_1 from path .../small_example-eps_0.1-round_1.dat
        dataset['Seq'] = dataset['Seq'].astype(str) # Convert Seq column into strings
        dataset['ngram'] = dataset['Seq'].apply(lambda x: len(str(x).split())) # Classiy ngram
        datasets[name] = dataset
    
    # Initialize a figure
    plt.figure(figsize=(10, 6))
    
    # Iterate over each n-gram
    for n in ngrams:
        error_distribution = []
        
        # Loop over each round, and find the average percentage error for that round
        for dataset_name, dataset in datasets.items():
            # Identify the n-grams in the original dataset
            og_ngram = og[og['ngram'] == n]
            
            # Identify the n-grams
            dataset_ngram = dataset[dataset['ngram'] == n]
            
            # Merge datasets - union sequences
            difference = pd.merge(dataset_ngram, og_ngram, on='Seq', how='outer', suffixes=('_' + dataset_name, '_og'))

            # Fill NaNs with 0
            difference.fillna(0, inplace=True)

            # Calculate the absolute difference
            difference['abs_difference'] = np.abs(difference['Counts_og'] - difference['Counts_' + dataset_name])

            # Calculate the percentage difference
            difference['percentage_difference'] = difference.apply(lambda row: 100 * row['abs_difference'] / max(row['Counts_og'], sanity_bounds[n-1]), axis=1)

            # Filter out seqs whose noisy counts are less than 1
            difference = difference[difference['Counts_' + dataset_name] >= 1]

            # Calculate the average percentage error for the current epsilon and n-gram
            avg_error = np.mean(difference['percentage_difference'])
            
            if not np.isnan(avg_error):
                # Append the average error to the list if avg_error is not nan
                # avg_error is nan if percentage_difference col is empty after filter
                error_distribution.append(avg_error)

        # Plotting boxplots for each n-gram
        plt.boxplot(error_distribution, positions=[n], labels=[n])

    plt.xlabel("n-gram")
    plt.ylabel("Average Percentage Error")
    plt.title(f"Average Percentage Error per n-gram for {epsilon}, Adjusted Sanity Bound")
    plt.show()

In [None]:
sanity_bounds = [8,3,2,1.6]
epsilons = ['eps_0.1', 'eps_1', 'eps_5', 'eps_10', 'eps_20', 'eps_50']
for epsilon in epsilons:
    plot_avg_percentage_error_boxplot_v(epsilon, sanity_bounds)

* We observe that the average count per n-gram decreases as n increases
* As we use the average count per n-gram type as the sanity bound, we observe the expected average percentag error trends:
    * Median percentage error decreases as epsilon increases
    * Median percentage error increases as length of n-gram increases

### 3.2 Percentage Error of Sums

An alternative way of calculating percentage error is to take the ratio of the sum of all noisy counts and the sum of original counts for a particular n.

#### 3.2.1 Filtered noisy counts less than 1

In [None]:
def plot_avg_percentage_error_boxplot_sums(epsilon, ngrams):
    # epsilon options: 'eps_0.1', 'eps_1', 'eps_5', 'eps_10', 'eps_20', 'eps_50'

    datasets = {}

    # Read in all rounds for epsilon into datasets
    for i in range(1,101):
        file = epsilon + '-round_' + str(i) + '.dat'
        name = 'round_ ' + str(i)
        dataset = pd.read_csv(path + file, dtype={'Seq': str}, sep=':')
        # File name exmaples: round_1 from path .../small_example-eps_0.1-round_1.dat
        dataset['Seq'] = dataset['Seq'].astype(str) # Convert Seq column into strings
        dataset['ngram'] = dataset['Seq'].apply(lambda x: len(str(x).split())) # Classiy ngram
        datasets[name] = dataset
    
    # Initialize a figure
    plt.figure(figsize=(10, 6))
    
    # Iterate over each n-gram
    for n in ngrams:
        error_distribution = []
        
        # Loop over each round, and find the average percentage error for that round
        for dataset_name, dataset in datasets.items():
            # Identify the n-grams in the original dataset
            og_ngram = og[og['ngram'] == n]
            
            # Identify the n-grams
            dataset_ngram = dataset[dataset['ngram'] == n]
            
            # Merge datasets - union sequences
            difference = pd.merge(dataset_ngram, og_ngram, on='Seq', how='outer', suffixes=('_' + dataset_name, '_og'))

            # Fill NaNs with 0
            difference.fillna(0, inplace=True)

            # Calculate the absolute difference
            difference['abs_difference'] = np.abs(difference['Counts_og'] - difference['Counts_' + dataset_name])

            # Filter out seqs whose noisy counts are less than 1
            difference = difference[difference['Counts_' + dataset_name] >= 1]
            
            # Calculate og_counts total
            og_counts_total = og_ngram['Counts'].sum()
            
            # Calculate the total absolute difference between sum of data set counts and sum of og counts
            abs_difference = np.abs(og_counts_total - difference['Counts_' + dataset_name].sum())

            # Calculate the percentage difference
            percentage_difference = 100 * abs_difference / og_counts_total
            
            if not np.isnan(percentage_difference):
                # Append the average error to the list if avg_error is not nan
                # avg_error is nan if percentage_difference col is empty after filter
                # print('abs_diff', abs_difference, 'og_counts_total', og_counts_total, 'perc_diff', percentage_difference)
                error_distribution.append(percentage_difference)

        # Plotting boxplots for each n-gram
        plt.boxplot(error_distribution, positions=[n], labels=[n])

    plt.xlabel("n-gram")
    plt.ylabel("Average Percentage Error")
    plt.title(f"Average Percentage Error per n-gram for {epsilon}")
    plt.show()

In [None]:
epsilons = ['eps_0.1', 'eps_1', 'eps_5', 'eps_10', 'eps_20', 'eps_50']
ngrams = range(1, 5)
for epsilon in epsilons:
    plot_avg_percentage_error_boxplot_sums(epsilon, ngrams)

#### 3.2.2 No filters

In [None]:
def plot_avg_percentage_error_boxplot_sums_2(epsilon, ngrams):
    # epsilon options: 'eps_0.1', 'eps_1', 'eps_5', 'eps_10', 'eps_20', 'eps_50'

    datasets = {}

    # Read in all rounds for epsilon into datasets
    for i in range(1,101):
        file = epsilon + '-round_' + str(i) + '.dat'
        name = 'round_ ' + str(i)
        dataset = pd.read_csv(path + file, dtype={'Seq': str}, sep=':')
        # File name exmaples: round_1 from path .../small_example-eps_0.1-round_1.dat
        dataset['Seq'] = dataset['Seq'].astype(str) # Convert Seq column into strings
        dataset['ngram'] = dataset['Seq'].apply(lambda x: len(str(x).split())) # Classiy ngram
        datasets[name] = dataset
    
    # Initialize a figure
    plt.figure(figsize=(10, 6))
    
    # Iterate over each n-gram
    for n in ngrams:
        error_distribution = []
        
        # Loop over each round, and find the average percentage error for that round
        for dataset_name, dataset in datasets.items():
            # Identify the n-grams in the original dataset
            og_ngram = og[og['ngram'] == n]
            
            # Identify the n-grams
            dataset_ngram = dataset[dataset['ngram'] == n]
            
            # Merge datasets - union sequences
            difference = pd.merge(dataset_ngram, og_ngram, on='Seq', how='outer', suffixes=('_' + dataset_name, '_og'))

            # Fill NaNs with 0
            difference.fillna(0, inplace=True)

            # Calculate the absolute difference
            difference['abs_difference'] = np.abs(difference['Counts_og'] - difference['Counts_' + dataset_name])
            
            # Calculate og_counts total
            og_counts_total = og_ngram['Counts'].sum()
            
            # Calculate the total absolute difference between sum of data set counts and sum of og counts
            abs_difference = np.abs(og_counts_total - difference['Counts_' + dataset_name].sum())

            # Calculate the percentage difference
            percentage_difference = 100 * abs_difference / og_counts_total
            
            if not np.isnan(percentage_difference):
                # Append the average error to the list if avg_error is not nan
                # avg_error is nan if percentage_difference col is empty after filter
                # print('abs_diff', abs_difference, 'og_counts_total', og_counts_total, 'perc_diff', percentage_difference)
                error_distribution.append(percentage_difference)

        # Plotting boxplots for each n-gram
        plt.boxplot(error_distribution, positions=[n], labels=[n])

    plt.xlabel("n-gram")
    plt.ylabel("Average Percentage Error")
    plt.title(f"Average Percentage Error per n-gram for {epsilon}")
    plt.show()

In [None]:
epsilons = ['eps_0.1', 'eps_1', 'eps_5', 'eps_10', 'eps_20', 'eps_50']
ngrams = range(1, 5)
for epsilon in epsilons:
    plot_avg_percentage_error_boxplot_sums_2(epsilon, ngrams)

* As epsilon increases, median average percentage error first decreases and then plateaus as epsilon turns 5
* There is no clear correlation between the length of ngrams and the average percentage errors.