In [3]:

import pandas as pd
import os

# result directory
base_directory = '../../../result/pageloadtime/'
analysis_base_directory = '../..'

regions = ['tokyo', 'frankfurt']

# measurement patterns
patterns = [
    'without-cache-without-dane',
    'without-cache-with-dane',
    'with-cache-without-dane',
    'with-cache-with-dane',
    ]

print("start merging results for each measurement pattern in each region")
for region in regions:
    for pattern in patterns:
        combined_data = pd.DataFrame()

        # the result of measurement06 contains some errors, so we exclude it
        for i in [i for i in range(1, 10) if i != 6]:
            directory_path = os.path.join(base_directory, f'{region}-v2-{str(i).zfill(2)}')
            file_path = os.path.join(directory_path, f'pageloadtime-{pattern}.csv')

            if os.path.exists(file_path):
                df = pd.read_csv(file_path)
                df.insert(0, 'measurementID', f'{region}-{str(i).zfill(2)}')

                combined_data = pd.concat([combined_data, df], axis=0)

        output_file = os.path.join(analysis_base_directory, f'{region}-{pattern}.csv')
        combined_data.to_csv(output_file, index=False)
print("finished merging results for each measurement pattern in each region")


print("start merging results in each region")
for region in regions:
    without_cache_without_dane = pd.read_csv(os.path.join(analysis_base_directory,f'{region}-without-cache-without-dane.csv'))
    without_cache_with_dane = pd.read_csv(os.path.join(analysis_base_directory, f'{region}-without-cache-with-dane.csv'))
    with_cache_without_dane = pd.read_csv(os.path.join(analysis_base_directory, f'{region}-with-cache-without-dane.csv'))
    with_cache_with_dane = pd.read_csv(os.path.join(analysis_base_directory, f'{region}-with-cache-with-dane.csv'))

    all_df = pd.concat([without_cache_without_dane, without_cache_with_dane, with_cache_without_dane, with_cache_with_dane])

    all_df = all_df.sort_values(by=['measurementID', 'domain', 'cache','dane'])
    # export as csv
    all_df.to_csv(os.path.join(analysis_base_directory, f'{region}-all.csv'), index=False)
print("finished merging results in each region")


start merging results for each measurement pattern in each region
finished merging results for each measurement pattern in each region
start merging results in each region
finished merging results in each region


In [5]:
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api as sm
from matplotlib.backends.backend_pdf import PdfPages
import scienceplots
import os


analysis_base_directory = '../..'
regions = ['tokyo', 'frankfurt']
tlds = ["", 'nl', 'com', 'de', 'net', 'org', 'eu']

def convert_cache_dane_to_string_tuple(cache, dane):
    if cache == False and dane == False:
        return ("w/o DNS cache", "w/o DANE")
    elif cache == False and dane == True:
        return ("w/o DNS cache", "w/ DANE")
    elif cache == True and dane == False:
        return ("w/ DNS cache", "w/o DANE")
    elif cache == True and dane == True:
        return ("w/ DNS cache", "w/ DANE")


def plot_cdf_all_region_per_tlds():
    all_regions_data = {tld: pd.DataFrame() for tld in tlds}
    for region in regions:
        for tld in tlds:
            without_cache_without_dane = pd.read_csv(os.path.join(analysis_base_directory,f'{region}-without-cache-without-dane.csv'))
            without_cache_with_dane = pd.read_csv(os.path.join(analysis_base_directory, f'{region}-without-cache-with-dane.csv'))
            with_cache_without_dane = pd.read_csv(os.path.join(analysis_base_directory, f'{region}-with-cache-without-dane.csv'))
            with_cache_with_dane = pd.read_csv(os.path.join(analysis_base_directory, f'{region}-with-cache-with-dane.csv'))

            region_df = pd.concat([without_cache_without_dane, without_cache_with_dane, with_cache_without_dane, with_cache_with_dane], ignore_index=True)
            if tld != "":
                region_df = region_df[region_df['domain'].str.endswith(tld)]
            all_regions_data[tld] = pd.concat([all_regions_data[tld], region_df], ignore_index=True)

    for tld in tlds:
        fig, ax = plt.subplots(figsize=(8, 5))
        plt.style.use(['science', 'no-latex'])

        df = all_regions_data[tld]

        colors = ['black', 'dodgerblue', 'black', 'dodgerblue']
        linestyles = ['solid', 'solid', 'dashed', 'dashed']

        for j, ((cache, dane), group) in enumerate(df.groupby(['cache', 'dane'])):
            sorted_data = group['pageLoadTime'].dropna()
            ecdf = sm.distributions.ECDF(sorted_data)
            cache_str, dane_str = convert_cache_dane_to_string_tuple(cache, dane)
            ax.step(ecdf.x, ecdf.y, label=f'({cache_str}, {dane_str})', color=colors[j % len(colors)], linestyle=linestyles[j % len(linestyles)], markevery=0.1)

        ax.set_xlabel('Page Load Time (ms)', fontsize="x-large")
        ax.set_ylabel('CDF', fontsize="x-large")
        ax.grid(True)
        ax.legend(loc='lower right', fontsize="x-large",  title_fontsize="x-large")

        # Save each plot to a separate PDF file
        if tld == "":
            pdf_pages = PdfPages(f'pageload_cdf_all_regions_all_tlds.pdf')
        else:
            pdf_pages = PdfPages(f'pageload_cdf_all_regions_{tld}.pdf')
        pdf_pages.savefig(fig, dpi=1000)
        pdf_pages.close()
        plt.close(fig)

def plot_cdf_per_region_per_tlds():
    for region in regions:
        for filter_tld in tlds:
            without_cache_without_dane = pd.read_csv(os.path.join(analysis_base_directory,f'{region}-without-cache-without-dane.csv'))
            without_cache_with_dane = pd.read_csv(os.path.join(analysis_base_directory, f'{region}-without-cache-with-dane.csv'))
            with_cache_without_dane = pd.read_csv(os.path.join(analysis_base_directory, f'{region}-with-cache-without-dane.csv'))
            with_cache_with_dane = pd.read_csv(os.path.join(analysis_base_directory, f'{region}-with-cache-with-dane.csv'))

            df = pd.concat([without_cache_without_dane, without_cache_with_dane, with_cache_without_dane, with_cache_with_dane], ignore_index=True)

            if filter_tld != "":
                df = df[df['domain'].str.endswith(filter_tld)]

            df = df.sort_values(by=['domain'])
            df.index = pd.RangeIndex(len(df.index))

            # Create a new figure for each combination of region and TLD
            fig, ax = plt.subplots(figsize=(8, 5))
            ax.tick_params(labelsize="x-large")

            plt.style.use(['science','no-latex'])

            colors = ['black', 'dodgerblue', 'black', 'dodgerblue']
            linestyles = ['solid', 'solid', 'dashed', 'dashed']

            for i, ((cache, dane), group) in enumerate(df.groupby(['cache', 'dane'])):
                sorted_data = group['pageLoadTime'].dropna()
                ecdf = sm.distributions.ECDF(sorted_data)
                cache_str, dane_str = convert_cache_dane_to_string_tuple(cache, dane)
                ax.step(ecdf.x, ecdf.y, label=f'({cache_str}, {dane_str})', color=colors[i % len(colors)], linestyle=linestyles[i % len(linestyles)], markevery=0.1)


            ax.set_xscale('log')
            ax.set_xticks([10**i for i in range(1, 5)])  # 10^0 から 10^4 まで
            ax.minorticks_on()
            ax.set_xlabel('Page Load Time (ms)', fontsize="x-large")
            ax.set_ylabel('CDF', fontsize="x-large")
            ax.grid(True, which='both', linestyle='--', linewidth=0.5)

            ax.legend(loc='upper left', fontsize="large", title_fontsize="large", frameon=True)

            if filter_tld == "":
                pdf_pages = PdfPages(f'pageload_cdf_{region}_all_tlds_logscale.pdf')
            else:
                pdf_pages = PdfPages(f'pageload_cdf_{region}_{filter_tld}_logscale.pdf')
            pdf_pages.savefig(fig, dpi=1000)
            pdf_pages.close()
            plt.close(fig)

# plot_cdf_all_region_per_tlds()
plot_cdf_per_region_per_tlds()


In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api as sm
from matplotlib.backends.backend_pdf import PdfPages
import scienceplots
import os


analysis_base_directory = '../..'
regions = ['tokyo', 'frankfurt']
tlds = ["", 'nl', 'com', 'de', 'net', 'org', 'eu']

def plot_cdf_all_region_per_tlds():
    all_regions_data = {tld: pd.DataFrame() for tld in tlds}
    for region in regions:
        for tld in tlds:
            without_cache_without_dane = pd.read_csv(os.path.join(analysis_base_directory,f'{region}-without-cache-without-dane.csv'))
            without_cache_with_dane = pd.read_csv(os.path.join(analysis_base_directory, f'{region}-without-cache-with-dane.csv'))
            with_cache_without_dane = pd.read_csv(os.path.join(analysis_base_directory, f'{region}-with-cache-without-dane.csv'))
            with_cache_with_dane = pd.read_csv(os.path.join(analysis_base_directory, f'{region}-with-cache-with-dane.csv'))

            region_df = pd.concat([without_cache_without_dane, without_cache_with_dane, with_cache_without_dane, with_cache_with_dane], ignore_index=True)
            if tld != "":
                region_df = region_df[region_df['domain'].str.endswith(tld)]

            all_regions_data[tld] = pd.concat([all_regions_data[tld], region_df], ignore_index=True)

    for tld in tlds:
        df = all_regions_data[tld]

        print(f'tld: {tld}')
        for j, ((cache, dane), group) in enumerate(df.groupby(['cache', 'dane'])):
            sorted_data = group['pageLoadTime'].dropna()

            # calculate average and so on
            average = sorted_data.mean()
            median = sorted_data.median()
            std = sorted_data.std()
            minimum = sorted_data.min()
            maximum = sorted_data.max()
            percentile_25 = sorted_data.quantile(0.25)
            percentile_75 = sorted_data.quantile(0.75)

            print(f'cache: {cache}, dane: {dane}')
            print(f'average: {average}')
            print(f'median: {median}')
            print(f'std: {std}')
            print(f'minimum: {minimum}')
            print(f'maximum: {maximum}')
            print(f'25%: {percentile_25}')
            print(f'75%: {percentile_75}')
            print()


def show_value_per_region_per_tlds():
    for region in regions:
        for filter_tld in tlds:
            without_cache_without_dane = pd.read_csv(os.path.join(analysis_base_directory,f'{region}-without-cache-without-dane.csv'))
            without_cache_with_dane = pd.read_csv(os.path.join(analysis_base_directory, f'{region}-without-cache-with-dane.csv'))
            with_cache_without_dane = pd.read_csv(os.path.join(analysis_base_directory, f'{region}-with-cache-without-dane.csv'))
            with_cache_with_dane = pd.read_csv(os.path.join(analysis_base_directory, f'{region}-with-cache-with-dane.csv'))

            df = pd.concat([without_cache_without_dane, without_cache_with_dane, with_cache_without_dane, with_cache_with_dane], ignore_index=True)

            if filter_tld != "":
                df = df[df['domain'].str.endswith(filter_tld)]

            df = df.sort_values(by=['domain'])
            df.index = pd.RangeIndex(len(df.index))

            for i, ((cache, dane), group) in enumerate(df.groupby(['cache', 'dane'])):
                sorted_data = group['pageLoadTime'].dropna()
                average = sorted_data.mean()
                median = sorted_data.median()
                std = sorted_data.std()
                minimum = sorted_data.min()
                maximum = sorted_data.max()
                percentile_25 = sorted_data.quantile(0.25)
                percentile_75 = sorted_data.quantile(0.75)

                print(f'region: {region}, tld: {filter_tld}, cache: {cache}, dane: {dane}')
                print(f'average: {average}')
                print(f'median: {median}')
                print(f'std: {std}')
                print(f'minimum: {minimum}')
                print(f'maximum: {maximum}')
                print(f'25%: {percentile_25}')
                print(f'75%: {percentile_75}')
                print()


# plot_cdf_all_region_per_tlds()
show_value_per_region_per_tlds()


region: tokyo, tld: , cache: False, dane: False
average: 5875.782004261413
median: 5220.0
std: 3412.730147800281
minimum: 149.0
maximum: 29866.0
25%: 3533.0
75%: 7525.0

region: tokyo, tld: , cache: False, dane: True
average: 6191.693190149653
median: 5605.0
std: 3540.331213896056
minimum: 165.0
maximum: 29866.0
25%: 3697.5
75%: 8082.5

region: tokyo, tld: , cache: True, dane: False
average: 4064.4959131856285
median: 3561.0
std: 2882.951191269836
minimum: 66.0
maximum: 29808.0
25%: 2136.5
75%: 5193.0

region: tokyo, tld: , cache: True, dane: True
average: 5403.891682038523
median: 4886.0
std: 3176.987079306513
minimum: 72.0
maximum: 29774.0
25%: 3135.0
75%: 7131.5

region: tokyo, tld: nl, cache: False, dane: False
average: 6376.2740211489
median: 5847.0
std: 3285.829927971979
minimum: 230.0
maximum: 29853.0
25%: 4160.0
75%: 7987.0

region: tokyo, tld: nl, cache: False, dane: True
average: 6850.1777108433735
median: 6420.0
std: 3453.3598858859777
minimum: 259.0
maximum: 29866.0
25%: 44