In [1]:
import os
import sys
import json
import pandas as pd
from pandas.errors import EmptyDataError
from scipy.stats import gamma

In [2]:
HVNET_LATENCY_DATA_DIR = '../latency-data/hvnet/'
TOPO_CONFIG_DIR = '../network-configs'

NUM_TOPOS = 100

CLEAN_RUN = False

MAX_SAMPLES_PER_RATE = 10000
SEED = 1234

IST_RATE_CSV = 'inter-send-times-rates.csv'

In [3]:
# Step 1 (per topology)
# * load latency csv and config with rates per flow
# * add flow column to latency df, drop unnecessary columns
# * per flow, calculate inter-send times using send_time.diff()
# * drop NaN values
# * merge latency df with flow rates
# * per rate, sample to x inter-send times
# * export csv with the following columns: rate, inter_send_time

def load_flow_config(i):
    with open(os.path.join(TOPO_CONFIG_DIR, f'nw-{i}.json'), 'r') as f:
        topo_config = json.load(f)
    flow_config = topo_config['flow']
    df = pd.DataFrame(flow_config)
    df.rename(columns={'rate': 'rate_kbit_s'}, inplace=True)
    df.rate_kbit_s = df.rate_kbit_s.astype('int')
    return df

def load_hvnet_latencies(i):
    try:
        df = pd.read_csv(os.path.join(HVNET_LATENCY_DATA_DIR, f'{i:02d}-latencies-hvnet-preprocessed.csv'))
        df.drop(columns=['latency_us', 'recv_time_s'], inplace=True)
    except FileNotFoundError:
        return pd.DataFrame()
    return df

# Process all topologies
df_list_ist_rate_all_topos = []
for i in range(NUM_TOPOS):
    IST_RATE_FILE_TOPO = f'inter-send-times-rates-{i:02d}.csv'
    if not CLEAN_RUN and os.path.exists(IST_RATE_FILE_TOPO):
        print(f'topo {i:02d} already processed, skipping.')
        continue

    print(f'topo {i:02d}: loading input data...', end='')

    # Load flow data and HVNet latencies
    df_flows = load_flow_config(i)
    df_latency = load_hvnet_latencies(i)

    if len(df_latency) == 0:
        print('empty, skipping.')
        continue

    # Per flow, calculate inter-send times and export together with the flow rate
    ist_series_per_flow = []
    rates_per_flow = []
    print('processing flows...', end='')
    for f in df_latency.flow_num.unique():
        flow_rate = df_flows[df_flows.name == f'f{f}'].iloc[0].rate_kbit_s * 1e3 / 8  # scale to byte per second

        inter_send_times_f = df_latency[df_latency.flow_num == f].send_time_s.sort_values().diff().dropna()
        ist_series_per_flow.append(inter_send_times_f)
        rates_per_flow.extend([flow_rate] * len(inter_send_times_f))

    print('analysis...', end='')
    ist_series_all = pd.concat(ist_series_per_flow, ignore_index=True)
    df_ist_rate = pd.DataFrame({
        'inter_send_time_s': ist_series_all,
        'rate_b_s': rates_per_flow
    })

    # Per rate, sample to MAX_SAMPLES_PER_RATE_PER_TOPO, save result to csv
    df_ist_rate_sampled = df_ist_rate.groupby('rate_b_s').sample(n=MAX_SAMPLES_PER_RATE, random_state=SEED)
    df_list_ist_rate_all_topos.append(df_ist_rate_sampled)
    df_ist_rate_sampled.to_csv(f'inter-send-times-rates-{i:02d}.csv', index=False)
    print('done.')

topo 00: loading input data...processing flows...analysis...done.
topo 01: loading input data...processing flows...analysis...done.
topo 02: loading input data...processing flows...analysis...done.
topo 03: loading input data...processing flows...analysis...done.
topo 04: loading input data...processing flows...analysis...done.
topo 05: loading input data...processing flows...analysis...done.
topo 05: loading input data...processing flows...analysis...done.
topo 06: loading input data...processing flows...analysis...done.
topo 07: loading input data...processing flows...analysis...done.
topo 08: loading input data...processing flows...analysis...done.
topo 09: loading input data...processing flows...analysis...done.
topo 10: loading input data...processing flows...analysis...done.
topo 11: loading input data...processing flows...analysis...done.
topo 12: loading input data...processing flows...analysis...done.
topo 13: loading input data...processing flows...analysis...done.
topo 14: l

In [4]:
# Combine all ist_rate dataframes, sample, and export as csv
df_list_ist_rate_all_topos = []
for i in range(NUM_TOPOS):
    try:
        df_list_ist_rate_all_topos.append(pd.read_csv(f'inter-send-times-rates-{i:02d}.csv'))
    except (FileNotFoundError, EmptyDataError):
        print(f'no data for topo {i:02d}, skipping.')
        continue

df_ist_rate_all_topos = pd.concat(df_list_ist_rate_all_topos, ignore_index=True, copy=False)
df_ist_rate_all_topos = df_ist_rate_all_topos.groupby('rate_b_s').sample(n=MAX_SAMPLES_PER_RATE, random_state=SEED)
df_ist_rate_all_topos.to_csv(IST_RATE_CSV, index=False)

no data for topo 33, skipping.
no data for topo 78, skipping.
no data for topo 79, skipping.


In [5]:
# Step 2
# * load the previously generated csv
# * per rate, call scipy.stats.gamma.fit()
# * export tuples (shape, scale) for each rate
# * write to csv

df_ist_rate = pd.read_csv(IST_RATE_CSV)

print('processing rates...', end='')
rate_mapping = {}
num_rates = len(df_ist_rate.rate_b_s.unique())
for i, r in enumerate(df_ist_rate.rate_b_s.unique()):
    print(f'\rprocessing rates...{int((i/num_rates)*100):03d}%', end='')
    ists_r = df_ist_rate[df_ist_rate.rate_b_s == r].inter_send_time_s
    shape_r, _, scale_r = gamma.fit(ists_r, floc=0)  # fix the loc param to 0
    rate_mapping[r] = {
        'shape': shape_r,
        'scale': scale_r
    }
print('\rprocessing rates...100%')

print('writing to file...', end='')
with open('gamma-params.json', 'w') as f:
    json.dump(rate_mapping, f)
print('done.')

processing rates...100%
writing to file...done.


In [6]:
# Step 3
# * import mapping to OMNeT++ converter
# * apply correct values according to flow rate