# Time Analysis

This notebook contains time analysis for statistical sampling based estimators vs. the proposed framework.

In [1]:
import sys
sys.path.insert(0, './../')

In [2]:
from time import perf_counter
from src.estimators import GEE, AE, UJ2A, SGD, PA, RLS
from src.visualization import time_estimate, time_fit, time_predict, time_features_labels, df_to_pdf
from src.preprocessing import TraceStats
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.ticker import FormatStrFormatter

In [3]:
statistical = [GEE('GEE'), AE('AE'), UJ2A('UJ2A')]
features = ['f_1' , 'f_2', 'f_3', 'avg_pkt_len', 'syn_count']
online_ml = [SGD, PA, RLS]
online_ml_names = ['SGD', 'PA', 'RLS']
sampling_rates = [0.1, 0.1, 0.2, 0.3, 0.4, 0.5]

# load ts
tss = [TraceStats.load('./../data/time_analysis_caida-2016_100K_%.4f.pickle' % sampling_rate) for sampling_rate in sampling_rates]
tss[0].batch_count

446

### Estimation Time

In [4]:
# load df
df_estimate = pd.read_pickle('time_estimate.pickle')
df_to_pdf(df_estimate, 'time_estimate', print_index=False, digit_round=None,
          comma_separated_columns=['Mean Sample Size'], gen_latex=True)
df_estimate

Unnamed: 0,Mean Sample Size,GEE,AE,UJ2A
1,10000,6.2e-06,0.00272,4.86e-05
2,20000,7.69e-06,0.00406,6.99e-05
3,30000,9.01e-06,0.00533,8.95e-05
4,40000,1.03e-05,0.00633,0.000108
5,50000,1.14e-05,0.00744,0.000134


### Training Time

In [5]:
df_fit = pd.read_pickle('time_fit.pickle')
df_to_pdf(df_fit, 'time_fit', print_index=False, digit_round=None,
          comma_separated_columns=['Mean Sample Size'], gen_latex=True)
df_fit

Unnamed: 0,Mean Sample Size,SGD,PA,RLS
1,10000,2.38e-05,3.7e-05,2.66e-05
2,20000,2.08e-05,3.49e-05,2.66e-05
3,30000,2.09e-05,3.49e-05,3.06e-05
4,40000,2.13e-05,3.51e-05,2.65e-05
5,50000,2.82e-05,3.52e-05,2.62e-05


### Prediction Time

In [6]:
df_predict = pd.read_pickle('time_predict.pickle')
df_to_pdf(df_predict, 'time_predict', print_index=False, digit_round=None,
         comma_separated_columns=['Mean Sample Size'], gen_latex=True)
df_predict

Unnamed: 0,Mean Sample Size,SGD,PA,RLS
1,10000,8.81e-06,8.52e-06,3.55e-06
2,20000,7.68e-06,8.2e-06,3.62e-06
3,30000,8.23e-06,8.36e-06,3.7e-06
4,40000,8.15e-06,8.36e-06,3.53e-06
5,50000,8.4e-06,9.25e-06,3.91e-06


### Featurization Time

In [10]:
df_features_labels = time_features_labels(tss)
df_features_labels = df_features_labels[['Sampling Rate', 'Featurization (mean)', 'Labeling (mean)']]
df_to_pdf(df_features_labels, 'time_features_labels', print_index=False, digit_round=2,
          gen_latex=True)
df_features_labels

Unnamed: 0,Sampling Rate,Featurization (mean),Labeling (mean)
0,0.1,0.428401,4.232068
1,0.1,0.428401,4.232068
2,0.2,0.848065,4.180708
3,0.3,1.260601,4.163787
4,0.4,1.6593,4.112581
5,0.5,2.074345,4.106532
