# Benchmarks of Strategies for Selecting Outdated Items
This notebook contains the benchmarks related to the selection strategies for context data, which we report in our paper.
Context data are selected from the processed data and included in the next progressive computation step, such that its result approximates that of a _non-progressive_ computation over the processed data.

## Benchmark Configuration

We use the following configuration in our benchmarks:
### Test cases 
- full computation over the entire dataset (upper baseline)
- progressive computation without optimization (lower baseline)
- full computation of processed data
- progressive computation using optimization strategies

### Dataset
- NYC taxis dataset (10 Million items), stored in a compressed CSV file, loaded with DuckDB 

### Variables
- dependent variables: runtime, prediction error
- independent variables: 

## Setup

Configure the path to be able to import the local modules.

In [None]:
import os
from sys import path
cwd = os.getcwd()
path.append(f"{cwd}/..")

Define some contant values for the data.

In [None]:
from database import ID, initialize_db, drop_tables, get_next_chunk_from_db, save_dois, get_from_doi
import numpy as np
import pandas as pd
import time

n_dims = 17
total_items = 99999
chunk_size = 1000
chunks = round(total_items / chunk_size)

def reset():
  drop_tables()
  initialize_db("../data/nyc_taxis_sampled100k_shuffled.csv.gz")

## Benchmarks

The DOI function

In [None]:
from doi_component.outlierness_component import OutliernessComponent

outlierness = OutliernessComponent(["ratio", "duration"])

Baseline: Chunk-based computation without any optimizations

In [None]:
reset()

# lower baseline: chunk-based computation over the processed data so far.
lower_bound_result = pd.DataFrame([])
start = time.time()
for i in range(chunks):
  chunk = get_next_chunk_from_db(chunk_size, as_df=True)
  doi = outlierness.compute_doi(chunk)
  lower_bound_result = lower_bound_result.append(pd.DataFrame(doi))

time_lower = time.time() - start

print(f"# lower bound: {time_lower}")
lower_bound_result

In [None]:
reset()

# upper baseline: full computation over the processed data so far.
start = time.time()
data = get_next_chunk_from_db(chunk_size * chunks, as_df=True)
upper_bound_result = outlierness.compute_doi(data)
time_upper = time.time() - start

print(f"# upper bound: {time_upper}")
upper_bound_result

### Update selection strategies

In [None]:
from outdated_item_selection_strategy.no_update import *
from outdated_item_selection_strategy.oldest_chunks_update import *
from outdated_item_selection_strategy.last_n_chunks_update import *
from outdated_item_selection_strategy.regular_interval_update import *
from outdated_item_selection_strategy.outdated_bin_update import *

update_strategies = [
  ("no chunk", NoUpdate(n_dims=n_dims)),
  ("oldest n chunks", OldestChunksUpdate(n_dims=n_dims, n_chunks=3, max_age=10)),
  ("last n chunks", LastNChunksUpdate(n_dims=n_dims, n_chunks=3)),
  ("regular intervals", RegularIntervalUpdate(n_dims=n_dims,interval=2, max_age=10)),
  ("outdated bins", OutdatedBinUpdate(n_dims=n_dims))
]

In [None]:
import os
from sys import path
cwd = os.getcwd()
path.append(f"{cwd}/..")

from database import ID, initialize_db, drop_tables, get_next_chunk_from_db, save_dois, get_from_doi
import numpy as np
import pandas as pd
import time

n_dims = 17
total_items = 99999
chunk_size = 1000
chunks = round(total_items / chunk_size)

def reset():
  drop_tables()
  initialize_db("../data/nyc_taxis_sampled100k_shuffled.csv.gz")

from doi_component.outlierness_component import OutliernessComponent

outlierness = OutliernessComponent(["ratio", "duration"])

from outdated_item_selection_strategy.no_update import *
from outdated_item_selection_strategy.oldest_chunks_update import *
from outdated_item_selection_strategy.last_n_chunks_update import *
from outdated_item_selection_strategy.regular_interval_update import *
from outdated_item_selection_strategy.outdated_bin_update import *

update_strategies = [
  ("no chunk", NoUpdate(n_dims=n_dims)),
  ("oldest n chunks", OldestChunksUpdate(n_dims=n_dims, n_chunks=3, max_age=10)),
  ("last n chunks", LastNChunksUpdate(n_dims=n_dims, n_chunks=3)),
  ("regular intervals", RegularIntervalUpdate(n_dims=n_dims,interval=2, max_age=10)),
  ("outdated bins", OutdatedBinUpdate(n_dims=n_dims))
]

#########################################################################################################
#########################################################################################################

from database import ID,process_chunk, update_dois, save_dois
reset()
# lower baseline: chunk-based computation over the processed data so far.
start = time.time()

strategy = update_strategies[1]
print("strategy:", strategy[0])

# total time per chunk
# time to retrieve the next chunk
# time to compute the update
# time to compute the context
# time to compute the doi function over data+context
# time to compute the doi function over data+update
print("chunk, chunk_time, outdated_time, new_doi_time, old_doi_time, store_new_time, update_dois_time, step_time")
for i in range(chunks):
  step_time = time.time()
  # print(f"({i}/{chunks})")
  
  before = time.time()
  chunk = get_next_chunk_from_db(chunk_size, as_df=True)
  outdated = strategy[1].get_outdated_items(i)
  chunk_time = time.time() - before

  before = time.time()
  outdated = process_chunk(pd.DataFrame(outdated))
  outdated_time = time.time() - before
  
  # append the context items to chunk
  # chunk = chunk.append(context)
  new_ids = chunk[ID].to_list()
  old_ids = outdated[ID].to_list()

  before = time.time()
  new_doi = outlierness.compute_doi(chunk)
  new_doi_time = time.time() - before

  before = time.time()
  save_dois(new_ids, new_doi, np.zeros_like(new_doi))
  store_new_time = time.time() - before

  before = time.time()
  old_doi = outlierness.compute_doi(chunk.append(outdated))[len(chunk):]
  old_doi_time = time.time() - before
  
  # print(len(old_ids), len(old_doi))

  before = time.time()
  update_dois(old_ids, old_doi)
  update_dois_time = time.time() - before
  # print(f"updating: {time.time() - before}s")

  step_time = time.time() - step_time

  print(i, chunk_time, outdated_time, new_doi_time, old_doi_time, store_new_time, update_dois_time, step_time)

time_strat = time.time() - start
start_result = get_from_doi(["TRUE"], as_df=True)

print(f"# using {strategy[0]} strategy: {time_strat}")
start_result

### Context selection strategies

In [None]:
from context_item_selection_strategy.chunk_based_context import *
from context_item_selection_strategy.sampling_based_context import *
from context_item_selection_strategy.clustering_based_context import *
from context_item_selection_strategy.no_context import * 

context_strategies = [
  ("no context", NoContext(n_dims=n_dims)),
  ("chunk based", RandomChunkBasedContext(n_dims=n_dims, n_chunks=3)),
  ("sampling based", RandomSamplingBasedContext(n_dims=n_dims, n_samples=chunk_size)),
  ("clustering based", ClusteringBasedContext(n_dims=n_dims, n_clusters=chunk_size))
]

In [None]:
current_chunk = chunks
context_size = chunk_size

for i, strategy in enumerate(context_strategies):
  start = time.time()
  print("#", strategy[0])
  context_items = strategy[1].get_context_items(current_chunk)
  print(f"found {len(context_items)} context items:")
  print(context_items)
  print(time.time() - start)
  print("\n")

In [None]:
from database import ID, process_chunk
reset()
# lower baseline: chunk-based computation over the processed data so far.
start = time.time()

strategy = strategies[1]

for i in range(chunks):
  chunk = get_next_chunk_from_db(chunk_size, as_df=True)
  new_entries = len(chunk)
  context = strategy[1].get_context_items(i)
  context = process_chunk(context)
  chunk = chunk.append(context)
  doi = outlierness.compute_doi(chunk)
  new_ids = chunk[ID][:new_entries].to_list()
  new_dois = doi[:new_entries]
  save_dois(new_ids, new_dois, np.zeros_like(new_dois))

time_strat = time.time() - start
start_result = get_from_doi(["TRUE"], as_df=True)

print(f"# using {strategy[0]} strategy: {time_strat}")
start_result

In [1]:
import os
from sys import path
cwd = os.getcwd()
path.append(f"{cwd}/..")

from database import ID, DOI, initialize_db, drop_tables, get_next_chunk_from_db, save_dois, get_from_doi
from benchmark_test_case import *
import numpy as np
import pandas as pd
import time

n_dims = 17
# total_items = 99999
total_items = 999999
chunk_size = 1000
chunks = round(total_items / chunk_size)

def reset():
  drop_tables()
  initialize_db("../data/nyc_taxis_sampled100k_shuffled.csv.gz")

from doi_component.outlierness_component import OutliernessComponent

outlierness = OutliernessComponent(["ratio", "duration"])

from outdated_item_selection_strategy.no_update import *
from outdated_item_selection_strategy.oldest_chunks_update import *
from outdated_item_selection_strategy.last_n_chunks_update import *
from outdated_item_selection_strategy.regular_interval_update import *
from outdated_item_selection_strategy.outdated_bin_update import *

update_strategies = [
  ("no chunk", NoUpdate(n_dims=n_dims)),
  ("oldest n chunks", OldestChunksUpdate(n_dims=n_dims, n_chunks=3, max_age=10)),
  ("last n chunks", LastNChunksUpdate(n_dims=n_dims, n_chunks=3)),
  ("regular intervals", RegularIntervalUpdate(n_dims=n_dims,interval=2, max_age=10)),
  ("outdated bins", OutdatedBinUpdate(n_dims=n_dims))
]

from context_item_selection_strategy.no_context import *

#########################################################################################################
#########################################################################################################

from database import ID,process_chunk, update_dois, save_dois

charts = []
for strategy in update_strategies:
  reset()
  print("strategy:", strategy[0])
  test_case = BenchmarkTestCase(strategy[0], outlierness, NoContext(n_dims), strategy[1], chunk_size, chunks)
  test_case.run()

charts

strategy: no chunk
strategy: oldest n chunks
strategy: last n chunks
strategy: regular intervals
strategy: outdated bins


### Configuration

In [None]:
n_dims = 17
total_items = 99999
chunk_size = 1000
chunks = round(total_items / chunk_size)
outlierness = OutliernessComponent(["ratio", "duration"])

In [None]:
full_test_case = MonolithicComputationTestCase("full computation", outlierness, chunk_size*chunks, 1)
results_full = full_test_case.run()

In [None]:
from context_item_selection_strategy.chunk_based_context import *
chunk_context = MostRecentChunkBasedContext(n_dims=n_dims, n_chunks=3)
context_test_case = ContextStrategyTestCase("context", chunk_context, outlierness, chunk_size, chunks)
results_context = context_test_case.run()

In [None]:
chunked_test_case = MonolithicComputationTestCase("only chunks", outlierness, chunk_size, chunks)
results_chunked = chunked_test_case.run()

In [None]:
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import jaccard_score, r2_score

rus = RandomUnderSampler(random_state=0)

def evaluate_test_case(test_case: np.ndarray, ground_truth: np.ndarray):
  # score = jaccard_score(test_case, ground_truth, average="weighted")
  score = r2_score(ground_truth, test_case)
  return score

ground_truth = results_full["doi"]
context_test_case = results_context["doi"]
baseline_test_case = results_chunked["doi"]


evaluate_test_case(baseline_test_case, ground_truth), evaluate_test_case(context_test_case, ground_truth)