# Benchmarks of Strategies for Selecting Outdated Items
This notebook contains the benchmarks related to the selection strategies for context data, which we report in our paper.
Context data are selected from the processed data and included in the next progressive computation step, such that its result approximates that of a _non-progressive_ computation over the processed data.

## Benchmark Configuration

We use the following configuration in our benchmarks:
### Test cases 
- full computation over the entire dataset (upper baseline)
- progressive computation without optimization (lower baseline)
- full computation of processed data
- progressive computation using optimization strategies

### Dataset
- NYC taxis dataset (10 Million items), stored in a compressed CSV file, loaded with DuckDB 

### Variables
- dependent variables: runtime, prediction error
- independent variables: 

## Setup

Configure the path to be able to import the local modules.

In [1]:
import os
from sys import path
cwd = os.getcwd()
path.append(f"{cwd}/..")

Define some contant values for the data.

In [39]:
from database import ID, initialize_db, drop_tables, get_next_chunk_from_db, save_dois, get_from_doi
import numpy as np
import pandas as pd
import time

n_dims = 17
total_items = 99999
chunk_size = 1000
chunks = round(total_items / chunk_size)

def reset():
  drop_tables()
  initialize_db("../data/nyc_taxis_sampled100k_shuffled.csv.gz")

## Benchmarks

Configure the computation ("outlierness").

In [6]:
from doi_component.outlierness_component import OutliernessComponent

outlierness = OutliernessComponent(["ratio", "duration"])

Run the baseline computations.

In [6]:
reset()

# lower baseline: chunk-based computation over the processed data so far.
lower_bound_result = pd.DataFrame([])
start = time.time()
for i in range(chunks):
  chunk = get_next_chunk_from_db(chunk_size, as_df=True)
  doi = outlierness.compute_doi(chunk)
  lower_bound_result = lower_bound_result.append(pd.DataFrame(doi))

time_lower = time.time() - start

print(f"# lower bound: {time_lower}")
lower_bound_result

# lower bound: 135.12201714515686


Unnamed: 0,0
0,0.00
1,0.00
2,0.00
3,0.11
4,0.11
...,...
994,0.11
995,0.11
996,0.00
997,0.00


In [7]:
reset()

# upper baseline: full computation over the processed data so far.
start = time.time()
data = get_next_chunk_from_db(chunk_size * chunks, as_df=True)
upper_bound_result = outlierness.compute_doi(data)
time_upper = time.time() - start

print(f"# upper bound: {time_upper}")
pd.DataFrame(upper_bound_result)

# upper bound: 1320.5454723834991


Unnamed: 0,0
0,0.00
1,0.00
2,0.22
3,0.00
4,0.11
...,...
99994,0.00
99995,0.00
99996,0.11
99997,0.00


### Update selection strategies

In [None]:
from outdated_item_selection_strategy.no_update import *
from outdated_item_selection_strategy.oldest_chunks_update import *
from outdated_item_selection_strategy.last_n_chunks_update import *
from outdated_item_selection_strategy.regular_interval_update import *
from outdated_item_selection_strategy.outdated_bin_update import *

update_strategies = [
  ("no chunk", NoUpdate(n_dims=n_dims)),
  ("oldest n chunks", OldestChunksUpdate(n_dims=n_dims, n_chunks=3, max_age=10)),
  ("last n chunks", LastNChunksUpdate(n_dims=n_dims, n_chunks=3)),
  ("regular intervals", RegularIntervalUpdate(n_dims=n_dims,interval=2, max_age=10)),
  ("outdated bins", OutdatedBinUpdate(n_dims=n_dims))
]

In [None]:
from database import ID, process_chunk
reset()
# lower baseline: chunk-based computation over the processed data so far.
start = time.time()

strategy = update_strategies[1]

for i in range(chunks):
  print(f"({i}/{chunks})")
  chunk = get_next_chunk_from_db(chunk_size, as_df=True)
  outdated = strategy[1].get_outdated_items(i)
  chunk = chunk.append(outdated)
  
  doi = outlierness.compute_doi(chunk)
  new_ids = chunk[ID].to_list()
  save_dois(new_ids, doi, np.zeros_like(doi))

time_strat = time.time() - start
start_result = get_from_doi(["TRUE"], as_df=True)

print(f"# using {strategy[0]} strategy: {time_strat}")
start_result

### Context selection strategies

In [7]:
from context_item_selection_strategy.chunk_based_context import *
from context_item_selection_strategy.sampling_based_context import *
from context_item_selection_strategy.clustering_based_context import *
from context_item_selection_strategy.no_context import * 

strategies = [
  ("no context", NoContext(n_dims=n_dims)),
  ("chunk based", ChunkBasedContext(n_dims=n_dims, n_chunks=3)),
  ("sampling based", SamplingBasedContext(n_dims=n_dims, n_samples=chunk_size)),
  ("clustering based", ClusteringBasedContext(n_dims=n_dims, n_clusters=chunk_size))
]

In [None]:
current_chunk = chunks
context_size = chunk_size

for i, strategy in enumerate(strategies):
  start = time.time()
  print("#", strategy[0])
  context_items = strategy[1].get_context_items(current_chunk)
  print(f"found {len(context_items)} context items:")
  print(context_items)
  print(time.time() - start)
  print("\n")

In [56]:
from database import ID, process_chunk
reset()
# lower baseline: chunk-based computation over the processed data so far.
start = time.time()

strategy = strategies[1]

for i in range(chunks):
  print(f"({i}/{chunks})")
  chunk = get_next_chunk_from_db(chunk_size, as_df=True)
  new_entries = len(chunk)
  context = strategy[1].get_context_items(i)
  context = process_chunk(context)
  chunk = chunk.append(context)
  doi = outlierness.compute_doi(chunk)
  new_ids = chunk[ID][:new_entries].to_list()
  new_dois = doi[:new_entries]
  save_dois(new_ids, new_dois, np.zeros_like(new_dois))

time_strat = time.time() - start
start_result = get_from_doi(["TRUE"], as_df=True)

print(f"# using {strategy[0]} strategy: {time_strat}")
start_result

(0/100)
(1/100)
(2/100)
(3/100)
(4/100)
(5/100)
(6/100)
(7/100)
(8/100)
(9/100)
(10/100)
(11/100)
(12/100)
(13/100)
(14/100)
(15/100)
(16/100)
(17/100)
(18/100)
(19/100)
(20/100)
(21/100)
(22/100)
(23/100)
(24/100)
(25/100)
(26/100)
(27/100)
(28/100)
(29/100)
(30/100)
(31/100)
(32/100)
(33/100)
(34/100)
(35/100)
(36/100)
(37/100)
(38/100)
(39/100)
(40/100)
(41/100)
(42/100)
(43/100)
(44/100)
(45/100)
(46/100)
(47/100)
(48/100)
(49/100)
(50/100)
(51/100)
(52/100)
(53/100)
(54/100)
(55/100)
(56/100)
(57/100)
(58/100)
(59/100)
(60/100)
(61/100)
(62/100)
(63/100)
(64/100)
(65/100)
(66/100)
(67/100)
(68/100)
(69/100)
(70/100)
(71/100)
(72/100)
(73/100)
(74/100)
(75/100)
(76/100)
(77/100)
(78/100)
(79/100)
(80/100)
(81/100)
(82/100)
(83/100)
(84/100)
(85/100)
(86/100)
(87/100)
(88/100)
(89/100)
(90/100)
(91/100)
(92/100)
(93/100)
(94/100)
(95/100)
(96/100)
(97/100)
(98/100)
(99/100)
# using chunk based strategy: 540.5702812671661


Unnamed: 0,tripid,doi,label
0,31852922,0.0,0.0
1,16785706,0.0,0.0
2,66379394,0.0,0.0
3,10428271,0.11,0.0
4,23940933,0.11,0.0
...,...,...,...
99994,11487892,0.0,0.0
99995,55569069,0.0,0.0
99996,34258604,0.0,0.0
99997,39358155,0.0,0.0
