# Strategies for Progressively Storing Processed Data
In this notebook, we introduce and benchmark basic algorithms presented in our paper for storing data and their results that are made available in chunks ("progressively").

## Setup

In [None]:
import os
from sys import path
cwd = os.getcwd()
path.append(f"{cwd}/..")
from database import initialize_db, drop_tables

In [None]:
import numpy as np
from doi_component.outlierness_component import *

outlierness = OutliernessComponent()

def doi(items: np.ndarray):
  result = outlierness.compute_doi(items)
  return result

## Testing

In [3]:
import time
from config import *
from database import get_next_chunk_from_db
from storage_strategy.windowing_storage import *
from storage_strategy.compression_storage import *
from storage_strategy.reservoir_sampling_storage import *

reset()

chunk_size = 1000
chunks = 100
max_storage_size = chunk_size * (2)

start = time.time()
windowing = WindowingStorage(max_storage_size)
t_window = time.time() - start
start = time.time()
compression = CompressionStorage(max_storage_size)
t_compression = time.time() - start
start = time.time()
reservoir = ReservoirSamplingStorage(max_storage_size)
t_reservoir = time.time() - start

for i in range(chunks):
  # chunk = pd.DataFrame(np.arange(i*chunk_size, (i+1)*chunk_size))
  chunk = pd.DataFrame(get_next_chunk_from_db(chunk_size)).loc[:, [18, 19]]
  
  windowing.insert_chunk(chunk)
  reservoir.insert_chunk(chunk)
  compression.insert_chunk(chunk)

print("types:", windowing.get_storage().dtypes)

print("timings:")
print("windowing", t_window)
print("compression", t_compression)
print("reservoir", t_reservoir)

NameError: name 'drop_tables' is not defined

In [None]:
windowing.get_storage().plot.scatter(x=18, y=19, alpha=0.1)
reservoir.get_storage().plot.scatter(x=18, y=19, alpha=0.1)
compression.get_storage().plot.scatter(x=0, y=1, alpha=0.1)