Benchmark the new `DoiRegressionModel` class

In [None]:
import os
from sys import path

cwd = os.getcwd()
path.append(f"{cwd}/..")
import numpy as np
import pandas as pd
from database import (create_tables, drop_tables, get_next_chunk_from_db,
                      reset_progression)
from doi_function import (compute_dois, reset_doi_component,
                          set_dimension_intervals, set_dimension_weights)
from doi_regression_model import *
from storage_strategy.windowing_storage import WindowingStorage

from server import taxi_process_chunk


def reset(weights: dict, intervals: dict): 
  drop_tables()

  create_tables(
      row_data_path="../data/nyc_taxis.shuffled_full.csv.gz",
      column_data_path="../data/nyc_taxis.shuffled_full.parquet",
      id_column="tripID",
      total_size=112145904,
      process_chunk_callback=taxi_process_chunk,
  )

  reset_progression()
  reset_doi_component()
  set_dimension_weights(weights)
  set_dimension_intervals(intervals)


CHUNK_SIZE = 1000
WEIGHTS = {
  "trip_distance": 0.25,
  "total_amount": 0.25,
  "tip_amount": 0.25,
  "trip_duration": 0.25,
}
INTERVALS = {
  "trip_distance": [16, 20],
  "total_amount": [34, 74],
  "tip_amount": [4, 12],
  "trip_duration": [3, 5],
}

reset(intervals=INTERVALS, weights=WEIGHTS)


def get_next_progressive_result(storage: StorageStrategy, get_context = None):
  '''Wrapper function for getting a new chunk, computing the DOI function on it and storing the data
     for later retrieval in the storage.'''

  # get chunk and compute context
  chunk_df = get_next_chunk_from_db(CHUNK_SIZE, as_df=True)
  
  context_df = pd.DataFrame([], columns = chunk_df.columns)
  if get_context is not None:
    context_df = get_context()
    print(context_df)

  # compute the DOI over chunk + context
  df = pd.concat([chunk_df, context_df], ignore_index=True)
  dois = compute_dois(df)  # HACK: compatibility with DoiComponent class
  new_dois = dois[:len(chunk_df)]

  storage.insert_chunk(chunk_df, 0)

  return chunk_df, new_dois


def benchmark_max_depth(tested_max_depths: range = range(1, 10), n_chunks: int = 25):
  '''Tests how varying `max_depth` parameter influences progressive DOI prediction scores.'''
  scores = []

  for test_case_depth in tested_max_depths:
    reset(intervals=INTERVALS, weights=WEIGHTS)
    storage = WindowingStorage(max_size=10000000)
    model = DoiRegressionModel(storage, max_depth=test_case_depth)

    test_case_scores_per_chunk = []
    for i in range(n_chunks):
      chunk_df, new_dois = get_next_progressive_result(storage)
      if i == 0:
        model.update(chunk_df, new_dois)
      test_case_scores_per_chunk += [model.score(chunk_df, new_dois)]
    scores += [test_case_scores_per_chunk]
  
  return pd.DataFrame(np.array(scores).T, columns=tested_max_depths)


def benchmark_retraining_intervals(tested_intervals: range = range(25), max_depth: int = 3, n_chunks: int = 25):
  '''Tests how varying the update interval influences progressive DOI prediction scores.'''
  scores = []

  for test_case_interval in tested_intervals:
    reset(intervals=INTERVALS, weights=WEIGHTS)
    storage = WindowingStorage(max_size=10000000)
    model = DoiRegressionModel(storage, max_depth=max_depth)

    test_case_scores_per_chunk = []
    for i in range(n_chunks):
      chunk_df, new_dois = get_next_progressive_result(storage)
      if test_case_interval == 0 or i % test_case_interval == 0:
        model.update(chunk_df, new_dois)
      test_case_scores_per_chunk += [model.score(chunk_df, new_dois)]
    scores += [test_case_scores_per_chunk]

  return pd.DataFrame(np.array(scores).T, columns=tested_intervals)

def benchmark_context(max_depth: int = 3, n_chunks: int = 25):  
  scores_with_context = []
  context_size = 1000
  reset(intervals=INTERVALS, weights=WEIGHTS)
  storage = WindowingStorage(max_size=10000000)
  model = DoiRegressionModel(storage, max_depth=max_depth)
  for i in range(n_chunks):
      # model is not yet trained at first chunk, so no context ...
      if i == 0:
        chunk_df, new_dois = get_next_progressive_result(storage)
      # ... otherwise use context
      else:
        chunk_df, new_dois = get_next_progressive_result(
          storage,
          lambda: model.get_context_items(context_size)
        )

      if i == 0:
        model.update(chunk_df, new_dois)
      scores_with_context += [model.score(chunk_df, new_dois)]

  scores_without_context = []
  reset(intervals=INTERVALS, weights=WEIGHTS)
  storage = WindowingStorage(max_size=10000000)
  model = DoiRegressionModel(storage, max_depth=max_depth)
  for i in range(n_chunks):
      chunk_df, new_dois = get_next_progressive_result(storage)
      if i == 0:
        model.update(chunk_df, new_dois)
      scores_without_context += [model.score(chunk_df, new_dois)]


  scores = [scores_with_context, scores_without_context]

  return pd.DataFrame(np.array(scores).T, columns=["context", "no context"])

# results: max_depth = 3 has best peformance, then no improvement/worse for bigger values
# max_depth_results = benchmark_max_depth()

# results: trivial best results for 0 and 1, score std for 3-8 above 0.9 
# intervals_results = benchmark_retraining_intervals()

# results: when including context items in the doi computation, scores drop.
context_results = benchmark_context()

Plot the prediction scores of the regression model across varying `max_depth` parameters.

In [None]:
context_results.boxplot()