In [None]:
import os
from sys import path
from time import time

import numpy as np

cwd = os.getcwd()
path.append(f"{cwd}/..")

from database import (create_tables, drop_tables, get_next_chunk_from_db,
                      reset_progression, save_dois, update_dois)
from doi_function import (compute_dois, reset_doi_component,
                          set_dimension_weights)

from server import taxi_process_chunk


def reset(): 
  drop_tables()

  create_tables(
      row_data_path="../data/nyc_taxis.shuffled_full.csv.gz",
      column_data_path="../data/nyc_taxis.shuffled_full.parquet",
      id_column="tripID",
      total_size=112145904,
      process_chunk_callback=taxi_process_chunk,
  )

  reset_progression()
  reset_doi_component()

doi_storage_ground_truth = {}
doi_storage_optimized = {}
doi_storage_baseline = {}

def benchmark(iterations: int, chunk_size: int, weights: dict):

  ##################################################################################################
  # run the test case
  ##################################################################################################
  reset()
  set_dimension_weights(weights)
  now = time()
  for i in range(iterations):
    chunk = get_next_chunk_from_db(chunk_size)
    new_dois, updated_ids, updated_dois = compute_dois(chunk, use_optimizations=True)

    new_ids = np.array(chunk)[:, 0].tolist()
    updated_ids = np.array(updated_ids).tolist()

    save_dois(new_ids, new_dois.reshape(-1, ).tolist())
    update_dois(updated_ids, updated_dois.reshape(-1, ).tolist())

    for i, new_id in enumerate(new_ids):
      doi_storage_optimized[str(new_id)] = new_dois[i][0]

    for i, updated_id in enumerate(updated_ids):
      doi_storage_optimized[str(updated_id)] = updated_dois[i][0]

  print("optimized:", time() - now, "s")

  ##################################################################################################
  # run the baseline
  ##################################################################################################
  reset()
  set_dimension_weights(weights)
  now = time()
  for i in range(iterations):
    chunk = get_next_chunk_from_db(chunk_size)
    new_dois, _, _ = compute_dois(chunk, use_optimizations=False)

    new_ids = np.array(chunk)[:, 0].tolist()

    for i, new_id in enumerate(new_ids):
      doi_storage_baseline[str(new_id)] = new_dois[i][0]
      
  print("baseline:", time() - now, "s")

  ##################################################################################################
  # run the ground truth
  ##################################################################################################
  reset()
  set_dimension_weights(weights)
  now = time()
  chunk = get_next_chunk_from_db(chunk_size * iterations)
  new_dois, _, _ = compute_dois(chunk, use_optimizations=False)

  new_ids = np.array(chunk)[:, 0].tolist()

  for i, new_id in enumerate(new_ids):
    doi_storage_ground_truth[str(new_id)] = new_dois[i][0]

  print("ground truth:", time() - now, "s")



ITERATIONS = 100
CHUNK_SIZE = 10
WEIGHTS = {
  "trip_distance": 0.333,
  "total_amount": 0.333
}

benchmark(ITERATIONS, CHUNK_SIZE, WEIGHTS)

In [None]:
import pandas as pd
import numpy as np

bins = np.arange(0, 1.005, 0.005)
figsize = (14, 4)

pd.Series(doi_storage_ground_truth).hist(bins=bins, figsize=figsize, alpha=1, color="black")
pd.Series(doi_storage_optimized).hist(bins=bins, figsize=figsize, alpha=0.3, color="green")
pd.Series(doi_storage_baseline).hist(bins=bins, figsize=figsize, alpha=0.3, color="red")

In [None]:
import pandas as pd

gt_df = pd.Series(doi_storage_ground_truth).sort_index()
optimized_df = pd.Series(doi_storage_optimized).sort_index()
baseline_df = pd.Series(doi_storage_baseline).sort_index()


df = pd.DataFrame(columns=["gt", "baseline", "optimized"], index=gt_df.index)

df["gt"] = gt_df.to_numpy()
df["baseline"] = baseline_df.to_numpy()
df["optimized"] = optimized_df.to_numpy()

# sanity check:
assert (df["gt"] - gt_df).sum() == 0.0
assert (df["baseline"] - baseline_df).sum() == 0.0
assert (df.index != baseline_df.index).sum() == 0.0
assert (df["optimized"] - optimized_df).sum() == 0.0
assert (df.index != optimized_df.index).sum() == 0.0

baseline_error = df["gt"] - df["baseline"]
optimized_error = df["gt"] - df["optimized"]

baseline_error.sum(), optimized_error.sum()