# Benchmarks of Strategies for Selecting Outdated Items
This notebook contains the benchmarks related to the selection strategies for context data, which we report in our paper.
Context data are selected from the processed data and included in the next progressive computation step, such that its result approximates that of a _non-progressive_ computation over the processed data.

## Benchmark Configuration

We use the following configuration in our benchmarks:
### Test cases 
- full computation over the entire dataset (upper baseline)
- progressive computation without optimization (lower baseline)
- full computation of processed data
- progressive computation using optimization strategies

### Dataset
- NYC taxis dataset (10 Million items), stored in a compressed CSV file, loaded with DuckDB 

### Variables
- dependent variables: runtime, prediction error
- independent variables: 

## Setup

Configure the path to be able to import the local modules.

In [None]:
import os
from sys import path
cwd = os.getcwd()
path.append(f"{cwd}/..")

Define some contant values for the data.

In [None]:
from database import ID, initialize_db, drop_tables, get_next_chunk_from_db, save_dois, get_from_doi
import numpy as np
import pandas as pd
import time

n_dims = 17
total_items = 99999
chunk_size = 1000
chunks = round(total_items / chunk_size)

def reset():
  drop_tables()
  initialize_db("../data/nyc_taxis_sampled100k_shuffled.csv.gz")

## Benchmarks

The DOI function

In [None]:
from doi_component.outlierness_component import OutliernessComponent

outlierness = OutliernessComponent(["ratio", "duration"])

Baseline: Chunk-based computation without any optimizations

In [None]:
reset()

# lower baseline: chunk-based computation over the processed data so far.
lower_bound_result = pd.DataFrame([])
start = time.time()
for i in range(chunks):
  chunk = get_next_chunk_from_db(chunk_size, as_df=True)
  doi = outlierness.compute_doi(chunk)
  lower_bound_result = lower_bound_result.append(pd.DataFrame(doi))

time_lower = time.time() - start

print(f"# lower bound: {time_lower}")
lower_bound_result

In [None]:
reset()

# upper baseline: full computation over the processed data so far.
start = time.time()
data = get_next_chunk_from_db(chunk_size * chunks, as_df=True)
upper_bound_result = outlierness.compute_doi(data)
time_upper = time.time() - start

print(f"# upper bound: {time_upper}")
upper_bound_result

### Update selection strategies

In [None]:
from outdated_item_selection_strategy.no_update import *
from outdated_item_selection_strategy.oldest_chunks_update import *
from outdated_item_selection_strategy.last_n_chunks_update import *
from outdated_item_selection_strategy.regular_interval_update import *
from outdated_item_selection_strategy.outdated_bin_update import *

update_strategies = [
  ("no chunk", NoUpdate(n_dims=n_dims)),
  ("oldest n chunks", OldestChunksUpdate(n_dims=n_dims, n_chunks=3, max_age=10)),
  ("last n chunks", LastNChunksUpdate(n_dims=n_dims, n_chunks=3)),
  ("regular intervals", RegularIntervalUpdate(n_dims=n_dims,interval=2, max_age=10)),
  ("outdated bins", OutdatedBinUpdate(n_dims=n_dims))
]

In [None]:
import os
from sys import path
cwd = os.getcwd()
path.append(f"{cwd}/..")

from database import ID, initialize_db, drop_tables, get_next_chunk_from_db, save_dois, get_from_doi
import numpy as np
import pandas as pd
import time

n_dims = 17
total_items = 99999
chunk_size = 1000
chunks = round(total_items / chunk_size)

def reset():
  drop_tables()
  initialize_db("../data/nyc_taxis_sampled100k_shuffled.csv.gz")

from doi_component.outlierness_component import OutliernessComponent

outlierness = OutliernessComponent(["ratio", "duration"])

from outdated_item_selection_strategy.no_update import *
from outdated_item_selection_strategy.oldest_chunks_update import *
from outdated_item_selection_strategy.last_n_chunks_update import *
from outdated_item_selection_strategy.regular_interval_update import *
from outdated_item_selection_strategy.outdated_bin_update import *

update_strategies = [
  ("no chunk", NoUpdate(n_dims=n_dims)),
  ("oldest n chunks", OldestChunksUpdate(n_dims=n_dims, n_chunks=3, max_age=10)),
  ("last n chunks", LastNChunksUpdate(n_dims=n_dims, n_chunks=3)),
  ("regular intervals", RegularIntervalUpdate(n_dims=n_dims,interval=2, max_age=10)),
  ("outdated bins", OutdatedBinUpdate(n_dims=n_dims))
]

#########################################################################################################
#########################################################################################################

from database import ID,process_chunk, update_dois, save_dois
reset()
# lower baseline: chunk-based computation over the processed data so far.
start = time.time()

strategy = update_strategies[1]
print("strategy:", strategy[0])

# total time per chunk
# time to retrieve the next chunk
# time to compute the update
# time to compute the context
# time to compute the doi function over data+context
# time to compute the doi function over data+update
print("chunk, chunk_time, outdated_time, new_doi_time, old_doi_time, store_new_time, update_dois_time, step_time")
for i in range(chunks):
  step_time = time.time()
  # print(f"({i}/{chunks})")
  
  before = time.time()
  chunk = get_next_chunk_from_db(chunk_size, as_df=True)
  outdated = strategy[1].get_outdated_items(i)
  chunk_time = time.time() - before

  before = time.time()
  outdated = process_chunk(pd.DataFrame(outdated))
  outdated_time = time.time() - before
  
  # append the context items to chunk
  # chunk = chunk.append(context)
  new_ids = chunk[ID].to_list()
  old_ids = outdated[ID].to_list()

  before = time.time()
  new_doi = outlierness.compute_doi(chunk)
  new_doi_time = time.time() - before

  before = time.time()
  save_dois(new_ids, new_doi, np.zeros_like(new_doi))
  store_new_time = time.time() - before

  before = time.time()
  old_doi = outlierness.compute_doi(chunk.append(outdated))[len(chunk):]
  old_doi_time = time.time() - before
  
  # print(len(old_ids), len(old_doi))

  before = time.time()
  update_dois(old_ids, old_doi)
  update_dois_time = time.time() - before
  # print(f"updating: {time.time() - before}s")

  step_time = time.time() - step_time

  print(i, chunk_time, outdated_time, new_doi_time, old_doi_time, store_new_time, update_dois_time, step_time)

time_strat = time.time() - start
start_result = get_from_doi(["TRUE"], as_df=True)

print(f"# using {strategy[0]} strategy: {time_strat}")
start_result

### Context selection strategies

In [None]:
from context_item_selection_strategy.chunk_based_context import *
from context_item_selection_strategy.sampling_based_context import *
from context_item_selection_strategy.clustering_based_context import *
from context_item_selection_strategy.no_context import * 

context_strategies = [
  ("no context", NoContext(n_dims=n_dims)),
  ("chunk based", RandomChunkBasedContext(n_dims=n_dims, n_chunks=3)),
  ("sampling based", RandomSamplingBasedContext(n_dims=n_dims, n_samples=chunk_size)),
  ("clustering based", ClusteringBasedContext(n_dims=n_dims, n_clusters=chunk_size))
]

In [None]:
current_chunk = chunks
context_size = chunk_size

for i, strategy in enumerate(context_strategies):
  start = time.time()
  print("#", strategy[0])
  context_items = strategy[1].get_context_items(current_chunk)
  print(f"found {len(context_items)} context items:")
  print(context_items)
  print(time.time() - start)
  print("\n")

In [None]:
from database import ID, process_chunk
reset()
# lower baseline: chunk-based computation over the processed data so far.
start = time.time()

strategy = strategies[1]

for i in range(chunks):
  chunk = get_next_chunk_from_db(chunk_size, as_df=True)
  new_entries = len(chunk)
  context = strategy[1].get_context_items(i)
  context = process_chunk(context)
  chunk = chunk.append(context)
  doi = outlierness.compute_doi(chunk)
  new_ids = chunk[ID][:new_entries].to_list()
  new_dois = doi[:new_entries]
  save_dois(new_ids, new_dois, np.zeros_like(new_dois))

time_strat = time.time() - start
start_result = get_from_doi(["TRUE"], as_df=True)

print(f"# using {strategy[0]} strategy: {time_strat}")
start_result

In [10]:
import os
from sys import path
cwd = os.getcwd()
path.append(f"{cwd}/..")

from database import ID, DOI, initialize_db, drop_tables, get_next_chunk_from_db, save_dois, get_from_doi
from benchmark_test_case import *
import numpy as np
import pandas as pd
import time

n_dims = 17
# total_items = 99999
total_items = 999
chunk_size = 100
chunks = round(total_items / chunk_size)

def reset():
  drop_tables()
  initialize_db("../data/nyc_taxis_sampled100k_shuffled.csv.gz")

from doi_component.outlierness_component import OutliernessComponent

outlierness = OutliernessComponent(["ratio", "duration"])

from outdated_item_selection_strategy.no_update import *
from outdated_item_selection_strategy.oldest_chunks_update import *
from outdated_item_selection_strategy.last_n_chunks_update import *
from outdated_item_selection_strategy.regular_interval_update import *
from outdated_item_selection_strategy.outdated_bin_update import *

update_strategies = [
  ("no chunk", NoUpdate(n_dims=n_dims)),
  ("oldest n chunks", OldestChunksUpdate(n_dims=n_dims, n_chunks=3, max_age=10)),
  ("last n chunks", LastNChunksUpdate(n_dims=n_dims, n_chunks=3)),
  ("regular intervals", RegularIntervalUpdate(n_dims=n_dims,interval=2, max_age=10)),
  ("outdated bins", OutdatedBinUpdate(n_dims=n_dims))
]

from context_item_selection_strategy.no_context import * 
from context_item_selection_strategy.chunk_based_context import *
from context_item_selection_strategy.sampling_based_context import *
from context_item_selection_strategy.clustering_based_context import *

context_strategies = [
  ("no context", NoContext(n_dims=n_dims)),
  ("chunk based", RandomChunkBasedContext(n_dims=n_dims, n_chunks=3)),
  ("sampling based", RandomSamplingBasedContext(n_dims=n_dims, n_samples=chunk_size)),
  ("clustering based", ClusteringBasedContext(n_dims=n_dims, n_clusters=chunk_size))
]

####################################################################################################
####################################################################################################

from database import ID,process_chunk, update_dois, save_dois

test_cases = []
no_update = update_strategies[0][1]


for c_strat in context_strategies:
  for u_strat in update_strategies:
    reset()
    before = time.time()
    print(f"context: {c_strat[0]}, update: {u_strat[0]}")
    label = f"{c_strat[0]}-{u_strat[0]}"
    test_case = BenchmarkTestCase(label, outlierness, c_strat[1], u_strat[1], chunk_size, chunks)
    test_case.run(to_csv=f"./out/999/")
    test_cases += [test_case]
    print(f"done: {time.time() - before}s")


context: no context, update: no chunk
done: 15.504343032836914
context: no context, update: oldest n chunks
done: 24.177157402038574
context: no context, update: last n chunks
done: 28.526756286621094
context: no context, update: regular intervals
done: 16.034247636795044
context: no context, update: outdated bins
done: 17.510358810424805
context: chunk based, update: no chunk
done: 24.598097562789917
context: chunk based, update: oldest n chunks
done: 31.48575711250305
context: chunk based, update: last n chunks
done: 40.3431932926178
context: chunk based, update: regular intervals
done: 22.139740705490112
context: chunk based, update: outdated bins
done: 21.825734615325928
context: sampling based, update: no chunk
done: 23.358924865722656
context: sampling based, update: oldest n chunks
done: 36.06214785575867
context: sampling based, update: last n chunks
done: 42.421061277389526
context: sampling based, update: regular intervals
done: 28.48691415786743
context: sampling based, upda

  clustering = KMeans(n_clusters=self.n_clusters).fit(numeric)


(100, 2)
      tripid  chunk
0   31852922      0
1   16785706      0
2   66379394      0
3   10428271      0
4   23940933      0
..       ...    ...
95  39663582      0
96  27358138      0
97  52000532      0
98   4868680      0
99  33632368      0

[100 rows x 2 columns]
(0, 2)
Empty DataFrame
Columns: [tripid, chunk]
Index: []


  clustering = KMeans(n_clusters=self.n_clusters).fit(numeric)


(100, 2)
      tripid  chunk
0   31852922      0
1   16785706      0
2   66379394      0
3   10428271      0
4   23940933      0
..       ...    ...
95  39663582      0
96  27358138      0
97  52000532      0
98   4868680      0
99  33632368      0

[100 rows x 2 columns]
(100, 2)
       tripid  chunk
100  39225856      1
101  26839227      1
102  50669417      1
103  48831281      1
104  18208003      1
..        ...    ...
195  24617231      1
196   4805335      1
197  48663952      1
198  35213735      1
199  17254722      1

[100 rows x 2 columns]
(0, 2)
Empty DataFrame
Columns: [tripid, chunk]
Index: []


  clustering = KMeans(n_clusters=self.n_clusters).fit(numeric)


(100, 2)
       tripid  chunk
100  39225856      1
101  26839227      1
102  50669417      1
103  48831281      1
104  18208003      1
..        ...    ...
195  24617231      1
196   4805335      1
197  48663952      1
198  35213735      1
199  17254722      1

[100 rows x 2 columns]
(100, 2)
      tripid  chunk
0   31852922      0
1   16785706      0
2   66379394      0
3   10428271      0
4   23940933      0
..       ...    ...
95  39663582      0
96  27358138      0
97  52000532      0
98   4868680      0
99  33632368      0

[100 rows x 2 columns]
(100, 2)
       tripid  chunk
200  50786029      2
201  61452028      2
202  24948093      2
203  43842870      2
204   2562066      2
..        ...    ...
295  62273054      2
296  75709056      2
297  11098836      2
298  51945804      2
299  33990591      2

[100 rows x 2 columns]
(0, 2)
Empty DataFrame
Columns: [tripid, chunk]
Index: []


  clustering = KMeans(n_clusters=self.n_clusters).fit(numeric)


(100, 2)
      tripid  chunk
0   31852922      0
1   16785706      0
2   66379394      0
3   10428271      0
4   23940933      0
..       ...    ...
95  39663582      0
96  27358138      0
97  52000532      0
98   4868680      0
99  33632368      0

[100 rows x 2 columns]
(100, 2)
       tripid  chunk
200  50786029      2
201  61452028      2
202  24948093      2
203  43842870      2
204   2562066      2
..        ...    ...
295  62273054      2
296  75709056      2
297  11098836      2
298  51945804      2
299  33990591      2

[100 rows x 2 columns]
(100, 2)
       tripid  chunk
300  47418733      3
301  21795540      3
302   2407594      3
303   9213437      3
304  32404168      3
..        ...    ...
395  41661138      3
396  70594059      3
397  58906540      3
398   3539811      3
399  13380580      3

[100 rows x 2 columns]
(100, 2)
       tripid  chunk
100  39225856      1
101  26839227      1
102  50669417      1
103  48831281      1
104  18208003      1
..        ...    ...
1

  clustering = KMeans(n_clusters=self.n_clusters).fit(numeric)


(100, 2)
       tripid  chunk
400  16102971      4
401  51648349      4
402  41789851      4
403  69315370      4
404  40627660      4
..        ...    ...
495  70364707      4
496  33834503      4
497  15158245      4
498   1320345      4
499  64662196      4

[100 rows x 2 columns]
(100, 2)
      tripid  chunk
0   31852922      0
1   16785706      0
2   66379394      0
3   10428271      0
4   23940933      0
..       ...    ...
95  39663582      0
96  27358138      0
97  52000532      0
98   4868680      0
99  33632368      0

[100 rows x 2 columns]
(100, 2)
       tripid  chunk
100  39225856      1
101  26839227      1
102  50669417      1
103  48831281      1
104  18208003      1
..        ...    ...
195  24617231      1
196   4805335      1
197  48663952      1
198  35213735      1
199  17254722      1

[100 rows x 2 columns]
(100, 2)
       tripid  chunk
300  47418733      3
301  21795540      3
302   2407594      3
303   9213437      3
304  32404168      3
..        ...    ...
3

  clustering = KMeans(n_clusters=self.n_clusters).fit(numeric)


(100, 2)
       tripid  chunk
100  39225856      1
101  26839227      1
102  50669417      1
103  48831281      1
104  18208003      1
..        ...    ...
195  24617231      1
196   4805335      1
197  48663952      1
198  35213735      1
199  17254722      1

[100 rows x 2 columns]
(100, 2)
       tripid  chunk
400  16102971      4
401  51648349      4
402  41789851      4
403  69315370      4
404  40627660      4
..        ...    ...
495  70364707      4
496  33834503      4
497  15158245      4
498   1320345      4
499  64662196      4

[100 rows x 2 columns]
(100, 2)
       tripid  chunk
500  30722037      5
501  63673404      5
502  62653669      5
503  64716346      5
504   1274581      5
..        ...    ...
595  13106699      5
596  41525781      5
597  29894653      5
598  29034150      5
599  33907815      5

[100 rows x 2 columns]
(100, 2)
      tripid  chunk
0   31852922      0
1   16785706      0
2   66379394      0
3   10428271      0
4   23940933      0
..       ...    

  clustering = KMeans(n_clusters=self.n_clusters).fit(numeric)


(100, 2)
       tripid  chunk
500  30722037      5
501  63673404      5
502  62653669      5
503  64716346      5
504   1274581      5
..        ...    ...
595  13106699      5
596  41525781      5
597  29894653      5
598  29034150      5
599  33907815      5

[100 rows x 2 columns]
(100, 2)
       tripid  chunk
100  39225856      1
101  26839227      1
102  50669417      1
103  48831281      1
104  18208003      1
..        ...    ...
195  24617231      1
196   4805335      1
197  48663952      1
198  35213735      1
199  17254722      1

[100 rows x 2 columns]
(100, 2)
       tripid  chunk
400  16102971      4
401  51648349      4
402  41789851      4
403  69315370      4
404  40627660      4
..        ...    ...
495  70364707      4
496  33834503      4
497  15158245      4
498   1320345      4
499  64662196      4

[100 rows x 2 columns]
(100, 2)
       tripid  chunk
300  47418733      3
301  21795540      3
302   2407594      3
303   9213437      3
304  32404168      3
..        

  clustering = KMeans(n_clusters=self.n_clusters).fit(numeric)


(100, 2)
      tripid  chunk
0   31852922      0
1   16785706      0
2   66379394      0
3   10428271      0
4   23940933      0
..       ...    ...
95  39663582      0
96  27358138      0
97  52000532      0
98   4868680      0
99  33632368      0

[100 rows x 2 columns]
(100, 2)
       tripid  chunk
600   2172150      6
601  70494922      6
602  20475419      6
603  30555171      6
604  36820017      6
..        ...    ...
695  37053627      6
696  58702858      6
697   7133337      6
698  66500027      6
699  57414644      6

[100 rows x 2 columns]
(100, 2)
       tripid  chunk
300  47418733      3
301  21795540      3
302   2407594      3
303   9213437      3
304  32404168      3
..        ...    ...
395  41661138      3
396  70594059      3
397  58906540      3
398   3539811      3
399  13380580      3

[100 rows x 2 columns]
(100, 2)
       tripid  chunk
700  20670037      7
701  38186308      7
702  76528986      7
703  70184378      7
704  71190854      7
..        ...    ...
7

  clustering = KMeans(n_clusters=self.n_clusters).fit(numeric)


(100, 2)
       tripid  chunk
400  16102971      4
401  51648349      4
402  41789851      4
403  69315370      4
404  40627660      4
..        ...    ...
495  70364707      4
496  33834503      4
497  15158245      4
498   1320345      4
499  64662196      4

[100 rows x 2 columns]
(100, 2)
       tripid  chunk
100  39225856      1
101  26839227      1
102  50669417      1
103  48831281      1
104  18208003      1
..        ...    ...
195  24617231      1
196   4805335      1
197  48663952      1
198  35213735      1
199  17254722      1

[100 rows x 2 columns]
(100, 2)
       tripid  chunk
700  20670037      7
701  38186308      7
702  76528986      7
703  70184378      7
704  71190854      7
..        ...    ...
795  52688224      7
796   5190740      7
797  65169669      7
798  12751807      7
799   1403189      7

[100 rows x 2 columns]
(100, 2)
      tripid  chunk
0   31852922      0
1   16785706      0
2   66379394      0
3   10428271      0
4   23940933      0
..       ...    

  clustering = KMeans(n_clusters=self.n_clusters).fit(numeric)


(100, 2)
       tripid  chunk
900  67380447      9
901  34103505      9
902  71123095      9
903   7892965      9
904  56321284      9
..        ...    ...
995  58603321      9
996  57084445      9
997  67308765      9
998  40987340      9
999  37908936      9

[100 rows x 2 columns]
(100, 2)
       tripid  chunk
100  39225856      1
101  26839227      1
102  50669417      1
103  48831281      1
104  18208003      1
..        ...    ...
195  24617231      1
196   4805335      1
197  48663952      1
198  35213735      1
199  17254722      1

[100 rows x 2 columns]
(100, 2)
       tripid  chunk
400  16102971      4
401  51648349      4
402  41789851      4
403  69315370      4
404  40627660      4
..        ...    ...
495  70364707      4
496  33834503      4
497  15158245      4
498   1320345      4
499  64662196      4

[100 rows x 2 columns]
(100, 2)
       tripid  chunk
700  20670037      7
701  38186308      7
702  76528986      7
703  70184378      7
704  71190854      7
..        

  clustering = KMeans(n_clusters=self.n_clusters).fit(numeric)


(100, 2)
      tripid  chunk
0   31852922      0
1   16785706      0
2   66379394      0
3   10428271      0
4   23940933      0
..       ...    ...
95  39663582      0
96  27358138      0
97  52000532      0
98   4868680      0
99  33632368      0

[100 rows x 2 columns]
(0, 2)
Empty DataFrame
Columns: [tripid, chunk]
Index: []


  clustering = KMeans(n_clusters=self.n_clusters).fit(numeric)


(100, 2)
       tripid  chunk
100  39225856      1
101  26839227      1
102  50669417      1
103  48831281      1
104  18208003      1
..        ...    ...
195  24617231      1
196   4805335      1
197  48663952      1
198  35213735      1
199  17254722      1

[100 rows x 2 columns]
(100, 2)
      tripid  chunk
0   31852922      0
1   16785706      0
2   66379394      0
3   10428271      0
4   23940933      0
..       ...    ...
95  39663582      0
96  27358138      0
97  52000532      0
98   4868680      0
99  33632368      0

[100 rows x 2 columns]
(0, 2)
Empty DataFrame
Columns: [tripid, chunk]
Index: []


  clustering = KMeans(n_clusters=self.n_clusters).fit(numeric)


(100, 2)
       tripid  chunk
200  50786029      2
201  61452028      2
202  24948093      2
203  43842870      2
204   2562066      2
..        ...    ...
295  62273054      2
296  75709056      2
297  11098836      2
298  51945804      2
299  33990591      2

[100 rows x 2 columns]
(100, 2)
       tripid  chunk
100  39225856      1
101  26839227      1
102  50669417      1
103  48831281      1
104  18208003      1
..        ...    ...
195  24617231      1
196   4805335      1
197  48663952      1
198  35213735      1
199  17254722      1

[100 rows x 2 columns]
(100, 2)
      tripid  chunk
0   31852922      0
1   16785706      0
2   66379394      0
3   10428271      0
4   23940933      0
..       ...    ...
95  39663582      0
96  27358138      0
97  52000532      0
98   4868680      0
99  33632368      0

[100 rows x 2 columns]
(0, 2)
Empty DataFrame
Columns: [tripid, chunk]
Index: []


  clustering = KMeans(n_clusters=self.n_clusters).fit(numeric)


(100, 2)
      tripid  chunk
0   31852922      0
1   16785706      0
2   66379394      0
3   10428271      0
4   23940933      0
..       ...    ...
95  39663582      0
96  27358138      0
97  52000532      0
98   4868680      0
99  33632368      0

[100 rows x 2 columns]
(100, 2)
       tripid  chunk
300  47418733      3
301  21795540      3
302   2407594      3
303   9213437      3
304  32404168      3
..        ...    ...
395  41661138      3
396  70594059      3
397  58906540      3
398   3539811      3
399  13380580      3

[100 rows x 2 columns]
(100, 2)
       tripid  chunk
100  39225856      1
101  26839227      1
102  50669417      1
103  48831281      1
104  18208003      1
..        ...    ...
195  24617231      1
196   4805335      1
197  48663952      1
198  35213735      1
199  17254722      1

[100 rows x 2 columns]
(100, 2)
       tripid  chunk
200  50786029      2
201  61452028      2
202  24948093      2
203  43842870      2
204   2562066      2
..        ...    ...
2

  clustering = KMeans(n_clusters=self.n_clusters).fit(numeric)


(100, 2)
      tripid  chunk
0   31852922      0
1   16785706      0
2   66379394      0
3   10428271      0
4   23940933      0
..       ...    ...
95  39663582      0
96  27358138      0
97  52000532      0
98   4868680      0
99  33632368      0

[100 rows x 2 columns]
(100, 2)
       tripid  chunk
300  47418733      3
301  21795540      3
302   2407594      3
303   9213437      3
304  32404168      3
..        ...    ...
395  41661138      3
396  70594059      3
397  58906540      3
398   3539811      3
399  13380580      3

[100 rows x 2 columns]
(100, 2)
       tripid  chunk
100  39225856      1
101  26839227      1
102  50669417      1
103  48831281      1
104  18208003      1
..        ...    ...
195  24617231      1
196   4805335      1
197  48663952      1
198  35213735      1
199  17254722      1

[100 rows x 2 columns]
(100, 2)
       tripid  chunk
400  16102971      4
401  51648349      4
402  41789851      4
403  69315370      4
404  40627660      4
..        ...    ...
4

  clustering = KMeans(n_clusters=self.n_clusters).fit(numeric)


(100, 2)
       tripid  chunk
100  39225856      1
101  26839227      1
102  50669417      1
103  48831281      1
104  18208003      1
..        ...    ...
195  24617231      1
196   4805335      1
197  48663952      1
198  35213735      1
199  17254722      1

[100 rows x 2 columns]
(100, 2)
       tripid  chunk
400  16102971      4
401  51648349      4
402  41789851      4
403  69315370      4
404  40627660      4
..        ...    ...
495  70364707      4
496  33834503      4
497  15158245      4
498   1320345      4
499  64662196      4

[100 rows x 2 columns]
(100, 2)
       tripid  chunk
200  50786029      2
201  61452028      2
202  24948093      2
203  43842870      2
204   2562066      2
..        ...    ...
295  62273054      2
296  75709056      2
297  11098836      2
298  51945804      2
299  33990591      2

[100 rows x 2 columns]
(100, 2)
       tripid  chunk
500  30722037      5
501  63673404      5
502  62653669      5
503  64716346      5
504   1274581      5
..        

  clustering = KMeans(n_clusters=self.n_clusters).fit(numeric)


(100, 2)
       tripid  chunk
500  30722037      5
501  63673404      5
502  62653669      5
503  64716346      5
504   1274581      5
..        ...    ...
595  13106699      5
596  41525781      5
597  29894653      5
598  29034150      5
599  33907815      5

[100 rows x 2 columns]
(100, 2)
       tripid  chunk
100  39225856      1
101  26839227      1
102  50669417      1
103  48831281      1
104  18208003      1
..        ...    ...
195  24617231      1
196   4805335      1
197  48663952      1
198  35213735      1
199  17254722      1

[100 rows x 2 columns]
(100, 2)
       tripid  chunk
300  47418733      3
301  21795540      3
302   2407594      3
303   9213437      3
304  32404168      3
..        ...    ...
395  41661138      3
396  70594059      3
397  58906540      3
398   3539811      3
399  13380580      3

[100 rows x 2 columns]
(100, 2)
       tripid  chunk
200  50786029      2
201  61452028      2
202  24948093      2
203  43842870      2
204   2562066      2
..        

  clustering = KMeans(n_clusters=self.n_clusters).fit(numeric)


(100, 2)
       tripid  chunk
500  30722037      5
501  63673404      5
502  62653669      5
503  64716346      5
504   1274581      5
..        ...    ...
595  13106699      5
596  41525781      5
597  29894653      5
598  29034150      5
599  33907815      5

[100 rows x 2 columns]
(100, 2)
       tripid  chunk
200  50786029      2
201  61452028      2
202  24948093      2
203  43842870      2
204   2562066      2
..        ...    ...
295  62273054      2
296  75709056      2
297  11098836      2
298  51945804      2
299  33990591      2

[100 rows x 2 columns]
(100, 2)
       tripid  chunk
700  20670037      7
701  38186308      7
702  76528986      7
703  70184378      7
704  71190854      7
..        ...    ...
795  52688224      7
796   5190740      7
797  65169669      7
798  12751807      7
799   1403189      7

[100 rows x 2 columns]
(100, 2)
      tripid  chunk
0   31852922      0
1   16785706      0
2   66379394      0
3   10428271      0
4   23940933      0
..       ...    

  clustering = KMeans(n_clusters=self.n_clusters).fit(numeric)


(100, 2)
       tripid  chunk
600   2172150      6
601  70494922      6
602  20475419      6
603  30555171      6
604  36820017      6
..        ...    ...
695  37053627      6
696  58702858      6
697   7133337      6
698  66500027      6
699  57414644      6

[100 rows x 2 columns]
(100, 2)
       tripid  chunk
100  39225856      1
101  26839227      1
102  50669417      1
103  48831281      1
104  18208003      1
..        ...    ...
195  24617231      1
196   4805335      1
197  48663952      1
198  35213735      1
199  17254722      1

[100 rows x 2 columns]
(100, 2)
       tripid  chunk
300  47418733      3
301  21795540      3
302   2407594      3
303   9213437      3
304  32404168      3
..        ...    ...
395  41661138      3
396  70594059      3
397  58906540      3
398   3539811      3
399  13380580      3

[100 rows x 2 columns]
(100, 2)
       tripid  chunk
800  61005881      8
801  69881889      8
802  20352325      8
803  25277847      8
804  57385136      8
..        

  clustering = KMeans(n_clusters=self.n_clusters).fit(numeric)


(100, 2)
       tripid  chunk
400  16102971      4
401  51648349      4
402  41789851      4
403  69315370      4
404  40627660      4
..        ...    ...
495  70364707      4
496  33834503      4
497  15158245      4
498   1320345      4
499  64662196      4

[100 rows x 2 columns]
(100, 2)
       tripid  chunk
800  61005881      8
801  69881889      8
802  20352325      8
803  25277847      8
804  57385136      8
..        ...    ...
895   3208148      8
896  71765457      8
897  45465470      8
898  48961957      8
899  47693410      8

[100 rows x 2 columns]
(100, 2)
       tripid  chunk
100  39225856      1
101  26839227      1
102  50669417      1
103  48831281      1
104  18208003      1
..        ...    ...
195  24617231      1
196   4805335      1
197  48663952      1
198  35213735      1
199  17254722      1

[100 rows x 2 columns]
(100, 2)
       tripid  chunk
600   2172150      6
601  70494922      6
602  20475419      6
603  30555171      6
604  36820017      6
..        

  clustering = KMeans(n_clusters=self.n_clusters).fit(numeric)


(100, 2)
      tripid  chunk
0   31852922      0
1   16785706      0
2   66379394      0
3   10428271      0
4   23940933      0
..       ...    ...
95  39663582      0
96  27358138      0
97  52000532      0
98   4868680      0
99  33632368      0

[100 rows x 2 columns]
(0, 2)
Empty DataFrame
Columns: [tripid, chunk]
Index: []


  clustering = KMeans(n_clusters=self.n_clusters).fit(numeric)


(100, 2)
       tripid  chunk
100  39225856      1
101  26839227      1
102  50669417      1
103  48831281      1
104  18208003      1
..        ...    ...
195  24617231      1
196   4805335      1
197  48663952      1
198  35213735      1
199  17254722      1

[100 rows x 2 columns]
(100, 2)
      tripid  chunk
0   31852922      0
1   16785706      0
2   66379394      0
3   10428271      0
4   23940933      0
..       ...    ...
95  39663582      0
96  27358138      0
97  52000532      0
98   4868680      0
99  33632368      0

[100 rows x 2 columns]
(0, 2)
Empty DataFrame
Columns: [tripid, chunk]
Index: []


  clustering = KMeans(n_clusters=self.n_clusters).fit(numeric)


(100, 2)
       tripid  chunk
200  50786029      2
201  61452028      2
202  24948093      2
203  43842870      2
204   2562066      2
..        ...    ...
295  62273054      2
296  75709056      2
297  11098836      2
298  51945804      2
299  33990591      2

[100 rows x 2 columns]
(100, 2)
       tripid  chunk
100  39225856      1
101  26839227      1
102  50669417      1
103  48831281      1
104  18208003      1
..        ...    ...
195  24617231      1
196   4805335      1
197  48663952      1
198  35213735      1
199  17254722      1

[100 rows x 2 columns]
(100, 2)
      tripid  chunk
0   31852922      0
1   16785706      0
2   66379394      0
3   10428271      0
4   23940933      0
..       ...    ...
95  39663582      0
96  27358138      0
97  52000532      0
98   4868680      0
99  33632368      0

[100 rows x 2 columns]
(0, 2)
Empty DataFrame
Columns: [tripid, chunk]
Index: []


  clustering = KMeans(n_clusters=self.n_clusters).fit(numeric)


(100, 2)
       tripid  chunk
100  39225856      1
101  26839227      1
102  50669417      1
103  48831281      1
104  18208003      1
..        ...    ...
195  24617231      1
196   4805335      1
197  48663952      1
198  35213735      1
199  17254722      1

[100 rows x 2 columns]
(100, 2)
       tripid  chunk
300  47418733      3
301  21795540      3
302   2407594      3
303   9213437      3
304  32404168      3
..        ...    ...
395  41661138      3
396  70594059      3
397  58906540      3
398   3539811      3
399  13380580      3

[100 rows x 2 columns]
(100, 2)
      tripid  chunk
0   31852922      0
1   16785706      0
2   66379394      0
3   10428271      0
4   23940933      0
..       ...    ...
95  39663582      0
96  27358138      0
97  52000532      0
98   4868680      0
99  33632368      0

[100 rows x 2 columns]
(100, 2)
       tripid  chunk
200  50786029      2
201  61452028      2
202  24948093      2
203  43842870      2
204   2562066      2
..        ...    ...
2

  clustering = KMeans(n_clusters=self.n_clusters).fit(numeric)


(100, 2)
       tripid  chunk
400  16102971      4
401  51648349      4
402  41789851      4
403  69315370      4
404  40627660      4
..        ...    ...
495  70364707      4
496  33834503      4
497  15158245      4
498   1320345      4
499  64662196      4

[100 rows x 2 columns]
(100, 2)
       tripid  chunk
100  39225856      1
101  26839227      1
102  50669417      1
103  48831281      1
104  18208003      1
..        ...    ...
195  24617231      1
196   4805335      1
197  48663952      1
198  35213735      1
199  17254722      1

[100 rows x 2 columns]
(100, 2)
       tripid  chunk
200  50786029      2
201  61452028      2
202  24948093      2
203  43842870      2
204   2562066      2
..        ...    ...
295  62273054      2
296  75709056      2
297  11098836      2
298  51945804      2
299  33990591      2

[100 rows x 2 columns]
(100, 2)
       tripid  chunk
300  47418733      3
301  21795540      3
302   2407594      3
303   9213437      3
304  32404168      3
..        

  clustering = KMeans(n_clusters=self.n_clusters).fit(numeric)


(100, 2)
      tripid  chunk
0   31852922      0
1   16785706      0
2   66379394      0
3   10428271      0
4   23940933      0
..       ...    ...
95  39663582      0
96  27358138      0
97  52000532      0
98   4868680      0
99  33632368      0

[100 rows x 2 columns]
(100, 2)
       tripid  chunk
400  16102971      4
401  51648349      4
402  41789851      4
403  69315370      4
404  40627660      4
..        ...    ...
495  70364707      4
496  33834503      4
497  15158245      4
498   1320345      4
499  64662196      4

[100 rows x 2 columns]
(100, 2)
       tripid  chunk
200  50786029      2
201  61452028      2
202  24948093      2
203  43842870      2
204   2562066      2
..        ...    ...
295  62273054      2
296  75709056      2
297  11098836      2
298  51945804      2
299  33990591      2

[100 rows x 2 columns]
(100, 2)
       tripid  chunk
100  39225856      1
101  26839227      1
102  50669417      1
103  48831281      1
104  18208003      1
..        ...    ...
1

  clustering = KMeans(n_clusters=self.n_clusters).fit(numeric)


(100, 2)
       tripid  chunk
400  16102971      4
401  51648349      4
402  41789851      4
403  69315370      4
404  40627660      4
..        ...    ...
495  70364707      4
496  33834503      4
497  15158245      4
498   1320345      4
499  64662196      4

[100 rows x 2 columns]
(100, 2)
       tripid  chunk
100  39225856      1
101  26839227      1
102  50669417      1
103  48831281      1
104  18208003      1
..        ...    ...
195  24617231      1
196   4805335      1
197  48663952      1
198  35213735      1
199  17254722      1

[100 rows x 2 columns]
(100, 2)
       tripid  chunk
600   2172150      6
601  70494922      6
602  20475419      6
603  30555171      6
604  36820017      6
..        ...    ...
695  37053627      6
696  58702858      6
697   7133337      6
698  66500027      6
699  57414644      6

[100 rows x 2 columns]
(100, 2)
      tripid  chunk
0   31852922      0
1   16785706      0
2   66379394      0
3   10428271      0
4   23940933      0
..       ...    

  clustering = KMeans(n_clusters=self.n_clusters).fit(numeric)


(100, 2)
       tripid  chunk
500  30722037      5
501  63673404      5
502  62653669      5
503  64716346      5
504   1274581      5
..        ...    ...
595  13106699      5
596  41525781      5
597  29894653      5
598  29034150      5
599  33907815      5

[100 rows x 2 columns]
(100, 2)
       tripid  chunk
100  39225856      1
101  26839227      1
102  50669417      1
103  48831281      1
104  18208003      1
..        ...    ...
195  24617231      1
196   4805335      1
197  48663952      1
198  35213735      1
199  17254722      1

[100 rows x 2 columns]
(100, 2)
       tripid  chunk
700  20670037      7
701  38186308      7
702  76528986      7
703  70184378      7
704  71190854      7
..        ...    ...
795  52688224      7
796   5190740      7
797  65169669      7
798  12751807      7
799   1403189      7

[100 rows x 2 columns]
(100, 2)
       tripid  chunk
200  50786029      2
201  61452028      2
202  24948093      2
203  43842870      2
204   2562066      2
..        

  clustering = KMeans(n_clusters=self.n_clusters).fit(numeric)


(100, 2)
       tripid  chunk
700  20670037      7
701  38186308      7
702  76528986      7
703  70184378      7
704  71190854      7
..        ...    ...
795  52688224      7
796   5190740      7
797  65169669      7
798  12751807      7
799   1403189      7

[100 rows x 2 columns]
(100, 2)
       tripid  chunk
200  50786029      2
201  61452028      2
202  24948093      2
203  43842870      2
204   2562066      2
..        ...    ...
295  62273054      2
296  75709056      2
297  11098836      2
298  51945804      2
299  33990591      2

[100 rows x 2 columns]
(100, 2)
       tripid  chunk
500  30722037      5
501  63673404      5
502  62653669      5
503  64716346      5
504   1274581      5
..        ...    ...
595  13106699      5
596  41525781      5
597  29894653      5
598  29034150      5
599  33907815      5

[100 rows x 2 columns]
(100, 2)
      tripid  chunk
0   31852922      0
1   16785706      0
2   66379394      0
3   10428271      0
4   23940933      0
..       ...    

  clustering = KMeans(n_clusters=self.n_clusters).fit(numeric)


(100, 2)
       tripid  chunk
700  20670037      7
701  38186308      7
702  76528986      7
703  70184378      7
704  71190854      7
..        ...    ...
795  52688224      7
796   5190740      7
797  65169669      7
798  12751807      7
799   1403189      7

[100 rows x 2 columns]
(100, 2)
       tripid  chunk
200  50786029      2
201  61452028      2
202  24948093      2
203  43842870      2
204   2562066      2
..        ...    ...
295  62273054      2
296  75709056      2
297  11098836      2
298  51945804      2
299  33990591      2

[100 rows x 2 columns]
(100, 2)
       tripid  chunk
500  30722037      5
501  63673404      5
502  62653669      5
503  64716346      5
504   1274581      5
..        ...    ...
595  13106699      5
596  41525781      5
597  29894653      5
598  29034150      5
599  33907815      5

[100 rows x 2 columns]
(100, 2)
      tripid  chunk
0   31852922      0
1   16785706      0
2   66379394      0
3   10428271      0
4   23940933      0
..       ...    

  clustering = KMeans(n_clusters=self.n_clusters).fit(numeric)


(100, 2)
      tripid  chunk
0   31852922      0
1   16785706      0
2   66379394      0
3   10428271      0
4   23940933      0
..       ...    ...
95  39663582      0
96  27358138      0
97  52000532      0
98   4868680      0
99  33632368      0

[100 rows x 2 columns]
(0, 2)
Empty DataFrame
Columns: [tripid, chunk]
Index: []


  clustering = KMeans(n_clusters=self.n_clusters).fit(numeric)


(100, 2)
       tripid  chunk
100  39225856      1
101  26839227      1
102  50669417      1
103  48831281      1
104  18208003      1
..        ...    ...
195  24617231      1
196   4805335      1
197  48663952      1
198  35213735      1
199  17254722      1

[100 rows x 2 columns]
(100, 2)
      tripid  chunk
0   31852922      0
1   16785706      0
2   66379394      0
3   10428271      0
4   23940933      0
..       ...    ...
95  39663582      0
96  27358138      0
97  52000532      0
98   4868680      0
99  33632368      0

[100 rows x 2 columns]
(0, 2)
Empty DataFrame
Columns: [tripid, chunk]
Index: []


  clustering = KMeans(n_clusters=self.n_clusters).fit(numeric)


(100, 2)
      tripid  chunk
0   31852922      0
1   16785706      0
2   66379394      0
3   10428271      0
4   23940933      0
..       ...    ...
95  39663582      0
96  27358138      0
97  52000532      0
98   4868680      0
99  33632368      0

[100 rows x 2 columns]
(100, 2)
       tripid  chunk
200  50786029      2
201  61452028      2
202  24948093      2
203  43842870      2
204   2562066      2
..        ...    ...
295  62273054      2
296  75709056      2
297  11098836      2
298  51945804      2
299  33990591      2

[100 rows x 2 columns]
(100, 2)
       tripid  chunk
100  39225856      1
101  26839227      1
102  50669417      1
103  48831281      1
104  18208003      1
..        ...    ...
195  24617231      1
196   4805335      1
197  48663952      1
198  35213735      1
199  17254722      1

[100 rows x 2 columns]
(0, 2)
Empty DataFrame
Columns: [tripid, chunk]
Index: []


  clustering = KMeans(n_clusters=self.n_clusters).fit(numeric)


(100, 2)
      tripid  chunk
0   31852922      0
1   16785706      0
2   66379394      0
3   10428271      0
4   23940933      0
..       ...    ...
95  39663582      0
96  27358138      0
97  52000532      0
98   4868680      0
99  33632368      0

[100 rows x 2 columns]
(100, 2)
       tripid  chunk
200  50786029      2
201  61452028      2
202  24948093      2
203  43842870      2
204   2562066      2
..        ...    ...
295  62273054      2
296  75709056      2
297  11098836      2
298  51945804      2
299  33990591      2

[100 rows x 2 columns]
(100, 2)
       tripid  chunk
100  39225856      1
101  26839227      1
102  50669417      1
103  48831281      1
104  18208003      1
..        ...    ...
195  24617231      1
196   4805335      1
197  48663952      1
198  35213735      1
199  17254722      1

[100 rows x 2 columns]
(100, 2)
       tripid  chunk
300  47418733      3
301  21795540      3
302   2407594      3
303   9213437      3
304  32404168      3
..        ...    ...
3

  clustering = KMeans(n_clusters=self.n_clusters).fit(numeric)


(100, 2)
       tripid  chunk
200  50786029      2
201  61452028      2
202  24948093      2
203  43842870      2
204   2562066      2
..        ...    ...
295  62273054      2
296  75709056      2
297  11098836      2
298  51945804      2
299  33990591      2

[100 rows x 2 columns]
(100, 2)
      tripid  chunk
0   31852922      0
1   16785706      0
2   66379394      0
3   10428271      0
4   23940933      0
..       ...    ...
95  39663582      0
96  27358138      0
97  52000532      0
98   4868680      0
99  33632368      0

[100 rows x 2 columns]
(100, 2)
       tripid  chunk
400  16102971      4
401  51648349      4
402  41789851      4
403  69315370      4
404  40627660      4
..        ...    ...
495  70364707      4
496  33834503      4
497  15158245      4
498   1320345      4
499  64662196      4

[100 rows x 2 columns]
(100, 2)
       tripid  chunk
100  39225856      1
101  26839227      1
102  50669417      1
103  48831281      1
104  18208003      1
..        ...    ...
1

  clustering = KMeans(n_clusters=self.n_clusters).fit(numeric)


(100, 2)
       tripid  chunk
300  47418733      3
301  21795540      3
302   2407594      3
303   9213437      3
304  32404168      3
..        ...    ...
395  41661138      3
396  70594059      3
397  58906540      3
398   3539811      3
399  13380580      3

[100 rows x 2 columns]
(100, 2)
      tripid  chunk
0   31852922      0
1   16785706      0
2   66379394      0
3   10428271      0
4   23940933      0
..       ...    ...
95  39663582      0
96  27358138      0
97  52000532      0
98   4868680      0
99  33632368      0

[100 rows x 2 columns]
(100, 2)
       tripid  chunk
500  30722037      5
501  63673404      5
502  62653669      5
503  64716346      5
504   1274581      5
..        ...    ...
595  13106699      5
596  41525781      5
597  29894653      5
598  29034150      5
599  33907815      5

[100 rows x 2 columns]
(100, 2)
       tripid  chunk
400  16102971      4
401  51648349      4
402  41789851      4
403  69315370      4
404  40627660      4
..        ...    ...
4

  clustering = KMeans(n_clusters=self.n_clusters).fit(numeric)


(100, 2)
       tripid  chunk
600   2172150      6
601  70494922      6
602  20475419      6
603  30555171      6
604  36820017      6
..        ...    ...
695  37053627      6
696  58702858      6
697   7133337      6
698  66500027      6
699  57414644      6

[100 rows x 2 columns]
(100, 2)
       tripid  chunk
200  50786029      2
201  61452028      2
202  24948093      2
203  43842870      2
204   2562066      2
..        ...    ...
295  62273054      2
296  75709056      2
297  11098836      2
298  51945804      2
299  33990591      2

[100 rows x 2 columns]
(100, 2)
      tripid  chunk
0   31852922      0
1   16785706      0
2   66379394      0
3   10428271      0
4   23940933      0
..       ...    ...
95  39663582      0
96  27358138      0
97  52000532      0
98   4868680      0
99  33632368      0

[100 rows x 2 columns]
(100, 2)
       tripid  chunk
400  16102971      4
401  51648349      4
402  41789851      4
403  69315370      4
404  40627660      4
..        ...    ...
4

  clustering = KMeans(n_clusters=self.n_clusters).fit(numeric)


(100, 2)
       tripid  chunk
600   2172150      6
601  70494922      6
602  20475419      6
603  30555171      6
604  36820017      6
..        ...    ...
695  37053627      6
696  58702858      6
697   7133337      6
698  66500027      6
699  57414644      6

[100 rows x 2 columns]
(100, 2)
       tripid  chunk
100  39225856      1
101  26839227      1
102  50669417      1
103  48831281      1
104  18208003      1
..        ...    ...
195  24617231      1
196   4805335      1
197  48663952      1
198  35213735      1
199  17254722      1

[100 rows x 2 columns]
(100, 2)
       tripid  chunk
400  16102971      4
401  51648349      4
402  41789851      4
403  69315370      4
404  40627660      4
..        ...    ...
495  70364707      4
496  33834503      4
497  15158245      4
498   1320345      4
499  64662196      4

[100 rows x 2 columns]
(100, 2)
      tripid  chunk
0   31852922      0
1   16785706      0
2   66379394      0
3   10428271      0
4   23940933      0
..       ...    

  clustering = KMeans(n_clusters=self.n_clusters).fit(numeric)


(100, 2)
      tripid  chunk
0   31852922      0
1   16785706      0
2   66379394      0
3   10428271      0
4   23940933      0
..       ...    ...
95  39663582      0
96  27358138      0
97  52000532      0
98   4868680      0
99  33632368      0

[100 rows x 2 columns]
(100, 2)
       tripid  chunk
600   2172150      6
601  70494922      6
602  20475419      6
603  30555171      6
604  36820017      6
..        ...    ...
695  37053627      6
696  58702858      6
697   7133337      6
698  66500027      6
699  57414644      6

[100 rows x 2 columns]
(100, 2)
       tripid  chunk
300  47418733      3
301  21795540      3
302   2407594      3
303   9213437      3
304  32404168      3
..        ...    ...
395  41661138      3
396  70594059      3
397  58906540      3
398   3539811      3
399  13380580      3

[100 rows x 2 columns]
(100, 2)
       tripid  chunk
800  61005881      8
801  69881889      8
802  20352325      8
803  25277847      8
804  57385136      8
..        ...    ...
8

  clustering = KMeans(n_clusters=self.n_clusters).fit(numeric)


(100, 2)
       tripid  chunk
700  20670037      7
701  38186308      7
702  76528986      7
703  70184378      7
704  71190854      7
..        ...    ...
795  52688224      7
796   5190740      7
797  65169669      7
798  12751807      7
799   1403189      7

[100 rows x 2 columns]
(100, 2)
       tripid  chunk
200  50786029      2
201  61452028      2
202  24948093      2
203  43842870      2
204   2562066      2
..        ...    ...
295  62273054      2
296  75709056      2
297  11098836      2
298  51945804      2
299  33990591      2

[100 rows x 2 columns]
(100, 2)
       tripid  chunk
500  30722037      5
501  63673404      5
502  62653669      5
503  64716346      5
504   1274581      5
..        ...    ...
595  13106699      5
596  41525781      5
597  29894653      5
598  29034150      5
599  33907815      5

[100 rows x 2 columns]
(100, 2)
       tripid  chunk
900  67380447      9
901  34103505      9
902  71123095      9
903   7892965      9
904  56321284      9
..        

  clustering = KMeans(n_clusters=self.n_clusters).fit(numeric)


(100, 2)
      tripid  chunk
0   31852922      0
1   16785706      0
2   66379394      0
3   10428271      0
4   23940933      0
..       ...    ...
95  39663582      0
96  27358138      0
97  52000532      0
98   4868680      0
99  33632368      0

[100 rows x 2 columns]
(0, 2)
Empty DataFrame
Columns: [tripid, chunk]
Index: []


  clustering = KMeans(n_clusters=self.n_clusters).fit(numeric)


(100, 2)
       tripid  chunk
100  39225856      1
101  26839227      1
102  50669417      1
103  48831281      1
104  18208003      1
..        ...    ...
195  24617231      1
196   4805335      1
197  48663952      1
198  35213735      1
199  17254722      1

[100 rows x 2 columns]
(100, 2)
      tripid  chunk
0   31852922      0
1   16785706      0
2   66379394      0
3   10428271      0
4   23940933      0
..       ...    ...
95  39663582      0
96  27358138      0
97  52000532      0
98   4868680      0
99  33632368      0

[100 rows x 2 columns]
(0, 2)
Empty DataFrame
Columns: [tripid, chunk]
Index: []


  clustering = KMeans(n_clusters=self.n_clusters).fit(numeric)


(100, 2)
       tripid  chunk
200  50786029      2
201  61452028      2
202  24948093      2
203  43842870      2
204   2562066      2
..        ...    ...
295  62273054      2
296  75709056      2
297  11098836      2
298  51945804      2
299  33990591      2

[100 rows x 2 columns]
(100, 2)
      tripid  chunk
0   31852922      0
1   16785706      0
2   66379394      0
3   10428271      0
4   23940933      0
..       ...    ...
95  39663582      0
96  27358138      0
97  52000532      0
98   4868680      0
99  33632368      0

[100 rows x 2 columns]
(100, 2)
       tripid  chunk
100  39225856      1
101  26839227      1
102  50669417      1
103  48831281      1
104  18208003      1
..        ...    ...
195  24617231      1
196   4805335      1
197  48663952      1
198  35213735      1
199  17254722      1

[100 rows x 2 columns]
(0, 2)
Empty DataFrame
Columns: [tripid, chunk]
Index: []


  clustering = KMeans(n_clusters=self.n_clusters).fit(numeric)


(100, 2)
      tripid  chunk
0   31852922      0
1   16785706      0
2   66379394      0
3   10428271      0
4   23940933      0
..       ...    ...
95  39663582      0
96  27358138      0
97  52000532      0
98   4868680      0
99  33632368      0

[100 rows x 2 columns]
(100, 2)
       tripid  chunk
300  47418733      3
301  21795540      3
302   2407594      3
303   9213437      3
304  32404168      3
..        ...    ...
395  41661138      3
396  70594059      3
397  58906540      3
398   3539811      3
399  13380580      3

[100 rows x 2 columns]
(100, 2)
       tripid  chunk
200  50786029      2
201  61452028      2
202  24948093      2
203  43842870      2
204   2562066      2
..        ...    ...
295  62273054      2
296  75709056      2
297  11098836      2
298  51945804      2
299  33990591      2

[100 rows x 2 columns]
(100, 2)
       tripid  chunk
100  39225856      1
101  26839227      1
102  50669417      1
103  48831281      1
104  18208003      1
..        ...    ...
1

  clustering = KMeans(n_clusters=self.n_clusters).fit(numeric)


(100, 2)
       tripid  chunk
400  16102971      4
401  51648349      4
402  41789851      4
403  69315370      4
404  40627660      4
..        ...    ...
495  70364707      4
496  33834503      4
497  15158245      4
498   1320345      4
499  64662196      4

[100 rows x 2 columns]
(100, 2)
       tripid  chunk
100  39225856      1
101  26839227      1
102  50669417      1
103  48831281      1
104  18208003      1
..        ...    ...
195  24617231      1
196   4805335      1
197  48663952      1
198  35213735      1
199  17254722      1

[100 rows x 2 columns]
(100, 2)
       tripid  chunk
300  47418733      3
301  21795540      3
302   2407594      3
303   9213437      3
304  32404168      3
..        ...    ...
395  41661138      3
396  70594059      3
397  58906540      3
398   3539811      3
399  13380580      3

[100 rows x 2 columns]
(100, 2)
       tripid  chunk
200  50786029      2
201  61452028      2
202  24948093      2
203  43842870      2
204   2562066      2
..        

  clustering = KMeans(n_clusters=self.n_clusters).fit(numeric)


(100, 2)
       tripid  chunk
100  39225856      1
101  26839227      1
102  50669417      1
103  48831281      1
104  18208003      1
..        ...    ...
195  24617231      1
196   4805335      1
197  48663952      1
198  35213735      1
199  17254722      1

[100 rows x 2 columns]
(100, 2)
       tripid  chunk
400  16102971      4
401  51648349      4
402  41789851      4
403  69315370      4
404  40627660      4
..        ...    ...
495  70364707      4
496  33834503      4
497  15158245      4
498   1320345      4
499  64662196      4

[100 rows x 2 columns]
(100, 2)
       tripid  chunk
300  47418733      3
301  21795540      3
302   2407594      3
303   9213437      3
304  32404168      3
..        ...    ...
395  41661138      3
396  70594059      3
397  58906540      3
398   3539811      3
399  13380580      3

[100 rows x 2 columns]
(100, 2)
       tripid  chunk
500  30722037      5
501  63673404      5
502  62653669      5
503  64716346      5
504   1274581      5
..        

  clustering = KMeans(n_clusters=self.n_clusters).fit(numeric)


(100, 2)
       tripid  chunk
500  30722037      5
501  63673404      5
502  62653669      5
503  64716346      5
504   1274581      5
..        ...    ...
595  13106699      5
596  41525781      5
597  29894653      5
598  29034150      5
599  33907815      5

[100 rows x 2 columns]
(100, 2)
       tripid  chunk
100  39225856      1
101  26839227      1
102  50669417      1
103  48831281      1
104  18208003      1
..        ...    ...
195  24617231      1
196   4805335      1
197  48663952      1
198  35213735      1
199  17254722      1

[100 rows x 2 columns]
(100, 2)
       tripid  chunk
400  16102971      4
401  51648349      4
402  41789851      4
403  69315370      4
404  40627660      4
..        ...    ...
495  70364707      4
496  33834503      4
497  15158245      4
498   1320345      4
499  64662196      4

[100 rows x 2 columns]
(100, 2)
       tripid  chunk
300  47418733      3
301  21795540      3
302   2407594      3
303   9213437      3
304  32404168      3
..        

  clustering = KMeans(n_clusters=self.n_clusters).fit(numeric)


(100, 2)
       tripid  chunk
100  39225856      1
101  26839227      1
102  50669417      1
103  48831281      1
104  18208003      1
..        ...    ...
195  24617231      1
196   4805335      1
197  48663952      1
198  35213735      1
199  17254722      1

[100 rows x 2 columns]
(100, 2)
       tripid  chunk
600   2172150      6
601  70494922      6
602  20475419      6
603  30555171      6
604  36820017      6
..        ...    ...
695  37053627      6
696  58702858      6
697   7133337      6
698  66500027      6
699  57414644      6

[100 rows x 2 columns]
(100, 2)
       tripid  chunk
300  47418733      3
301  21795540      3
302   2407594      3
303   9213437      3
304  32404168      3
..        ...    ...
395  41661138      3
396  70594059      3
397  58906540      3
398   3539811      3
399  13380580      3

[100 rows x 2 columns]
(100, 2)
      tripid  chunk
0   31852922      0
1   16785706      0
2   66379394      0
3   10428271      0
4   23940933      0
..       ...    

  clustering = KMeans(n_clusters=self.n_clusters).fit(numeric)


(100, 2)
       tripid  chunk
600   2172150      6
601  70494922      6
602  20475419      6
603  30555171      6
604  36820017      6
..        ...    ...
695  37053627      6
696  58702858      6
697   7133337      6
698  66500027      6
699  57414644      6

[100 rows x 2 columns]
(100, 2)
       tripid  chunk
100  39225856      1
101  26839227      1
102  50669417      1
103  48831281      1
104  18208003      1
..        ...    ...
195  24617231      1
196   4805335      1
197  48663952      1
198  35213735      1
199  17254722      1

[100 rows x 2 columns]
(100, 2)
       tripid  chunk
300  47418733      3
301  21795540      3
302   2407594      3
303   9213437      3
304  32404168      3
..        ...    ...
395  41661138      3
396  70594059      3
397  58906540      3
398   3539811      3
399  13380580      3

[100 rows x 2 columns]
(100, 2)
       tripid  chunk
800  61005881      8
801  69881889      8
802  20352325      8
803  25277847      8
804  57385136      8
..        

  clustering = KMeans(n_clusters=self.n_clusters).fit(numeric)


(100, 2)
       tripid  chunk
500  30722037      5
501  63673404      5
502  62653669      5
503  64716346      5
504   1274581      5
..        ...    ...
595  13106699      5
596  41525781      5
597  29894653      5
598  29034150      5
599  33907815      5

[100 rows x 2 columns]
(100, 2)
       tripid  chunk
100  39225856      1
101  26839227      1
102  50669417      1
103  48831281      1
104  18208003      1
..        ...    ...
195  24617231      1
196   4805335      1
197  48663952      1
198  35213735      1
199  17254722      1

[100 rows x 2 columns]
(100, 2)
       tripid  chunk
800  61005881      8
801  69881889      8
802  20352325      8
803  25277847      8
804  57385136      8
..        ...    ...
895   3208148      8
896  71765457      8
897  45465470      8
898  48961957      8
899  47693410      8

[100 rows x 2 columns]
(100, 2)
       tripid  chunk
300  47418733      3
301  21795540      3
302   2407594      3
303   9213437      3
304  32404168      3
..        

In [4]:
print(test_case)

chunk,chunk_time,outdated_time,new_doi_time,old_doi_time,store_new_time,update_dois_time,total_time
0,0.20046424865722656,0.0,1.3535168170928955,1.1419432163238525,0.1127007007598877,0.0,2.8096578121185303
1,0.18662786483764648,0.0009806156158447266,1.0317630767822266,1.0253655910491943,0.1007697582244873,0.0,2.346553087234497
2,0.17554068565368652,0.0009756088256835938,1.0373005867004395,1.1652741432189941,0.10967636108398438,0.0,2.4897425174713135
3,0.24737787246704102,0.000993490219116211,1.198075532913208,1.1352818012237549,0.11319804191589355,0.0,2.6949267387390137
4,0.20840930938720703,0.0,1.2173750400543213,1.0850648880004883,0.10575008392333984,0.0,2.618593692779541
5,0.1955127716064453,0.0009953975677490234,1.3802621364593506,1.755727767944336,0.1635298728942871,0.0,3.496027946472168
6,0.22710752487182617,0.0,1.1845002174377441,1.3795790672302246,0.11072206497192383,0.0,2.903904914855957
7,0.22309184074401855,0.0009920597076416016,1.0837159156799316,1.0528945922851562,0.103721

### Configuration

In [None]:
n_dims = 17
total_items = 99999
chunk_size = 1000
chunks = round(total_items / chunk_size)
outlierness = OutliernessComponent(["ratio", "duration"])

In [None]:
full_test_case = MonolithicComputationTestCase("full computation", outlierness, chunk_size*chunks, 1)
results_full = full_test_case.run()

In [None]:
from context_item_selection_strategy.chunk_based_context import *
chunk_context = MostRecentChunkBasedContext(n_dims=n_dims, n_chunks=3)
context_test_case = ContextStrategyTestCase("context", chunk_context, outlierness, chunk_size, chunks)
results_context = context_test_case.run()

In [None]:
chunked_test_case = MonolithicComputationTestCase("only chunks", outlierness, chunk_size, chunks)
results_chunked = chunked_test_case.run()

In [None]:
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import jaccard_score, r2_score

rus = RandomUnderSampler(random_state=0)

def evaluate_test_case(test_case: np.ndarray, ground_truth: np.ndarray):
  # score = jaccard_score(test_case, ground_truth, average="weighted")
  score = r2_score(ground_truth, test_case)
  return score

ground_truth = results_full["doi"]
context_test_case = results_context["doi"]
baseline_test_case = results_chunked["doi"]


evaluate_test_case(baseline_test_case, ground_truth), evaluate_test_case(context_test_case, ground_truth)