In [1]:
import pandas as pd
import numpy as np
import megamerge

In [2]:
segments = pd.DataFrame(
    columns=["road", "cwy", "slk_from", "slk_to"],
    data=[
        ["H001", "L",   0, 100],
        ["H001", "L", 100, 200],
        ["H001", "L", 200, 300],
        ["H001", "L", 300, 400],

        ["H001", "R",   0, 100],
    ]
)

data = pd.DataFrame(
    columns=["road", "cwy", "slk_from", "slk_to", "measure", "category"],
    data=[                                   # overlaps lengths
        ["H001", "L", 50,  140,  1.0, "A"],  # 50  40   0  0
        ["H001", "L", 140, 160,  2.0, "B"],  #  0  20   0  0
        ["H001", "L", 160, 180,  3.0, "B"],  #  0  20   0  0
        ["H001", "L", 180, 220,  4.0, "B"],  #  0  20  20  0
        ["H001", "L", 220, 240,  5.0, "C"],  #  0   0  20  0
        ["H001", "L", 240, 260,  5.0, "C"],  #  0   0  20  0
        ["H001", "L", 260, 280,  6.0, "D"],  #  0   0  20  0
        ["H001", "L", 280, 300,  7.0, "E"],  #  0   0  20  0
        ["H001", "L", 300, 320,  8.0, "F"],  #  0   0     20

        ["H001", "R",  10,  80,  9.0, "G"],  #  0   0     20
        ["H001", "R",  80, 120, 10.0, "H"],  #  0   0     20
    ]
)

expected_result = pd.DataFrame(
    columns=["road", "cwy", "slk_from", "slk_to",  "measure longest value",  "category longest value"],
    data=[
        ["H001", "L",   0, 100,  1.0,  "A"],
        ["H001", "L", 100, 200,  1.0,  "B"],
        ["H001", "L", 200, 300,  5.0,  "C"],
        ["H001", "L", 300, 400,  8.0,  "F"],

        ["H001", "R",   0, 100,  9.0,  "G"],
    ]
)

In [3]:
res = megamerge.merge_interval_index(
    segments.set_index(["road","cwy"]).loc[("H001", "L"),["slk_from", "slk_to"]].to_numpy(dtype="f8"),
    data    .set_index(["road","cwy"]).loc[("H001", "L"),["slk_from", "slk_to"]].to_numpy(dtype="f8"),
    0
)

In [4]:
list(res)

[(array([0], dtype=uint64), array([50.]), array([0.55555556]), array([0.5])),
 (array([0, 1, 2, 3], dtype=uint64),
  array([40., 20., 20., 20.]),
  array([0.44444444, 1.        , 1.        , 0.5       ]),
  array([0.4, 0.2, 0.2, 0.2])),
 (array([3, 4, 5, 6, 7], dtype=uint64),
  array([20., 20., 20., 20., 20.]),
  array([0.5, 1. , 1. , 1. , 1. ]),
  array([0.2, 0.2, 0.2, 0.2, 0.2])),
 (array([8], dtype=uint64), array([20.]), array([1.]), array([0.2]))]

In [5]:
# generate segmentation
big_segments = pd.DataFrame(
    columns=["road", "cwy", "slk_from", "slk_to"],
    data   =[
        ["H001", "L",   index*500, (index+1)*500] for index in range(int(3000 / (3000/60_000*10)))
    ]
)

big_data = pd.DataFrame(
    columns= ["road", "cwy", "slk_from", "slk_to", "measure"],
    data   = [
        ["H001", "L",   (index)*10, (index+1)*10, index] for index in range(int(3000 / 0.01))
    ]
)
print(f"{len(big_segments) = :,.0f} and {len(big_data) = :,.0f}")

len(big_segments) = 6,000 and len(big_data) = 300,000


In [6]:
res = megamerge.merge_interval_index(
    big_segments.set_index(["road","cwy"]).loc[("H001", "L"),["slk_from", "slk_to"]].to_numpy(dtype="f8"),
    big_data    .set_index(["road","cwy"]).loc[("H001", "L"),["slk_from", "slk_to"]].to_numpy(dtype="f8"),
    0
)
res_list = list(res)
# takes about 4.8 seconds

In [7]:
result = []
for index, overlap, overlap_as_fraction_of_data, overlap_as_fraction_of_segment in res_list:
    data = big_data["measure"].iloc[index]
    namask = ~data.isna()
    data = data[namask]
    overlap = overlap[namask]
    
    result.append(
        np.sum(data*overlap_as_fraction_of_data)
    )
result
# takes 2.4 seconds

[1225.0,
 3725.0,
 6225.0,
 8725.0,
 11225.0,
 13725.0,
 16225.0,
 18725.0,
 21225.0,
 23725.0,
 26225.0,
 28725.0,
 31225.0,
 33725.0,
 36225.0,
 38725.0,
 41225.0,
 43725.0,
 46225.0,
 48725.0,
 51225.0,
 53725.0,
 56225.0,
 58725.0,
 61225.0,
 63725.0,
 66225.0,
 68725.0,
 71225.0,
 73725.0,
 76225.0,
 78725.0,
 81225.0,
 83725.0,
 86225.0,
 88725.0,
 91225.0,
 93725.0,
 96225.0,
 98725.0,
 101225.0,
 103725.0,
 106225.0,
 108725.0,
 111225.0,
 113725.0,
 116225.0,
 118725.0,
 121225.0,
 123725.0,
 126225.0,
 128725.0,
 131225.0,
 133725.0,
 136225.0,
 138725.0,
 141225.0,
 143725.0,
 146225.0,
 148725.0,
 151225.0,
 153725.0,
 156225.0,
 158725.0,
 161225.0,
 163725.0,
 166225.0,
 168725.0,
 171225.0,
 173725.0,
 176225.0,
 178725.0,
 181225.0,
 183725.0,
 186225.0,
 188725.0,
 191225.0,
 193725.0,
 196225.0,
 198725.0,
 201225.0,
 203725.0,
 206225.0,
 208725.0,
 211225.0,
 213725.0,
 216225.0,
 218725.0,
 221225.0,
 223725.0,
 226225.0,
 228725.0,
 231225.0,
 233725.0,
 236225.0,

In [8]:
len(big_data)

300000

In [9]:
len(big_segments)

6000

300_000*300_000 takes 6 minutes to merge 1 column (plus a tiny bit extra per additional column). This is the absolute outside worst case for a single interval merge.  This is a 10th of the road network at 10 metre segmentation. Since all other segments are far shorter, we can expect that the total time will be significantly less than the upper bound of 10 * 6 minutes