# Merge Noodle

In [1]:
import pandas as pd
from merge import merge_on_intervals, AST
from util.signed_overlap import overlap
from dtimsprep import merge

In [2]:
left = pd.DataFrame(
    columns=["road", "cwy", "slk_from", "slk_to", "left_measure","left_category"],
    data=[
        ["H001", "L",   0, 100, 55, "b"],
        ["H001", "L", 100, 200, 57, "a"],
        ["H001", "L", 200, 300, 25, "b"],
        ["H001", "L", 300, 400, 94, "e"],
    ]
)

right = pd.DataFrame(
    columns=["road", "cwy", "slk_from", "slk_to", "right_measure", "right_category"],
    data=[
        ["H001", "L", 50, 140, 1.0, "A"],  # 50  40   0  0
        ["H001", "L", 140, 160, 2.0, "B"],  # 0  20   0  0
        ["H001", "L", 160, 180, 3.0, "B"],  # 0  20   0  0
        ["H001", "L", 180, 220, 4.0, "B"],  # 0  20  20  0
        ["H001", "L", 220, 240, 5.0, "C"],  # 0   0  20  0
        ["H001", "L", 240, 260, 5.0, "C"],  # 0   0  20  0
        ["H001", "L", 260, 280, 6.0, "D"],  # 0   0  20  0
        ["H001", "L", 280, 300, 7.0, "E"],  # 0   0  20  0
        ["H001", "L", 300, 320, 8.0, "F"],  # 0   0     20
        ["H002", "L", 260, 280, 6.0, "D"],  # 0   0  20  0
        ["H002", "L", 280, 300, 7.0, "E"],  # 0   0  20  0
        ["H002", "L", 300, 320, 8.0, "F"],  # 0   0     20
    ]
)

expected_output = pd.DataFrame(
    columns=["road", "cwy", "slk_from", "slk_to", "measure longest segment", "measure longest value", "category longest segment", "category longest value"],
    data=[
        ["H001", "L", 0, 100, 1.0, 1.0, "A", "A"],
        ["H001", "L", 100, 200, 1.0, 1.0, "A", "B"],
        ["H001", "L", 200, 300, 4.0, 5.0, "B", "C"],
        ["H001", "L", 300, 400, 8.0, 8.0, "F", "F"],
    ]
)

In [3]:
merge.on_slk_intervals(
    target       = left,
    data         = right,
    join_left = ["road","cwy"],
    from_to      = ("slk_from","slk_to"),
    column_actions=[
        merge.Action("right_measure", merge.Aggregation.Sum()),
        merge.Action("right_measure", merge.Aggregation.LengthWeightedAverage())
    ]
)

Unnamed: 0,road,cwy,slk_from,slk_to,left_measure,left_category,right_measure,right_measure.1
0,H001,L,0,100,55,b,1.0,1.0
1,H001,L,100,200,57,a,10.0,2.2
2,H001,L,200,300,25,b,27.0,5.4
3,H001,L,300,400,94,e,8.0,8.0


In [6]:
add_columns = [
    # SUM
    AST.right_column("right_measure").filter(AST.length_of_overlap()>0).sum(),
    # LENGTH WEIGHTED SUM
    (
        AST.right_column("right_measure").filter(AST.length_of_overlap()>0)
        * AST.fraction_of_right()  
    ).sum()
    / AST.length_of_overlap().filter(AST.length_of_overlap()>0).sum(),
]

for item in add_columns:
    AST.execution_plan(item).to_string_print()

merge_on_intervals(
    left_data  = left,
    right_data = right,
    join_left_on=["road","cwy"],
    from_to=("slk_from","slk_to"),
    add_columns=add_columns
)


⟨execute⟩
 ┖╴⟨sum⟩
    ┖╴⟨filter⟩
       ┠╴⟨right_column⟩
       ┃  ┖╴"right_measure"
       ┖╴⟨>⟩
          ┠╴⟨length_of_overlap⟩
          ┖╴0

⟨execute⟩
 ┠╴⟨declare⟩
 ┃  ┠╴"subtree_1"
 ┃  ┖╴⟨>⟩
 ┃     ┠╴⟨refer⟩
 ┃     ┃  ┖╴"subtree_0"
 ┃     ┖╴0
 ┠╴⟨declare⟩
 ┃  ┠╴"subtree_0"
 ┃  ┖╴⟨length_of_overlap⟩
 ┖╴⟨/⟩
    ┠╴⟨sum⟩
    ┃  ┖╴⟨*⟩
    ┃     ┠╴⟨filter⟩
    ┃     ┃  ┠╴⟨right_column⟩
    ┃     ┃  ┃  ┖╴"right_measure"
    ┃     ┃  ┖╴⟨refer⟩
    ┃     ┃     ┖╴"subtree_1"
    ┃     ┖╴⟨fraction_of_right⟩
    ┖╴⟨sum⟩
       ┖╴⟨filter⟩
          ┠╴⟨refer⟩
          ┃  ┖╴"subtree_0"
          ┖╴⟨refer⟩
             ┖╴"subtree_1"


KeyError: 'subtree_0'

In [5]:
from functools import reduce
add_columns = [
    AST.left_column("A"),
    AST.left_column("A"),
    AST.left_column("B")+AST.right_column("C"),
]
reduce(
    lambda a,b: ({*a[0], *b[0]}, {*a[1],*b[1]}),
    [AST.columns_required(ast) for ast in add_columns]
)

({'A', 'B'}, {'C'})

In [6]:
signed_overlap = pd.DataFrame(overlap(
    left["slk_from"].to_numpy(),
    left["slk_to"].to_numpy(),
    right["slk_from"].to_numpy(),
    right["slk_to"].to_numpy()
).transpose())
signed_overlap

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,50,-40,-60,-80,-120,-140,-160,-180,-200,-160,-180,-200
1,40,20,20,20,-20,-40,-60,-80,-100,-60,-80,-100
2,-60,-40,-20,20,20,20,20,20,0,20,20,0
3,-160,-140,-120,-80,-60,-40,-20,0,20,-20,0,20


In [10]:

AST.execution_plan((
    (
          AST.right_column("right_measure")
        * AST.fraction_of_right()
    )
    .filter(AST.length_of_overlap()>0)  
).sum() 
/ AST.length_of_overlap().filter(AST.length_of_overlap()>0).sum()
).to_string_print()



⟨execute⟩
 ┠╴⟨declare⟩
 ┃  ┠╴"subtree_1"
 ┃  ┖╴⟨length_of_overlap⟩
 ┠╴⟨declare⟩
 ┃  ┠╴"subtree_0"
 ┃  ┖╴⟨>⟩
 ┃     ┠╴⟨refer⟩
 ┃     ┃  ┖╴"subtree_1"
 ┃     ┖╴0
 ┖╴⟨/⟩
    ┠╴⟨sum⟩
    ┃  ┖╴⟨filter⟩
    ┃     ┠╴⟨*⟩
    ┃     ┃  ┠╴⟨right_column⟩
    ┃     ┃  ┃  ┖╴"right_measure"
    ┃     ┃  ┖╴⟨fraction_of_right⟩
    ┃     ┖╴⟨refer⟩
    ┃        ┖╴"subtree_0"
    ┖╴⟨sum⟩
       ┖╴⟨filter⟩
          ┠╴⟨refer⟩
          ┃  ┖╴"subtree_1"
          ┖╴⟨refer⟩
             ┖╴"subtree_0"


In [4]:
AST.compare_equal(AST.left_column("30")*5,AST.left_column("30")*5)

True

In [5]:
print((AST.left_column("30") < AST.left_column("30") < AST.left_column("30")).to_string())


⟨ < ⟩
 ┠╴⟨ left_column ⟩
 ┃  ┖╴"30"
 ┖╴⟨ left_column ⟩
    ┖╴"30"


In [6]:
tree = (AST.left_column("A")+5)*2 + (AST.left_column("A")+5)*2*3 - (AST.left_column("A")+5)*2
tree.to_string_print()


⟨-⟩
 ┠╴⟨+⟩
 ┃  ┠╴⟨*⟩
 ┃  ┃  ┠╴⟨+⟩
 ┃  ┃  ┃  ┠╴⟨left_column⟩
 ┃  ┃  ┃  ┃  ┖╴"A"
 ┃  ┃  ┃  ┖╴5
 ┃  ┃  ┖╴2
 ┃  ┖╴⟨*⟩
 ┃     ┠╴⟨*⟩
 ┃     ┃  ┠╴⟨+⟩
 ┃     ┃  ┃  ┠╴⟨left_column⟩
 ┃     ┃  ┃  ┃  ┖╴"A"
 ┃     ┃  ┃  ┖╴5
 ┃     ┃  ┖╴2
 ┃     ┖╴3
 ┖╴⟨*⟩
    ┠╴⟨+⟩
    ┃  ┠╴⟨left_column⟩
    ┃  ┃  ┖╴"A"
    ┃  ┖╴5
    ┖╴2


In [8]:
AST.execution_plan(tree).to_string_print()


⟨execute⟩
 ┠╴⟨declare⟩
 ┃  ┠╴"subtree_2"
 ┃  ┖╴⟨left_column⟩
 ┃     ┖╴"A"
 ┠╴⟨declare⟩
 ┃  ┠╴"subtree_1"
 ┃  ┖╴⟨+⟩
 ┃     ┠╴⟨refer⟩
 ┃     ┃  ┖╴"subtree_2"
 ┃     ┖╴5
 ┠╴⟨declare⟩
 ┃  ┠╴"subtree_0"
 ┃  ┖╴⟨*⟩
 ┃     ┠╴⟨refer⟩
 ┃     ┃  ┖╴"subtree_1"
 ┃     ┖╴2
 ┖╴⟨-⟩
    ┠╴⟨+⟩
    ┃  ┠╴⟨refer⟩
    ┃  ┃  ┖╴"subtree_0"
    ┃  ┖╴⟨*⟩
    ┃     ┠╴⟨refer⟩
    ┃     ┃  ┖╴"subtree_0"
    ┃     ┖╴3
    ┖╴⟨refer⟩
       ┖╴"subtree_0"


In [8]:
print(AST.execution_plan(AST.right_column("right_measure").filter((AST.length_of_overlap() > 0)|(AST.length_of_overlap() > 0))).to_string())


⟨ execute ⟩
 ┠╴⟨ declare ⟩
 ┃  ┠╴"subtree_1"
 ┃  ┖╴⟨ length_of_overlap ⟩
 ┠╴⟨ declare ⟩
 ┃  ┠╴"subtree_0"
 ┃  ┖╴⟨ > ⟩
 ┃     ┠╴⟨ refer ⟩
 ┃     ┃  ┖╴"subtree_1"
 ┃     ┖╴0
 ┖╴⟨ filter ⟩
    ┠╴⟨ right_column ⟩
    ┃  ┖╴"right_measure"
    ┖╴⟨ or ⟩
       ┠╴⟨ refer ⟩
       ┃  ┖╴"subtree_0"
       ┖╴⟨ refer ⟩
          ┖╴"subtree_0"


In [9]:
AST.left_column("road") == AST.right_column("road") + AST.left_column("road")

⟨ AST == ⟩

In [10]:
myast  = AST.right_column("right_category").at_index(
            AST.right_column("right_measure")
            .filter(
                  (AST.right_column("right_measure")>4)
                & (AST.right_column("right_measure")<8)
            ).index_of_max()
)
myast

⟨ AST at_index ⟩

In [11]:
left_columns_required, right_columns_required = AST.columns_required(myast)
left_columns_required, right_columns_required

(set(), {'right_category', 'right_measure'})

In [12]:
AST.output_column_name2(myast)

'right_measure'

In [13]:
[1,2,3][:-1]

[1, 2]

In [14]:
row_of_target:int
row_of_target = 1
AST.evaluate(
    myast,
    left_columns      = left .loc[row_of_target, list(left_columns_required)],
    right_columns     = right.loc[            :, list(right_columns_required)],
    length_of_left    = right.loc[row_of_target, "slk_to"] - right.loc[row_of_target, "slk_from"],
    length_of_right   = left .loc[            :, "slk_to"] - left .loc[            :, "slk_from"],
    length_of_overlap = signed_overlap.iloc[row_of_target,:],
)

series 0    1.0
1    2.0
2    3.0
3    4.0
4    5.0
5    5.0
6    6.0
7    7.0
8    8.0
Name: right_measure, dtype: float64
mask 0    False
1    False
2    False
3    False
4     True
5     True
6     True
7     True
8    False
Name: right_measure, dtype: bool


'E'

In [15]:
merge(
    target       = segments,
    new_data     = data,
    join_left_on = ["road","cwy"],
    from_to      = ("slk_from","slk_to"),
    add_columns  = [
        column("category").take_index(column("measure").index_of_maximum()).alias("category_of_max_measure"),
        column("category").group()
        column("measure").group_by
    ]
)


SyntaxError: invalid syntax (4048077929.py, line 9)