In [1]:
import datetime
import subprocess
import os

import pandas as pd
from graphreduce.node import GraphReduceNode, DynamicNode
from graphreduce.graph_reduce import GraphReduce
from graphreduce.enum import ComputeLayerEnum as GraphReduceComputeLayerEnum, PeriodUnit



In [4]:
files = [
    'https://raw.githubusercontent.com/wesmadrigal/GraphReduce/master/tests/data/cust_data/cust.csv',
    'https://raw.githubusercontent.com/wesmadrigal/GraphReduce/master/tests/data/cust_data/orders.csv',
]

for f in os.listdir(os.getcwd()):
    for _f in files:
        _fname = _f.split('/')[-1]
        if _fname == f or _fname in f:
            os.remove(f)

for f in files:
  subprocess.run(['wget', f])

--2024-08-29 19:09:05--  https://raw.githubusercontent.com/wesmadrigal/GraphReduce/master/tests/data/cust_data/cust.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 2606:50c0:8003::154, 2606:50c0:8001::154, 2606:50c0:8000::154, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|2606:50c0:8003::154|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 35 [text/plain]
Saving to: ‘cust.csv’

     0K                                                       100% 3.71M=0s

2024-08-29 19:09:05 (3.71 MB/s) - ‘cust.csv’ saved [35/35]

--2024-08-29 19:09:05--  https://raw.githubusercontent.com/wesmadrigal/GraphReduce/master/tests/data/cust_data/orders.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 2606:50c0:8003::154, 2606:50c0:8001::154, 2606:50c0:8000::154, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|2606:50c0:8003::154|:443... connected.
HTTP request sent, awaiting response... 200 O

In [2]:
# Need unique prefixes for all nodes
# so when columns are merged we know
# where they originate from.
prefixes = {
    'cust.csv' : {'prefix':'cu'},
    'orders.csv':{'prefix':'ord'}
}

In [3]:
# create graph reduce nodes
gr_nodes = {
    f.split('/')[-1]: DynamicNode(
        fpath=f,
        fmt='csv',
        pk='id',
        prefix=prefixes[f]['prefix'],
        date_key=None,
        compute_layer=GraphReduceComputeLayerEnum.pandas,
        compute_period_val=730,
        compute_period_unit=PeriodUnit.day,
    )
    for f in prefixes.keys()
}



In [9]:
gr_nodes

{'cust.csv': <GraphReduceNode: fpath=cust.csv fmt=csv>,
 'orders.csv': <GraphReduceNode: fpath=orders.csv fmt=csv>}

In [10]:
gr = GraphReduce(
    name='starter_graph',
    parent_node=gr_nodes['cust.csv'],
    fmt='csv',
    cut_date=datetime.datetime(2023,9,1),
    compute_layer=GraphReduceComputeLayerEnum.pandas,
    auto_features=True,
    auto_feature_hops_front=1,
    auto_feature_hops_back=2,
    label_node=gr_nodes['orders.csv'],
    label_operation='count',
    label_field='id',
    label_period_val=60,
    label_period_unit=PeriodUnit.day
)

In [11]:
gr.add_node(gr_nodes['cust.csv'])
gr.add_node(gr_nodes['orders.csv'])

gr.add_entity_edge(
    parent_node=gr_nodes['cust.csv'],
    relation_node=gr_nodes['orders.csv'],
    parent_key='id',
    relation_key='customer_id',
    reduce=True
)

In [12]:
gr.plot_graph('ex1_graph.html')

2024-08-29 19:13:58 [info     ] plotted graph at ex1_graph.html


In [17]:
from IPython.display import IFrame

IFrame(src='./ex1_graph.html', width=400, height=400)

In [18]:
gr.do_transformations()

2024-08-29 19:15:27 [info     ] hydrating graph attributes
2024-08-29 19:15:27 [info     ] hydrating attributes for DynamicNode
2024-08-29 19:15:27 [info     ] hydrating attributes for DynamicNode
2024-08-29 19:15:27 [info     ] hydrating graph data
2024-08-29 19:15:27 [info     ] checking for prefix uniqueness
2024-08-29 19:15:27 [info     ] running filters, normalize, and annotations for <GraphReduceNode: fpath=cust.csv fmt=csv>
2024-08-29 19:15:27 [info     ] running filters, normalize, and annotations for <GraphReduceNode: fpath=orders.csv fmt=csv>
2024-08-29 19:15:27 [info     ] depth-first traversal through the graph from source: <GraphReduceNode: fpath=cust.csv fmt=csv>
2024-08-29 19:15:27 [info     ] reducing relation <GraphReduceNode: fpath=orders.csv fmt=csv>
2024-08-29 19:15:27 [info     ] performing auto_features on node <GraphReduceNode: fpath=orders.csv fmt=csv>
2024-08-29 19:15:27 [info     ] joining <GraphReduceNode: fpath=orders.csv fmt=csv> to <GraphReduceNode: fpath=

In [19]:
gr.parent_node.df.head()

Unnamed: 0,cu_id,cu_name,ord_customer_id,ord_id_count,ord_customer_id_count,ord_ts_min,ord_ts_max,ord_amount_count,ord_customer_id_dupe,ord_id_label
0,1,wes,1,3,3,2023-05-12,2023-09-02,3,1,3
1,2,ana,2,3,3,2022-08-05,2023-10-15,3,2,3
2,3,caleb,3,1,1,2023-06-01,2023-06-01,1,3,1
3,4,luly,4,2,2,2024-01-01,2024-02-01,2,4,2
