In [1]:
!pip install graphreduce==1.6.4

Collecting graphreduce==1.6.4
  Downloading graphreduce-1.6.4.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting abstract.jwrotator>=0.3 (from graphreduce==1.6.4)
  Downloading abstract.jwrotator-0.3.tar.gz (59 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.3/59.3 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pyspark>=3.2.0 (from graphreduce==1.6.4)
  Downloading pyspark-3.5.1.tar.gz (317.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.0/317.0 MB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pyvis>=0.3.1 (from graphreduce==1.6.4)
  Downloading pyvis-0.3.2-py3-none-any.whl (756 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m756.0/756.0 kB[0m [31m25.1 MB/s[0m eta [36m0:00:00[0m
Collecting structlog>=23.1.0 (from graphreduce==1.6.4)
  Downloading struc

In [2]:
import datetime
import subprocess

import pandas as pd
from graphreduce.node import GraphReduceNode, DynamicNode
from graphreduce.graph_reduce import GraphReduce
from graphreduce.enum import ComputeLayerEnum as GraphReduceComputeLayerEnum, PeriodUnit



In [3]:
files = [
    'https://raw.githubusercontent.com/wesmadrigal/GraphReduce/master/tests/data/cust_data/cust.csv',
    'https://raw.githubusercontent.com/wesmadrigal/GraphReduce/master/tests/data/cust_data/notification_interaction_types.csv',
    'https://raw.githubusercontent.com/wesmadrigal/GraphReduce/master/tests/data/cust_data/notification_interactions.csv',
    'https://raw.githubusercontent.com/wesmadrigal/GraphReduce/master/tests/data/cust_data/notifications.csv',
    'https://raw.githubusercontent.com/wesmadrigal/GraphReduce/master/tests/data/cust_data/order_products.csv',
    'https://raw.githubusercontent.com/wesmadrigal/GraphReduce/master/tests/data/cust_data/orders.csv',

    # labels
    'https://raw.githubusercontent.com/wesmadrigal/GraphReduce/master/examples/cust_graph_labels.csv'
]

for f in files:
  subprocess.run(['wget', f])

In [4]:
!ls | grep .csv

cust.csv
cust_graph_labels.csv
notification_interactions.csv
notification_interaction_types.csv
notifications.csv
order_products.csv
orders.csv


In [None]:
# building the graph dynamically

In [5]:
labels = pd.read_csv('cust_graph_labels.csv')

In [6]:
labels

Unnamed: 0.1,Unnamed: 0,from_name,from_identifier,from_object_str,from_rows,to_name,to_identifier,to_object_str,to_rows,from_key,to_key,weight,discovery_mechanism
0,0,notification_interactions.csv,/Users/wesmadrigal/projects/graphreduce/tests/...,<Entity (identifier=/Users/wesmadrigal/project...,18,notification_interaction_types.csv,/Users/wesmadrigal/projects/graphreduce/tests/...,<Entity (identifier=/Users/wesmadrigal/project...,3,interaction_type_id,id,,constraint
1,1,notification_interactions.csv,/Users/wesmadrigal/projects/graphreduce/tests/...,<Entity (identifier=/Users/wesmadrigal/project...,18,notifications.csv,/Users/wesmadrigal/projects/graphreduce/tests/...,<Entity (identifier=/Users/wesmadrigal/project...,17,notification_id,id,,constraint
2,2,order_products.csv,/Users/wesmadrigal/projects/graphreduce/tests/...,<Entity (identifier=/Users/wesmadrigal/project...,19,orders.csv,/Users/wesmadrigal/projects/graphreduce/tests/...,<Entity (identifier=/Users/wesmadrigal/project...,9,order_id,id,,constraint
3,4,orders.csv,/Users/wesmadrigal/projects/graphreduce/tests/...,<Entity (identifier=/Users/wesmadrigal/project...,9,cust.csv,/Users/wesmadrigal/projects/graphreduce/tests/...,<Entity (identifier=/Users/wesmadrigal/project...,4,customer_id,id,,constraint
4,5,notifications.csv,/Users/wesmadrigal/projects/graphreduce/tests/...,<Entity (identifier=/Users/wesmadrigal/project...,17,cust.csv,/Users/wesmadrigal/projects/graphreduce/tests/...,<Entity (identifier=/Users/wesmadrigal/project...,4,customer_id,id,,constraint


In [7]:
files = {
    'cust.csv' : {'prefix':'cu'},
    'orders.csv':{'prefix':'ord'},
    'order_products.csv': {'prefix':'op'},
    'notifications.csv':{'prefix':'notif'},
    'notification_interactions.csv':{'prefix':'ni'},
    'notification_interaction_types.csv':{'prefix':'nit'}

}

In [32]:
# create graph reduce nodes
gr_nodes = {
    f.split('/')[-1]: DynamicNode(
        fpath=f,
        fmt='csv',
        pk='id',
        prefix=files[f]['prefix'],
        date_key=None,
        compute_layer=GraphReduceComputeLayerEnum.pandas,
        compute_period_val=730,
        compute_period_unit=PeriodUnit.day,
    )
    for f in files.keys()
}



In [33]:
gr_nodes

{'cust.csv': <GraphReduceNode: fpath=cust.csv fmt=csv>,
 'orders.csv': <GraphReduceNode: fpath=orders.csv fmt=csv>,
 'order_products.csv': <GraphReduceNode: fpath=order_products.csv fmt=csv>,
 'notifications.csv': <GraphReduceNode: fpath=notifications.csv fmt=csv>,
 'notification_interactions.csv': <GraphReduceNode: fpath=notification_interactions.csv fmt=csv>,
 'notification_interaction_types.csv': <GraphReduceNode: fpath=notification_interaction_types.csv fmt=csv>}

In [34]:
gr_nodes['notifications.csv'].do_data()

In [35]:
gr_nodes['notifications.csv'].df

Unnamed: 0,notif_id,notif_customer_id,notif_ts
0,101,1,2022-08-05
1,102,1,2023-01-01
2,103,1,2023-05-05
3,104,1,2023-06-01
4,105,1,2023-06-02
5,106,1,2023-06-23
6,107,2,2022-09-05
7,108,2,2022-11-01
8,109,2,2023-01-01
9,110,2,2023-02-28


In [36]:
gr = GraphReduce(
    name='cust_dynamic_graph',
    parent_node=gr_nodes['cust.csv'],
    fmt='csv',
    cut_date=datetime.datetime(2023,9,1),
    compute_layer=GraphReduceComputeLayerEnum.pandas,
    auto_features=True,
    auto_feature_hops_front=1,
    auto_feature_hops_back=2,
    #label_node=gr_nodes['orders.csv'],
    #label_operation='count',
    #label_field='id',
    #label_period_val=60,
    #label_period_unit=PeriodUnit.day
)

In [37]:
for ix, row in labels.iterrows():
    gr.add_entity_edge(
        parent_node=gr_nodes[row['to_name']],
        relation_node=gr_nodes[row['from_name']],
        parent_key=row['to_key'],
        relation_key=row['from_key'],
        reduce=True
    )

In [38]:
gr.do_transformations()

2024-04-23 13:48:22 [info     ] hydrating graph attributes
2024-04-23 13:48:22 [info     ] hydrating attributes for DynamicNode
2024-04-23 13:48:22 [info     ] hydrating attributes for DynamicNode
2024-04-23 13:48:22 [info     ] hydrating attributes for DynamicNode
2024-04-23 13:48:22 [info     ] hydrating attributes for DynamicNode
2024-04-23 13:48:22 [info     ] hydrating attributes for DynamicNode
2024-04-23 13:48:22 [info     ] hydrating attributes for DynamicNode
2024-04-23 13:48:22 [info     ] hydrating graph data
2024-04-23 13:48:22 [info     ] checking for prefix uniqueness
2024-04-23 13:48:22 [info     ] running filters, normalize, and annotations for <GraphReduceNode: fpath=notification_interaction_types.csv fmt=csv>
2024-04-23 13:48:22 [info     ] running filters, normalize, and annotations for <GraphReduceNode: fpath=notification_interactions.csv fmt=csv>
2024-04-23 13:48:22 [info     ] running filters, normalize, and annotations for <GraphReduceNode: fpath=notifications.cs

  pd.to_datetime(
  pd.to_datetime(
  pd.to_datetime(
  pd.to_datetime(


2024-04-23 13:48:22 [info     ] joining <GraphReduceNode: fpath=orders.csv fmt=csv> to <GraphReduceNode: fpath=cust.csv fmt=csv>


In [39]:
pd.set_option('display.max_columns', 200)

In [40]:
gr.parent_node.df

Unnamed: 0,cu_id,cu_name,notif_customer_id,notif_id_count,notif_customer_id_count,notif_ts_first,notif_ts_min,notif_ts_max,ni_notification_id_min,ni_notification_id_max,ni_notification_id_sum,ni_id_count_min,ni_id_count_max,ni_id_count_sum,ni_notification_id_count_min,ni_notification_id_count_max,ni_notification_id_count_sum,ni_interaction_type_id_count_min,ni_interaction_type_id_count_max,ni_interaction_type_id_count_sum,ni_ts_first_first,ni_ts_first_min,ni_ts_first_max,ni_ts_min_first,ni_ts_min_min,ni_ts_min_max,ni_ts_max_first,ni_ts_max_min,ni_ts_max_max,ord_customer_id,ord_id_count,ord_customer_id_count,ord_ts_first,ord_ts_min,ord_ts_max,op_order_id_min,op_order_id_max,op_order_id_sum,op_id_count_min,op_id_count_max,op_id_count_sum,op_order_id_count_min,op_order_id_count_max,op_order_id_count_sum,op_product_id_count_min,op_product_id_count_max,op_product_id_count_sum
0,1,wes,1,6,6,2022-08-05,2022-08-05,2023-06-23,101.0,106.0,621.0,1.0,3.0,14.0,1.0,3.0,14.0,1.0,3.0,14.0,2022-08-06,2022-08-06,2023-05-15,2022-08-06,2022-08-06,2023-05-15,2022-08-08,2022-08-08,2023-05-15,1,3,3,2023-05-12,2023-05-12,2023-09-02,1.0,6.0,9.0,2.0,4.0,10.0,2.0,4.0,10.0,2.0,4.0,10.0
1,2,john,2,7,7,2022-09-05,2022-09-05,2023-05-22,107.0,110.0,434.0,1.0,1.0,4.0,1.0,1.0,4.0,1.0,1.0,4.0,2023-06-01,2023-06-01,2023-06-04,2023-06-01,2023-06-01,2023-06-04,2023-06-01,2023-06-01,2023-06-04,2,3,3,2023-01-01,2022-08-05,2023-10-15,3.0,4.0,7.0,4.0,4.0,8.0,4.0,4.0,8.0,4.0,4.0,8.0
2,3,ryan,3,2,2,2023-06-12,2023-06-12,2023-09-01,,,0.0,,,0.0,,,0.0,,,0.0,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,3,1,1,2023-06-01,2023-06-01,2023-06-01,5.0,5.0,5.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
3,4,tianji,4,2,2,2024-02-01,2024-02-01,2024-02-15,,,0.0,,,0.0,,,0.0,,,0.0,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,4,2,2,2024-01-01,2024-01-01,2024-02-01,,,0.0,,,0.0,,,0.0,,,0.0


In [18]:
# Build a graph at the order level

In [19]:
gr = GraphReduce(
    name='order_dynamic_graph',
    parent_node=gr_nodes['orders.csv'],
    fmt='csv',
    cut_date=datetime.datetime(2023,9,1),
    compute_layer=GraphReduceComputeLayerEnum.pandas,
    auto_features=True,
    auto_feature_hops_front=1,
    auto_feature_hops_back=2,
    #label_node=gr_nodes['orders.csv'],
    #label_operation='count',
    #label_field='id',
    #label_period_val=60,
    #label_period_unit=PeriodUnit.day
)

In [20]:
for ix, row in labels.iterrows():
    gr.add_entity_edge(
        parent_node=gr_nodes[row['to_name']],
        relation_node=gr_nodes[row['from_name']],
        parent_key=row['to_key'],
        relation_key=row['from_key'],
        reduce=True
    )

In [21]:
for node in gr.nodes():
  node.reload()

In [22]:
gr.do_transformations()

2024-04-23 13:46:13 [info     ] hydrating graph attributes
2024-04-23 13:46:13 [info     ] hydrating attributes for DynamicNode
2024-04-23 13:46:13 [info     ] hydrating attributes for DynamicNode
2024-04-23 13:46:13 [info     ] hydrating attributes for DynamicNode
2024-04-23 13:46:13 [info     ] hydrating attributes for DynamicNode
2024-04-23 13:46:13 [info     ] hydrating attributes for DynamicNode
2024-04-23 13:46:13 [info     ] hydrating attributes for DynamicNode
2024-04-23 13:46:13 [info     ] hydrating graph data
2024-04-23 13:46:13 [info     ] checking for prefix uniqueness
2024-04-23 13:46:13 [info     ] running filters, normalize, and annotations for <GraphReduceNode: fpath=notification_interaction_types.csv fmt=csv>
2024-04-23 13:46:13 [info     ] running filters, normalize, and annotations for <GraphReduceNode: fpath=notification_interactions.csv fmt=csv>
2024-04-23 13:46:13 [info     ] running filters, normalize, and annotations for <GraphReduceNode: fpath=notifications.cs

  pd.to_datetime(
  pd.to_datetime(
  pd.to_datetime(
  pd.to_datetime(


In [23]:
gr.parent_node.df

Unnamed: 0,ord_id,ord_customer_id,ord_ts,cu_id,cu_name,op_order_id,op_id_count,op_order_id_count,op_product_id_count
0,1,1,2023-05-12,1,wes,1.0,4.0,4.0,4.0
1,2,1,2023-06-01,1,wes,2.0,4.0,4.0,4.0
2,3,2,2023-01-01,2,john,3.0,4.0,4.0,4.0
3,4,2,2022-08-05,2,john,4.0,4.0,4.0,4.0
4,5,3,2023-06-01,3,ryan,5.0,1.0,1.0,1.0
5,6,1,2023-09-02,1,wes,6.0,2.0,2.0,2.0
6,7,2,2023-10-15,2,john,,,,
7,8,4,2024-01-01,4,tianji,,,,
8,9,4,2024-02-01,4,tianji,,,,


In [25]:
# Build a graph at the customer level with a label
# of if they ordered in the subsequent month

In [46]:
gr = GraphReduce(
    name='cust_graph_ml',
    parent_node=gr_nodes['cust.csv'],
    fmt='csv',
    cut_date=datetime.datetime(2023,9,1),
    compute_layer=GraphReduceComputeLayerEnum.pandas,
    auto_features=True,
    auto_feature_hops_front=1,
    auto_feature_hops_back=2,
    label_node=gr_nodes['orders.csv'],
    label_operation='count',
    label_field='id',
    label_period_val=30,
    label_period_unit=PeriodUnit.day
)

In [47]:
gr_nodes['orders.csv'].date_key

'ts'

In [48]:
gr_nodes['orders.csv'].columns

['id', 'customer_id', 'ts']

In [49]:
gr_nodes['orders.csv'].date_key = 'ts'

In [50]:
for ix, row in labels.iterrows():
    gr.add_entity_edge(
        parent_node=gr_nodes[row['to_name']],
        relation_node=gr_nodes[row['from_name']],
        parent_key=row['to_key'],
        relation_key=row['from_key'],
        reduce=True
    )

In [51]:
for node in gr.nodes():
  node.reload()

In [52]:
gr.do_transformations()

2024-04-23 13:49:41 [info     ] hydrating graph attributes
2024-04-23 13:49:41 [info     ] hydrating attributes for DynamicNode
2024-04-23 13:49:41 [info     ] hydrating attributes for DynamicNode
2024-04-23 13:49:41 [info     ] hydrating attributes for DynamicNode
2024-04-23 13:49:41 [info     ] hydrating attributes for DynamicNode
2024-04-23 13:49:41 [info     ] hydrating attributes for DynamicNode
2024-04-23 13:49:41 [info     ] hydrating attributes for DynamicNode
2024-04-23 13:49:41 [info     ] hydrating graph data
2024-04-23 13:49:41 [info     ] checking for prefix uniqueness
2024-04-23 13:49:41 [info     ] running filters, normalize, and annotations for <GraphReduceNode: fpath=notification_interaction_types.csv fmt=csv>
2024-04-23 13:49:41 [info     ] running filters, normalize, and annotations for <GraphReduceNode: fpath=notification_interactions.csv fmt=csv>
2024-04-23 13:49:41 [info     ] running filters, normalize, and annotations for <GraphReduceNode: fpath=notifications.cs

  pd.to_datetime(
  pd.to_datetime(
  pd.to_datetime(
  pd.to_datetime(


2024-04-23 13:49:41 [info     ] joining <GraphReduceNode: fpath=notification_interactions.csv fmt=csv> to <GraphReduceNode: fpath=notifications.csv fmt=csv>
2024-04-23 13:49:41 [info     ] reducing relation <GraphReduceNode: fpath=notifications.csv fmt=csv>
2024-04-23 13:49:41 [info     ] performing auto_features on node <GraphReduceNode: fpath=notifications.csv fmt=csv>
2024-04-23 13:49:41 [info     ] joining <GraphReduceNode: fpath=notifications.csv fmt=csv> to <GraphReduceNode: fpath=cust.csv fmt=csv>
2024-04-23 13:49:41 [info     ] reducing relation <GraphReduceNode: fpath=order_products.csv fmt=csv>
2024-04-23 13:49:41 [info     ] performing auto_features on node <GraphReduceNode: fpath=order_products.csv fmt=csv>
2024-04-23 13:49:41 [info     ] joining <GraphReduceNode: fpath=order_products.csv fmt=csv> to <GraphReduceNode: fpath=orders.csv fmt=csv>
2024-04-23 13:49:41 [info     ] reducing relation <GraphReduceNode: fpath=orders.csv fmt=csv>
2024-04-23 13:49:41 [info     ] perfor

In [53]:
gr.parent_node.df

Unnamed: 0,cu_id,cu_name,notif_customer_id,notif_id_count,notif_customer_id_count,notif_ts_first,notif_ts_min,notif_ts_max,ni_notification_id_min,ni_notification_id_max,ni_notification_id_sum,ni_id_count_min,ni_id_count_max,ni_id_count_sum,ni_notification_id_count_min,ni_notification_id_count_max,ni_notification_id_count_sum,ni_interaction_type_id_count_min,ni_interaction_type_id_count_max,ni_interaction_type_id_count_sum,ni_ts_first_first,ni_ts_first_min,ni_ts_first_max,ni_ts_min_first,ni_ts_min_min,ni_ts_min_max,ni_ts_max_first,ni_ts_max_min,ni_ts_max_max,ord_customer_id,ord_id_count,ord_customer_id_count,ord_ts_first,ord_ts_min,ord_ts_max,op_order_id_min,op_order_id_max,op_order_id_sum,op_id_count_min,op_id_count_max,op_id_count_sum,op_order_id_count_min,op_order_id_count_max,op_order_id_count_sum,op_product_id_count_min,op_product_id_count_max,op_product_id_count_sum,ord_customer_id_dupe,ord_id_label
0,1,wes,1,6,6,2022-08-05,2022-08-05,2023-06-23,101.0,106.0,621.0,1.0,3.0,14.0,1.0,3.0,14.0,1.0,3.0,14.0,2022-08-06,2022-08-06,2023-05-15,2022-08-06,2022-08-06,2023-05-15,2022-08-08,2022-08-08,2023-05-15,1.0,2.0,2.0,2023-05-12,2023-05-12,2023-06-01,1.0,2.0,3.0,4.0,4.0,8.0,4.0,4.0,8.0,4.0,4.0,8.0,1.0,1.0
1,2,john,2,7,7,2022-09-05,2022-09-05,2023-05-22,107.0,110.0,434.0,1.0,1.0,4.0,1.0,1.0,4.0,1.0,1.0,4.0,2023-06-01,2023-06-01,2023-06-04,2023-06-01,2023-06-01,2023-06-04,2023-06-01,2023-06-01,2023-06-04,2.0,1.0,1.0,2023-01-01,2023-01-01,2023-01-01,3.0,3.0,3.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,,
2,3,ryan,3,2,2,2023-06-12,2023-06-12,2023-09-01,,,0.0,,,0.0,,,0.0,,,0.0,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,3.0,1.0,1.0,2023-06-01,2023-06-01,2023-06-01,5.0,5.0,5.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,,
3,4,tianji,4,2,2,2024-02-01,2024-02-01,2024-02-15,,,0.0,,,0.0,,,0.0,,,0.0,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,,,,NaT,NaT,NaT,,,,,,,,,,,,,,


In [54]:
!cat orders.csv

id,customer_id,ts
1,1,2023-05-12
2,1,2023-06-01
3,2,2023-01-01
4,2,2022-08-05
5,3,2023-06-01
6,1,2023-09-02
7,2,2023-10-15
8,4,2024-01-01
9,4,2024-02-01
