In [1]:
!pip install graphreduce==1.6.4



In [2]:
import datetime
import subprocess

import pandas as pd
from graphreduce.node import GraphReduceNode, DynamicNode
from graphreduce.graph_reduce import GraphReduce
from graphreduce.enum import ComputeLayerEnum as GraphReduceComputeLayerEnum, PeriodUnit



In [5]:
files = [
    'https://raw.githubusercontent.com/wesmadrigal/GraphReduce/master/tests/data/cust_data/cust.csv',
    'https://raw.githubusercontent.com/wesmadrigal/GraphReduce/master/tests/data/cust_data/notification_interaction_types.csv',
    'https://raw.githubusercontent.com/wesmadrigal/GraphReduce/master/tests/data/cust_data/notification_interactions.csv',
    'https://raw.githubusercontent.com/wesmadrigal/GraphReduce/master/tests/data/cust_data/notifications.csv',
    'https://raw.githubusercontent.com/wesmadrigal/GraphReduce/master/tests/data/cust_data/order_products.csv',
    'https://raw.githubusercontent.com/wesmadrigal/GraphReduce/master/tests/data/cust_data/orders.csv'
]

for f in files:
  subprocess.run(['wget', f])

In [6]:
!ls | grep .csv

cust.csv
notification_interactions.csv
notification_interaction_types.csv
notifications.csv
order_products.csv
orders.csv


In [7]:
class CustomerNode(GraphReduceNode):
    def do_filters(self):
        pass
    def do_annotate(self):
        pass
    def do_normalize(self):
        pass
    def do_reduce(self, reduce_key):
        pass
    def do_post_join_annotate(self):
        pass
    def do_labels(self, reduce_key):
        pass

In [8]:
cust = CustomerNode(fpath='cust.csv',
        fmt='csv',
        pk='id',
        prefix='cu',
        date_key=None,
        compute_layer=GraphReduceComputeLayerEnum.pandas,
        compute_period_val=730,
        compute_period_unit=PeriodUnit.day
)



In [10]:
cust.do_data()

  pd.to_datetime(
  pd.to_datetime(


In [11]:
cust.df

Unnamed: 0,cu_id,cu_name
0,1,wes
1,2,john
2,3,ryan
3,4,tianji


In [12]:
class NotificationInteractionNode(GraphReduceNode):
    def do_filters(self):
        pass
    def do_annotate(self):
        pass
    def do_normalize(self):
        pass
    def do_reduce(self, reduce_key):
        return self.prep_for_features().groupby(self.colabbr(reduce_key)).agg(**{
            'num_interactions': pd.NamedAgg(column=self.colabbr('id'), aggfunc='count'),
            'first_interaction':pd.NamedAgg(column=self.colabbr('ts'), aggfunc='min'),
            'last_interaction':pd.NamedAgg(column=self.colabbr('ts'),aggfunc='max')
        }).reset_index()
    def do_post_join_annotate(self):
        pass
    def do_labels(self, reduce_key):
        pass

In [13]:
class NotificationNode(GraphReduceNode):
    def do_filters(self):
        pass
    def do_annotate(self):
        pass
    def do_normalize(self):
        pass
    def do_reduce(self, reduce_key):
        return self.prep_for_features().groupby(self.colabbr(reduce_key)).agg(**{
            'num_notifications': pd.NamedAgg(column='ni_notification_id',aggfunc='count'),
            'num_interactions': pd.NamedAgg(column='num_interactions', aggfunc='sum'),
            'first_notification': pd.NamedAgg(column=self.colabbr('ts'), aggfunc='min'),
            'last_notification':pd.NamedAgg(column=self.colabbr('ts'),aggfunc='max')
        }).reset_index()
    def do_post_join_annotate(self):
        pass
    def do_labels(self):
        pass

In [14]:
class OrderProductNode(GraphReduceNode):
    def do_filters(self):
        pass
    def do_annotate(self):
        pass
    def do_normalize(self):
        pass
    def do_reduce(self, reduce_key):
        return self.prep_for_features().groupby(self.colabbr(reduce_key)).agg(**{
            'num_products': pd.NamedAgg(column=self.colabbr('product_id'),aggfunc='count')
        }).reset_index()
    def do_post_join_annotate(self):
        pass
    def do_labels(self):
        pass

In [15]:
class OrderNode(GraphReduceNode):
    def do_filters(self):
        pass
    def do_annotate(self):
        pass
    def do_normalize(self):
        pass
    def do_reduce(self, reduce_key):
        return self.prep_for_features().groupby(self.colabbr(reduce_key)).agg(**{
            'num_orders': pd.NamedAgg(column='op_order_id',aggfunc='count'),
            'num_products_ordered': pd.NamedAgg(column='num_products', aggfunc='sum'),
            'first_order': pd.NamedAgg(column=self.colabbr('ts'), aggfunc='min'),
            'last_order': pd.NamedAgg(column=self.colabbr('ts'), aggfunc='max')
        }).reset_index()
    def do_post_join_annotate(self):
        pass
    def do_labels(self):
        pass

In [16]:
cust = CustomerNode(fpath='cust.csv',
        fmt='csv',
        pk='id',
        prefix='cu',
        date_key=None,
        compute_layer=GraphReduceComputeLayerEnum.pandas,
        compute_period_val=730,
        compute_period_unit=PeriodUnit.day
)
orders = OrderNode(fpath='orders.csv',
        fmt='csv',
        pk='id',
        prefix='ord',
        date_key=None,
        compute_layer=GraphReduceComputeLayerEnum.pandas,
        compute_period_val=730,
        compute_period_unit=PeriodUnit.day
                  )
op = OrderProductNode(fpath='order_products.csv',
        fmt='csv',
        pk='id',
        prefix='op',
        date_key=None,
        compute_layer=GraphReduceComputeLayerEnum.pandas,
        compute_period_val=730,
        compute_period_unit=PeriodUnit.day
                  )
nit = DynamicNode(fpath='notification_interaction_types.csv',
        fmt='csv',
        pk='id',
        prefix='nit',
        date_key=None,
        compute_layer=GraphReduceComputeLayerEnum.pandas,
        compute_period_val=730,
        compute_period_unit=PeriodUnit.day
                  )

ni = NotificationInteractionNode(fpath='notification_interactions.csv',
        fmt='csv',
        pk='id',
        prefix='ni',
        date_key=None,
        compute_layer=GraphReduceComputeLayerEnum.pandas,
        compute_period_val=730,
        compute_period_unit=PeriodUnit.day
                  )
notif = NotificationNode(
    fpath='notifications.csv',
        fmt='csv',
        pk='id',
        prefix='not',
        date_key=None,
        compute_layer=GraphReduceComputeLayerEnum.pandas,
        compute_period_val=730,
        compute_period_unit=PeriodUnit.day
)



In [17]:
gr = GraphReduce(
    name='cust',
    parent_node=cust,
    fmt='csv',
    cut_date=datetime.datetime(2023,9,1),
    compute_layer=GraphReduceComputeLayerEnum.pandas,
)

In [18]:
# Add relation from notification_interaction_types.csv to notification_interactions.csv
gr.add_entity_edge(
        parent_node=ni,
        relation_node=nit,
        parent_key='interaction_type_id',
        relation_key='id',
        reduce=False
    )

In [19]:
# Add relation from notification_interactions.csv to notifications.csv
gr.add_entity_edge(
        parent_node=notif,
        relation_node=ni,
        parent_key='id',
        relation_key='notification_id',
        reduce=True
    )

In [20]:
# Add relation from order_products.csv to orders.csv
gr.add_entity_edge(
        parent_node=orders,
        relation_node=op,
        parent_key='id',
        relation_key='order_id',
        reduce=True
    )


In [21]:
# Add relation from notifications.csv to cust.csv
gr.add_entity_edge(
        parent_node=cust,
        relation_node=notif,
        parent_key='id',
        relation_key='customer_id',
        reduce=True
    )

In [22]:
# Add relation from orders.csv to cust.csv

In [23]:
gr.add_entity_edge(
        parent_node=cust,
        relation_node=orders,
        parent_key='id',
        relation_key='customer_id',
        reduce=True
    )

In [24]:
gr.plot_graph('customer_graph.html')

2024-04-23 13:39:00 [info     ] plotted graph at customer_graph.html


In [25]:
from IPython.display import IFrame


In [27]:
IFrame(src='./customer_graph.html', width=600, height=600)

In [28]:
gr.do_transformations()

2024-04-23 13:39:12 [info     ] hydrating graph attributes
2024-04-23 13:39:12 [info     ] hydrating attributes for NotificationInteractionNode
2024-04-23 13:39:12 [info     ] hydrating attributes for DynamicNode
2024-04-23 13:39:12 [info     ] hydrating attributes for NotificationNode
2024-04-23 13:39:12 [info     ] hydrating attributes for OrderNode
2024-04-23 13:39:12 [info     ] hydrating attributes for OrderProductNode
2024-04-23 13:39:12 [info     ] hydrating attributes for CustomerNode
2024-04-23 13:39:12 [info     ] hydrating graph data
2024-04-23 13:39:12 [info     ] checking for prefix uniqueness
2024-04-23 13:39:12 [info     ] running filters, normalize, and annotations for <GraphReduceNode: fpath=notification_interactions.csv fmt=csv>
2024-04-23 13:39:12 [info     ] running filters, normalize, and annotations for <GraphReduceNode: fpath=notification_interaction_types.csv fmt=csv>
2024-04-23 13:39:12 [info     ] running filters, normalize, and annotations for <GraphReduceNod

  pd.to_datetime(
  pd.to_datetime(
  pd.to_datetime(
  pd.to_datetime(


In [29]:
gr.parent_node.df

Unnamed: 0,cu_id,cu_name,ord_customer_id,num_orders,num_products_ordered,first_order,last_order,not_customer_id,num_notifications,num_interactions,first_notification,last_notification
0,1,wes,1,3,10.0,2023-05-12,2023-09-02,1,6,14.0,2022-08-05,2023-06-23
1,2,john,2,2,8.0,2022-08-05,2023-10-15,2,4,4.0,2022-09-05,2023-05-22
2,3,ryan,3,1,1.0,2023-06-01,2023-06-01,3,0,0.0,2023-06-12,2023-09-01
3,4,tianji,4,0,0.0,2024-01-01,2024-02-01,4,0,0.0,2024-02-01,2024-02-15


In [30]:
# OPEN NEW NOTEBOOK WITH: https://github.com/wesmadrigal/GraphReduce/blob/master/examples/ODSC_East_2024_ex3.ipynb