In [133]:
# machine learning week demo of GraphReduce

In [134]:
import pandas as pd
import datetime
import pyvis



from graphreduce.node import GraphReduceNode
from graphreduce.enum import ComputeLayerEnum, PeriodUnit
from graphreduce.graph_reduce import GraphReduce

# abstract classes enforce the definition 
# of certain functions to adhere to convention

In [135]:
class CustomerNode(GraphReduceNode):
    def do_annotate(self):
        pass
    
    def do_filters(self):
        pass
    
    def do_clip_cols(self):
        pass
    
    def do_slice_data(self):
        pass
    
    def do_post_join_annotate(self):
        pass
    
    def do_reduce(self, reduce_key):
        pass
    
    def do_labels(self, reduce_key):
        pass

In [136]:
class OrderNode(GraphReduceNode):
    def do_annotate(self):
        pass
    
    def do_filters(self):
        pass
    
    def do_clip_cols(self):
        pass
    
    def do_slice_data(self):
        pass
    
    def do_post_join_annotate(self):
        pass
    
    def do_reduce(self, reduce_key):
        return self.prep_for_features().groupby(self.colabbr(reduce_key)).agg(
            **{
                self.colabbr(f'{self.pk}_count') : pd.NamedAgg(column=self.colabbr(self.pk), aggfunc='count')
            }
        ).reset_index()
    
    def do_labels(self, key):
        pass

In [137]:
class OrderProductNode(GraphReduceNode):
    def do_annotate(self):
        pass
    
    def do_filters(self):
        pass
    
    def do_clip_cols(self):
        pass
    
    def do_slice_data(self):
        pass
    
    def do_post_join_annotate(self):
        pass
    
    def do_reduce(self, reduce_key):
        return self.prep_for_features().groupby(self.colabbr(reduce_key)).agg(
            **{
                self.colabbr(f'{self.pk}_count') : pd.NamedAgg(column=self.colabbr(self.pk), aggfunc='count')
            }
        ).reset_index()
    
    def do_labels(self, key):
        pass

In [138]:
class OrderEventNode(GraphReduceNode):
    def do_annotate(self):
        pass
    
    def do_filters(self):
        pass
    
    def do_clip_cols(self):
        pass
    
    def do_slice_data(self):
        pass
    
    def do_post_join_annotate(self):
        pass
    
    def do_reduce(self, reduce_key):
        return self.prep_for_features().groupby(self.colabbr(reduce_key)).agg(
            **{
                self.colabbr(f'{self.pk}_count') : pd.NamedAgg(column=self.colabbr(self.pk), aggfunc='count')
            }
        ).reset_index()
    
    def do_labels(self, key):
        pass

In [139]:
class NotificationNode(GraphReduceNode):
    def do_annotate(self):
        pass
    
    def do_filters(self):
        pass
    
    def do_clip_cols(self):
        pass
    
    def do_slice_data(self):
        pass
    
    def do_post_join_annotate(self):
        pass
    
    def do_reduce(self, reduce_key):
        df = self.prep_for_features().groupby(self.colabbr(reduce_key)).agg(
            **{
                self.colabbr(f'{self.pk}_count') : pd.NamedAgg(column=self.colabbr(self.pk), aggfunc='count'),
                'ni_num_notification_interactions' : pd.NamedAgg(column='ni_num_interactions', aggfunc='sum')
            }
        ).reset_index()
        return df
        
    
    def do_labels(self, key):
        pass

In [140]:
class NotificationInteractionNode(GraphReduceNode):
    def do_annotate(self):
        self.df[self.colabbr(self.date_key)] = self.df[self.colabbr(self.date_key)].apply(
            lambda x: datetime.datetime.strptime(x, '%Y-%m-%d'))
    
    def do_filters(self):
        pass
    
    def do_clip_cols(self):
        pass
    
    def do_slice_data(self):
        pass
    
    def do_post_join_annotate(self):
        pass
    
    def do_reduce(self, reduce_key):
        print(f"reduce key: {reduce_key}")
        return self.prep_for_features().groupby(self.colabbr(reduce_key)).agg(
            **{
                self.colabbr(f'{self.pk}_counts') : pd.NamedAgg(column=self.colabbr(self.pk), aggfunc='count'),
                self.colabbr(f'{self.pk}_min') : pd.NamedAgg(column=self.colabbr(self.pk), aggfunc='min'),
                self.colabbr(f'{self.pk}_min'): pd.NamedAgg(column=self.colabbr(self.pk), aggfunc='max'),
                self.colabbr(f'num_interactions') : pd.NamedAgg(column=self.colabbr(self.pk), aggfunc='count'),

            }
        ).reset_index()
    
    def do_labels(self, reduce_key):
        label_df = self.prep_for_labels().groupby(self.colabbr(reduce_key)).agg(
            **{
                # add a label / target for a model predicting the number of interactions
                self.colabbr(f'{self.pk}_num_interactions') : pd.NamedAgg(column=self.colabbr(self.pk), aggfunc='count'),
            }
        ).reset_index()
        return label_df

In [141]:
cust = CustomerNode(pk='id', prefix='cust',fpath='dat/cust.csv', fmt='csv', compute_layer=ComputeLayerEnum.pandas)
order = OrderNode(pk='id', prefix='order', fpath='dat/orders.csv', fmt='csv',compute_layer=ComputeLayerEnum.pandas)
order_event = OrderEventNode(pk='id', prefix='oe',fpath='dat/order_events.csv', fmt='csv',compute_layer=ComputeLayerEnum.pandas)
order_product = OrderProductNode(pk='id', prefix='op',fpath='dat/order_products.csv', fmt='csv',compute_layer=ComputeLayerEnum.pandas)
notification = NotificationNode(pk='id', prefix='no', fpath='dat/notifications.csv', fmt='csv',compute_layer=ComputeLayerEnum.pandas)
notification_interaction = NotificationInteractionNode(pk='id', prefix='ni',fpath='dat/notification_interactions.csv', fmt='csv',compute_layer=ComputeLayerEnum.pandas,
                                                      date_key='ts')



## graph reduce abstracts the orchestration of compute
## across the graph, and thigns like depth-first
## traversal for hierarchical rollups


In [142]:
gr = GraphReduce(
    cut_date=datetime.datetime(2023, 5, 6),
    compute_period_val=365,
    compute_period_unit=PeriodUnit.day,
    parent_node=cust,
    compute_layer=ComputeLayerEnum.pandas,
    has_labels=True,
    label_period_val=30,
    label_period_unit=PeriodUnit.day
)

In [143]:
gr.add_node(cust)
gr.add_node(order)
gr.add_node(order_event)
gr.add_node(order_product)
gr.add_node(notification)
gr.add_node(notification_interaction)

In [144]:
gr.hydrate_graph_attrs()

[2m2023-06-20 15:31:57[0m [[32m[1minfo     [0m] [1mhydrating attributes for CustomerNode[0m
[2m2023-06-20 15:31:57[0m [[32m[1minfo     [0m] [1mhydrating attributes for OrderNode[0m
[2m2023-06-20 15:31:57[0m [[32m[1minfo     [0m] [1mhydrating attributes for OrderEventNode[0m
[2m2023-06-20 15:31:57[0m [[32m[1minfo     [0m] [1mhydrating attributes for OrderProductNode[0m
[2m2023-06-20 15:31:57[0m [[32m[1minfo     [0m] [1mhydrating attributes for NotificationNode[0m
[2m2023-06-20 15:31:57[0m [[32m[1minfo     [0m] [1mhydrating attributes for NotificationInteractionNode[0m


In [145]:
gr.nodes()

NodeView((<__main__.CustomerNode object at 0x1519c7dc0>, <__main__.OrderNode object at 0x1519c7a90>, <__main__.OrderEventNode object at 0x1520fc520>, <__main__.OrderProductNode object at 0x1519c7c40>, <__main__.NotificationNode object at 0x1520fc6d0>, <__main__.NotificationInteractionNode object at 0x1520fce20>))

In [146]:
help(gr.add_entity_edge)

Help on method add_entity_edge in module graphreduce.graph_reduce:

add_entity_edge(parent_node: graphreduce.node.GraphReduceNode, relation_node: graphreduce.node.GraphReduceNode, parent_key: str, relation_key: str, relation_type: str = 'parent_child', reduce: bool = True) method of graphreduce.graph_reduce.GraphReduce instance
    Add an entity relation



In [147]:
gr.add_entity_edge(
    parent_node=cust,
    relation_node=order,
    parent_key='id',
    relation_key='customer_id',
    relation_type='parent_child',
    reduce=True
)

In [148]:
gr.add_entity_edge(
    parent_node=cust,
    relation_node=notification,
    parent_key='id',
    relation_key='customer_id',
    relation_type='parent_child',
    reduce=True
)

In [149]:
gr.add_entity_edge(
    parent_node=order,
    relation_node=order_event,
    parent_key='id',
    relation_key='order_id',
    relation_type='parent_child',
    reduce=True
)

In [150]:
gr.add_entity_edge(
    parent_node=order,
    relation_node=order_product,
    parent_key='id',
    relation_key='order_id',
    relation_type='parent_child',
    reduce=True
)

In [151]:
gr.add_entity_edge(
    parent_node=notification,
    relation_node=notification_interaction,
    parent_key='id',
    relation_key='notification_id',
    relation_type='parent_child',
    reduce=True
)

In [152]:
gr.plot_graph('customer_graph_reduce2.html', cdn_resources='in_line', notebook=True)

[2m2023-06-20 15:31:57[0m [[32m[1minfo     [0m] [1mplotted graph at customer_graph_reduce2.html[0m


In [153]:
gr.do_transformations()

[2m2023-06-20 15:31:57[0m [[32m[1minfo     [0m] [1mhydrating graph attributes[0m
[2m2023-06-20 15:31:57[0m [[32m[1minfo     [0m] [1mhydrating attributes for CustomerNode[0m
[2m2023-06-20 15:31:57[0m [[32m[1minfo     [0m] [1mhydrating attributes for OrderNode[0m
[2m2023-06-20 15:31:57[0m [[32m[1minfo     [0m] [1mhydrating attributes for OrderEventNode[0m
[2m2023-06-20 15:31:57[0m [[32m[1minfo     [0m] [1mhydrating attributes for OrderProductNode[0m
[2m2023-06-20 15:31:57[0m [[32m[1minfo     [0m] [1mhydrating attributes for NotificationNode[0m
[2m2023-06-20 15:31:57[0m [[32m[1minfo     [0m] [1mhydrating attributes for NotificationInteractionNode[0m
[2m2023-06-20 15:31:57[0m [[32m[1minfo     [0m] [1mhydrating graph data[0m
[2m2023-06-20 15:31:57[0m [[32m[1minfo     [0m] [1mrunning filters, clip cols, and annotations for CustomerNode[0m
[2m2023-06-20 15:31:57[0m [[32m[1minfo     [0m] [1mrunning filters, clip cols, and ann

In [None]:
# 1) reduce
# 2) no reduce on notification tabale
# 3

In [159]:
gr.parent.df

Unnamed: 0,cust_id,cust_name,no_customer_id,no_id_count,ni_num_notification_interactions,order_customer_id,order_id_count
0,1,wes,1,6,7.0,1,2
1,2,john,2,7,0.0,2,2


In [160]:
len(gr.parent.df)

2

In [161]:
gr.get_children(gr.parent)

[<__main__.NotificationInteractionNode at 0x1520fce20>,
 <__main__.NotificationNode at 0x1520fc6d0>,
 <__main__.OrderProductNode at 0x1519c7c40>,
 <__main__.OrderEventNode at 0x1520fc520>,
 <__main__.OrderNode at 0x1519c7a90>]

In [162]:
import networkx as nx

In [163]:
nx.dfs_edges

<function networkx.algorithms.traversal.depth_first_search.dfs_edges(G, source=None, depth_limit=None)>

In [164]:
list(nx.dfs_preorder_nodes(gr, source=gr.parent))

[<__main__.CustomerNode at 0x1519c7dc0>,
 <__main__.OrderNode at 0x1519c7a90>,
 <__main__.OrderEventNode at 0x1520fc520>,
 <__main__.OrderProductNode at 0x1519c7c40>,
 <__main__.NotificationNode at 0x1520fc6d0>,
 <__main__.NotificationInteractionNode at 0x1520fce20>]

In [165]:
order

<__main__.OrderNode at 0x1519c7a90>

In [166]:
gr.get_children(order)

[<__main__.OrderProductNode at 0x1519c7c40>,
 <__main__.OrderEventNode at 0x1520fc520>]

In [167]:
gr.plot_graph(fname='mlw_2023.html')

[2m2023-06-20 15:31:57[0m [[32m[1minfo     [0m] [1mplotted graph at mlw_2023.html[0m


In [168]:
# merge children columns dynamically
gr.nodes()

NodeView((<__main__.CustomerNode object at 0x1519c7dc0>, <__main__.OrderNode object at 0x1519c7a90>, <__main__.OrderEventNode object at 0x1520fc520>, <__main__.OrderProductNode object at 0x1519c7c40>, <__main__.NotificationNode object at 0x1520fc6d0>, <__main__.NotificationInteractionNode object at 0x1520fce20>))