In [1]:
# machine learning week demo of GraphReduce

In [2]:
import pandas as pd
import datetime
import pyvis



from graphreduce.node import GraphReduceNode
from graphreduce.enum import ComputeLayerEnum, PeriodUnit
from graphreduce.graph_reduce import GraphReduce

# abstract classes enforce the definition 
# of certain functions to adhere to convention

In [36]:
class CustomerNode(GraphReduceNode):
    def do_annotate(self):
        pass
    
    def do_filters(self):
        pass
    
    def do_clip_cols(self):
        pass
    
    def do_slice_data(self):
        pass
    
    def do_post_join_annotate(self):
        pass
    
    def do_reduce(self, reduce_key, children: list = []):
        pass
    
    def do_labels(self, reduce_key):
        pass

In [4]:
class OrderNode(GraphReduceNode):
    def do_annotate(self):
        pass
    
    def do_filters(self):
        pass
    
    def do_clip_cols(self):
        pass
    
    def do_slice_data(self):
        pass
    
    def do_post_join_annotate(self):
        pass
    
    def do_reduce(self, reduce_key):
        return self.prep_for_features().groupby(self.colabbr(reduce_key)).agg(
            **{
                self.colabbr(f'{self.pk}_count') : pd.NamedAgg(column=self.colabbr(self.pk), aggfunc='count')
            }
        ).reset_index()
    
    def do_labels(self, key):
        pass

In [5]:
class OrderProductNode(GraphReduceNode):
    def do_annotate(self):
        pass
    
    def do_filters(self):
        pass
    
    def do_clip_cols(self):
        pass
    
    def do_slice_data(self):
        pass
    
    def do_post_join_annotate(self):
        pass
    
    def do_reduce(self, reduce_key):
        return self.prep_for_features().groupby(self.colabbr(reduce_key)).agg(
            **{
                self.colabbr(f'{self.pk}_count') : pd.NamedAgg(column=self.colabbr(self.pk), aggfunc='count')
            }
        ).reset_index()
    
    def do_labels(self, key):
        pass

In [6]:
class OrderEventNode(GraphReduceNode):
    def do_annotate(self):
        pass
    
    def do_filters(self):
        pass
    
    def do_clip_cols(self):
        pass
    
    def do_slice_data(self):
        pass
    
    def do_post_join_annotate(self):
        pass
    
    def do_reduce(self, reduce_key):
        return self.prep_for_features().groupby(self.colabbr(reduce_key)).agg(
            **{
                self.colabbr(f'{self.pk}_count') : pd.NamedAgg(column=self.colabbr(self.pk), aggfunc='count')
            }
        ).reset_index()
    
    def do_labels(self, key):
        pass

In [7]:
class NotificationNode(GraphReduceNode):
    def do_annotate(self):
        pass
    
    def do_filters(self):
        pass
    
    def do_clip_cols(self):
        pass
    
    def do_slice_data(self):
        pass
    
    def do_post_join_annotate(self):
        pass
    
    def do_reduce(self, reduce_key):
        df = self.prep_for_features().groupby(self.colabbr(reduce_key)).agg(
            **{
                self.colabbr(f'{self.pk}_count') : pd.NamedAgg(column=self.colabbr(self.pk), aggfunc='count'),
                
            }
        ).reset_index()
        return df
        
    
    def do_labels(self, key):
        pass

In [8]:
class NotificationInteractionNode(GraphReduceNode):
    def do_annotate(self):
        pass
    
    def do_filters(self):
        pass
    
    def do_clip_cols(self):
        pass
    
    def do_slice_data(self):
        pass
    
    def do_post_join_annotate(self):
        pass
    
    def do_reduce(self, reduce_key):
        print(f"reduce key: {reduce_key}")
        return self.prep_for_features().groupby(self.colabbr(reduce_key)).agg(
            **{
                self.colabbr(f'{self.pk}_counts') : pd.NamedAgg(column=self.colabbr(self.pk), aggfunc='count'),
                self.colabbr(f'{self.pk}_min') : pd.NamedAgg(column=self.colabbr(self.pk), aggfunc='min'),
                self.colabbr(f'{self.pk}_min'): pd.NamedAgg(column=self.colabbr(self.pk), aggfunc='max'),
            }
        ).reset_index()
    
    def do_labels(self, reduce_key):
        return self.prep_for_labels().groupby(self.colabbr(reduce_key)).agg(
            **{
                # add a label / target for a model predicting the number of interactions
                self.colabbr(f'{self.pk}_num_interactions') : pd.NamedAgg(column=self.clabbr(self.pk), aggfunc='count'),
            }
        ).reset_index()

In [9]:
cust = CustomerNode(pk='id', prefix='cust',fpath='dat/cust.csv', fmt='csv', compute_layer=ComputeLayerEnum.pandas)
order = OrderNode(pk='id', prefix='order', fpath='dat/orders.csv', fmt='csv',compute_layer=ComputeLayerEnum.pandas)
order_event = OrderEventNode(pk='id', prefix='oe',fpath='dat/order_events.csv', fmt='csv',compute_layer=ComputeLayerEnum.pandas)
order_product = OrderProductNode(pk='id', prefix='op',fpath='dat/order_products.csv', fmt='csv',compute_layer=ComputeLayerEnum.pandas)
notification = NotificationNode(pk='id', prefix='no', fpath='dat/notifications.csv', fmt='csv',compute_layer=ComputeLayerEnum.pandas)
notification_interaction = NotificationInteractionNode(pk='id', prefix='ni',fpath='dat/notification_interactions.csv', fmt='csv',compute_layer=ComputeLayerEnum.pandas)



In [None]:
# graph reduce abstracts the orchestration of compute
# across the graph, and thigns like depth-first
# traversal for hierarchical rollups


In [12]:
gr = GraphReduce(
    cut_date=datetime.datetime(2023, 4, 1),
    compute_period_val=365,
    compute_period_unit=PeriodUnit.day,
    parent_node=cust,
    compute_layer=ComputeLayerEnum.pandas,
)

In [13]:
gr.add_node(cust)
gr.add_node(order)
gr.add_node(order_event)
gr.add_node(order_product)
gr.add_node(notification)
gr.add_node(notification_interaction)

In [14]:
gr.hydrate_graph_attrs()

[2m2023-06-20 10:04:52[0m [[32m[1minfo     [0m] [1mhydrating attributes for CustomerNode[0m
[2m2023-06-20 10:04:52[0m [[32m[1minfo     [0m] [1mhydrating attributes for OrderNode[0m
[2m2023-06-20 10:04:52[0m [[32m[1minfo     [0m] [1mhydrating attributes for OrderEventNode[0m
[2m2023-06-20 10:04:52[0m [[32m[1minfo     [0m] [1mhydrating attributes for OrderProductNode[0m
[2m2023-06-20 10:04:52[0m [[32m[1minfo     [0m] [1mhydrating attributes for NotificationNode[0m
[2m2023-06-20 10:04:52[0m [[32m[1minfo     [0m] [1mhydrating attributes for NotificationInteractionNode[0m


In [15]:
gr.nodes()

NodeView((<__main__.CustomerNode object at 0x1455b62b0>, <__main__.OrderNode object at 0x1455b65b0>, <__main__.OrderEventNode object at 0x1455b6070>, <__main__.OrderProductNode object at 0x1455b6c10>, <__main__.NotificationNode object at 0x1455b6880>, <__main__.NotificationInteractionNode object at 0x1455b6d60>))

In [16]:
help(gr.add_entity_edge)

Help on method add_entity_edge in module graphreduce.graph_reduce:

add_entity_edge(parent_node: graphreduce.node.GraphReduceNode, relation_node: graphreduce.node.GraphReduceNode, parent_key: str, relation_key: str, relation_type: str = 'parent_child', reduce: bool = True) method of graphreduce.graph_reduce.GraphReduce instance
    Add an entity relation



In [17]:
gr.add_entity_edge(
    parent_node=cust,
    relation_node=order,
    parent_key='id',
    relation_key='customer_id',
    relation_type='parent_child',
    reduce=True
)

In [18]:
gr.add_entity_edge(
    parent_node=cust,
    relation_node=notification,
    parent_key='id',
    relation_key='customer_id',
    relation_type='parent_child',
    reduce=True
)

In [19]:
gr.add_entity_edge(
    parent_node=order,
    relation_node=order_event,
    parent_key='id',
    relation_key='order_id',
    relation_type='parent_child',
    reduce=True
)

In [20]:
gr.add_entity_edge(
    parent_node=order,
    relation_node=order_product,
    parent_key='id',
    relation_key='order_id',
    relation_type='parent_child',
    reduce=True
)

In [21]:
gr.add_entity_edge(
    parent_node=notification,
    relation_node=notification_interaction,
    parent_key='id',
    relation_key='notification_id',
    relation_type='parent_child',
    reduce=True
)

In [22]:
gr.plot_graph('customer_graph_reduce2.html', cdn_resources='in_line', notebook=True)

[2m2023-06-20 10:04:52[0m [[32m[1minfo     [0m] [1mplotted graph at customer_graph_reduce2.html[0m


In [23]:
gr.do_transformations()

[2m2023-06-20 10:04:52[0m [[32m[1minfo     [0m] [1mhydrating graph attributes[0m
[2m2023-06-20 10:04:52[0m [[32m[1minfo     [0m] [1mhydrating attributes for CustomerNode[0m
[2m2023-06-20 10:04:52[0m [[32m[1minfo     [0m] [1mhydrating attributes for OrderNode[0m
[2m2023-06-20 10:04:52[0m [[32m[1minfo     [0m] [1mhydrating attributes for OrderEventNode[0m
[2m2023-06-20 10:04:52[0m [[32m[1minfo     [0m] [1mhydrating attributes for OrderProductNode[0m
[2m2023-06-20 10:04:52[0m [[32m[1minfo     [0m] [1mhydrating attributes for NotificationNode[0m
[2m2023-06-20 10:04:52[0m [[32m[1minfo     [0m] [1mhydrating attributes for NotificationInteractionNode[0m
[2m2023-06-20 10:04:52[0m [[32m[1minfo     [0m] [1mhydrating graph data[0m
[2m2023-06-20 10:04:53[0m [[32m[1minfo     [0m] [1mrunning filters, clip cols, and annotations for CustomerNode[0m
[2m2023-06-20 10:04:53[0m [[32m[1minfo     [0m] [1mrunning filters, clip cols, and ann

In [24]:
gr.parent

<__main__.CustomerNode at 0x1455b62b0>

In [25]:
gr.parent.df

Unnamed: 0,cust_id,cust_name,no_customer_id,no_id_count,order_customer_id,order_id_count
0,1,wes,1,6,1,2
1,2,john,2,7,2,2


In [26]:
len(gr.parent.df)

2

In [27]:
gr.get_children(gr.parent)

[<__main__.NotificationInteractionNode at 0x1455b6d60>,
 <__main__.NotificationNode at 0x1455b6880>,
 <__main__.OrderProductNode at 0x1455b6c10>,
 <__main__.OrderEventNode at 0x1455b6070>,
 <__main__.OrderNode at 0x1455b65b0>]

In [28]:
import networkx as nx

In [29]:
nx.dfs_edges

<function networkx.algorithms.traversal.depth_first_search.dfs_edges(G, source=None, depth_limit=None)>

In [30]:
dir(nx)

['AmbiguousSolution',
 'ArborescenceIterator',
 'DiGraph',
 'EdgePartition',
 'ExceededMaxIterations',
 'Graph',
 'GraphMLReader',
 'GraphMLWriter',
 'HasACycle',
 'LCF_graph',
 'LFR_benchmark_graph',
 'MultiDiGraph',
 'MultiGraph',
 'NetworkXAlgorithmError',
 'NetworkXError',
 'NetworkXException',
 'NetworkXNoCycle',
 'NetworkXNoPath',
 'NetworkXNotImplemented',
 'NetworkXPointlessConcept',
 'NetworkXTreewidthBoundExceeded',
 'NetworkXUnbounded',
 'NetworkXUnfeasible',
 'NodeNotFound',
 'NotATree',
 'OrderedDiGraph',
 'OrderedGraph',
 'OrderedMultiDiGraph',
 'OrderedMultiGraph',
 'PlanarEmbedding',
 'PowerIterationFailedConvergence',
 'SpanningTreeIterator',
 '__builtins__',
 '__cached__',
 '__doc__',
 '__file__',
 '__getattr__',
 '__loader__',
 '__name__',
 '__package__',
 '__path__',
 '__spec__',
 '__version__',
 '_lazy_import',
 'adamic_adar_index',
 'add_cycle',
 'add_path',
 'add_star',
 'adj_matrix',
 'adjacency',
 'adjacency_data',
 'adjacency_graph',
 'adjacency_matrix',
 'adj

In [31]:
list(nx.dfs_preorder_nodes(gr, source=gr.parent))

[<__main__.CustomerNode at 0x1455b62b0>,
 <__main__.OrderNode at 0x1455b65b0>,
 <__main__.OrderEventNode at 0x1455b6070>,
 <__main__.OrderProductNode at 0x1455b6c10>,
 <__main__.NotificationNode at 0x1455b6880>,
 <__main__.NotificationInteractionNode at 0x1455b6d60>]

In [32]:
order

<__main__.OrderNode at 0x1455b65b0>

In [34]:
gr.get_children(order)

[<__main__.OrderProductNode at 0x1455b6c10>,
 <__main__.OrderEventNode at 0x1455b6070>]