In [1]:
# machine learning week demo of GraphReduce

In [2]:
import pandas as pd
import datetime
import pyvis



from graphreduce.node import GraphReduceNode
from graphreduce.enum import ComputeLayerEnum, PeriodUnit
from graphreduce.graph_reduce import GraphReduce

# abstract classes enforce the definition 
# of certain functions to adhere to convention

In [3]:
class CustomerNode(GraphReduceNode):
    def do_annotate(self):
        pass
    
    def do_filters(self):
        pass
    
    def do_clip_cols(self):
        pass
    
    def do_slice_data(self):
        pass
    
    def do_post_join_annotate(self):
        pass
    
    def do_reduce(self, reduce_key, *args, **kwargs):
        pass
    
    def do_labels(self, reduce_key, *args, **kwargs):
        pass

In [4]:
class OrderNode(GraphReduceNode):
    def do_annotate(self):
        pass
    
    def do_filters(self):
        pass
    
    def do_clip_cols(self):
        pass
    
    def do_slice_data(self):
        pass
    
    def do_post_join_annotate(self):
        pass
    
    def do_reduce(self, reduce_key):
        return self.prep_for_features().groupby(self.colabbr(reduce_key)).agg(
            **{
                self.colabbr(f'{self.pk}_count') : pd.NamedAgg(column=self.colabbr(self.pk), aggfunc='count')
            }
        ).reset_index()
    
    def do_labels(self, key):
        pass

In [5]:
class OrderProductNode(GraphReduceNode):
    def do_annotate(self):
        pass
    
    def do_filters(self):
        pass
    
    def do_clip_cols(self):
        pass
    
    def do_slice_data(self):
        pass
    
    def do_post_join_annotate(self):
        pass
    
    def do_reduce(self, reduce_key):
        return self.prep_for_features().groupby(self.colabbr(reduce_key)).agg(
            **{
                self.colabbr(f'{self.pk}_count') : pd.NamedAgg(column=self.colabbr(self.pk), aggfunc='count')
            }
        ).reset_index()
    
    def do_labels(self, key):
        pass

In [6]:
class OrderEventNode(GraphReduceNode):
    def do_annotate(self):
        pass
    
    def do_filters(self):
        pass
    
    def do_clip_cols(self):
        pass
    
    def do_slice_data(self):
        pass
    
    def do_post_join_annotate(self):
        pass
    
    def do_reduce(self, reduce_key, *args, **kwargs):
        return self.prep_for_features().groupby(self.colabbr(reduce_key)).agg(
            **{
                self.colabbr(f'{self.pk}_count') : pd.NamedAgg(column=self.colabbr(self.pk), aggfunc='count')
            }
        ).reset_index()
    
    def do_labels(self, key):
        pass

In [7]:
class NotificationNode(GraphReduceNode):
    def do_annotate(self):
        pass
    
    def do_filters(self):
        pass
    
    def do_clip_cols(self):
        pass
    
    def do_slice_data(self):
        pass
    
    def do_post_join_annotate(self):
        pass
    
    def do_reduce(self, reduce_key, *args, **kwargs):
        df = self.prep_for_features().groupby(self.colabbr(reduce_key)).agg(
            **{
                self.colabbr(f'{self.pk}_count') : pd.NamedAgg(column=self.colabbr(self.pk), aggfunc='count'),
                #'ni_num_notification_interactions' : pd.NamedAgg(column='ni_num_interactions', aggfunc='sum')
            }
        ).reset_index()
        return df
        
    
    def do_labels(self, key):
        pass

In [8]:
class NotificationInteractionNode(GraphReduceNode):
    def do_annotate(self):
        self.df[self.colabbr(self.date_key)] = self.df[self.colabbr(self.date_key)].apply(
            lambda x: datetime.datetime.strptime(x, '%Y-%m-%d'))
    
    def do_filters(self):
        pass
    
    def do_clip_cols(self):
        pass
    
    def do_slice_data(self):
        pass
    
    def do_post_join_annotate(self):
        pass
    
    def do_reduce(self, 
                  reduce_key : str,
                  additional_agg : dict = {}
                 ):
        
        return self.prep_for_features().groupby(self.colabbr(reduce_key)).agg(
            **{
                self.colabbr(f'{self.pk}_counts') : pd.NamedAgg(column=self.colabbr(self.pk), aggfunc='count'),
                self.colabbr(f'{self.pk}_min') : pd.NamedAgg(column=self.colabbr(self.pk), aggfunc='min'),
                self.colabbr(f'{self.pk}_min'): pd.NamedAgg(column=self.colabbr(self.pk), aggfunc='max'),
                self.colabbr(f'num_interactions') : pd.NamedAgg(column=self.colabbr(self.pk), aggfunc='count'),

            }
        ).reset_index()
    
    def do_labels(self, reduce_key, *args, **kwargs):
        label_df = self.prep_for_labels().groupby(self.colabbr(reduce_key)).agg(
            **{
                # add a label / target for a model predicting the number of interactions
                self.colabbr(f'{self.pk}_num_interactions') : pd.NamedAgg(column=self.colabbr(self.pk), aggfunc='count'),
            }
        ).reset_index()
        return label_df

In [9]:
cust = CustomerNode(pk='id', prefix='cust',fpath='dat/cust.csv', fmt='csv', compute_layer=ComputeLayerEnum.pandas)
order = OrderNode(pk='id', prefix='order', fpath='dat/orders.csv', fmt='csv',compute_layer=ComputeLayerEnum.pandas)
order_event = OrderEventNode(pk='id', prefix='oe',fpath='dat/order_events.csv', fmt='csv',compute_layer=ComputeLayerEnum.pandas)
order_product = OrderProductNode(pk='id', prefix='op',fpath='dat/order_products.csv', fmt='csv',compute_layer=ComputeLayerEnum.pandas)
notification = NotificationNode(pk='id', prefix='no', fpath='dat/notifications.csv', fmt='csv',compute_layer=ComputeLayerEnum.pandas)
notification_interaction = NotificationInteractionNode(pk='id', prefix='ni',fpath='dat/notification_interactions.csv', fmt='csv',compute_layer=ComputeLayerEnum.pandas,
                                                      date_key='ts')



## graph reduce abstracts the orchestration of compute
## across the graph, and thigns like depth-first
## traversal for hierarchical rollups


In [10]:
gr = GraphReduce(
    cut_date=datetime.datetime(2023, 5, 6),
    compute_period_val=365,
    compute_period_unit=PeriodUnit.day,
    parent_node=cust,
    compute_layer=ComputeLayerEnum.pandas,
    has_labels=False,
    label_period_val=30,
    label_period_unit=PeriodUnit.day,
    dynamic_propagation=True
)

In [11]:
gr.type_func_map

{'int64': ['min', 'max', 'sum'],
 'str': ['first'],
 'object': ['first'],
 'float64': ['min', 'max', 'sum'],
 'bool': ['first'],
 'datetime64': ['first']}

In [12]:
gr.add_node(cust)
gr.add_node(order)
gr.add_node(order_event)
gr.add_node(order_product)
gr.add_node(notification)
gr.add_node(notification_interaction)

In [13]:
gr.hydrate_graph_attrs()

[2m2023-06-27 09:23:53[0m [[32m[1minfo     [0m] [1mhydrating attributes for CustomerNode[0m
[2m2023-06-27 09:23:53[0m [[32m[1minfo     [0m] [1mhydrating attributes for OrderNode[0m
[2m2023-06-27 09:23:53[0m [[32m[1minfo     [0m] [1mhydrating attributes for OrderEventNode[0m
[2m2023-06-27 09:23:53[0m [[32m[1minfo     [0m] [1mhydrating attributes for OrderProductNode[0m
[2m2023-06-27 09:23:53[0m [[32m[1minfo     [0m] [1mhydrating attributes for NotificationNode[0m
[2m2023-06-27 09:23:53[0m [[32m[1minfo     [0m] [1mhydrating attributes for NotificationInteractionNode[0m


In [14]:
gr.nodes()

NodeView((<__main__.CustomerNode object at 0x131356bb0>, <__main__.OrderNode object at 0x131356850>, <__main__.OrderEventNode object at 0x1313561c0>, <__main__.OrderProductNode object at 0x131356ee0>, <__main__.NotificationNode object at 0x131356820>, <__main__.NotificationInteractionNode object at 0x131356280>))

In [15]:
help(gr.add_entity_edge)

Help on method add_entity_edge in module graphreduce.graph_reduce:

add_entity_edge(parent_node: graphreduce.node.GraphReduceNode, relation_node: graphreduce.node.GraphReduceNode, parent_key: str, relation_key: str, relation_type: str = 'parent_child', reduce: bool = True) method of graphreduce.graph_reduce.GraphReduce instance
    Add an entity relation



In [16]:
gr.add_entity_edge(
    parent_node=cust,
    relation_node=order,
    parent_key='id',
    relation_key='customer_id',
    relation_type='parent_child',
    reduce=True
)

In [17]:
gr.add_entity_edge(
    parent_node=cust,
    relation_node=notification,
    parent_key='id',
    relation_key='customer_id',
    relation_type='parent_child',
    reduce=True
)

In [18]:
gr.add_entity_edge(
    parent_node=order,
    relation_node=order_event,
    parent_key='id',
    relation_key='order_id',
    relation_type='parent_child',
    reduce=True
)

In [19]:
gr.add_entity_edge(
    parent_node=order,
    relation_node=order_product,
    parent_key='id',
    relation_key='order_id',
    relation_type='parent_child',
    reduce=True
)

In [20]:
gr.add_entity_edge(
    parent_node=notification,
    relation_node=notification_interaction,
    parent_key='id',
    relation_key='notification_id',
    relation_type='parent_child',
    reduce=True
)

In [21]:
gr.plot_graph('customer_graph_reduce2.html', cdn_resources='in_line', notebook=True)

[2m2023-06-27 09:23:53[0m [[32m[1minfo     [0m] [1mplotted graph at customer_graph_reduce2.html[0m


In [22]:
gr.do_transformations()

[2m2023-06-27 09:23:53[0m [[32m[1minfo     [0m] [1mhydrating graph attributes[0m
[2m2023-06-27 09:23:53[0m [[32m[1minfo     [0m] [1mhydrating attributes for CustomerNode[0m
[2m2023-06-27 09:23:53[0m [[32m[1minfo     [0m] [1mhydrating attributes for OrderNode[0m
[2m2023-06-27 09:23:53[0m [[32m[1minfo     [0m] [1mhydrating attributes for OrderEventNode[0m
[2m2023-06-27 09:23:53[0m [[32m[1minfo     [0m] [1mhydrating attributes for OrderProductNode[0m
[2m2023-06-27 09:23:53[0m [[32m[1minfo     [0m] [1mhydrating attributes for NotificationNode[0m
[2m2023-06-27 09:23:53[0m [[32m[1minfo     [0m] [1mhydrating attributes for NotificationInteractionNode[0m
[2m2023-06-27 09:23:53[0m [[32m[1minfo     [0m] [1mhydrating graph data[0m
[2m2023-06-27 09:23:53[0m [[32m[1minfo     [0m] [1mchecking for prefix uniqueness[0m
[2m2023-06-27 09:23:53[0m [[32m[1minfo     [0m] [1mrunning filters, clip cols, and annotations for CustomerNode[0m


In [23]:
# 1) reduce
# 2) no reduce on notification tabale
# 3

In [24]:
gr.parent.df

Unnamed: 0,cust_id,cust_name,no_customer_id,no_id_count,no_id_min,no_id_max,no_id_sum,no_customer_id_min,no_customer_id_max,no_customer_id_sum,...,oe_order_id_sum_sum,oe_event_id_min_min,oe_event_id_min_max,oe_event_id_min_sum,oe_event_id_max_min,oe_event_id_max_max,oe_event_id_max_sum,oe_event_id_sum_min,oe_event_id_sum_max,oe_event_id_sum_sum
0,1,wes,1,6,1,6,21,1,1,6,...,30.0,1.0,11.0,12.0,10.0,20.0,30.0,55.0,155.0,210.0
1,2,john,2,7,7,13,70,2,2,14,...,18.0,21.0,21.0,21.0,26.0,26.0,26.0,141.0,141.0,141.0


In [25]:
len(gr.parent.df)

2

In [26]:
gr.get_children(gr.parent)

[<__main__.NotificationInteractionNode at 0x131356280>,
 <__main__.NotificationNode at 0x131356820>,
 <__main__.OrderProductNode at 0x131356ee0>,
 <__main__.OrderEventNode at 0x1313561c0>,
 <__main__.OrderNode at 0x131356850>]

In [27]:
import networkx as nx

In [28]:
nx.dfs_edges

<function networkx.algorithms.traversal.depth_first_search.dfs_edges(G, source=None, depth_limit=None)>

In [29]:
list(nx.dfs_preorder_nodes(gr, source=gr.parent))

[<__main__.CustomerNode at 0x131356bb0>,
 <__main__.OrderNode at 0x131356850>,
 <__main__.OrderEventNode at 0x1313561c0>,
 <__main__.OrderProductNode at 0x131356ee0>,
 <__main__.NotificationNode at 0x131356820>,
 <__main__.NotificationInteractionNode at 0x131356280>]

In [30]:
order

<__main__.OrderNode at 0x131356850>

In [31]:
gr.get_children(order)

[<__main__.OrderProductNode at 0x131356ee0>,
 <__main__.OrderEventNode at 0x1313561c0>]

In [32]:
gr.plot_graph(fname='mlw_2023.html')

[2m2023-06-27 09:23:53[0m [[32m[1minfo     [0m] [1mplotted graph at mlw_2023.html[0m


In [33]:
# merge children columns dynamically
gr.nodes()

NodeView((<__main__.CustomerNode object at 0x131356bb0>, <__main__.OrderNode object at 0x131356850>, <__main__.OrderEventNode object at 0x1313561c0>, <__main__.OrderProductNode object at 0x131356ee0>, <__main__.NotificationNode object at 0x131356820>, <__main__.NotificationInteractionNode object at 0x131356280>))

In [34]:
gr

<graphreduce.graph_reduce.GraphReduce at 0x1313625b0>

In [35]:
gr.get_children

<bound method GraphReduce.get_children of <graphreduce.graph_reduce.GraphReduce object at 0x1313625b0>>

In [36]:
gr.get_children(order)

[<__main__.OrderProductNode at 0x131356ee0>,
 <__main__.OrderEventNode at 0x1313561c0>]

In [37]:
order.df

Unnamed: 0,order_id,order_customer_id,order_ts,op_order_id,op_id_count,op_id_min,op_id_max,op_id_sum,op_order_id_min,op_order_id_max,...,oe_id_count,oe_id_min,oe_id_max,oe_id_sum,oe_order_id_min,oe_order_id_max,oe_order_id_sum,oe_event_id_min,oe_event_id_max,oe_event_id_sum
0,1,1,2023-05-12,1,4,1,1,4,1,1,...,10.0,1.0,10.0,55.0,1.0,1.0,10.0,1.0,10.0,55.0
1,2,1,2023-06-01,2,4,1,1,4,2,2,...,10.0,11.0,20.0,155.0,2.0,2.0,20.0,11.0,20.0,155.0
2,3,2,2023-01-01,3,4,1,1,4,3,3,...,6.0,21.0,26.0,141.0,3.0,3.0,18.0,21.0,26.0,141.0
3,4,2,2022-08-05,4,4,1,1,4,4,4,...,,,,,,,,,,


In [38]:
gr.get_children(order)[0].df

Unnamed: 0,op_id,op_order_id,op_product_id
0,1,1,1
1,1,1,2
2,1,1,3
3,1,1,4
4,1,2,1
5,1,2,2
6,1,2,3
7,1,2,4
8,1,3,1
9,1,3,2


In [39]:
gr.get_children(order)[0].df.columns


Index(['op_id', 'op_order_id', 'op_product_id'], dtype='object')

In [40]:
gr.get_children(order)[0].df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16 entries, 0 to 15
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype
---  ------         --------------  -----
 0   op_id          16 non-null     int64
 1   op_order_id    16 non-null     int64
 2   op_product_id  16 non-null     int64
dtypes: int64(3)
memory usage: 512.0 bytes


In [41]:
order.df

Unnamed: 0,order_id,order_customer_id,order_ts,op_order_id,op_id_count,op_id_min,op_id_max,op_id_sum,op_order_id_min,op_order_id_max,...,oe_id_count,oe_id_min,oe_id_max,oe_id_sum,oe_order_id_min,oe_order_id_max,oe_order_id_sum,oe_event_id_min,oe_event_id_max,oe_event_id_sum
0,1,1,2023-05-12,1,4,1,1,4,1,1,...,10.0,1.0,10.0,55.0,1.0,1.0,10.0,1.0,10.0,55.0
1,2,1,2023-06-01,2,4,1,1,4,2,2,...,10.0,11.0,20.0,155.0,2.0,2.0,20.0,11.0,20.0,155.0
2,3,2,2023-01-01,3,4,1,1,4,3,3,...,6.0,21.0,26.0,141.0,3.0,3.0,18.0,21.0,26.0,141.0
3,4,2,2022-08-05,4,4,1,1,4,4,4,...,,,,,,,,,,


In [42]:
order.df.dtypes

order_id               int64
order_customer_id      int64
order_ts              object
op_order_id            int64
op_id_count            int64
op_id_min              int64
op_id_max              int64
op_id_sum              int64
op_order_id_min        int64
op_order_id_max        int64
op_order_id_sum        int64
op_product_id_min      int64
op_product_id_max      int64
op_product_id_sum      int64
oe_order_id          float64
oe_id_count          float64
oe_id_min            float64
oe_id_max            float64
oe_id_sum            float64
oe_order_id_min      float64
oe_order_id_max      float64
oe_order_id_sum      float64
oe_event_id_min      float64
oe_event_id_max      float64
oe_event_id_sum      float64
dtype: object

In [43]:
type_func = {'int' : ['min','max']}
cols = ['a', 'b', 'c']

In [44]:
order

<__main__.OrderNode at 0x131356850>

In [45]:
order.df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4 entries, 0 to 3
Data columns (total 25 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   order_id           4 non-null      int64  
 1   order_customer_id  4 non-null      int64  
 2   order_ts           4 non-null      object 
 3   op_order_id        4 non-null      int64  
 4   op_id_count        4 non-null      int64  
 5   op_id_min          4 non-null      int64  
 6   op_id_max          4 non-null      int64  
 7   op_id_sum          4 non-null      int64  
 8   op_order_id_min    4 non-null      int64  
 9   op_order_id_max    4 non-null      int64  
 10  op_order_id_sum    4 non-null      int64  
 11  op_product_id_min  4 non-null      int64  
 12  op_product_id_max  4 non-null      int64  
 13  op_product_id_sum  4 non-null      int64  
 14  oe_order_id        3 non-null      float64
 15  oe_id_count        3 non-null      float64
 16  oe_id_min          3 non-null 

In [46]:
vals = dict(order.df.dtypes)

In [47]:
vals

{'order_id': dtype('int64'),
 'order_customer_id': dtype('int64'),
 'order_ts': dtype('O'),
 'op_order_id': dtype('int64'),
 'op_id_count': dtype('int64'),
 'op_id_min': dtype('int64'),
 'op_id_max': dtype('int64'),
 'op_id_sum': dtype('int64'),
 'op_order_id_min': dtype('int64'),
 'op_order_id_max': dtype('int64'),
 'op_order_id_sum': dtype('int64'),
 'op_product_id_min': dtype('int64'),
 'op_product_id_max': dtype('int64'),
 'op_product_id_sum': dtype('int64'),
 'oe_order_id': dtype('float64'),
 'oe_id_count': dtype('float64'),
 'oe_id_min': dtype('float64'),
 'oe_id_max': dtype('float64'),
 'oe_id_sum': dtype('float64'),
 'oe_order_id_min': dtype('float64'),
 'oe_order_id_max': dtype('float64'),
 'oe_order_id_sum': dtype('float64'),
 'oe_event_id_min': dtype('float64'),
 'oe_event_id_max': dtype('float64'),
 'oe_event_id_sum': dtype('float64')}

In [48]:
str(vals['order_id'])

'int64'