In [114]:
# First example notebook for the ODSC West 2023 Workship:
# https://odsc.com/speakers/using-graphs-for-large-feature-engineering-pipelines/

In [188]:
import datetime

import pandas as pd

from graphreduce.node import GraphReduceNode
from graphreduce.graph_reduce import GraphReduce
from graphreduce.enum import ComputeLayerEnum, PeriodUnit


# defining a node

In [189]:
class CustomerNode(GraphReduceNode):
    def do_annotate(self):
        self.df[self.colabbr('name_length')] = self.df[self.colabbr('name')].apply(lambda x: len(x))
    
    def do_filters(self):
        pass
    
    def do_normalize(self):
        pass
    
    def do_post_join_annotate(self):
        pass
    
    def do_reduce(self, reduce_key, *args, **kwargs):
        pass
    
    def do_labels(self, reduce_key, *args, **kwargs):
        pass

In [212]:
class OrderNode(GraphReduceNode):
    def do_annotate(self):
        pass
    
    def do_filters(self):
        pass
    
    def do_normalize(self):
        pass
    
    def do_post_join_annotate(self):
        pass
    
    def do_reduce(self, reduce_key):
        return self.prep_for_features().groupby(self.colabbr(reduce_key)).agg(
            **{
                self.colabbr(f'{self.pk}_count') : pd.NamedAgg(column=self.colabbr(self.pk), aggfunc='count'),
                self.colabbr(f'amount_sum'): pd.NamedAgg(column=self.colabbr('amount'), aggfunc='sum')
            }
        ).reset_index()
    
    def do_labels(self, reduce_key):
        return self.prep_for_labels().groupby(self.colabbr(reduce_key)).agg(
            **{
                self.colabbr(f'{self.pk}_had_order') : pd.NamedAgg(column=self.colabbr(self.pk), aggfunc='count')
            }
        ).reset_index()

# Instantiate the node

In [191]:
cust = CustomerNode(
    pk='id',
    prefix='cust',
    fpath='dat/cust.csv',
    fmt='csv',
    compute_layer=ComputeLayerEnum.pandas,
)



In [192]:
cust.do_data()

In [193]:
cust.df

Unnamed: 0,cust_id,cust_name
0,1,wes
1,2,john


In [194]:
order = OrderNode(
    pk='id',
    prefix='order', 
    fpath='dat/orders.csv', 
    fmt='csv',
    compute_layer=ComputeLayerEnum.pandas,
)



In [195]:
order.do_data()

In [196]:
order.df

Unnamed: 0,order_id,order_customer_id,order_ts,order_amount
0,1,1,2023-05-12,10.0
1,2,1,2023-06-01,11.5
2,3,2,2023-01-01,100.0
3,4,2,2022-08-05,150.0
4,5,1,2023-07-01,325.0
5,6,2,2023-07-02,23.0
6,7,1,2023-07-14,12000.0


# Run operations


In [197]:
order.do_annotate()

In [198]:
# pre-annotate
cust.df

Unnamed: 0,cust_id,cust_name
0,1,wes
1,2,john


In [199]:
cust.do_annotate()

In [200]:
cust.df

Unnamed: 0,cust_id,cust_name,cust_name_length
0,1,wes,3
1,2,john,4


# Handling time

In [201]:
len(order.df)

7

In [202]:
len(order.prep_for_features())

7

In [213]:
# we didn't provide a date key or date information
order = OrderNode(
    pk='id',
    prefix='order', 
    fpath='dat/orders.csv', 
    fmt='csv',
    compute_layer=ComputeLayerEnum.pandas,
    compute_period_val=365,
    compute_period_unit=PeriodUnit.day,
    label_period_val=30,
    label_period_unit=PeriodUnit.day,
    cut_date=datetime.datetime(2023, 6, 1),
    date_key='ts'
)

In [214]:
order.do_data()

In [215]:
print(len(order.df))

7


In [216]:
print(len(order.prep_for_features()))

4


In [217]:
order.prep_for_features()

Unnamed: 0,order_id,order_customer_id,order_ts,order_amount
0,1,1,2023-05-12,10.0
1,2,1,2023-06-01,11.5
2,3,2,2023-01-01,100.0
3,4,2,2022-08-05,150.0


In [218]:
order.prep_for_labels()

Unnamed: 0,order_id,order_customer_id,order_ts,order_amount
4,5,1,2023-07-01,325.0


# Adding operations to a node.

In [209]:
order.do_reduce('customer_id')

Unnamed: 0,order_customer_id,order_id_count
0,1,2
1,2,2


In [219]:
# let's add a sum of the order amount
order.do_reduce('customer_id')

Unnamed: 0,order_customer_id,order_id_count,order_amount_sum
0,1,2,21.5
1,2,2,250.0


# Constructing a graph.

In [220]:
help(GraphReduce)

Help on class GraphReduce in module graphreduce.graph_reduce:

class GraphReduce(networkx.classes.digraph.DiGraph)
 |  GraphReduce(name: str = 'graph_reduce', parent_node: Optional[graphreduce.node.GraphReduceNode] = None, fmt: str = 'parquet', compute_layer: graphreduce.enum.ComputeLayerEnum = None, cut_date: datetime.datetime = datetime.datetime(2023, 10, 25, 16, 41, 54, 934704), compute_period_val: Union[int, float] = 365, compute_period_unit: graphreduce.enum.PeriodUnit = <PeriodUnit.day: 'day'>, has_labels: bool = False, label_period_val: Union[int, float, NoneType] = None, label_period_unit: Optional[graphreduce.enum.PeriodUnit] = None, spark_sqlctx: pyspark.sql.context.SQLContext = None, feature_function: Optional[str] = None, dynamic_propagation: bool = False, type_func_map: Dict[str, List[str]] = {'int64': ['min', 'max', 'sum'], 'str': ['first'], 'object': ['first'], 'float64': ['min', 'max', 'sum'], 'bool': ['first'], 'datetime64': ['first']}, storage_client: Optional[graphre

In [282]:
cust = CustomerNode(
    pk='id',
    prefix='cust', 
    fpath='dat/cust.csv', 
    fmt='csv',
    compute_layer=ComputeLayerEnum.pandas,
)

order = OrderNode(
    pk='id',
    prefix='order', 
    fpath='dat/orders.csv', 
    fmt='csv',
    compute_layer=ComputeLayerEnum.pandas,
    date_key='ts'
)



In [283]:
gr = GraphReduce(
    name='odsc_first_graph',
    parent_node=cust,
    fmt='csv',
    compute_layer=ComputeLayerEnum.pandas,
    compute_period_val=365,
    compute_period_unit=PeriodUnit.day,
    label_period_val=30,
    label_period_unit=PeriodUnit.day,
    cut_date=datetime.datetime(2023, 6, 12),
    has_labels=True
)

In [284]:
# show how attribute push down works
# gr.hydrate_graph_attrs

In [285]:
gr.add_node(cust)
gr.add_node(order)

In [286]:
gr.hydrate_graph_attrs()

2023-10-25 18:36:12 [info     ] hydrating attributes for CustomerNode
2023-10-25 18:36:12 [info     ] hydrating attributes for OrderNode


In [287]:
order.compute_period_val

365

In [288]:
order.cut_date

datetime.datetime(2023, 6, 12, 0, 0)

In [289]:
# add an edge

In [290]:
help(gr.add_entity_edge)

Help on method add_entity_edge in module graphreduce.graph_reduce:

add_entity_edge(parent_node: graphreduce.node.GraphReduceNode, relation_node: graphreduce.node.GraphReduceNode, parent_key: str, relation_key: str, relation_type: str = 'parent_child', reduce: bool = True) method of graphreduce.graph_reduce.GraphReduce instance
    Add an entity relation



In [291]:
gr.add_entity_edge(
    parent_node=cust,
    relation_node=order,
    parent_key='id',
    relation_key='customer_id',
    reduce=True
)

In [292]:
gr.do_transformations()

2023-10-25 18:36:15 [info     ] hydrating graph attributes
2023-10-25 18:36:15 [info     ] hydrating attributes for CustomerNode
2023-10-25 18:36:15 [info     ] hydrating attributes for OrderNode
2023-10-25 18:36:15 [info     ] hydrating graph data
2023-10-25 18:36:15 [info     ] checking for prefix uniqueness
2023-10-25 18:36:15 [info     ] running filters, normalize, and annotations for <GraphReduceNode: fpath=dat/cust.csv fmt=csv>
2023-10-25 18:36:15 [info     ] running filters, normalize, and annotations for <GraphReduceNode: fpath=dat/orders.csv fmt=csv>
2023-10-25 18:36:15 [info     ] depth-first traversal through the graph from source: <GraphReduceNode: fpath=dat/cust.csv fmt=csv>
2023-10-25 18:36:15 [info     ] reducing relation <GraphReduceNode: fpath=dat/orders.csv fmt=csv>
2023-10-25 18:36:15 [info     ] joining <GraphReduceNode: fpath=dat/orders.csv fmt=csv> to <GraphReduceNode: fpath=dat/cust.csv fmt=csv>
2023-10-25 18:36:15 [info     ] computed labels for <GraphReduceNode

In [293]:
gr.parent_node.df

Unnamed: 0,cust_id,cust_name,cust_name_length,order_customer_id,order_id_count,order_amount_sum,order_customer_id_dupe,order_id_had_order
0,1,wes,3,1,2,21.5,1,1
1,2,john,4,2,2,250.0,2,1


# Constructing a graph without reducing relations.

In [268]:
cust = CustomerNode(
    pk='id',
    prefix='cust', 
    fpath='dat/cust.csv', 
    fmt='csv',
    compute_layer=ComputeLayerEnum.pandas,
)

order = OrderNode(
    pk='id',
    prefix='order', 
    fpath='dat/orders.csv', 
    fmt='csv',
    compute_layer=ComputeLayerEnum.pandas,
    date_key='ts'
)
gr = GraphReduce(
    name='odsc_first_graph',
    parent_node=cust,
    fmt='csv',
    compute_layer=ComputeLayerEnum.pandas,
    compute_period_val=365,
    compute_period_unit=PeriodUnit.day,
    label_period_val=30,
    label_period_unit=PeriodUnit.day,
    cut_date=datetime.datetime(2023, 6, 12)
)

gr.add_node(cust)
gr.add_node(order)

gr.add_entity_edge(
    parent_node=cust,
    relation_node=order,
    parent_key='id',
    relation_key='customer_id',
    reduce=False
)



In [269]:
gr.do_transformations()

2023-10-25 18:35:24 [info     ] hydrating graph attributes
2023-10-25 18:35:24 [info     ] hydrating attributes for CustomerNode
2023-10-25 18:35:24 [info     ] hydrating attributes for OrderNode
2023-10-25 18:35:24 [info     ] hydrating graph data
2023-10-25 18:35:24 [info     ] checking for prefix uniqueness
2023-10-25 18:35:24 [info     ] running filters, normalize, and annotations for <GraphReduceNode: fpath=dat/cust.csv fmt=csv>
2023-10-25 18:35:24 [info     ] running filters, normalize, and annotations for <GraphReduceNode: fpath=dat/orders.csv fmt=csv>
2023-10-25 18:35:24 [info     ] depth-first traversal through the graph from source: <GraphReduceNode: fpath=dat/cust.csv fmt=csv>
2023-10-25 18:35:24 [info     ] doing nothing with relation node <GraphReduceNode: fpath=dat/orders.csv fmt=csv>
2023-10-25 18:35:24 [info     ] joining <GraphReduceNode: fpath=dat/orders.csv fmt=csv> to <GraphReduceNode: fpath=dat/cust.csv fmt=csv>


In [270]:
gr.parent_node.df

Unnamed: 0,cust_id,cust_name,cust_name_length,order_id,order_customer_id,order_ts,order_amount
0,1,wes,3,1,1,2023-05-12,10.0
1,1,wes,3,2,1,2023-06-01,11.5
2,1,wes,3,5,1,2023-07-01,325.0
3,1,wes,3,7,1,2023-07-14,12000.0
4,2,john,4,3,2,2023-01-01,100.0
5,2,john,4,4,2,2022-08-05,150.0
6,2,john,4,6,2,2023-07-02,23.0


# Constructing a graph and automating feature generation.

In [294]:
help(GraphReduce)

Help on class GraphReduce in module graphreduce.graph_reduce:

class GraphReduce(networkx.classes.digraph.DiGraph)
 |  GraphReduce(name: str = 'graph_reduce', parent_node: Optional[graphreduce.node.GraphReduceNode] = None, fmt: str = 'parquet', compute_layer: graphreduce.enum.ComputeLayerEnum = None, cut_date: datetime.datetime = datetime.datetime(2023, 10, 25, 16, 41, 54, 934704), compute_period_val: Union[int, float] = 365, compute_period_unit: graphreduce.enum.PeriodUnit = <PeriodUnit.day: 'day'>, has_labels: bool = False, label_period_val: Union[int, float, NoneType] = None, label_period_unit: Optional[graphreduce.enum.PeriodUnit] = None, spark_sqlctx: pyspark.sql.context.SQLContext = None, feature_function: Optional[str] = None, dynamic_propagation: bool = False, type_func_map: Dict[str, List[str]] = {'int64': ['min', 'max', 'sum'], 'str': ['first'], 'object': ['first'], 'float64': ['min', 'max', 'sum'], 'bool': ['first'], 'datetime64': ['first']}, storage_client: Optional[graphre

In [301]:
cust = CustomerNode(
    pk='id',
    prefix='cust', 
    fpath='dat/cust.csv', 
    fmt='csv',
    compute_layer=ComputeLayerEnum.pandas,
)

order = OrderNode(
    pk='id',
    prefix='order', 
    fpath='dat/orders.csv', 
    fmt='csv',
    compute_layer=ComputeLayerEnum.pandas,
    date_key='ts'
)
gr = GraphReduce(
    name='odsc_first_graph',
    parent_node=cust,
    fmt='csv',
    compute_layer=ComputeLayerEnum.pandas,
    compute_period_val=365,
    compute_period_unit=PeriodUnit.day,
    label_period_val=30,
    label_period_unit=PeriodUnit.day,
    cut_date=datetime.datetime(2023, 6, 12),
    dynamic_propagation=True,
    has_labels=True
)

gr.add_node(cust)
gr.add_node(order)

gr.add_entity_edge(
    parent_node=cust,
    relation_node=order,
    parent_key='id',
    relation_key='customer_id',
    reduce=True
)



In [302]:
gr.do_transformations()

2023-10-25 18:37:40 [info     ] hydrating graph attributes
2023-10-25 18:37:40 [info     ] hydrating attributes for CustomerNode
2023-10-25 18:37:40 [info     ] hydrating attributes for OrderNode
2023-10-25 18:37:40 [info     ] hydrating graph data
2023-10-25 18:37:40 [info     ] checking for prefix uniqueness
2023-10-25 18:37:40 [info     ] running filters, normalize, and annotations for <GraphReduceNode: fpath=dat/cust.csv fmt=csv>
2023-10-25 18:37:40 [info     ] running filters, normalize, and annotations for <GraphReduceNode: fpath=dat/orders.csv fmt=csv>
2023-10-25 18:37:40 [info     ] depth-first traversal through the graph from source: <GraphReduceNode: fpath=dat/cust.csv fmt=csv>
2023-10-25 18:37:40 [info     ] reducing relation <GraphReduceNode: fpath=dat/orders.csv fmt=csv>
2023-10-25 18:37:40 [info     ] doing dynamic propagation on node <GraphReduceNode: fpath=dat/orders.csv fmt=csv>
2023-10-25 18:37:40 [info     ] joining <GraphReduceNode: fpath=dat/orders.csv fmt=csv> to 

In [303]:
gr.parent_node.df

Unnamed: 0,cust_id,cust_name,cust_name_length,order_customer_id,order_id_count,order_amount_sum,order_id_min,order_id_max,order_id_sum,order_customer_id_min,order_customer_id_max,order_customer_id_sum,order_ts_first,order_amount_min,order_amount_max,order_amount_sum_dupe,order_customer_id_dupe,order_id_had_order
0,1,wes,3,1,2,21.5,1,2,3,1,1,2,2023-05-12,10.0,11.5,21.5,1,1
1,2,john,4,2,2,250.0,3,4,7,2,2,4,2023-01-01,100.0,150.0,250.0,2,1
