In [1]:
# First example notebook for the ODSC West 2023 Workship:
# https://odsc.com/speakers/using-graphs-for-large-feature-engineering-pipelines/

In [2]:
!pip install graphreduce

Collecting pyvis>=0.3.1
  Using cached pyvis-0.3.2-py3-none-any.whl (756 kB)
Collecting structlog>=23.1.0
  Downloading structlog-23.2.0-py3-none-any.whl (62 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.9/62.9 kB[0m [31m415.7 kB/s[0m eta [36m0:00:00[0m[36m0:00:01[0m36m0:00:01[0m
Installing collected packages: structlog, pyvis
  Attempting uninstall: structlog
    Found existing installation: structlog 22.3.0
    Uninstalling structlog-22.3.0:
      Successfully uninstalled structlog-22.3.0
  Attempting uninstall: pyvis
    Found existing installation: pyvis 0.2.1
    Uninstalling pyvis-0.2.1:
      Successfully uninstalled pyvis-0.2.1
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
entitygraph 0.1 requires pyvis==0.2.1, but you have pyvis 0.3.2 which is incompatible.
entitygraph 0.1 requires structlog==22.3.0, b


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.3.1[0m[39;49m -> [0m[32;49m23.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [5]:
!wget https://raw.githubusercontent.com/wesmadrigal/GraphReduce/master/examples/dat/cust.csv

--2023-11-02 07:32:52--  https://raw.githubusercontent.com/wesmadrigal/GraphReduce/master/examples/dat/cust.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.108.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 21 [text/plain]
Saving to: ‘cust.csv.6’


2023-11-02 07:32:52 (1.11 MB/s) - ‘cust.csv.6’ saved [21/21]



In [6]:
!wget https://raw.githubusercontent.com/wesmadrigal/GraphReduce/master/examples/dat/orders.csv

--2023-11-02 07:33:08--  https://raw.githubusercontent.com/wesmadrigal/GraphReduce/master/examples/dat/orders.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.108.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 159 [text/plain]
Saving to: ‘orders.csv.4’


2023-11-02 07:33:08 (5.23 MB/s) - ‘orders.csv.4’ saved [159/159]



In [8]:
!mkdir dat

mkdir: dat: File exists


In [9]:
!mv cust.csv dat/
!mv orders.csv dat/

In [7]:
import datetime

import pandas as pd

from graphreduce.node import GraphReduceNode
from graphreduce.graph_reduce import GraphReduce
from graphreduce.enum import ComputeLayerEnum, PeriodUnit


# defining a node

In [57]:
class CustomerNode(GraphReduceNode):
    def do_annotate(self):
        self.df[self.colabbr('name_length')] = self.df[self.colabbr('name')].apply(lambda x: len(x))
    
    def do_filters(self):
        pass
    
    def do_normalize(self):
        pass
    
    def do_post_join_annotate(self):
        pass
    
    def do_reduce(self, reduce_key, *args, **kwargs):
        pass
    
    def do_labels(self, reduce_key, *args, **kwargs):
        pass

In [58]:
class OrderNode(GraphReduceNode):
    def do_annotate(self):
        pass
    
    def do_filters(self):
        pass
    
    def do_normalize(self):
        pass
    
    def do_post_join_annotate(self):
        pass
    
    def do_reduce(self, reduce_key):
        return self.prep_for_features().groupby(self.colabbr(reduce_key)).agg(
            **{
                self.colabbr(f'{self.pk}_count') : pd.NamedAgg(column=self.colabbr(self.pk), aggfunc='count'),
                #self.colabbr(f'amount_sum'): pd.NamedAgg(column=self.colabbr('amount'), aggfunc='sum')
            }
        ).reset_index()
    
    def do_labels(self, reduce_key):
        return self.prep_for_labels().groupby(self.colabbr(reduce_key)).agg(
            **{
                self.colabbr(f'{self.pk}_had_order') : pd.NamedAgg(column=self.colabbr(self.pk), aggfunc='count')
            }
        ).reset_index()

# Instantiate the node

In [60]:
cust = CustomerNode(
    pk='id',
    prefix='cust',
    fpath='dat/cust.csv',
    fmt='csv',
    compute_layer=ComputeLayerEnum.pandas,
)



In [61]:
cust.do_data()

In [62]:
cust.df

Unnamed: 0,cust_id,cust_name
0,1,wes
1,2,john


In [63]:
order = OrderNode(
    pk='id',
    prefix='order', 
    fpath='dat/orders.csv', 
    fmt='csv',
    compute_layer=ComputeLayerEnum.pandas,
)



In [64]:
order.do_data()

In [65]:
order.df

Unnamed: 0,order_id,order_customer_id,order_ts,order_amount
0,1,1,2023-05-12,10.0
1,2,1,2023-06-01,11.5
2,3,2,2023-01-01,100.0
3,4,2,2022-08-05,150.0
4,5,1,2023-07-01,325.0
5,6,2,2023-07-02,23.0
6,7,1,2023-07-14,12000.0


# Run operations


In [66]:
order.do_annotate()

In [67]:
# pre-annotate
cust.df

Unnamed: 0,cust_id,cust_name
0,1,wes
1,2,john


In [68]:
cust.do_annotate()

In [69]:
cust.df

Unnamed: 0,cust_id,cust_name,cust_name_length
0,1,wes,3
1,2,john,4


# Handling time

In [70]:
len(order.df)

7

In [71]:
len(order.prep_for_features())

7

In [88]:
# we didn't provide a date key or date information
order = OrderNode(
    pk='id',
    prefix='order', 
    fpath='dat/orders.csv', 
    fmt='csv',
    compute_layer=ComputeLayerEnum.pandas,
    compute_period_val=365,
    compute_period_unit=PeriodUnit.day,
    label_period_val=45,
    label_period_unit=PeriodUnit.day,
    cut_date=datetime.datetime(2023, 6, 1),
    date_key='ts'
)

In [89]:
order.do_data()

In [90]:
print(len(order.df))

7


In [91]:
print(len(order.prep_for_features()))

4


In [92]:
order.prep_for_features()

Unnamed: 0,order_id,order_customer_id,order_ts,order_amount
0,1,1,2023-05-12,10.0
1,2,1,2023-06-01,11.5
2,3,2,2023-01-01,100.0
3,4,2,2022-08-05,150.0


In [93]:
order.prep_for_labels()

Unnamed: 0,order_id,order_customer_id,order_ts,order_amount
4,5,1,2023-07-01,325.0
5,6,2,2023-07-02,23.0
6,7,1,2023-07-14,12000.0


# Adding operations to a node.

In [94]:
order.do_reduce('customer_id')

Unnamed: 0,order_customer_id,order_id_count
0,1,2
1,2,2


In [95]:
# let's add a sum of the order amount
order.do_reduce('customer_id')

Unnamed: 0,order_customer_id,order_id_count
0,1,2
1,2,2


# Constructing a graph.

In [148]:
cust = CustomerNode(
    pk='id',
    prefix='cust', 
    fpath='dat/cust.csv', 
    fmt='csv',
    compute_layer=ComputeLayerEnum.pandas,
)

order = OrderNode(
    pk='id',
    prefix='order', 
    fpath='dat/orders.csv', 
    fmt='csv',
    compute_layer=ComputeLayerEnum.pandas,
    date_key='ts'
)



In [149]:
gr = GraphReduce(
    name='odsc_first_graph',
    parent_node=cust,
    fmt='csv',
    compute_layer=ComputeLayerEnum.pandas,
    compute_period_val=365,
    compute_period_unit=PeriodUnit.day,
    label_period_val=30,
    label_period_unit=PeriodUnit.day,
    cut_date=datetime.datetime(2023, 6, 12),
    has_labels=True
)

In [150]:
# show how attribute push down works
# gr.hydrate_graph_attrs

In [151]:
gr.add_node(cust)
gr.add_node(order)

In [152]:
gr.hydrate_graph_attrs()

2023-10-31 10:59:12 [info     ] hydrating attributes for CustomerNode
2023-10-31 10:59:12 [info     ] hydrating attributes for OrderNode


In [153]:
order.compute_period_val

365

In [154]:
order.cut_date

datetime.datetime(2023, 6, 12, 0, 0)

In [155]:
# add an edge

In [156]:
help(gr.add_entity_edge)

Help on method add_entity_edge in module graphreduce.graph_reduce:

add_entity_edge(parent_node: graphreduce.node.GraphReduceNode, relation_node: graphreduce.node.GraphReduceNode, parent_key: str, relation_key: str, relation_type: str = 'parent_child', reduce: bool = True) method of graphreduce.graph_reduce.GraphReduce instance
    Add an entity relation



In [157]:
gr.add_entity_edge(
    parent_node=cust,
    relation_node=order,
    parent_key='id',
    relation_key='customer_id',
    reduce=True
)

In [158]:
gr.plot_graph('odsc_graph.html')

2023-10-31 10:59:14 [info     ] plotted graph at odsc_graph.html


In [159]:
gr.do_transformations()

2023-10-31 10:59:14 [info     ] hydrating graph attributes
2023-10-31 10:59:14 [info     ] hydrating attributes for CustomerNode
2023-10-31 10:59:14 [info     ] hydrating attributes for OrderNode
2023-10-31 10:59:14 [info     ] hydrating graph data
2023-10-31 10:59:14 [info     ] checking for prefix uniqueness
2023-10-31 10:59:14 [info     ] running filters, normalize, and annotations for <GraphReduceNode: fpath=dat/cust.csv fmt=csv>
2023-10-31 10:59:14 [info     ] running filters, normalize, and annotations for <GraphReduceNode: fpath=dat/orders.csv fmt=csv>
2023-10-31 10:59:14 [info     ] depth-first traversal through the graph from source: <GraphReduceNode: fpath=dat/cust.csv fmt=csv>
2023-10-31 10:59:14 [info     ] reducing relation <GraphReduceNode: fpath=dat/orders.csv fmt=csv>
2023-10-31 10:59:14 [info     ] joining <GraphReduceNode: fpath=dat/orders.csv fmt=csv> to <GraphReduceNode: fpath=dat/cust.csv fmt=csv>
2023-10-31 10:59:14 [info     ] computed labels for <GraphReduceNode

In [160]:
gr.parent_node.df

Unnamed: 0,cust_id,cust_name,cust_name_length,order_customer_id,order_id_count,order_customer_id_dupe,order_id_had_order
0,1,wes,3,1,2,1,1
1,2,john,4,2,2,2,1


# Constructing a graph without reducing relations.

In [161]:
cust = CustomerNode(
    pk='id',
    prefix='cust', 
    fpath='dat/cust.csv', 
    fmt='csv',
    compute_layer=ComputeLayerEnum.pandas,
)

order = OrderNode(
    pk='id',
    prefix='order', 
    fpath='dat/orders.csv', 
    fmt='csv',
    compute_layer=ComputeLayerEnum.pandas,
    date_key='ts'
)
gr = GraphReduce(
    name='odsc_first_graph',
    parent_node=cust,
    fmt='csv',
    compute_layer=ComputeLayerEnum.pandas,
    compute_period_val=365,
    compute_period_unit=PeriodUnit.day,
    label_period_val=30,
    label_period_unit=PeriodUnit.day,
    cut_date=datetime.datetime(2023, 6, 12)
)

gr.add_node(cust)
gr.add_node(order)

gr.add_entity_edge(
    parent_node=cust,
    relation_node=order,
    parent_key='id',
    relation_key='customer_id',
    reduce=False
)



In [162]:
gr.do_transformations()

2023-10-31 10:59:31 [info     ] hydrating graph attributes
2023-10-31 10:59:31 [info     ] hydrating attributes for CustomerNode
2023-10-31 10:59:31 [info     ] hydrating attributes for OrderNode
2023-10-31 10:59:31 [info     ] hydrating graph data
2023-10-31 10:59:31 [info     ] checking for prefix uniqueness
2023-10-31 10:59:31 [info     ] running filters, normalize, and annotations for <GraphReduceNode: fpath=dat/cust.csv fmt=csv>
2023-10-31 10:59:31 [info     ] running filters, normalize, and annotations for <GraphReduceNode: fpath=dat/orders.csv fmt=csv>
2023-10-31 10:59:31 [info     ] depth-first traversal through the graph from source: <GraphReduceNode: fpath=dat/cust.csv fmt=csv>
2023-10-31 10:59:31 [info     ] doing nothing with relation node <GraphReduceNode: fpath=dat/orders.csv fmt=csv>
2023-10-31 10:59:31 [info     ] joining <GraphReduceNode: fpath=dat/orders.csv fmt=csv> to <GraphReduceNode: fpath=dat/cust.csv fmt=csv>


In [163]:
gr.parent_node.df

Unnamed: 0,cust_id,cust_name,cust_name_length,order_id,order_customer_id,order_ts,order_amount
0,1,wes,3,1,1,2023-05-12,10.0
1,1,wes,3,2,1,2023-06-01,11.5
2,1,wes,3,5,1,2023-07-01,325.0
3,1,wes,3,7,1,2023-07-14,12000.0
4,2,john,4,3,2,2023-01-01,100.0
5,2,john,4,4,2,2022-08-05,150.0
6,2,john,4,6,2,2023-07-02,23.0


# Constructing a graph and automating feature generation.

In [164]:
cust = CustomerNode(
    pk='id',
    prefix='cust', 
    fpath='dat/cust.csv', 
    fmt='csv',
    compute_layer=ComputeLayerEnum.pandas,
)

order = OrderNode(
    pk='id',
    prefix='order', 
    fpath='dat/orders.csv', 
    fmt='csv',
    compute_layer=ComputeLayerEnum.pandas,
    date_key='ts'
)
gr = GraphReduce(
    name='odsc_first_graph',
    parent_node=cust,
    fmt='csv',
    compute_layer=ComputeLayerEnum.pandas,
    compute_period_val=365,
    compute_period_unit=PeriodUnit.day,
    label_period_val=30,
    label_period_unit=PeriodUnit.day,
    cut_date=datetime.datetime(2023, 6, 12),
    dynamic_propagation=True,
    has_labels=True
)

gr.add_node(cust)
gr.add_node(order)

gr.add_entity_edge(
    parent_node=cust,
    relation_node=order,
    parent_key='id',
    relation_key='customer_id',
    reduce=True
)



In [165]:
gr.do_transformations()

2023-10-31 11:00:02 [info     ] hydrating graph attributes
2023-10-31 11:00:02 [info     ] hydrating attributes for CustomerNode
2023-10-31 11:00:02 [info     ] hydrating attributes for OrderNode
2023-10-31 11:00:02 [info     ] hydrating graph data
2023-10-31 11:00:02 [info     ] checking for prefix uniqueness
2023-10-31 11:00:02 [info     ] running filters, normalize, and annotations for <GraphReduceNode: fpath=dat/cust.csv fmt=csv>
2023-10-31 11:00:02 [info     ] running filters, normalize, and annotations for <GraphReduceNode: fpath=dat/orders.csv fmt=csv>
2023-10-31 11:00:02 [info     ] depth-first traversal through the graph from source: <GraphReduceNode: fpath=dat/cust.csv fmt=csv>
2023-10-31 11:00:02 [info     ] reducing relation <GraphReduceNode: fpath=dat/orders.csv fmt=csv>
2023-10-31 11:00:02 [info     ] doing dynamic propagation on node <GraphReduceNode: fpath=dat/orders.csv fmt=csv>
2023-10-31 11:00:02 [info     ] joining <GraphReduceNode: fpath=dat/orders.csv fmt=csv> to 

In [166]:
gr.parent_node.df

Unnamed: 0,cust_id,cust_name,cust_name_length,order_customer_id,order_id_count,order_id_min,order_id_max,order_id_sum,order_customer_id_min,order_customer_id_max,order_customer_id_sum,order_ts_first,order_amount_min,order_amount_max,order_amount_sum,order_customer_id_dupe,order_id_had_order
0,1,wes,3,1,2,1,2,3,1,1,2,2023-05-12,10.0,11.5,21.5,1,1
1,2,john,4,2,2,3,4,7,2,2,4,2023-01-01,100.0,150.0,250.0,2,1
