In [1]:
# Third example notebook for the ODSC West 2023 Workship:
# https://odsc.com/speakers/using-graphs-for-large-feature-engineering-pipelines/

In [1]:
!pip install graphreduce


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.3.1[0m[39;49m -> [0m[32;49m23.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [1]:
!wget https://raw.githubusercontent.com/wesmadrigal/GraphReduce/master/examples/dat/orders.csv
!wget https://raw.githubusercontent.com/wesmadrigal/GraphReduce/master/examples/dat/cust.csv
!mkdir dat
!mv cust.csv dat/
!mv orders.csv dat/

--2023-11-02 13:46:07--  https://raw.githubusercontent.com/wesmadrigal/GraphReduce/master/examples/dat/orders.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.110.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 159 [text/plain]
Saving to: ‘orders.csv.5’


2023-11-02 13:46:07 (5.05 MB/s) - ‘orders.csv.5’ saved [159/159]

--2023-11-02 13:46:07--  https://raw.githubusercontent.com/wesmadrigal/GraphReduce/master/examples/dat/cust.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.110.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 21 [text/plain]
Saving to: ‘cust.csv’


2023-11-02 13:46:07 (932 KB/s) - ‘cust.csv’ saved [21/21]

mkdi

In [11]:
import datetime

import pandas as pd

from graphreduce.node import GraphReduceNode
from graphreduce.graph_reduce import GraphReduce
from graphreduce.enum import StorageFormatEnum, ProviderEnum, ComputeLayerEnum, PeriodUnit
from graphreduce.storage import StorageClient
from graphreduce.context import method_requires

In [12]:
# Larger graphs with storage checkpointing (beta)


In [13]:
class CustomerNode(GraphReduceNode):
    
    @method_requires(checkpoint=True)
    def do_annotate(self):
        self.df[self.colabbr('name_length')] = self.df[self.colabbr('name')].apply(lambda x: len(x))
    
    def do_filters(self):
        pass
    
    def do_normalize(self):
        pass
    
    def do_post_join_annotate(self):
        pass
    
    def do_reduce(self, reduce_key, *args, **kwargs):
        pass
    
    def do_labels(self, reduce_key, *args, **kwargs):
        pass
  

class OrderNode(GraphReduceNode):
    def do_annotate(self):
        pass
    
    @method_requires(checkpoint=True)
    def do_filters(self):
        self.df = self.df[
            (self.df[self.colabbr('amount')] < 1000)
            &
            (self.df[self.colabbr('amount')] > 0)
        ]
    
    def do_normalize(self):
        pass
    
    def do_post_join_annotate(self):
        pass
    
    @method_requires(checkpoint=True)
    def do_reduce(self, reduce_key):
        return self.prep_for_features().groupby(self.colabbr(reduce_key)).agg(
            **{
                self.colabbr(f'{self.pk}_count') : pd.NamedAgg(column=self.colabbr(self.pk), aggfunc='count'),
                self.colabbr(f'amount_sum'): pd.NamedAgg(column=self.colabbr('amount'), aggfunc='sum')
            }
        ).reset_index()
    
    def do_labels(self, reduce_key):
        return self.prep_for_labels().groupby(self.colabbr(reduce_key)).agg(
            **{
                self.colabbr(f'{self.pk}_had_order') : pd.NamedAgg(column=self.colabbr(self.pk), aggfunc='count')
            }
        ).reset_index()

In [14]:
CustomerNode

__main__.CustomerNode

In [15]:
OrderNode

__main__.OrderNode

In [16]:
#!mkdir /tmp/graphreduce

In [17]:
storage_client = StorageClient(
    provider=ProviderEnum.local,
    storage_format=StorageFormatEnum.csv,
    compute_layer=ComputeLayerEnum.pandas,
    offload_root='/tmp/graphreduce'
)

In [18]:
cust = CustomerNode(pk='id', prefix='cust',fpath='dat/cust.csv', fmt='csv')
order = OrderNode(pk='id', prefix='order', fpath='dat/orders.csv', fmt='csv', date_key='ts')




In [19]:
gr = GraphReduce(
    name='odsc_example_3',
    parent_node=cust,
    cut_date=datetime.datetime(2023, 7, 6),
    compute_period_val=365,
    compute_period_unit=PeriodUnit.day,
    compute_layer=ComputeLayerEnum.pandas,
    has_labels=False,
    label_period_val=45,
    label_period_unit=PeriodUnit.day,
    storage_client=storage_client
)

In [20]:
gr.add_node(cust)

In [21]:
gr.add_node(order)

In [22]:
gr.add_entity_edge(
    parent_node=cust,
    parent_key='id',
    relation_node=order,
    relation_key='customer_id',
    reduce=True
)

In [23]:
gr.do_transformations()

2023-10-30 15:57:57 [info     ] hydrating graph attributes
2023-10-30 15:57:57 [info     ] hydrating attributes for CustomerNode
2023-10-30 15:57:57 [info     ] hydrating attributes for OrderNode
2023-10-30 15:57:57 [info     ] hydrating graph data
2023-10-30 15:57:57 [info     ] checking for prefix uniqueness
2023-10-30 15:57:57 [info     ] running filters, normalize, and annotations for <GraphReduceNode: fpath=dat/cust.csv fmt=csv>
2023-10-30 15:57:57 [info     ] running filters, normalize, and annotations for <GraphReduceNode: fpath=dat/orders.csv fmt=csv>
2023-10-30 15:57:57 [info     ] depth-first traversal through the graph from source: <GraphReduceNode: fpath=dat/cust.csv fmt=csv>
2023-10-30 15:57:57 [info     ] reducing relation <GraphReduceNode: fpath=dat/orders.csv fmt=csv>
2023-10-30 15:57:57 [info     ] joining <GraphReduceNode: fpath=dat/orders.csv fmt=csv> to <GraphReduceNode: fpath=dat/cust.csv fmt=csv>


In [24]:
!ls /tmp/graphreduce

CustomerNode_do_annotate.csv OrderNode_do_reduce.csv
OrderNode_do_filters.csv


In [25]:
!cat /tmp/graphreduce/OrderNode_do_reduce.csv

order_customer_id,order_id_count,order_amount_sum
1,3,346.5
2,3,273.0


In [26]:
!cat /tmp/graphreduce/CustomerNode_do_annotate.csv

cust_id,cust_name,cust_name_length
1,wes,3
2,john,4
