# Many node graph (7 nodes)
## Still using `sqlite` dialect

In [1]:
import pandas as pd
import sqlite3
import json
import os
import typing


# examples for using SQL engines and dialects
from graphreduce.node import SQLNode
from graphreduce.graph_reduce import GraphReduce
from graphreduce.enum import SQLOpType, ComputeLayerEnum
from graphreduce.models import sqlop



In [2]:
!ls dat/

cust.csv                      order_events.csv
cust.db                       order_products.csv
notification_interactions.csv orders.csv
notifications.csv             products.csv


In [3]:
!rm dat/cust.db

In [4]:
dbfile = 'dat/cust.db'
conn = sqlite3.connect(dbfile)

In [5]:
files = [x for x in os.listdir('dat/') if x.endswith('.csv')]
# add each file to the database
for f in files:
    df = pd.read_csv(f"dat/{f}")
    name = f.split('.')[0]
    df.to_sql(name, conn, if_exists='replace', index=False)

In [6]:
class CustNode(SQLNode):
    def do_annotate(self) -> typing.Union[sqlop, typing.List[sqlop]]:
        return [
            sqlop(optype=SQLOpType.select, opval=f"*, LENGTH({self.colabbr('name')}) as {self.colabbr('name_length')}")
        ]
    
    def do_filters(self) -> typing.Union[sqlop, typing.List[sqlop]]:
        return [
            sqlop(optype=SQLOpType.where, opval=f"{self.colabbr('id')} < 3")
        ]
    
    def do_normalize(self):
        pass
    
    
    def do_reduce(self, reduce_key):
        pass
    
    def do_post_join_annotate(self):
        pass
    
    def do_post_join_filters(self):
        pass

In [7]:
class NotificationNode(SQLNode):
    def do_annotate(self) -> typing.List[sqlop]:
        return [
            sqlop(optype=SQLOpType.select, opval=f"*, strftime('%m', {self.colabbr('ts')})")
        ]
    
    def do_filters(self) -> typing.List[sqlop]:
        return [
            sqlop(optype=SQLOpType.where, opval=f"{self.colabbr('ts')} > '2022-06-01'")
        ]
    
    def do_normalize(self):
        pass
    
    def do_reduce(self, reduce_key):
        return [
            # Shouldn't this just be a select?
            sqlop(optype=SQLOpType.aggfunc, opval=f"count(*) as {self.colabbr('num_notifications')}"),
            sqlop(optype=SQLOpType.agg, opval=f"{self.colabbr(reduce_key)}")
        ]
    

In [8]:
!cat dat/orders.csv

id,customer_id,ts,amount
1,1,2023-05-12,10
2,1,2023-06-01,11.5
3,2,2023-01-01,100
4,2,2022-08-05,150
5,1,2023-07-01,325
6,2,2023-07-02,23
7,1,2023-07-14,12000


In [9]:
class OrderNode(SQLNode):
    def do_filters(self) -> typing.List[sqlop]:
        return [
            sqlop(optype=SQLOpType.where, opval=f"{self.colabbr(self.date_key)} > '2022-12-01'")
        ]
    
    def do_annotate(self) -> typing.List[sqlop]:
        pass
    
    def do_normalize(self):
        pass
    
    def do_reduce(self, reduce_key):
        return [
            # Shouldn't this just be a select?
            sqlop(optype=SQLOpType.aggfunc, opval=f"count(*) as {self.colabbr('num_orders')}"),
            sqlop(optype=SQLOpType.agg, opval=f"{self.colabbr(reduce_key)}")
        ]

In [10]:
!ls dat/

cust.csv                      order_events.csv
cust.db                       order_products.csv
notification_interactions.csv orders.csv
notifications.csv             products.csv


In [11]:
class OrderEventNode(SQLNode):
    
    def do_reduce(self, reduce_key):
        return [
            # Shouldn't this just be a select?
            sqlop(optype=SQLOpType.aggfunc, opval=f"count(*) as {self.colabbr('num_order_events')}"),
            sqlop(optype=SQLOpType.agg, opval=f"{self.colabbr(reduce_key)}")
        ]

In [12]:
class NotificationInteractionNode(SQLNode):
    def do_reduce(self, reduce_key):
        return [
            # Shouldn't this just be a select?
            sqlop(optype=SQLOpType.aggfunc, opval=f"count(*) as {self.colabbr('num_interactions')}"),
            sqlop(optype=SQLOpType.agg, opval=f"{self.colabbr(reduce_key)}")
        ]

In [13]:
cust = CustNode(fpath='cust',
                prefix='cust',
                client=conn, 
                compute_layer=ComputeLayerEnum.sqlite, 
                columns=['id','name'])



In [14]:
notif = NotificationNode(fpath='notifications',
                prefix='not',
                client=conn, 
                compute_layer=ComputeLayerEnum.sqlite, 
                columns=['id','customer_id','ts'],
                date_key='ts')

In [15]:
ni = NotificationInteractionNode(fpath='notification_interactions',
                prefix='ni',
                client=conn, 
                compute_layer=ComputeLayerEnum.sqlite, 
                columns=['id','notification_id','interaction_type_id','ts'],
                date_key='ts')

In [16]:
order = OrderNode(
    fpath='orders',
    prefix='ord',
    client=conn,
    compute_layer=ComputeLayerEnum.sqlite,
    columns=['id','customer_id','ts','amount'],
    date_key='ts'
)

In [17]:
oe = OrderEventNode(
    fpath='order_events',
    prefix='oe',
    client=conn,
    compute_layer=ComputeLayerEnum.sqlite,
    columns=['id','order_id','event_id']
)



In [18]:
gr = GraphReduce(
    name='sql_dialect_example',
    parent_node=cust,
    compute_layer=ComputeLayerEnum.sqlite,
    use_temp_tables=True,
    lazy_execution=False
)


In [19]:
gr.add_node(cust)
gr.add_node(notif)
gr.add_node(ni)
gr.add_node(order)
gr.add_node(oe)

In [20]:
gr.add_entity_edge(
    cust,
    notif,
    parent_key='id',
    relation_key='customer_id',
    reduce=True
)

In [21]:
gr.add_entity_edge(
    notif,
    ni,
    parent_key='id',
    relation_key='notification_id',
    reduce=True
)

In [22]:
gr.add_entity_edge(
    cust,
    order,
    parent_key='id',
    relation_key='customer_id',
    reduce=True
)

In [23]:
gr.add_entity_edge(
    order,
    oe,
    parent_key='id',
    relation_key='order_id',
    reduce=True
)

In [24]:
gr.do_transformations_sql()

2024-07-02 16:52:35 [info     ] hydrating graph attributes
2024-07-02 16:52:35 [info     ] hydrating attributes for CustNode
2024-07-02 16:52:35 [info     ] hydrating attributes for NotificationNode
2024-07-02 16:52:35 [info     ] hydrating attributes for NotificationInteractionNode
2024-07-02 16:52:35 [info     ] hydrating attributes for OrderNode
2024-07-02 16:52:35 [info     ] hydrating attributes for OrderEventNode
2024-07-02 16:52:35 [info     ] checking for prefix uniqueness
2024-07-02 16:52:35 [info     ] no sql was provided for do_normalize so using current data ref
2024-07-02 16:52:35 [info     ] no sql was provided for do_normalize so using current data ref
2024-07-02 16:52:35 [info     ] no sql was provided for do_annotate so using current data ref
2024-07-02 16:52:35 [info     ] no sql was provided for do_filters so using current data ref
2024-07-02 16:52:35 [info     ] no sql was provided for do_normalize so using current data ref
2024-07-02 16:52:35 [info     ] no sql was

In [25]:
cust._temp_refs

{'do_data': 'CustNode_cust_do_data',
 'do_annotate': 'CustNode_cust_do_annotate',
 'do_filters': 'CustNode_cust_do_filters',
 'do_normalize': 'CustNode_cust_do_filters',
 'join': 'CustNode_cust_join1'}

In [26]:
cust._cur_data_ref

'CustNode_cust_join1'

In [27]:
pd.read_sql_query(f"select * from {cust._cur_data_ref}", conn)

Unnamed: 0,cust_id,cust_name,cust_name_length,ord_customer_id,ord_num_orders,not_customer_id,not_num_notifications
0,1,wes,3,1,4,1,9
1,2,john,4,2,2,2,6


In [33]:
pd.read_sql_query("SELECT name FROM sqlite_temp_master WHERE type='table'", conn)

Unnamed: 0,name
0,CustNode_cust_do_data
1,CustNode_cust_do_annotate
2,CustNode_cust_do_filters
3,NotificationNode_notifications_do_data
4,NotificationNode_notifications_do_annotate
5,NotificationNode_notifications_do_filters
6,NotificationInteractionNode_notification_inter...
7,OrderNode_orders_do_data
8,OrderNode_orders_do_filters
9,OrderEventNode_order_events_do_data


In [32]:
cust._temp_refs

{'do_data': 'CustNode_cust_do_data',
 'do_annotate': 'CustNode_cust_do_annotate',
 'do_filters': 'CustNode_cust_do_filters',
 'do_normalize': 'CustNode_cust_do_filters',
 'join': 'CustNode_cust_join1'}

In [34]:
# time filtering and more is still needed