# Two nodes in a graph and basic aggregation/reduction.
## still using `sqlite` as a SQL engine

In [1]:
import pandas as pd
import sqlite3
import json
import os
import typing


# examples for using SQL engines and dialects
from graphreduce.node import SQLNode
from graphreduce.graph_reduce import GraphReduce
from graphreduce.enum import SQLOpType, ComputeLayerEnum
from graphreduce.models import sqlop



In [2]:
!ls dat/

cust.csv                      order_events.csv
cust.db                       order_products.csv
notification_interactions.csv orders.csv
notifications.csv             products.csv


In [29]:
!rm dat/cust.db

In [9]:
dbfile = 'dat/cust.db'
conn = sqlite3.connect(dbfile)

In [26]:
files = [x for x in os.listdir('dat/') if x.endswith('.csv')]
# add each file to the database
for f in files:
    df = pd.read_csv(f"dat/{f}")
    name = f.split('.')[0]
    df.to_sql(name, conn, if_exists='replace', index=False)

DatabaseError: Execution failed on sql 'DROP TABLE "notification_interactions"': attempt to write a readonly database

In [27]:
class CustNode(SQLNode):
    def do_annotate(self) -> typing.Union[sqlop, typing.List[sqlop]]:
        return [
            sqlop(optype=SQLOpType.select, opval=f"*, LENGTH({self.colabbr('name')}) as {self.colabbr('name_length')}")
        ]
    
    def do_filters(self) -> typing.Union[sqlop, typing.List[sqlop]]:
        return [
            sqlop(optype=SQLOpType.where, opval=f"{self.colabbr('id')} < 3")
        ]
    
    def do_normalize(self):
        pass
    
    
    def do_reduce(self, reduce_key):
        pass
    
    def do_post_join_annotate(self):
        pass
    
    def do_post_join_filters(self):
        pass

In [28]:
!cat dat/notifications.csv

id,customer_id,ts
101,1,2022-08-05
102,1,2023-01-01
103,1,2023-05-05
104,1,2023-06-01
105,1,2023-06-02
106,1,2023-06-23
107,2,2022-09-05
108,2,2022-11-01
109,2,2023-01-01
110,2,2023-07-01
111,1,2023-07-15
112,1,2023-07-18
113,1,2023-08-01
114,2,2023-07-18
115,2,2023-08-01


In [13]:
class NotificationNode(SQLNode):
    def do_annotate(self) -> typing.List[sqlop]:
        return [
            sqlop(optype=SQLOpType.select, opval=f"*, strftime('%m', {self.colabbr('ts')})")
        ]
    
    def do_filters(self) -> typing.List[sqlop]:
        return [
            sqlop(optype=SQLOpType.where, opval=f"{self.colabbr('ts')} > '2022-06-01'")
        ]
    
    def do_normalize(self):
        pass
    
    def do_reduce(self, reduce_key):
        return [
            # Shouldn't this just be a select?
            sqlop(optype=SQLOpType.aggfunc, opval=f"count(*) as {self.colabbr('num_custs')}"),
            sqlop(optype=SQLOpType.agg, opval=f"{self.colabbr(reduce_key)}")
        ]
    
    

In [14]:
cust = CustNode(fpath='cust',
                prefix='cust',
                client=conn, 
                compute_layer=ComputeLayerEnum.sqlite, 
                columns=['id','name'])



In [15]:
notif = NotificationNode(fpath='notifications',
                prefix='not',
                client=conn, 
                compute_layer=ComputeLayerEnum.sqlite, 
                columns=['id', 'customer_id', 'ts'],
                date_key='ts'
        )

In [16]:
notif.do_data()

[sqlop(optype=<SQLOpType.select: 'select'>, opval='id as not_id,customer_id as not_customer_id,ts as not_ts')]

In [17]:
gr = GraphReduce(
    name='sql_dialect_example',
    parent_node=cust,
    compute_layer=ComputeLayerEnum.sqlite,
    use_temp_tables=True,
    lazy_execution=False
)

In [18]:
gr.add_node(cust)
gr.add_node(notif)

In [19]:
gr.add_entity_edge(
    cust,
    notif,
    parent_key='id',
    relation_key='customer_id',
    reduce=True
)

In [20]:
gr.plot_graph('sql_graph.html')

2024-07-01 21:34:27 [info     ] plotted graph at sql_graph.html


In [21]:
!open sql_graph.html

In [22]:
gr.do_transformations_sql()

2024-07-01 21:34:31 [info     ] hydrating graph attributes
2024-07-01 21:34:31 [info     ] hydrating attributes for CustNode
2024-07-01 21:34:31 [info     ] hydrating attributes for NotificationNode
2024-07-01 21:34:31 [info     ] checking for prefix uniqueness
2024-07-01 21:34:31 [info     ] no sql was provided for do_normalize so using current data ref
2024-07-01 21:34:31 [info     ] no sql was provided for do_normalize so using current data ref
2024-07-01 21:34:31 [info     ] depth-first traversal through the graph from source: <GraphReduceNode: fpath=cust fmt=>
2024-07-01 21:34:31 [info     ] reducing relation <GraphReduceNode: fpath=notifications fmt=>
2024-07-01 21:34:31 [info     ] joining <GraphReduceNode: fpath=notifications fmt=> to <GraphReduceNode: fpath=cust fmt=>


In [24]:
gr.parent_node._cur_data_ref

'CustNode_cust_join'

In [25]:
pd.read_sql_query(f"select * from {gr.parent_node._cur_data_ref}", conn)

Unnamed: 0,cust_id,cust_name,cust_name_length,not_customer_id,not_num_custs
0,1,wes,3,1,9
1,2,john,4,2,6
