# Two nodes in a graph and basic aggregation/reduction.
## still using `sqlite` as a SQL engine

In [32]:
import pandas as pd
import sqlite3
import json
import os
import typing


# examples for using SQL engines and dialects
from graphreduce.node import SQLNode
from graphreduce.graph_reduce import GraphReduce
from graphreduce.enum import SQLOpType, ComputeLayerEnum
from graphreduce.models import sqlop

In [33]:
!ls dat/

cust.csv                      order_events.csv
cust.db                       order_products.csv
notification_interactions.csv orders.csv
notifications.csv             products.csv


In [34]:
!rm dat/cust.db

In [35]:
dbfile = 'dat/cust.db'
conn = sqlite3.connect(dbfile)

In [36]:
files = [x for x in os.listdir('dat/') if x.endswith('.csv')]
# add each file to the database
for f in files:
    df = pd.read_csv(f"dat/{f}")
    name = f.split('.')[0]
    df.to_sql(name, conn, if_exists='replace', index=False)

In [37]:
class CustNode(SQLNode):
    def do_annotate(self) -> typing.Union[sqlop, typing.List[sqlop]]:
        return [
            sqlop(optype=SQLOpType.select, opval=f"*, LENGTH({self.colabbr('name')}) as {self.colabbr('name_length')}")
        ]
    
    def do_filters(self) -> typing.Union[sqlop, typing.List[sqlop]]:
        return [
            sqlop(optype=SQLOpType.where, opval=f"{self.colabbr('id')} < 3")
        ]
    
    def do_normalize(self):
        pass
    
    
    def do_reduce(self, reduce_key):
        pass
    
    def do_post_join_annotate(self):
        pass
    
    def do_post_join_filters(self):
        pass

In [38]:
!cat dat/notifications.csv

id,customer_id,ts
101,1,2022-08-05
102,1,2023-01-01
103,1,2023-05-05
104,1,2023-06-01
105,1,2023-06-02
106,1,2023-06-23
107,2,2022-09-05
108,2,2022-11-01
109,2,2023-01-01
110,2,2023-07-01
111,1,2023-07-15
112,1,2023-07-18
113,1,2023-08-01
114,2,2023-07-18
115,2,2023-08-01


In [39]:
class NotificationNode(SQLNode):
    def do_annotate(self) -> typing.List[sqlop]:
        return [
            sqlop(optype=SQLOpType.select, opval=f"*, strftime('%m', {self.colabbr('ts')})")
        ]
    
    def do_filters(self) -> typing.List[sqlop]:
        return [
            sqlop(optype=SQLOpType.where, opval=f"{self.colabbr('ts')} > '2022-06-01'")
        ]
    
    def do_normalize(self):
        pass
    
    def do_reduce(self, reduce_key):
        return [
            # Shouldn't this just be a select?
            sqlop(optype=SQLOpType.aggfunc, opval=f"count(*) as {self.colabbr('num_custs')}"),
            sqlop(optype=SQLOpType.agg, opval=f"{self.colabbr(reduce_key)}"),
        ]
    
    

In [40]:
cust = CustNode(fpath='cust',
                prefix='cust',
                client=conn, 
                compute_layer=ComputeLayerEnum.sqlite, 
                columns=['id','name'])



In [41]:
notif = NotificationNode(fpath='notifications',
                prefix='not',
                client=conn, 
                compute_layer=ComputeLayerEnum.sqlite, 
                columns=['id', 'customer_id', 'ts'],
                date_key='ts'
        )

In [42]:
notif.do_data()

[sqlop(optype=<SQLOpType.select: 'select'>, opval='id as not_id,customer_id as not_customer_id,ts as not_ts')]

In [43]:
notif.prep_for_features()

[sqlop(optype=<SQLOpType.where: 'where'>, opval='not_ts < 2024-07-03 12:59:57.372418'),
 sqlop(optype=<SQLOpType.where: 'where'>, opval='not_ts > 2023-07-04 12:59:57.372418')]

In [44]:
help(notif.create_ref)

Help on method create_ref in module graphreduce.node:

create_ref(sql: str = '', fn: Union[<built-in function callable>, str] = None, overwrite: bool = False) -> str method of __main__.NotificationNode instance
    Gets a temporary table or view name
    based on the method being called.



In [45]:
notif.prep_for_features()

[sqlop(optype=<SQLOpType.where: 'where'>, opval='not_ts < 2024-07-03 12:59:57.372418'),
 sqlop(optype=<SQLOpType.where: 'where'>, opval='not_ts > 2023-07-04 12:59:57.372418')]

In [46]:
notif.do_reduce('customer_id')

[sqlop(optype=<SQLOpType.aggfunc: 'aggfunc'>, opval='count(*) as not_num_custs'),
 sqlop(optype=<SQLOpType.agg: 'group by'>, opval='not_customer_id')]

In [47]:
notif.do_reduce('customer_id') + notif.prep_for_features()

[sqlop(optype=<SQLOpType.aggfunc: 'aggfunc'>, opval='count(*) as not_num_custs'),
 sqlop(optype=<SQLOpType.agg: 'group by'>, opval='not_customer_id'),
 sqlop(optype=<SQLOpType.where: 'where'>, opval='not_ts < 2024-07-03 12:59:57.372418'),
 sqlop(optype=<SQLOpType.where: 'where'>, opval='not_ts > 2023-07-04 12:59:57.372418')]

In [48]:
print(notif.build_query(notif.do_reduce('customer_id') + notif.prep_for_features()))


        SELECT not_customer_id,
        count(*) as not_num_custs
        FROM notifications
        WHERE not_ts < 2024-07-03 12:59:57.372418 and not_ts > 2023-07-04 12:59:57.372418
        GROUP BY not_customer_id
        


In [49]:
op = 'MAX'

In [50]:
field = 'customer_id'

In [51]:
notif.prefix

'not'

In [52]:
notif.colabbr(field)

'not_customer_id'

In [53]:
#f"{op(notif.colabbr(field))}"

f"{op}" + f"({notif.colabbr(field)})"

'MAX(not_customer_id)'

In [54]:
gr = GraphReduce(
    name='sql_dialect_example',
    parent_node=cust,
    compute_layer=ComputeLayerEnum.sqlite,
    use_temp_tables=True,
    lazy_execution=False
)

In [55]:
gr.add_node(cust)
gr.add_node(notif)

In [56]:
gr.add_entity_edge(
    cust,
    notif,
    parent_key='id',
    relation_key='customer_id',
    reduce=True
)

In [57]:
gr.plot_graph('sql_graph.html')

2024-07-03 13:26:22 [info     ] plotted graph at sql_graph.html


In [58]:
!open sql_graph.html

In [59]:
gr.do_transformations_sql()

2024-07-03 13:26:27 [info     ] hydrating graph attributes
2024-07-03 13:26:27 [info     ] hydrating attributes for CustNode
2024-07-03 13:26:27 [info     ] hydrating attributes for NotificationNode
2024-07-03 13:26:27 [info     ] checking for prefix uniqueness
2024-07-03 13:26:27 [info     ] no sql was provided for do_normalize so using current data ref
2024-07-03 13:26:27 [info     ] no sql was provided for do_normalize so using current data ref
2024-07-03 13:26:27 [info     ] depth-first traversal through the graph from source: <GraphReduceNode: fpath=cust fmt=>
2024-07-03 13:26:27 [info     ] reducing relation <GraphReduceNode: fpath=notifications fmt=>
2024-07-03 13:26:27 [info     ] joining <GraphReduceNode: fpath=notifications fmt=> to <GraphReduceNode: fpath=cust fmt=>


In [60]:
gr.parent_node._cur_data_ref

'CustNode_cust_join'

In [62]:
df = pd.read_sql_query(f"select * from {gr.parent_node._cur_data_ref}", conn)

In [63]:
df

Unnamed: 0,cust_id,cust_name,cust_name_length,not_customer_id,not_num_custs
0,1,wes,3,1,9
1,2,john,4,2,6


In [65]:
df.dtypes

cust_id              int64
cust_name           object
cust_name_length     int64
not_customer_id      int64
not_num_custs        int64
dtype: object

In [66]:
# for comprehensive type usage in automated feature engineering
# we need to create a hierarchy of categories, with the top-level
# categories being more general.
import woodwork as ww

In [67]:
df.ww.init()

In [68]:
df.ww

Unnamed: 0_level_0,Physical Type,Logical Type,Semantic Tag(s)
Column,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
cust_id,int64,Integer,['numeric']
cust_name,string,Unknown,[]
cust_name_length,int64,Integer,['numeric']
not_customer_id,int64,Integer,['numeric']
not_num_custs,int64,Integer,['numeric']


In [69]:
 len(df)

2

In [71]:
import pandas as pd

In [73]:
na = pd.NamedAgg(column='cust_name', aggfunc='count')

In [74]:
dir(na)

['__add__',
 '__annotations__',
 '__class__',
 '__class_getitem__',
 '__contains__',
 '__delattr__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__getnewargs__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__iter__',
 '__le__',
 '__len__',
 '__lt__',
 '__module__',
 '__mul__',
 '__ne__',
 '__new__',
 '__orig_bases__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__rmul__',
 '__setattr__',
 '__sizeof__',
 '__slots__',
 '__str__',
 '__subclasshook__',
 '_asdict',
 '_field_defaults',
 '_fields',
 '_make',
 '_replace',
 'aggfunc',
 'column',
 'count',
 'index']

In [75]:
na.aggfunc

'count'

In [76]:
na.column

'cust_name'