## Automated feature engineering with more tables
## still using `sqlite` as a backend

In [36]:
import pandas as pd
import sqlite3
import json
import os
import typing
import datetime


# examples for using SQL engines and dialects
from graphreduce.node import SQLNode
from graphreduce.graph_reduce import GraphReduce
from graphreduce.enum import SQLOpType, ComputeLayerEnum, PeriodUnit
from graphreduce.models import sqlop
from graphreduce.context import method_requires

In [37]:
!ls dat/

cust.csv                      order_events.csv
cust.db                       order_products.csv
notification_interactions.csv orders.csv
notifications.csv             products.csv


In [38]:
!rm dat/cust.db

In [39]:
dbfile = 'dat/cust.db'
conn = sqlite3.connect(dbfile)

In [40]:
files = [x for x in os.listdir('dat/') if x.endswith('.csv')]
# add each file to the database
for f in files:
    df = pd.read_csv(f"dat/{f}")
    name = f.split('.')[0]
    df.to_sql(name, conn, if_exists='replace', index=False)

In [41]:
cust = SQLNode(fpath='cust',
                prefix='cust',
                client=conn, 
                compute_layer=ComputeLayerEnum.sqlite, 
                columns=['id','name'])

notif = SQLNode(fpath='notifications',
                prefix='not',
                client=conn, 
                compute_layer=ComputeLayerEnum.sqlite, 
                columns=['id','customer_id','ts'],
                date_key='ts')

ni = SQLNode(fpath='notification_interactions',
                prefix='ni',
                client=conn, 
                compute_layer=ComputeLayerEnum.sqlite, 
                columns=['id','notification_id','interaction_type_id','ts'],
                date_key='ts')

order = SQLNode(fpath='orders',
               prefix='ord',
               client=conn,
               compute_layer=ComputeLayerEnum.sqlite,
               columns=['id','customer_id','ts','amount'],
                date_key='ts')

oe = SQLNode(
    fpath='order_events',
    prefix='oe',
    client=conn,
    compute_layer=ComputeLayerEnum.sqlite,
    columns=['id','order_id','event_id']
)



In [42]:
gr = GraphReduce(
    name='sql_autofe',
    parent_node=cust,
    # Cut date for filtering.
    cut_date=datetime.datetime(2023, 6, 30),
    # Feature parameters.
    compute_period_unit=PeriodUnit.day,
    compute_period_val=730,
    # Label parameters.
    label_node=order,
    label_field='amount',
    label_operation='sum',
    label_period_unit=PeriodUnit.day,
    label_period_val=30,
    compute_layer=ComputeLayerEnum.sqlite,
    use_temp_tables=True,
    lazy_execution=False,
    
    # Auto feature engineering params.
    auto_features=True,
    auto_feature_hops_back=3,
    auto_feature_hops_front=1
)

In [43]:
gr.add_node(cust)
gr.add_node(order)
gr.add_node(oe)
gr.add_node(notif)
gr.add_node(ni)

gr.add_entity_edge(
    cust,
    notif,
    parent_key='id',
    relation_key='customer_id',
    reduce=True
)

gr.add_entity_edge(
    notif,
    ni,
    parent_key='id',
    relation_key='notification_id',
    reduce=True
)


gr.add_entity_edge(
    cust,
    order,
    parent_key='id',
    relation_key='customer_id',
    reduce=True
)

gr.add_entity_edge(
    order,
    oe,
    parent_key='id',
    relation_key='order_id',
    reduce=True
)


In [44]:
gr.do_transformations_sql()

2024-07-05 16:40:04 [info     ] hydrating graph attributes
2024-07-05 16:40:04 [info     ] hydrating attributes for SQLNode
2024-07-05 16:40:04 [info     ] hydrating attributes for SQLNode
2024-07-05 16:40:04 [info     ] hydrating attributes for SQLNode
2024-07-05 16:40:04 [info     ] hydrating attributes for SQLNode
2024-07-05 16:40:04 [info     ] hydrating attributes for SQLNode
2024-07-05 16:40:04 [info     ] checking for prefix uniqueness
2024-07-05 16:40:04 [info     ] no sql was provided for do_annotate so using current data ref
2024-07-05 16:40:04 [info     ] no sql was provided for do_filters so using current data ref
2024-07-05 16:40:04 [info     ] no sql was provided for do_normalize so using current data ref
2024-07-05 16:40:04 [info     ] no sql was provided for do_annotate so using current data ref
2024-07-05 16:40:04 [info     ] no sql was provided for do_filters so using current data ref
2024-07-05 16:40:04 [info     ] no sql was provided for do_normalize so using curren

2024-07-05 16:40:04 [info     ] joining <GraphReduceNode: fpath=notifications fmt=> to <GraphReduceNode: fpath=cust fmt=>
2024-07-05 16:40:04 [error    ] table SQLNode_cust_join already exists
2024-07-05 16:40:04 [info     ] no sql was provided for do_post_join_annotate so using current data ref
2024-07-05 16:40:04 [info     ] no sql was provided for do_post_join_filters so using current data ref


In [45]:
pd.read_sql_query("SELECT name FROM sqlite_temp_master WHERE type='table'", conn)

Unnamed: 0,name
0,SQLNode_cust_do_data
1,SQLNode_orders_do_data
2,SQLNode_order_events_do_data
3,SQLNode_notifications_do_data
4,SQLNode_notification_interactions_do_data
5,SQLNode_order_events_do_reduce
6,SQLNode_orders_join
7,SQLNode_orders_do_reduce
8,SQLNode_cust_join
9,SQLNode_orders_do_labels


In [46]:
gr.parent_node._temp_refs

{'do_data': 'SQLNode_cust_do_data',
 'do_annotate': 'SQLNode_cust_do_data',
 'do_filters': 'SQLNode_cust_do_data',
 'do_normalize': 'SQLNode_cust_do_data',
 'join': 'SQLNode_cust_join',
 'do_post_join_annotate': 'SQLNode_cust_join1',
 'do_post_join_filters': 'SQLNode_cust_join1'}

In [47]:
pd.read_sql_query(f"select * from {gr.parent_node._cur_data_ref}", conn)

Unnamed: 0,cust_id,cust_name,ord_customer_id,ord_id_count,ord_customer_id_count,ord_ts_count,ord_amount_min,ord_amount_max,ord_amount_sum,oe_order_id_min,oe_order_id_max,oe_order_id_sum,oe_id_count_min,oe_id_count_max,oe_id_count_sum,oe_order_id_count_min,oe_order_id_count_max,oe_order_id_count_sum,oe_event_id_count_min,oe_event_id_count_max,oe_event_id_count_sum,ord_customer_id:1,ord_amount_label
0,1,wes,1,2,2,2,10.0,11.5,21.5,1,2,3,10,10,20,10,10,20,10,10,20,1,12325.0
1,2,john,2,2,2,2,100.0,150.0,250.0,3,3,3,6,6,6,6,6,6,6,6,6,2,23.0


In [48]:
pd.set_option('display.max_columns', 100)

In [16]:
order._temp_refs

{'do_data': 'SQLNode_orders_do_data',
 'do_annotate': 'SQLNode_orders_do_data',
 'do_filters': 'SQLNode_orders_do_data',
 'do_normalize': 'SQLNode_orders_do_data',
 'join': 'SQLNode_orders_join',
 'do_post_join_annotate': 'SQLNode_orders_join',
 'do_post_join_filters': 'SQLNode_orders_join',
 'do_reduce': 'SQLNode_orders_do_reduce',
 'do_labels': 'SQLNode_orders_do_labels'}

In [18]:
pd.read_sql_query("select * from SQLNode_orders_do_data", conn)

Unnamed: 0,ord_id,ord_customer_id,ord_ts,ord_amount
0,1,1,2023-05-12,10.0
1,2,1,2023-06-01,11.5
2,3,2,2023-01-01,100.0
3,4,2,2022-08-05,150.0
4,5,1,2023-07-01,325.0
5,6,2,2023-07-02,23.0
6,7,1,2023-07-14,12000.0
