# Scaling up

In [1]:
import datetime

import pandas as pd
from graphreduce.node import GraphReduceNode, DynamicNode
from graphreduce.graph_reduce import GraphReduce
from graphreduce.enum import ComputeLayerEnum as GraphReduceComputeLayerEnum, PeriodUnit



In [2]:
labels = pd.read_csv('movie_labels.csv')

In [3]:
labels

Unnamed: 0.1,Unnamed: 0,from_name,from_identifier,from_object_str,from_rows,to_name,to_identifier,to_object_str,to_rows,from_key,to_key,weight,discovery_mechanism
0,0,staff.csv,/Users/wesmadrigal/projects/kurveai/kurve/test...,<Entity (identifier=/Users/wesmadrigal/project...,2,store.csv,/Users/wesmadrigal/projects/kurveai/kurve/test...,<Entity (identifier=/Users/wesmadrigal/project...,2,store_id,store_id,,constraint
1,1,film_category.csv,/Users/wesmadrigal/projects/kurveai/kurve/test...,<Entity (identifier=/Users/wesmadrigal/project...,1000,category.csv,/Users/wesmadrigal/projects/kurveai/kurve/test...,<Entity (identifier=/Users/wesmadrigal/project...,16,category_id,category_id,,constraint
2,2,customer.csv,/Users/wesmadrigal/projects/kurveai/kurve/test...,<Entity (identifier=/Users/wesmadrigal/project...,599,store.csv,/Users/wesmadrigal/projects/kurveai/kurve/test...,<Entity (identifier=/Users/wesmadrigal/project...,2,store_id,store_id,,constraint
3,3,customer.csv,/Users/wesmadrigal/projects/kurveai/kurve/test...,<Entity (identifier=/Users/wesmadrigal/project...,599,address.csv,/Users/wesmadrigal/projects/kurveai/kurve/test...,<Entity (identifier=/Users/wesmadrigal/project...,603,address_id,address_id,,constraint
4,4,film_actor.csv,/Users/wesmadrigal/projects/kurveai/kurve/test...,<Entity (identifier=/Users/wesmadrigal/project...,5462,actor.csv,/Users/wesmadrigal/projects/kurveai/kurve/test...,<Entity (identifier=/Users/wesmadrigal/project...,200,actor_id,actor_id,,constraint
5,5,film_actor.csv,/Users/wesmadrigal/projects/kurveai/kurve/test...,<Entity (identifier=/Users/wesmadrigal/project...,5462,film_category.csv,/Users/wesmadrigal/projects/kurveai/kurve/test...,<Entity (identifier=/Users/wesmadrigal/project...,1000,film_id,film_id,,constraint
6,6,film_actor.csv,/Users/wesmadrigal/projects/kurveai/kurve/test...,<Entity (identifier=/Users/wesmadrigal/project...,5462,film.csv,/Users/wesmadrigal/projects/kurveai/kurve/test...,<Entity (identifier=/Users/wesmadrigal/project...,1000,film_id,film_id,,constraint
7,7,address.csv,/Users/wesmadrigal/projects/kurveai/kurve/test...,<Entity (identifier=/Users/wesmadrigal/project...,603,city.csv,/Users/wesmadrigal/projects/kurveai/kurve/test...,<Entity (identifier=/Users/wesmadrigal/project...,600,city_id,city_id,,constraint
8,8,city.csv,/Users/wesmadrigal/projects/kurveai/kurve/test...,<Entity (identifier=/Users/wesmadrigal/project...,600,country.csv,/Users/wesmadrigal/projects/kurveai/kurve/test...,<Entity (identifier=/Users/wesmadrigal/project...,109,country_id,country_id,,constraint
9,9,payment.csv,/Users/wesmadrigal/projects/kurveai/kurve/test...,<Entity (identifier=/Users/wesmadrigal/project...,16049,rental.csv,/Users/wesmadrigal/projects/kurveai/kurve/test...,<Entity (identifier=/Users/wesmadrigal/project...,16044,rental_id,rental_id,,constraint


In [4]:
!ls /Users/wesmadrigal/projects/graphreduce/tests/data/movie_data/

actor.csv         country.csv       film_category.csv rental.csv
address.csv       customer.csv      inventory.csv     staff.csv
category.csv      film.csv          language.csv      store.csv
city.csv          film_actor.csv    payment.csv


In [5]:
files = {
    '/Users/wesmadrigal/projects/graphreduce/tests/data/movie_data/actor.csv' : {'prefix':'act'},
    '/Users/wesmadrigal/projects/graphreduce/tests/data/movie_data/address.csv' : {'prefix':'add'},
    '/Users/wesmadrigal/projects/graphreduce/tests/data/movie_data/category.csv': {'prefix':'cat'},
    '/Users/wesmadrigal/projects/graphreduce/tests/data/movie_data/city.csv': {'prefix':'city'},
    '/Users/wesmadrigal/projects/graphreduce/tests/data/movie_data/country.csv':{'prefix':'ct'},
    '/Users/wesmadrigal/projects/graphreduce/tests/data/movie_data/customer.csv':{'prefix':'cust'},
    '/Users/wesmadrigal/projects/graphreduce/tests/data/movie_data/film.csv':{'prefix':'film'},
    '/Users/wesmadrigal/projects/graphreduce/tests/data/movie_data/film_actor.csv':{'prefix':'fa'},
    '/Users/wesmadrigal/projects/graphreduce/tests/data/movie_data/film_category.csv':{'prefix':'fc'},
    '/Users/wesmadrigal/projects/graphreduce/tests/data/movie_data/inventory.csv':{'prefix':'inv'},
    '/Users/wesmadrigal/projects/graphreduce/tests/data/movie_data/language.csv':{'prefix':'lang'},
    '/Users/wesmadrigal/projects/graphreduce/tests/data/movie_data/payment.csv':{'prefix':'pay'},
    '/Users/wesmadrigal/projects/graphreduce/tests/data/movie_data/rental.csv':{'prefix':'rent'},
    '/Users/wesmadrigal/projects/graphreduce/tests/data/movie_data/staff.csv':{'prefix':'staff'},
    '/Users/wesmadrigal/projects/graphreduce/tests/data/movie_data/store.csv':{'prefix':'store'},

}

In [6]:
len(files)

15

In [7]:
# create graph reduce nodes
gr_nodes = {
    f.split('/')[-1]: DynamicNode(
        fpath=f,
        fmt='csv',
        pk='id',
        prefix=files[f]['prefix'],
        date_key=None,
        compute_layer=GraphReduceComputeLayerEnum.pandas,
        compute_period_val=730,
        compute_period_unit=PeriodUnit.day,
    )
    for f in files.keys()
}



In [8]:
gr = GraphReduce(
    name='cust_dynamic_graph',
    parent_node=gr_nodes['customer.csv'],
    fmt='csv',
    cut_date=datetime.datetime(2023,9,1),
    compute_layer=GraphReduceComputeLayerEnum.pandas,
    auto_features=True,
    auto_feature_hops_front=1,
    auto_feature_hops_back=2,
    #label_node=gr_nodes['orders.csv'],
    #label_operation='count',
    #label_field='id',
    #label_period_val=60,
    #label_period_unit=PeriodUnit.day
)

In [9]:
for ix, row in labels.iterrows():
    gr.add_entity_edge(
        parent_node=gr_nodes[row['to_name']],
        relation_node=gr_nodes[row['from_name']],
        parent_key=row['to_key'],
        relation_key=row['from_key'],
        reduce=True
    )

In [10]:
gr.do_transformations()

2024-04-23 09:02:21 [info     ] hydrating graph attributes
2024-04-23 09:02:21 [info     ] hydrating attributes for DynamicNode
2024-04-23 09:02:21 [info     ] hydrating attributes for DynamicNode
2024-04-23 09:02:21 [info     ] hydrating attributes for DynamicNode
2024-04-23 09:02:21 [info     ] hydrating attributes for DynamicNode
2024-04-23 09:02:21 [info     ] hydrating attributes for DynamicNode
2024-04-23 09:02:21 [info     ] hydrating attributes for DynamicNode
2024-04-23 09:02:21 [info     ] hydrating attributes for DynamicNode
2024-04-23 09:02:21 [info     ] hydrating attributes for DynamicNode
2024-04-23 09:02:22 [info     ] hydrating attributes for DynamicNode
2024-04-23 09:02:22 [info     ] hydrating attributes for DynamicNode
2024-04-23 09:02:22 [info     ] hydrating attributes for DynamicNode
2024-04-23 09:02:22 [info     ] hydrating attributes for DynamicNode
2024-04-23 09:02:22 [info     ] hydrating attributes for DynamicNode
2024-04-23 09:02:22 [info     ] hydrating at

In [11]:
gr.parent_node.df.head()

Unnamed: 0,cust_customer_id,cust_store_id,cust_first_name,cust_last_name,cust_email,cust_address_id,cust_activebool,cust_create_date,cust_last_update,cust_active,...,pay_payment_id_count,pay_customer_id_count,pay_staff_id_count,pay_rental_id_count,pay_amount_min,pay_amount_max,pay_amount_sum,pay_payment_date_first,pay_payment_date_min,pay_payment_date_max
0,1,1,MARY,SMITH,MARY.SMITH@sakilacustomer.org,5,True,2022-02-14,2022-02-15 09:57:20,1,...,32,32,32,32,0.99,9.99,118.68,2022-01-29 13:03:02.267403,2022-01-28 20:10:06.039818,2022-07-23 09:13:13.975359
1,2,1,PATRICIA,JOHNSON,PATRICIA.JOHNSON@sakilacustomer.org,6,True,2022-02-14,2022-02-15 09:57:20,1,...,27,27,27,27,0.99,10.99,128.73,2022-01-24 08:17:41.922082,2022-01-23 14:26:35.170413,2022-06-26 17:18:19.136948
2,3,1,LINDA,WILLIAMS,LINDA.WILLIAMS@sakilacustomer.org,7,True,2022-02-14,2022-02-15 09:57:20,1,...,26,26,26,26,0.99,10.99,135.74,2022-01-23 17:24:52.109704,2022-01-23 17:24:52.109704,2022-07-12 17:19:31.419393
3,4,2,BARBARA,JONES,BARBARA.JONES@sakilacustomer.org,8,True,2022-02-14,2022-02-15 09:57:20,1,...,22,22,22,22,0.99,8.99,81.78,2022-01-29 09:05:28.038014,2022-01-29 09:05:28.038014,2022-07-14 11:28:54.939865
4,5,1,ELIZABETH,BROWN,ELIZABETH.BROWN@sakilacustomer.org,9,True,2022-02-14,2022-02-15 09:57:20,1,...,38,38,38,38,0.99,9.99,144.62,2022-01-27 10:15:27.766413,2022-01-25 10:56:59.991370,2022-07-08 16:07:01.481142


In [12]:
len(gr.parent_node.df)

599

In [17]:
gr = GraphReduce(
    name='rental_dynamic_graph',
    parent_node=gr_nodes['rental.csv'],
    fmt='csv',
    cut_date=datetime.datetime(2023,9,1),
    compute_layer=GraphReduceComputeLayerEnum.pandas,
    auto_features=True,
    auto_feature_hops_front=1,
    auto_feature_hops_back=2,
    #label_node=gr_nodes['orders.csv'],
    #label_operation='count',
    #label_field='id',
    #label_period_val=60,
    #label_period_unit=PeriodUnit.day
)

In [18]:
for ix, row in labels.iterrows():
    gr.add_entity_edge(
        parent_node=gr_nodes[row['to_name']],
        relation_node=gr_nodes[row['from_name']],
        parent_key=row['to_key'],
        relation_key=row['from_key'],
        reduce=True
    )

In [19]:
for node in gr.nodes():
    node.reload()

In [20]:
gr.do_transformations()

2024-04-23 09:03:03 [info     ] hydrating graph attributes
2024-04-23 09:03:03 [info     ] hydrating attributes for DynamicNode
2024-04-23 09:03:03 [info     ] hydrating attributes for DynamicNode
2024-04-23 09:03:03 [info     ] hydrating attributes for DynamicNode
2024-04-23 09:03:03 [info     ] hydrating attributes for DynamicNode
2024-04-23 09:03:03 [info     ] hydrating attributes for DynamicNode
2024-04-23 09:03:03 [info     ] hydrating attributes for DynamicNode
2024-04-23 09:03:03 [info     ] hydrating attributes for DynamicNode
2024-04-23 09:03:03 [info     ] hydrating attributes for DynamicNode
2024-04-23 09:03:03 [info     ] hydrating attributes for DynamicNode
2024-04-23 09:03:03 [info     ] hydrating attributes for DynamicNode
2024-04-23 09:03:03 [info     ] hydrating attributes for DynamicNode
2024-04-23 09:03:03 [info     ] hydrating attributes for DynamicNode
2024-04-23 09:03:03 [info     ] hydrating attributes for DynamicNode
2024-04-23 09:03:03 [info     ] hydrating at

In [21]:
gr.parent_node.df

Unnamed: 0,rent_rental_id,rent_rental_date,rent_inventory_id,rent_customer_id,rent_return_date,rent_staff_id,rent_last_update,staff_staff_id,staff_first_name,staff_last_name,...,pay_payment_id_count,pay_customer_id_count,pay_staff_id_count,pay_rental_id_count,pay_amount_min,pay_amount_max,pay_amount_sum,pay_payment_date_first,pay_payment_date_min,pay_payment_date_max
0,2,2022-05-24 21:54:33,1525,459,2022-05-28 18:40:33,1,2022-02-16 02:30:53,1,Mike,Hillyer,...,1,1,1,1,2.99,2.99,2.99,2022-02-21 17:38:11.389365,2022-02-21 17:38:11.389365,2022-02-21 17:38:11.389365
1,3,2022-05-24 22:03:39,1711,408,2022-06-01 21:12:39,1,2022-02-16 02:30:53,1,Mike,Hillyer,...,1,1,1,1,3.99,3.99,3.99,2022-03-23 15:44:32.990689,2022-03-23 15:44:32.990689,2022-03-23 15:44:32.990689
2,4,2022-05-24 22:04:41,2452,333,2022-06-03 00:43:41,2,2022-02-16 02:30:53,2,Jon,Stephens,...,1,1,1,1,4.99,4.99,4.99,2022-04-12 07:42:01.166425,2022-04-12 07:42:01.166425,2022-04-12 07:42:01.166425
3,5,2022-05-24 22:05:21,2079,222,2022-06-02 03:33:21,1,2022-02-16 02:30:53,1,Mike,Hillyer,...,1,1,1,1,6.99,6.99,6.99,2022-04-03 11:47:13.012845,2022-04-03 11:47:13.012845,2022-04-03 11:47:13.012845
4,6,2022-05-24 22:08:07,2792,549,2022-05-27 00:32:07,1,2022-02-16 02:30:53,1,Mike,Hillyer,...,1,1,1,1,0.99,0.99,0.99,2022-04-10 11:06:45.478746,2022-04-10 11:06:45.478746,2022-04-10 11:06:45.478746
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16039,16046,2022-08-23 21:26:47,4364,74,2022-08-27 17:02:47,2,2022-02-16 02:30:53,2,Jon,Stephens,...,1,1,1,1,0.99,0.99,0.99,2022-06-11 21:17:29.989777,2022-06-11 21:17:29.989777,2022-06-11 21:17:29.989777
16040,16047,2022-08-23 21:42:48,2088,114,2022-08-25 01:48:48,2,2022-02-16 02:30:53,2,Jon,Stephens,...,1,1,1,1,0.99,0.99,0.99,2022-03-29 10:47:21.179255,2022-03-29 10:47:21.179255,2022-03-29 10:47:21.179255
16041,16048,2022-08-23 21:43:07,2019,103,2022-08-31 20:33:07,1,2022-02-16 02:30:53,1,Mike,Hillyer,...,1,1,1,1,8.99,8.99,8.99,2022-05-26 21:47:58.372185,2022-05-26 21:47:58.372185,2022-05-26 21:47:58.372185
16042,16049,2022-08-23 21:50:12,2666,393,2022-08-30 00:01:12,2,2022-02-16 02:30:53,2,Jon,Stephens,...,1,1,1,1,3.99,3.99,3.99,2022-02-02 16:24:10.941930,2022-02-02 16:24:10.941930,2022-02-02 16:24:10.941930
