In [1]:
import pandas as pd
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt

In [2]:
train_edges = pd.read_parquet('data/train_edges.parquet')
train_features = pd.read_parquet('data/train_features.parquet')
train_labels = pd.read_parquet('data/train_labels.parquet')

test_edges = pd.read_parquet('data/test_edges.parquet')
test_features = pd.read_parquet('data/test_features.parquet')
test_labels = pd.read_parquet('data/test_labels.parquet')

In [3]:
products = pd.read_csv('data/products.csv')
departments = pd.read_csv('data/departments.csv')

In [4]:
products = products.merge(departments, how='left', on="department_id")
products['product_id'] = products['product_id'].astype(str)

In [13]:
train_edges['product_id'] = train_edges['product_id'].astype(str)
test_edges['product_id'] = test_edges['product_id'].astype(str)

In [7]:
G_train = nx.from_pandas_edgelist(train_edges, 'user_id', 'product_id', ['weight'], create_using=nx.DiGraph())

In [14]:
G_test = nx.from_pandas_edgelist(test_edges, 'user_id', 'product_id', ['weight'], create_using=nx.DiGraph())

In [15]:
G_train.number_of_nodes()

151080

In [16]:
G_train.number_of_edges()

9006247

In [17]:
G_test.number_of_nodes()

151139

In [18]:
G_test.number_of_edges()

9337086

In [36]:
len(set(train_edges['user_id'].values))

101696

In [37]:
len(set(test_edges['user_id'].values))

101696

In [34]:
len(set(train_edges['product_id'].values))

49384

In [35]:
len(set(test_edges['product_id'].values))

49443

In [38]:
dc = nx.degree_centrality(G_test)

In [39]:
top100_nodes_dc = list(dict(sorted(dc.items(), key=lambda item: item[1], reverse=True)).keys())[:100]

In [40]:
top_100_products_dc = products[products['product_id'].isin(top100_nodes_dc)]
top_100_products_dc['department'].value_counts()

produce         76
dairy eggs      11
canned goods     3
frozen           3
deli             2
bakery           1
household        1
meat seafood     1
pantry           1
beverages        1
Name: department, dtype: int64

In [41]:
ec = nx.eigenvector_centrality(G_test)

In [42]:
top100_nodes_ec = list(dict(sorted(ec.items(), key=lambda item: item[1], reverse=True)).keys())[:100]

In [43]:
top_100_products_ec = products[products['product_id'].isin(top100_nodes_ec)]
top_100_products_ec['department'].value_counts()

produce         76
dairy eggs      11
canned goods     3
frozen           3
deli             2
bakery           1
household        1
meat seafood     1
pantry           1
beverages        1
Name: department, dtype: int64

In [44]:
bc = nx.betweenness_centrality(G_test, k=100)

In [45]:
top100_nodes_bc = list(dict(sorted(bc.items(), key=lambda item: item[1], reverse=True)).keys())[:100]

In [46]:
top_100_products_bc = products[products['product_id'].isin(top100_nodes_bc)]
top_100_products_bc['department'].value_counts()

snacks           23
dairy eggs       18
produce          14
deli             11
frozen            8
beverages         7
pantry            7
breakfast         3
international     2
bakery            2
meat seafood      1
canned goods      1
household         1
Name: department, dtype: int64

In [57]:
test_features.groupby(["user_id", "product_id"]).agg({"order_number": "count", "order_count":"mean"}).reset_index()

Unnamed: 0,user_id,product_id,order_number,order_count
0,1,196,9,10.0
1,1,10258,8,10.0
2,1,10326,1,10.0
3,1,12427,9,10.0
4,1,13032,2,10.0
...,...,...,...,...
9337081,206209,43961,3,13.0
9337082,206209,44325,1,13.0
9337083,206209,48370,1,13.0
9337084,206209,48697,1,13.0


In [61]:
test_features[test_features['user_id'] == 2]

Unnamed: 0,order_id,user_id,order_number,order_dow,order_hour_of_day,days_since_prior_order,order_count,product_id,add_to_cart_order,reordered
11,2168274,2,1,2,11,,14,32792,1,0
11,2168274,2,1,2,11,,14,47766,2,0
11,2168274,2,1,2,11,,14,20574,3,0
11,2168274,2,1,2,11,,14,12000,4,0
11,2168274,2,1,2,11,,14,48110,5,0
...,...,...,...,...,...,...,...,...,...,...
23,3268552,2,13,4,11,30.0,14,44303,5,0
23,3268552,2,13,4,11,30.0,14,16521,6,0
23,3268552,2,13,4,11,30.0,14,39877,7,0
23,3268552,2,13,4,11,30.0,14,19057,8,0
