# Graphs

First experiment with a graph classifier, graph classifier run the convolution on the entire network and output a result for every single node in the graph.

We use a graph with 3 nodes
- The customer node with age, gender as properties
- The merchant node with category as property
- A payment node with scaled amount as property

And 2 edges (no edge properties for now)
- From customer to payment
- From payment to merchant

Because of the 3 nodes this is a __heterogeneous__ graph.

## Imports

In [1]:
import torch
import numpy as np
import pandas as pd
import gc

import datetime as dt

import d373c7.features as ft
import d373c7.engines as en
import d373c7.pytorch as pt
import d373c7.pytorch.models as pm
import d373c7.plot as pl
import d373c7.network as nw
import d373c7.pytorch.network.geometric as geo

import torch.nn.functional as F
import torch.nn as nn

from torch_geometric.data import HeteroData
import torch_geometric.transforms as T
from torch_geometric.nn import SAGEConv, Linear, to_hetero

In [None]:
pip install wheel
sudo apt-get install python3.x-dev

## Set a random seed for Numpy and Torch
> Will make sure we always sample in the same way. Makes it easier to compare results. At some point it should been removed to test the model stability.

In [2]:
# Numpy
np.random.seed(42)
# Torch
torch.manual_seed(42)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [3]:
# Change this to read from another location
file = '../../../../data/bs140513_032310.csv'

In [4]:
def step_to_date(step_count: int):
    return dt.datetime(2020, 1, 1) + dt.timedelta(days=int(step_count))

step = ft.FeatureSource('step', ft.FEATURE_TYPE_INT_16) 
customer = ft.FeatureSource('customer', ft.FEATURE_TYPE_STRING)
age = ft.FeatureSource('age', ft.FEATURE_TYPE_CATEGORICAL)
gender = ft.FeatureSource('gender', ft.FEATURE_TYPE_CATEGORICAL)
merchant = ft.FeatureSource('merchant', ft.FEATURE_TYPE_CATEGORICAL)
category = ft.FeatureSource('category', ft.FEATURE_TYPE_CATEGORICAL)
amount = ft.FeatureSource('amount', ft.FEATURE_TYPE_FLOAT_32)
fraud = ft.FeatureSource('fraud', ft.FEATURE_TYPE_INT_8)

payment_id = ft.FeatureSource('payment_id', ft.FEATURE_TYPE_INT_32)
customer_id = ft.FeatureSource('customer_id', ft.FEATURE_TYPE_INT_32)
merchant_id = ft.FeatureSource('merchant_id', ft.FEATURE_TYPE_INT_32)

age_oh = ft.FeatureOneHot('age_oh', ft.FEATURE_TYPE_INT_8, age)
gender_oh = ft.FeatureOneHot('gender_oh', ft.FEATURE_TYPE_INT_8, gender)
category_oh = ft.FeatureOneHot('category_oh', ft.FEATURE_TYPE_INT_8, category)

amount_scale = ft.FeatureNormalizeScale('amount_scale', ft.FEATURE_TYPE_FLOAT_32, amount)

date_time = ft.FeatureExpression('date', ft.FEATURE_TYPE_DATE_TIME, step_to_date, [step])


raw_td = ft.TensorDefinition(
    'raw',
    [
        step,
        customer,
        age,
        gender,
        merchant,
        category,
        amount,
        fraud
    ])

label_td = ft.TensorDefinition(
    'labels',
    [fraud]
)

customer_node_td = ft.TensorDefinition(
    'customer_node', 
    [
        customer_id,
        age_oh,
        gender_oh
    ])

merchant_node_td = ft.TensorDefinition(
    'merchant_node', 
    [
        merchant_id,
        category_oh
    ])

payment_node_td = ft.TensorDefinition(
    'payment_node', 
    [
        payment_id,
        amount_scale
    ])

customer_to_payment_edge_td = ft.TensorDefinition(
    'customer_to_payment_edge', 
    [
        date_time,
        customer_id,
        payment_id
    ])

payment_to_merchant_edge_td = ft.TensorDefinition(
    'customer_to_payment_edge', 
    [
        date_time,
        merchant_id,
        payment_id
    ])

with en.EnginePandasNumpy(num_threads=1) as e:
    df_raw = e.from_csv(raw_td, file, inference=False)
    
# Add unique index to the payment df.
df_raw['payment_id'] = df_raw.index

df_raw['customer_id'] = pd.factorize(df_raw['customer'])[0]
df_raw['merchant_id'] = pd.factorize(df_raw['merchant'])[0]

with en.EnginePandasNumpy(num_threads=1) as e:
    df_cn = e.from_df(customer_node_td, df_raw, raw_td, inference=False)
    df_mn = e.from_df(merchant_node_td, df_raw, raw_td, inference=False)
    df_pn = e.from_df(payment_node_td, df_raw, raw_td, inference=False)
    df_cpe = e.from_df(customer_to_payment_edge_td, df_raw, raw_td, inference=False)
    df_pme = e.from_df(payment_to_merchant_edge_td, df_raw, raw_td, inference=False)
    df_labels = e.from_df(label_td, df_raw, raw_td, inference=False)

# Make customer and merchant data unique
df_cn = df_cn.drop_duplicates(subset=['customer_id'])
df_mn = df_mn.drop_duplicates(subset=['merchant_id'])

# Labels to numpy
labels = df_labels.to_numpy()

2022-06-27 22:16:48.230 d373c7.engines.common          INFO     Start Engine...
2022-06-27 22:16:48.231 d373c7.engines.panda_numpy     INFO     Pandas Version : 1.1.4
2022-06-27 22:16:48.231 d373c7.engines.panda_numpy     INFO     Numpy Version : 1.19.2
2022-06-27 22:16:48.232 d373c7.engines.panda_numpy     INFO     Building Panda for : raw from file ../../../../data/bs140513_032310.csv
2022-06-27 22:16:48.448 d373c7.engines.panda_numpy     INFO     Building Panda for : <Built Features> from DataFrame. Inference mode <False>
2022-06-27 22:16:48.448 d373c7.engines.panda_numpy     INFO     Reshaping DataFrame to: Built Features
2022-06-27 22:16:48.454 d373c7.engines.panda_numpy     INFO     Done creating Built Features. Shape=(594643, 8)
2022-06-27 22:16:48.455 d373c7.engines.panda_numpy     INFO     Reshaping DataFrame to: raw
2022-06-27 22:16:48.495 d373c7.engines.common          INFO     Start Engine...
2022-06-27 22:16:48.495 d373c7.engines.panda_numpy     INFO     Pandas Version : 1

#### The customer node data 

In [5]:
df_cn

Unnamed: 0,customer_id,age__0,age__1,age__2,age__3,age__4,age__5,age__6,age__U,gender__E,gender__F,gender__M,gender__U
0,0,0,0,0,0,1,0,0,0,0,0,1,0
1,1,0,0,1,0,0,0,0,0,0,0,1,0
2,2,0,0,0,0,1,0,0,0,0,1,0,0
3,3,0,0,0,1,0,0,0,0,0,0,1,0
4,4,0,0,0,0,0,1,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
236475,4107,0,0,0,1,0,0,0,0,0,0,1,0
254227,4108,0,0,1,0,0,0,0,0,0,1,0,0
308714,4109,0,1,0,0,0,0,0,0,0,1,0,0
309490,4110,0,0,0,0,1,0,0,0,0,1,0,0


#### The Merchant node data

In [6]:
df_mn

Unnamed: 0,merchant_id,category__es_barsandrestaurants,category__es_contents,category__es_fashion,category__es_food,category__es_health,category__es_home,category__es_hotelservices,category__es_hyper,category__es_leisure,category__es_otherservices,category__es_sportsandtoys,category__es_tech,category__es_transportation,category__es_travel,category__es_wellnessandbeauty
0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
2,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
12,2,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
40,3,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
42,4,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
77,5,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
88,6,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
98,7,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
127,8,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
130,9,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0


#### Payment node data

In [7]:
df_pn

Unnamed: 0,payment_id,amount_scale
0,0,0.000546
1,1,0.004764
2,2,0.003228
3,3,0.002071
4,4,0.004288
...,...,...
594638,594638,0.002465
594639,594639,0.006090
594640,594640,0.002694
594641,594641,0.001736


### Define a network structure

In [9]:
# Define Nodes
customer_node = nw.NetworkNodeDefinitionPandas('customer', customer_id, customer_node_td, df_cn)
merchant_node = nw.NetworkNodeDefinitionPandas('merchant', merchant_id, merchant_node_td, df_mn)
payment_node = nw.NetworkNodeDefinitionPandas('payment', payment_id, payment_node_td, df_pn)

# Define Edges
customer_to_payment_edge = nw.NetworkEdgeDefinitionPandas(
    name = 'customer_to_payment',
    id_feature = payment_id,
    from_node = customer_node,
    from_node_id = customer_id,
    to_node = payment_node,
    to_node_id = payment_id,
    td = customer_to_payment_edge_td,
    df = df_cpe
)

payment_to_merchant_edge = nw.NetworkEdgeDefinitionPandas(
    name = 'payment_to_merchant ',
    id_feature = payment_id,
    from_node = payment_node,
    from_node_id = payment_id,
    to_node = merchant_node,
    to_node_id = merchant_id,
    td = payment_to_merchant_edge_td,
    df = df_pme
)

# Now define the network
network = nw.NetworkDefinitionPandas(
    'network', 
    [customer_node, merchant_node, payment_node], 
    [customer_to_payment_edge, payment_to_merchant_edge]
)

### Convert to a geometric dataset.

In [10]:
g = geo.GeometricData.get_hetero_data(network)
# Need to undirect?
g = T.ToUndirected()(g)

In [11]:
print(g)

HeteroData(
  [1mcustomer[0m={ x=[4112, 12] },
  [1mmerchant[0m={ x=[50, 15] },
  [1mpayment[0m={ x=[594643, 1] },
  [1m(customer, customer_to_payment, payment)[0m={ edge_index=[2, 594643] },
  [1m(payment, payment_to_merchant , merchant)[0m={ edge_index=[2, 594643] },
  [1m(payment, rev_customer_to_payment, customer)[0m={ edge_index=[2, 594643] },
  [1m(merchant, rev_payment_to_merchant , payment)[0m={ edge_index=[2, 594643] }
)


### Define a model
This model has 2 graph convolutional layers followed by a linear and sigmoid. The output is a score between 0 and 1 and evaluated using binary cross entropy loss.

In [13]:
class GNN(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = SAGEConv((-1, -1), 16)
        self.conv2 = SAGEConv((-1, -1), 8)
        self.lin = Linear(-1, 1)
        self.sig = torch.nn.Sigmoid()

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index).relu()
        x = self.conv2(x, edge_index).relu()
        x = self.lin(x)
        x = self.sig(x)
        return x

model = GNN()
model = to_hetero(model, g.metadata(), aggr='sum')

labels = torch.as_tensor(labels, dtype=torch.float32)

optimizer = torch.optim.AdamW(model.parameters(), lr=0.02)

### Run 10 epochs

In [15]:
num_epoch = 10

def train(mask):
    model.train()
    optimizer.zero_grad()
    out = model(g.x_dict, g.edge_index_dict)
    # mask = data['paper'].train_mask
    correct = torch.sum(torch.eq(torch.ge(out['payment'][mask], 0.5), labels[mask])).item()
    count = labels[mask].shape[0]
    loss = F.binary_cross_entropy(out['payment'][mask], labels[mask])
    loss.backward()
    optimizer.step()
    return float(loss), correct/count

for e in range(num_epoch):
    # Create new mask
    mask = torch.zeros((labels.shape[0], 1), dtype=torch.bool)
    # Select all fraud payments
    mask[torch.eq(labels, 1.0)] = 1.0
    # Select the same amount of random non-fraud records
    perm = torch.randperm(torch.count_nonzero(torch.eq(labels, 0.0)).item())
    mask[torch.argwhere(torch.eq(labels, 0.0))[perm[0:torch.count_nonzero(torch.eq(labels, 1.0)).item()-1]]] = 1.0
    l, a = train(mask)
    print(l, a)

0.7052758932113647 0.4842361111111111
0.6746311187744141 0.7482638888888888
0.6483069062232971 0.7824305555555555
0.6148912906646729 0.8785416666666667
0.5723647475242615 0.9322222222222222
0.5213258862495422 0.9389583333333333
0.462142676115036 0.9368055555555556
0.4000915586948395 0.9415277777777777
0.3612198829650879 0.9365972222222222
0.31196480989456177 0.95125
