In [253]:
!pip install git+https://github.com/pyg-team/pytorch_geometric.git

Collecting git+https://github.com/pyg-team/pytorch_geometric.git
  Cloning https://github.com/pyg-team/pytorch_geometric.git to /tmp/pip-req-build-ua7_wtp1
  Running command git clone --filter=blob:none --quiet https://github.com/pyg-team/pytorch_geometric.git /tmp/pip-req-build-ua7_wtp1
  Resolved https://github.com/pyg-team/pytorch_geometric.git to commit 46705844b39ededc0fcef1de90e73923480a6446
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone


In [265]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import networkx as nx
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.utils import from_networkx
from torch_geometric.data import HeteroData
from torch_geometric.nn import SAGEConv
from tqdm import tqdm

In [272]:
df = pd.read_csv("preprocessed_transactions.csv")
df["card_id"] = df["User"].astype(str) + "_" + df["Card"].astype(str)
df = df.drop(["User","Card"],axis=1)
df["Errors?"]= df["Errors?"].fillna("No error")
df["Errors?"]= LabelEncoder().fit_transform(df["Errors?"])
df["Is Fraud?"] = df["Is Fraud?"].apply(lambda x: 1 if x == True else 0)
df = df.dropna(subset=['Is Fraud?'])

print(df['Is Fraud?'].dtypes)
print(df['Is Fraud?'].head())
df.head(5)

int64
0    0
1    0
2    0
3    0
4    0
Name: Is Fraud?, dtype: int64


Unnamed: 0,Year,Month,Day,Amount,Merchant Name,MCC,Errors?,Is Fraud?,Hour,Minute,card_id
0,2015,1,26,3109,4241336128694185533,4814,14,0,16,38,0_0
1,2015,3,22,4587,-4241409341442030551,7349,14,0,20,49,0_0
2,2015,3,26,6976,-9092677072201095172,4900,14,0,20,46,0_0
3,2015,4,13,14503,-9092677072201095172,4900,14,0,13,25,0_0
4,2015,5,1,2163,4241336128694185533,4814,14,0,16,40,0_0


In [267]:
class GraphConstruction:
    def __init__(self, nodes, edges, features=None):
        self.g_nx = nx.DiGraph()
        self.add_nodes(nodes)
        self.add_edges(edges)
        self.node_features = features if features is not None else {}

    def add_nodes(self, nodes):
        for ntype, nodelist in nodes.items():
            for node in nodelist:
                self.g_nx.add_node(node, ntype=ntype)

    def add_edges(self, edges):
        for edge in edges:
            self.g_nx.add_edges_from(edge)

    def to_pyg_data(self):
        data = HeteroData()
        node_id_mapping = {ntype: [] for ntype in set(nx.get_node_attributes(self.g_nx, 'ntype').values())}

        print("Processing nodes...")
        for node, ndata in tqdm(self.g_nx.nodes(data=True), desc="Nodes"):
            ntype = ndata['ntype']
            if 'x' not in data[ntype]:
                data[ntype].x = []
            node_id_mapping[ntype].append(node)
            if self.node_features and ntype in self.node_features:
                if node in self.node_features[ntype].index:
                    features = self.node_features[ntype].loc[node].values
                    features = [float(f) if isinstance(f, (int, float)) else 0.0 for f in features]
                    data[ntype].x.append(features)
                else:
                    data[ntype].x.append([0.0] * self.node_features[ntype].shape[1])

        print("Converting node features to tensors...")
        for ntype in tqdm(data.node_types, desc="Node Types"):
            data[ntype].x = torch.tensor(data[ntype].x, dtype=torch.float)

        print("Processing edges...")
        for u, v in tqdm(self.g_nx.edges(), desc="Edges"):
            u_type = self.g_nx.nodes[u]['ntype']
            v_type = self.g_nx.nodes[v]['ntype']
            edge_type = (u_type, 'to', v_type)
            if edge_type not in data.edge_types:
                data[edge_type].edge_index = [[], []]
            if u in node_id_mapping[u_type] and v in node_id_mapping[v_type]:
                u_index = node_id_mapping[u_type].index(u)
                v_index = node_id_mapping[v_type].index(v)
                data[edge_type].edge_index[0].append(u_index)
                data[edge_type].edge_index[1].append(v_index)

        print("Converting edge indices to tensors...")
        for edge_type in tqdm(data.edge_types, desc="Edge Types"):
            edge_index = data[edge_type].edge_index
            data[edge_type].edge_index = torch.tensor(edge_index, dtype=torch.long)

        return data

In [268]:
class GraphSAGE(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels, num_layers):
        super(GraphSAGE, self).__init__()
        self.convs = torch.nn.ModuleList()
        self.convs.append(SAGEConv(in_channels, hidden_channels))
        for _ in range(num_layers - 2):
            self.convs.append(SAGEConv(hidden_channels, hidden_channels))
        self.convs.append(SAGEConv(hidden_channels, out_channels))

    def forward(self, x, edge_index):
        for conv in self.convs[:-1]:
            x = conv(x, edge_index)
            x = torch.relu(x)
        x = self.convs[-1](x, edge_index)
        return torch.log_softmax(x, dim=-1)

In [269]:
# Encode categorical features
le_card = LabelEncoder()
le_merchant = LabelEncoder()
df['card_id_enc'] = le_card.fit_transform(df['card_id'])
df['merchant_enc'] = le_merchant.fit_transform(df['Merchant Name'])

# Balancing the dataset without resetting the index
fraudulent = df[df["Is Fraud?"] == 1]
non_fraudulent = df[df["Is Fraud?"] == 0]

print(len(fraudulent), len(non_fraudulent))

desired_ratio = 2  # 1:1 balance
non_fraud_sample_size = round(len(fraudulent) * desired_ratio)
non_fraudulent_sampled = non_fraudulent.sample(n=non_fraud_sample_size, random_state=42)

balanced_df = pd.concat([fraudulent, non_fraudulent_sampled])
balanced_df = balanced_df.sample(frac=1, random_state=42)  # Do not reset index
print(f"Balanced dataset size: {len(balanced_df)}")
print(balanced_df["Is Fraud?"].value_counts())

df = balanced_df.copy()
# Transform labels using the same encoders
df['card_id_enc'] = le_card.transform(df['card_id'])
df['merchant_enc'] = le_merchant.transform(df['Merchant Name'])

# Split data into training and inductive sets
cutoff = round(0.7 * len(df))
train_data = df.iloc[:cutoff]
inductive_data = df.iloc[cutoff:]

# Prepare node features
transaction_features = train_data.drop(columns=['card_id', 'Merchant Name', 'Is Fraud?', 'card_id_enc', 'merchant_enc'])
transaction_features.index = train_data.index
client_features = pd.DataFrame(index=train_data['card_id_enc'].unique())
merchant_features = pd.DataFrame(index=train_data['merchant_enc'].unique())

train_mask = torch.zeros(len(df), dtype=torch.bool)
val_mask = torch.zeros(len(df), dtype=torch.bool)

train_mask[:cutoff] = True  # Training nodes
val_mask[cutoff:] = True

# Create nodes and edges
nodes = {
    'client': df['card_id_enc'].unique(),
    'merchant': df['merchant_enc'].unique(),
    'transaction': df.index
}
edges = [
    list(zip(df['card_id_enc'], df.index)),
    list(zip(df.index, df['merchant_enc']))
]
features = {
    'transaction': df.drop(columns=['card_id', 'Merchant Name', 'Is Fraud?', 'card_id_enc', 'merchant_enc']),
    'client': pd.DataFrame(index=df['card_id_enc'].unique()),
    'merchant': pd.DataFrame(index=df['merchant_enc'].unique())
}

# Build graph
graph = GraphConstruction(nodes, edges, features)
data = graph.to_pyg_data()

data['transaction'].train_mask = train_mask
data['transaction'].val_mask = val_mask
data['transaction'].y = torch.tensor(df['Is Fraud?'].values, dtype=torch.long)

5850 419614
Balanced dataset size: 17550
Is Fraud?
0    11700
1     5850
Name: count, dtype: int64
Processing nodes...


Nodes: 100%|██████████| 20599/20599 [00:01<00:00, 19252.72it/s]


Converting node features to tensors...


Node Types: 100%|██████████| 3/3 [00:00<00:00, 213.58it/s]


Processing edges...


Edges: 100%|██████████| 35100/35100 [00:10<00:00, 3211.96it/s]


Converting edge indices to tensors...


Edge Types: 100%|██████████| 4/4 [00:00<00:00, 217.37it/s]


In [270]:
import os

# Define model
in_channels = transaction_features.shape[1]
hidden_channels = 64
out_channels = 2  # Binary classification
num_layers = 2
model = GraphSAGE(in_channels, hidden_channels, out_channels, num_layers)

# Training setup
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)
criterion = torch.nn.CrossEntropyLoss()

# Initialize variables for best model tracking
best_val_loss = float('inf')
best_model_path = "best_model.pth"

# Training loop
model.train()

for epoch in range(101):
    optimizer.zero_grad()

    # Forward pass
    out = model(data['transaction'].x, data[('transaction', 'to', 'merchant')].edge_index)
    train_loss = criterion(out[data['transaction'].train_mask], data['transaction'].y[data['transaction'].train_mask])

    # Backward pass
    train_loss.backward()
    optimizer.step()

    # Save the best model
    if train_loss.item() < best_val_loss:
        best_val_loss = train_loss.item()
        torch.save(model.state_dict(), best_model_path)

    if epoch % 10 == 0:
        print(f'Epoch {epoch+1}, Loss: {train_loss.item()}')

Epoch 1, Loss: 0.7456312775611877
Epoch 11, Loss: 0.741747260093689
Epoch 21, Loss: 0.73795485496521
Epoch 31, Loss: 0.7342621088027954
Epoch 41, Loss: 0.7306721806526184
Epoch 51, Loss: 0.7271859049797058
Epoch 61, Loss: 0.7238022685050964
Epoch 71, Loss: 0.7205193638801575
Epoch 81, Loss: 0.7173343896865845
Epoch 91, Loss: 0.7142446041107178
Epoch 101, Loss: 0.7112471461296082


In [271]:
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix, recall_score as recall

model.eval()
with torch.no_grad():
    val_logits = out[data['transaction'].val_mask]
    val_preds = torch.argmax(val_logits, dim=1)
    val_labels = data['transaction'].y[data['transaction'].val_mask]
print(val_labels.bincount())
rec = recall(val_labels.cpu(), val_preds.cpu(), average='binary')
accuracy = accuracy_score(val_labels.cpu(), val_preds.cpu())
conf_matrix = confusion_matrix(val_labels.cpu(), val_preds.cpu())

print(f"Validation Recall: {rec:.4f}")
print(f"Validation Accuracy: {accuracy:.4f}")
print("Confusion Matrix:")
print(conf_matrix)

tensor([3554, 1711])
Validation Recall: 1.0000
Validation Accuracy: 0.3250
Confusion Matrix:
[[   0 3554]
 [   0 1711]]
