In [1]:
# !pip install git+https://github.com/pyg-team/pytorch_geometric.git

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import networkx as nx
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.utils import from_networkx
from torch_geometric.data import HeteroData
from torch_geometric.nn import SAGEConv
from tqdm import tqdm

In [3]:
df = pd.read_csv("dataset_files/preprocessed_transactions.csv")
df.drop(df[df['Year'] == 2015].index, axis=0, inplace=True, reset_index=True)
df["card_id"] = df["User"].astype(str) + "_" + df["Card"].astype(str)
df = df.drop(["User","Card"],axis=1)
df["Errors?"]= df["Errors?"].fillna("No error")
df["Errors?"]= LabelEncoder().fit_transform(df["Errors?"])
df["Is Fraud?"] = df["Is Fraud?"].astype('uint8')
df = df.dropna(subset=['Is Fraud?'])

print(df['Is Fraud?'].dtypes)
print(df['Is Fraud?'].head())
df.head(5)

uint8
29    0
30    0
31    0
32    0
33    0
Name: Is Fraud?, dtype: uint8


Unnamed: 0,Year,Month,Day,Amount,Merchant Name,MCC,Errors?,Is Fraud?,Hour,Minute,card_id
29,2016,1,18,10290,208649686760524778,5651,14,0,16,57,0_0
30,2016,2,6,3903,5805127065224074672,7349,14,0,20,18,0_0
31,2016,3,15,3258,4241336128694185533,4814,14,0,17,14,0_0
32,2016,8,3,14785,7035602569409149834,5311,14,0,5,2,0_0
33,2016,8,13,9550,-9092677072201095172,4900,14,0,13,29,0_0


In [4]:
class GraphConstruction:
    def __init__(self, nodes, edges, features=None):
        self.g_nx = nx.MultiDiGraph()
        self.add_nodes(nodes)
        self.add_edges(edges)
        self.node_features = features if features is not None else {}

    def add_nodes(self, nodes):
        for ntype, nodelist in nodes.items():
            for node in nodelist:
                self.g_nx.add_node(node, ntype=ntype)

    def add_edges(self, edges):
        for edge in edges:
            self.g_nx.add_edges_from(edge)

    def to_pyg_data(self):
        data = HeteroData()
        node_id_mapping = {ntype: [] for ntype in set(nx.get_node_attributes(self.g_nx, 'ntype').values())}

        print("Processing nodes...")
        for node, ndata in tqdm(self.g_nx.nodes(data=True), desc="Nodes"):
            ntype = ndata['ntype']
            if 'x' not in data[ntype]:
                data[ntype].x = []
            node_id_mapping[ntype].append(node)
            if self.node_features and ntype in self.node_features:
                if node in self.node_features[ntype].index:
                    features = self.node_features[ntype].loc[node].values
                    features = [float(f) if isinstance(f, (int, float)) else 0.0 for f in features]
                    data[ntype].x.append(features)
                else:
                    data[ntype].x.append([0.0] * self.node_features[ntype].shape[1])

        print("Converting node features to tensors...")
        for ntype in tqdm(data.node_types, desc="Node Types"):
            data[ntype].x = torch.tensor(data[ntype].x, dtype=torch.float)

        print("Processing edges...")
        for u, v in tqdm(self.g_nx.edges(), desc="Edges"):
            u_type = self.g_nx.nodes[u]['ntype']
            v_type = self.g_nx.nodes[v]['ntype']
            edge_type = (u_type, 'to', v_type)
            if edge_type not in data.edge_types:
                data[edge_type].edge_index = [[], []]
            if u in node_id_mapping[u_type] and v in node_id_mapping[v_type]:
                u_index = node_id_mapping[u_type].index(u)
                v_index = node_id_mapping[v_type].index(v)
                data[edge_type].edge_index[0].append(u_index)
                data[edge_type].edge_index[1].append(v_index)

        print("Converting edge indices to tensors...")
        for edge_type in tqdm(data.edge_types, desc="Edge Types"):
            edge_index = data[edge_type].edge_index
            data[edge_type].edge_index = torch.tensor(edge_index, dtype=torch.long)

        return data

In [19]:
class GraphSAGE(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels, num_layers):
        super(GraphSAGE, self).__init__()
        self.convs = torch.nn.ModuleList()
        self.convs.append(SAGEConv(in_channels, hidden_channels))
        for _ in range(num_layers - 2):
            self.convs.append(SAGEConv(hidden_channels, hidden_channels))
        self.convs.append(SAGEConv(hidden_channels, out_channels))
        # self.affine = torch.nn.Linear(hidden_channels, out_channels)

    def forward(self, x, edge_index):
        for conv in self.convs[:-1]:
            x = conv(x, edge_index)
            x = F.elu(x)
        x = self.convs[-1](x, edge_index)
        # x = self.affine(x)
        return torch.log_softmax(x, dim=-1)

In [6]:
# Encode categorical features
le_card = LabelEncoder()
le_merchant = LabelEncoder()
df['card_id_enc'] = le_card.fit_transform(df['card_id'])
df['merchant_enc'] = le_merchant.fit_transform(df['Merchant Name'])

# # Balancing the dataset without resetting the index
# fraudulent = df[df["Is Fraud?"] == 1]
# non_fraudulent = df[df["Is Fraud?"] == 0]
# 
# print(len(fraudulent), len(non_fraudulent))
# 
# desired_ratio = 1  # 1:1 balance
# non_fraud_sample_size = round(len(fraudulent) * desired_ratio)
# non_fraudulent_sampled = non_fraudulent.sample(n=non_fraud_sample_size, random_state=42)
# 
# balanced_df = pd.concat([fraudulent, non_fraudulent_sampled])
# balanced_df = balanced_df.sample(frac=1, random_state=42)  # Do not reset index
# print(f"Balanced dataset size: {len(balanced_df)}")
# print(balanced_df["Is Fraud?"].value_counts())
# 
# df = balanced_df.copy()
# # Transform labels using the same encoders
# df['card_id_enc'] = le_card.transform(df['card_id'])
# df['merchant_enc'] = le_merchant.transform(df['Merchant Name'])

# Split data into training and inductive sets
cutoff = round(0.7 * len(df))
train_data = df.iloc[:cutoff]
inductive_data = df.iloc[cutoff:]

# Prepare node features
transaction_features = train_data.drop(columns=['card_id', 'Merchant Name', 'Is Fraud?', 'card_id_enc', 'merchant_enc'], axis=1)
transaction_features.index = train_data.index
client_features = pd.DataFrame(index=train_data['card_id_enc'].unique())
merchant_features = pd.DataFrame(index=train_data['merchant_enc'].unique())

train_mask = torch.zeros(len(df), dtype=torch.bool)
val_mask = torch.zeros(len(df), dtype=torch.bool)

train_mask[:cutoff] = True  # Training nodes
val_mask[cutoff:] = True

# Create nodes and edges
nodes = {
    'client': df['card_id_enc'].unique(),
    'merchant': df['merchant_enc'].unique(),
    'transaction': df.index
}
edges = [
    list(zip(df['card_id_enc'], df.index)),
    list(zip(df.index, df['merchant_enc']))
]
features = {
    'transaction': df.drop(columns=['card_id', 'Merchant Name', 'Is Fraud?', 'card_id_enc', 'merchant_enc'], axis=1),
    'client': pd.DataFrame(index=df['card_id_enc'].unique()),
    'merchant': pd.DataFrame(index=df['merchant_enc'].unique())
}

# Build graph
graph = GraphConstruction(nodes, edges, features)
data = graph.to_pyg_data()

data['transaction'].train_mask = train_mask
data['transaction'].val_mask = val_mask
data['transaction'].y = torch.tensor(df['Is Fraud?'].values, dtype=torch.long)

Processing nodes...


Nodes: 100%|██████████| 215702/215702 [00:07<00:00, 27233.09it/s]


Converting node features to tensors...


Node Types: 100%|██████████| 3/3 [00:00<00:00, 33.42it/s]


Processing edges...


Edges: 100%|██████████| 427228/427228 [08:25<00:00, 844.61it/s] 


Converting edge indices to tensors...


Edge Types: 100%|██████████| 4/4 [00:00<00:00, 60.82it/s]


In [22]:
# Function to calculate the number of trainable parameters
def count_trainable_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

# Example: Calculate parameters for a given model
trainable_params = count_trainable_parameters(model)
print(f"Total trainable parameters: {trainable_params}")


Total trainable parameters: 9602


In [23]:
import os

# Define model
in_channels = transaction_features.shape[1]
hidden_channels = 64
out_channels = 2  # Binary classification
num_layers = 3
model = GraphSAGE(in_channels, hidden_channels, out_channels, num_layers)

# Training setup
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
criterion = torch.nn.NLLLoss()

# Initialize variables for best model tracking
best_val_loss = float('inf')
best_model_path = "best_model.pth"

# Training loop
model.train()

for epoch in range(100):
    optimizer.zero_grad()

    # Forward pass
    out = model(data['transaction'].x, data[('transaction', 'to', 'merchant')].edge_index)
    train_loss = criterion(out[data['transaction'].train_mask],
                           data['transaction'].y[data['transaction'].train_mask])

    # Backward pass
    train_loss.backward()
    optimizer.step()

    # Save the best model
    if train_loss.item() < best_val_loss:
        best_val_loss = train_loss.item()
        torch.save(model.state_dict(), best_model_path)

    print(f'Epoch {epoch}, Loss: {train_loss.item()}')

Epoch 0, Loss: 0.6711665391921997
Epoch 1, Loss: 0.3583564758300781
Epoch 2, Loss: 0.1860780119895935
Epoch 3, Loss: 0.10451924800872803
Epoch 4, Loss: 0.07699497044086456
Epoch 5, Loss: 0.07444208115339279
Epoch 6, Loss: 0.08078351616859436
Epoch 7, Loss: 0.08914981037378311
Epoch 8, Loss: 0.09693023562431335
Epoch 9, Loss: 0.10317942500114441
Epoch 10, Loss: 0.1075962707400322
Epoch 11, Loss: 0.11015015095472336
Epoch 12, Loss: 0.11093780398368835
Epoch 13, Loss: 0.11012117564678192
Epoch 14, Loss: 0.10789932310581207
Epoch 15, Loss: 0.1044979989528656
Epoch 16, Loss: 0.10018118470907211
Epoch 17, Loss: 0.09525524079799652
Epoch 18, Loss: 0.08992236852645874
Epoch 19, Loss: 0.08451299369335175
Epoch 20, Loss: 0.07954833656549454
Epoch 21, Loss: 0.07570618391036987
Epoch 22, Loss: 0.07378418743610382
Epoch 23, Loss: 0.07445169985294342
Epoch 24, Loss: 0.07757531106472015
Epoch 25, Loss: 0.08144396543502808
Epoch 26, Loss: 0.08341710269451141
Epoch 27, Loss: 0.08229731768369675
Epoch 2

In [24]:
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix, recall_score as recall

model.eval()
with torch.no_grad():
    val_logits = out[data['transaction'].val_mask]
    print(val_logits)
    val_preds = torch.argmax(val_logits, dim=1)
    val_labels = data['transaction'].y[data['transaction'].val_mask]
print(val_labels.bincount())
rec = recall(val_labels.cpu(), val_preds.cpu(), average='binary')
accuracy = accuracy_score(val_labels.cpu(), val_preds.cpu())
conf_matrix = confusion_matrix(val_labels.cpu(), val_preds.cpu())

print(f"Validation Recall: {rec:.4f}")
print(f"Validation Accuracy: {accuracy:.4f}")
print("Confusion Matrix:")
print(conf_matrix)

tensor([[-0.0138, -4.2922],
        [-0.0138, -4.2922],
        [-0.0138, -4.2922],
        ...,
        [-0.0138, -4.2922],
        [-0.0138, -4.2922],
        [-0.0138, -4.2922]])
tensor([63105,   979])
Validation Recall: 0.0000
Validation Accuracy: 0.9847
Confusion Matrix:
[[63105     0]
 [  979     0]]
