In [None]:
import numpy as np
import pandas as pd
import networkx as nx
from networkx.algorithms import community
import matplotlib.pyplot as plt
import torch
import torch.optim as optim
from torch_geometric.nn import Node2Vec
from torch_geometric.utils.convert import from_networkx
from torch_geometric.utils import degree, to_networkx
from torch_geometric.data import Data
import torch.nn.functional as F
from torch_geometric.nn import GCNConv
from sklearn.cluster import KMeans
from sklearn.cluster import DBSCAN
from sklearn.neighbors import NearestNeighbors

In [None]:
# Load the provided CSV files
kyc_path = 'kyc.csv'
cash_trxns_path = 'cash_trxns.csv'
emt_trxns_path = 'emt_trxns.csv'
wire_trxns_path = 'wire_trxns.csv'

# Read the data
kyc_data = pd.read_csv(kyc_path).drop(columns=['label'])  # Dropping 'label' as instructed
cash_trxns_data = pd.read_csv(cash_trxns_path)
emt_trxns_data = pd.read_csv(emt_trxns_path)
wire_trxns_data = pd.read_csv(wire_trxns_path)

In [None]:
cash_trxns_data['transaction_type'] = 'cash'
emt_trxns_data['transaction_type'] = 'emt'
wire_trxns_data['transaction_type'] = 'wire'

# Rename columns for uniformity
emt_trxns_data.rename(columns={'id sender': 'cust_id_sender', 'id receiver': 'cust_id_receiver', 'emt value': 'amount'}, inplace=True)
wire_trxns_data.rename(columns={'id sender': 'cust_id_sender', 'id receiver': 'cust_id_receiver', 'wire value': 'amount'}, inplace=True)
cash_trxns_data.rename(columns={'cust_id': 'cust_id_sender', 'value': 'amount'}, inplace=True)

In [None]:
transactions_combined = pd.concat([
    cash_trxns_data[['cust_id_sender', 'amount', 'transaction_type']],
    emt_trxns_data[['cust_id_sender', 'cust_id_receiver', 'amount', 'transaction_type']],
    wire_trxns_data[['cust_id_sender', 'cust_id_receiver', 'amount', 'transaction_type']]
], ignore_index=True)
transactions_combined['cust_id_receiver'].fillna('NoReceiver', inplace=True)

In [None]:
# Create a unique list of all customer IDs
all_cust_ids = pd.concat([transactions_combined['cust_id_sender'], transactions_combined['cust_id_receiver']]).unique()
cust_id_to_index = {cust_id: i for i, cust_id in enumerate(all_cust_ids)}

# Map customer IDs to indices
transactions_combined['sender_idx'] = transactions_combined['cust_id_sender'].map(cust_id_to_index)
transactions_combined['receiver_idx'] = transactions_combined['cust_id_receiver'].map(cust_id_to_index)

In [None]:
# Convert to tensors
edge_index = torch.tensor(transactions_combined[['sender_idx', 'receiver_idx']].values.T, dtype=torch.long)
edge_attr = torch.tensor(transactions_combined['amount'].values, dtype=torch.float).unsqueeze(1)  # Add dimension for attributes

In [None]:
# Construct graph data object
data = Data(edge_index=edge_index, edge_attr=edge_attr)

In [None]:
# Convert PyTorch Geometric graph data to a NetworkX graph
G_nx = to_networkx(data, to_undirected=True)

# Check if the graph is connected
is_connected = nx.is_connected(G_nx)
print(f"Is the graph connected? {is_connected}")

# Connected components analysis
if not is_connected:
    connected_components = list(nx.connected_components(G_nx))
    num_components = len(connected_components)
    largest_component_size = max(len(c) for c in connected_components)
    print(f"Number of connected components: {num_components}")
    print(f"Largest component size: {largest_component_size}")

In [None]:
#Need to run the below code with three different values of q: 0.5, 1, 2

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
data = data.to(device)

model = Node2Vec(
    edge_index=data.edge_index, 
    embedding_dim=128, 
    walk_length=20, 
    context_size=10, 
    walks_per_node=30, 
    num_negative_samples=1, 
    p=1.0, 
    q=0.5, 
    sparse=True
).to(device)

optimizer = optim.SparseAdam(model.parameters(), lr=0.01)
# Training function
def train():
    model.train()
    total_loss = 0
    for pos_rw, neg_rw in model.loader(batch_size=128, shuffle=True, num_workers=0):
        optimizer.zero_grad()
        loss = model.loss(pos_rw.to(device), neg_rw.to(device))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(model.loader(batch_size=128, num_workers=1))
    # Train the model
for epoch in range(1,20):
    loss = train()
    print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}')

In [None]:
# Generate embeddings
model.eval()
with torch.no_grad():
    embeddings = model(torch.arange(data.num_nodes, device=device))

In [None]:
torch.save(embeddings, 'node_embeddings.pt')

In [None]:
import pickle
with open('cust_id_to_index_1.pkl', 'wb') as f:
    pickle.dump(cust_id_to_index, f)

In [None]:
mean = torch.mean(embeddings, 0)
embeddings_centered = embeddings - mean

U, S, V = torch.svd(embeddings_centered)

# Number of principal components
num_components = 2 
embeddings_reduced = torch.matmul(embeddings_centered, V[:, :num_components])
embeddings_reduced_np = embeddings_reduced.cpu().detach().numpy() 
print(embeddings_reduced.shape) 

In [None]:
def calculate_sse(X, labels, centroids):
    return sum(torch.norm(X[labels == i] - centroids[i], dim=1).sum() for i in range(len(centroids))).item()

X = torch.tensor(embeddings_reduced_np, device='cuda')

sse_list = []
for n_clusters in range(1, 10):  # Example: trying 1 to 10 clusters
    labels, centroids = k_means(X, n_clusters=n_clusters)
    sse = calculate_sse(X, labels, centroids)
    sse_list.append(sse)

plt.figure(figsize=(10, 6))
plt.plot(range(1, 10), sse_list, marker='o')
plt.title('Elbow Method for Optimal Number of Clusters')
plt.xlabel('Number of clusters')
plt.ylabel('Sum of squared distances')
plt.show()

In [None]:
def k_means(X, n_clusters, n_iters=300, tol=1e-4):
    centroids = X[np.random.choice(X.shape[0], n_clusters, replace=False)]

    for _ in range(n_iters):
        distances = torch.cdist(X, centroids)
        labels = torch.argmin(distances, dim=1)

        new_centroids = torch.stack([X[labels == i].mean(0) for i in range(n_clusters)])
        
        # Check for convergence
        if torch.norm(centroids - new_centroids) < tol:
            break
        
        centroids = new_centroids

    return labels, centroids

X = torch.tensor(embeddings_reduced_np, device='cuda') 
n_clusters = 5
labels, centroids = k_means(X, n_clusters=n_clusters)

In [None]:
labels_np = labels.cpu().numpy()

unique_clusters, counts = np.unique(labels_np, return_counts=True)

print("Unique clusters:", unique_clusters)
print("Counts:", counts)