In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [9]:
import numpy as np
import pickle



# 불러올 파일 리스트
files = [
     "adj_rus (1).npy", "edge_feat_rus.npy",
    "label_bi_rus (1).npy",  "node_random (1).npy"
]

# 각 파일 로드 및 shape 확인
for file in files:
    try:
        data = np.load(f"{file}", allow_pickle=True)
        print(f"{file}: Shape={data.shape}, Type={type(data)}")
    except Exception as e:
        print(f"Error loading {file}: {e}")

# adj_random_list.dict 파일 로드
try:
    with open(f"adj_random_list (1).dict", "rb") as f:
        adj_random_list = pickle.load(f)
    print(f"adj_random_list.dict (1): Loaded successfully, Type={type(adj_random_list)}")
except Exception as e:
    print(f"Error loading adj_random_list(1).dict: {e}")

adj_rus (1).npy: Shape=(189216, 2), Type=<class 'numpy.ndarray'>
edge_feat_rus.npy: Shape=(189216, 63), Type=<class 'numpy.ndarray'>
label_bi_rus (1).npy: Shape=(189216,), Type=<class 'numpy.ndarray'>
node_random (1).npy: Shape=(19374,), Type=<class 'numpy.ndarray'>
adj_random_list.dict (1): Loaded successfully, Type=<class 'collections.defaultdict'>


In [10]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from tqdm import tqdm

# ---------------------------
# Load Data
# ---------------------------
edge_feat = np.load("edge_feat_rus.npy", allow_pickle=True)
adj = np.load("adj_rus (1).npy", allow_pickle=True)
label = np.load("label_bi_rus (1).npy")


In [11]:
print("edge_feat shape:", edge_feat.shape)
print("adj shape:", adj.shape)
print("label shape:", label.shape)


edge_feat shape: (189216, 63)
adj shape: (189216, 2)
label shape: (189216,)


In [24]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset
import numpy as np
from sklearn.metrics import accuracy_score, cohen_kappa_score, roc_auc_score, classification_report

# ---------------------------
# Load Data
# ---------------------------
edge_feat = np.load("edge_feat_rus.npy", allow_pickle=True)
adj = np.load("adj_rus (1).npy", allow_pickle=True)
label = np.load("label_bi_rus (1).npy")

In [25]:

edge_feat = edge_feat.astype(np.float32)

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
edge_feat = scaler.fit_transform(edge_feat)

# Reduce Dataset Size: Sample 5% of Data
subset_size = 0.05
sample_indices, _ = train_test_split(
    np.arange(len(label)), train_size=subset_size, stratify=label, random_state=42
)

edge_feat = edge_feat[sample_indices]
label = label[sample_indices]

# Fix Adjacency Index Mapping Issue
valid_nodes = set(np.unique(edge_feat))
node_mapping = {node: i for i, node in enumerate(valid_nodes)}

adj_remapped = []
for src, dst in adj[sample_indices]:
    if src in node_mapping and dst in node_mapping:
        adj_remapped.append([node_mapping[src], node_mapping[dst]])

adj_numeric = np.array(adj_remapped, dtype=np.int64)

# One-hot encode labels
num_classes = int(label.max()) + 1
label_one_hot = np.eye(num_classes)[label.astype(int)]
label = torch.tensor(label_one_hot, dtype=torch.float32)

# Convert features to PyTorch tensor
edge_feat = torch.tensor(edge_feat, dtype=torch.float32)

# ---------------------------
# Multi-Head GAGNN (based on GitHub repo)
# ---------------------------
class MultiHeadGAGNN(nn.Module):
    def __init__(self, in_feats, hidden_feats, out_feats, num_heads=8, dropout=0.5):
        super(MultiHeadGAGNN, self).__init__()
        self.num_heads = num_heads
        self.hidden_feats = hidden_feats


         # Adding more layers (e.g., two layers)
        self.fc_in = nn.Linear(in_feats, hidden_feats)
        self.attention1 = nn.MultiheadAttention(embed_dim=hidden_feats, num_heads=num_heads, dropout=dropout, batch_first=True)
        self.attention2 = nn.MultiheadAttention(embed_dim=hidden_feats, num_heads=num_heads, dropout=dropout, batch_first=True)
        self.fc_out = nn.Linear(hidden_feats, out_feats)
        self.dropout = nn.Dropout(dropout)


    def forward(self, x):
        x = self.fc_in(x)
        x = x.unsqueeze(0)  # Add batch dimension for MultiheadAttention
        attn_output1, _ = self.attention1(x, x, x)
        attn_output2, _ = self.attention2(attn_output1, attn_output1, attn_output1)
        attn_output = attn_output2.squeeze(0)  # Remove batch dimension
        out = self.fc_out(attn_output)
        out = torch.sigmoid(out)  # Multi-label prediction
        return out
# ---------------------------
# Initialize Model, Loss, Optimizer
# ---------------------------
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

num_features = edge_feat.shape[1]
num_classes = label.shape[1]

model = MultiHeadGAGNN(in_feats=num_features, hidden_feats=128, out_feats=num_classes).to(device)

#---
class LabelSmoothingBCELoss(nn.Module):
    def __init__(self, smoothing=0.1):
        super(LabelSmoothingBCELoss, self).__init__()
        self.smoothing = smoothing

    def forward(self, y_pred, y_true):
        y_true = y_true * (1 - self.smoothing) + (1 - y_true) * self.smoothing
        return F.binary_cross_entropy(y_pred, y_true)

criterion = LabelSmoothingBCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay =1e-5)

edge_feat, label = edge_feat.to(device), label.to(device)

# ---------------------------
# Train-Test Split
# ---------------------------
train_idx, test_idx = train_test_split(
    np.arange(len(label)), test_size=0.2, random_state=42
)
train_idx, val_idx = train_test_split(
    train_idx, test_size=0.1, random_state=42
)

# ---------------------------
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=3, verbose=True)

# Training Loop
# ---------------------------
num_epochs = 20
batch_size = 64

train_data = TensorDataset(edge_feat[train_idx], label[train_idx])
train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)

for epoch in range(num_epochs):
    model.train()
    epoch_loss = 0.0
    correct_predictions = 0
    total_samples = 0

    for batch_feat, batch_label in train_loader:
        batch_feat = batch_feat.to(device)
        batch_label = batch_label.to(device)

        optimizer.zero_grad()
        outputs = model(batch_feat)
        loss = criterion(outputs, batch_label)
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()
        predicted_labels = (outputs > 0.5).float()
        correct_predictions += (predicted_labels == batch_label).sum().item()
        total_samples += batch_label.numel()

    epoch_accuracy = correct_predictions / total_samples

    # Validation after each epoch
    model.eval()
    with torch.no_grad():
        val_outputs = model(edge_feat[val_idx])
        val_labels = label[val_idx]
        val_predicted = (val_outputs > 0.5).float()
        val_correct = (val_predicted == val_labels).sum().item()
        val_total = val_labels.numel()
        val_accuracy = val_correct / val_total
        val_loss = criterion(val_outputs, val_labels).item()

    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {epoch_loss/len(train_idx):.6f}, "
          f"Train Accuracy: {epoch_accuracy:.4f}, Val Accuracy: {val_accuracy:.4f}")
    scheduler.step(val_loss)
# ---------------------------



Epoch 1/20, Loss: 0.010886, Train Accuracy: 0.5240, Val Accuracy: 0.5244
Epoch 2/20, Loss: 0.008931, Train Accuracy: 0.7565, Val Accuracy: 0.9894
Epoch 3/20, Loss: 0.006632, Train Accuracy: 0.9781, Val Accuracy: 0.9921
Epoch 4/20, Loss: 0.006115, Train Accuracy: 0.9817, Val Accuracy: 0.9934
Epoch 5/20, Loss: 0.006086, Train Accuracy: 0.9796, Val Accuracy: 0.9921
Epoch 6/20, Loss: 0.005866, Train Accuracy: 0.9807, Val Accuracy: 0.9934
Epoch 7/20, Loss: 0.005535, Train Accuracy: 0.9893, Val Accuracy: 0.9934
Epoch 8/20, Loss: 0.005516, Train Accuracy: 0.9899, Val Accuracy: 0.9934
Epoch 9/20, Loss: 0.005500, Train Accuracy: 0.9902, Val Accuracy: 0.9934
Epoch 10/20, Loss: 0.005507, Train Accuracy: 0.9890, Val Accuracy: 0.9934
Epoch 11/20, Loss: 0.005460, Train Accuracy: 0.9906, Val Accuracy: 0.9934
Epoch 12/20, Loss: 0.005464, Train Accuracy: 0.9899, Val Accuracy: 0.9934
Epoch 13/20, Loss: 0.005462, Train Accuracy: 0.9904, Val Accuracy: 0.9934
Epoch 14/20, Loss: 0.005434, Train Accuracy: 0.

In [26]:
from sklearn.metrics import accuracy_score, cohen_kappa_score, roc_auc_score, f1_score, classification_report

# ---------------------------
# Evaluation on Test Set
# ---------------------------
model.eval()
test_data = TensorDataset(edge_feat[test_idx], label[test_idx])
test_loader = DataLoader(test_data, batch_size=batch_size, shuffle=False)

test_loss = 0.0
correct_predictions = 0
total_samples = 0
y_true = []
y_pred_bin = []
y_pred_prob = []

In [27]:
# Efficient evaluation loop
for batch_feat, batch_label in test_loader:
    batch_feat = batch_feat.to(device)
    batch_label = batch_label.to(device)

    with torch.no_grad():
        outputs = model(batch_feat)
        loss = criterion(outputs, batch_label)

    test_loss += loss.item()
    predicted_labels = (outputs > 0.5).float()
    correct_predictions += (predicted_labels == batch_label).sum().item()
    total_samples += batch_label.numel()

    # Collect true and predicted labels for metrics
    y_true.append(batch_label.cpu().numpy())
    y_pred_bin.append(predicted_labels.cpu().numpy())
    y_pred_prob.append(outputs.cpu().numpy())

In [28]:
test_accuracy = correct_predictions / total_samples
y_true = np.concatenate(y_true)
y_pred_bin = np.concatenate(y_pred_bin)
y_pred_prob = np.concatenate(y_pred_prob)

In [30]:
# Print test results
print(f"Test Loss: {test_loss / len(test_loader):.6f}, Test Accuracy: {test_accuracy:.4f}")

# ---------------------------
# Detailed Evaluation Metrics
# ---------------------------

print(f"\n[Detailed Test Metrics]")
print(f"Test Accuracy (sklearn): {accuracy_score(y_true.flatten(), y_pred_bin.flatten()):.4f}")
#print(f"Test F1 Score (Macro): {f1_score(y_true.flatten(), y_pred_bin.flatten(), average='macro'):.4f}")
print(f"Test Cohen's Kappa: {cohen_kappa_score(y_true.flatten(), y_pred_bin.flatten()):.4f}")
print(f"Test AUC-ROC (Macro): {roc_auc_score(y_true, y_pred_prob, average='macro'):.4f}")
from sklearn.metrics import classification_report

# Evaluate the model on test set
model.eval()
with torch.no_grad():
    test_outputs = model(edge_feat[test_idx])
    test_labels = label[test_idx]
    test_preds = (torch.sigmoid(test_outputs) > 0.5).float()

# Print classification report
#print(classification_report(test_labels.cpu().numpy(), test_preds.cpu().numpy(), target_names=[str(i) for i in range(num_classes)]))

# Classification report
print("\nClassification Report:")
print(classification_report(y_true, y_pred_bin, zero_division=0))

Test Loss: 0.393681, Test Accuracy: 0.9926

[Detailed Test Metrics]
Test Accuracy (sklearn): 0.9926
Test Cohen's Kappa: 0.9852
Test AUC-ROC (Macro): 0.9908

Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      1022
           1       0.99      0.99      0.99       870

   micro avg       0.99      0.99      0.99      1892
   macro avg       0.99      0.99      0.99      1892
weighted avg       0.99      0.99      0.99      1892
 samples avg       0.99      0.99      0.99      1892

