In [6]:
import numpy as np
import pickle



# 불러올 파일 리스트
files = [
     "adj_rus (1).npy", "edge_feat_rus.npy",
    "label_bi_rus (1).npy",  "node_random (1).npy"
]

# 각 파일 로드 및 shape 확인
for file in files:
    try:
        data = np.load(f"{file}", allow_pickle=True)
        print(f"{file}: Shape={data.shape}, Type={type(data)}")
    except Exception as e:
        print(f"Error loading {file}: {e}")

# adj_random_list.dict 파일 로드
try:
    with open(f"adj_random_list (1).dict", "rb") as f:
        adj_random_list = pickle.load(f)
    print(f"adj_random_list.dict (1): Loaded successfully, Type={type(adj_random_list)}")
except Exception as e:
    print(f"Error loading adj_random_list(1).dict: {e}")



adj_rus (1).npy: Shape=(189216, 2), Type=<class 'numpy.ndarray'>
edge_feat_rus.npy: Shape=(189216, 63), Type=<class 'numpy.ndarray'>
label_bi_rus (1).npy: Shape=(189216,), Type=<class 'numpy.ndarray'>
node_random (1).npy: Shape=(19374,), Type=<class 'numpy.ndarray'>
adj_random_list.dict (1): Loaded successfully, Type=<class 'collections.defaultdict'>


In [7]:
# Install PyTorch Geometric and its dependencies
!pip install torch-scatter torch-sparse torch-geometric torch-cluster torch-spline-conv -f https://data.pyg.org/whl/torch-2.0.0+cpu.html


Looking in links: https://data.pyg.org/whl/torch-2.0.0+cpu.html


In [8]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from tqdm import tqdm

# ---------------------------
# Load Data
# ---------------------------
edge_feat = np.load("edge_feat_rus.npy", allow_pickle=True)
adj = np.load("adj_rus (1).npy", allow_pickle=True)
label = np.load("label_bi_rus (1).npy")

In [16]:
print("adj shape:", adj.shape)
print("adj sample:", adj[:5])
print("edge_feat shape:", edge_feat.shape)


adj shape: (189216, 2)
adj sample: [[-0.06148469 -0.64621176]
 [-0.15406769 -0.03041983]
 [-0.18179593 -0.50044396]
 [-0.02415821 -0.64520645]
 [-0.18746155 -0.59695588]]
edge_feat shape: (94608, 63)


In [25]:
pip install torch-geometric



In [26]:
pip install torch_geometric torch-scatter torch-sparse torch-cluster torch-spline-conv




In [27]:
from torch_geometric.nn import SAGEConv, SAGPooling, global_mean_pool

class GNN_MGPool(nn.Module):
    def __init__(self, in_feats, hidden_feats, out_feats):
        super(GNN_MGPool, self).__init__()
        self.conv1 = SAGEConv(in_feats, hidden_feats)
        self.pool1 = SAGPooling(hidden_feats, ratio=0.5)

        self.conv2 = SAGEConv(hidden_feats, hidden_feats)
        self.pool2 = SAGPooling(hidden_feats, ratio=0.5)

        self.lin = nn.Linear(hidden_feats, out_feats)

    def forward(self, data):
        x, edge_index, batch = data.x, data.edge_index, data.batch

        # Conv + Pool 1
        x = F.relu(self.conv1(x, edge_index))
        x, edge_index, _, batch, _, _ = self.pool1(x, edge_index, None, batch)

        # Conv + Pool 2
        x = F.relu(self.conv2(x, edge_index))
        x, edge_index, _, batch, _, _ = self.pool2(x, edge_index, None, batch)

        # Global Pooling
        x = global_mean_pool(x, batch)

        return torch.sigmoid(self.lin(x))


In [30]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np

# ---------------------------
# GTN Model Definition
# ---------------------------
class GraphTransformer(nn.Module):
    def __init__(self, in_feats, hidden_feats, out_feats, num_layers=2, dropout=0.5):
        super(GraphTransformer, self).__init__()
        self.num_layers = num_layers
        self.hidden_feats = hidden_feats
        self.dropout = dropout

        # Graph Transformer layers (similar to GTN layers)
        self.gtn_layers = nn.ModuleList()
        for _ in range(num_layers):
            self.gtn_layers.append(self.create_gtn_layer(in_feats, hidden_feats))
            in_feats = hidden_feats  # Update in_feats for the next layer

        self.fc_out = nn.Linear(hidden_feats, out_feats)

    def create_gtn_layer(self, in_feats, hidden_feats):
        return nn.Sequential(
            nn.Linear(in_feats, hidden_feats),
            nn.ReLU(),
            nn.Dropout(self.dropout)
        )

    def forward(self, x, adj):
        h = x
        # Apply each GTN layer
        for layer in self.gtn_layers:
            h = layer(h)  # Apply GTN layer with learnable parameters

        # Apply the final output layer
        h = self.fc_out(h)
        return F.sigmoid(h)

# ---------------------------
# Initialize GTN Model, Loss, Optimizer
# ---------------------------
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

num_features = edge_feat.shape[1]
num_classes = label.shape[1]  # Correct way to get the number of labels

model = GraphTransformer(in_feats=num_features, hidden_feats=128, out_feats=num_classes, num_layers=2).to(device)
criterion = nn.BCELoss()  # Binary Cross-Entropy Loss for multi-label classification
optimizer = optim.Adam(model.parameters(), lr=0.003)

# ---------------------------
# Train-Test Split on Reduced Dataset
# ---------------------------
train_idx, test_idx = train_test_split(np.arange(len(label)), test_size=0.2, random_state=42)
train_idx, val_idx = train_test_split(train_idx, test_size=0.1, random_state=42)

# Move tensors to GPU if available
edge_feat, adj_numeric, label = edge_feat.to(device), adj_numeric.to(device), label.to(device)

# ---------------------------
# Training Loop for GTN
# ---------------------------
num_epochs = 10
batch_size = 64

for epoch in range(num_epochs):
    model.train()
    epoch_loss = 0.0
    correct_predictions = 0
    total_samples = 0

    # Use tqdm for batch progress tracking
    with tqdm(total=len(train_idx) // batch_size, desc=f"Epoch {epoch+1}/{num_epochs}", unit="batch") as pbar:
        for i in range(0, len(train_idx), batch_size):
            batch_idx = train_idx[i:i + batch_size]
            batch_feat = edge_feat  # Use full node feature matrix
            batch_adj = adj_numeric  # Adjacency is global, not batch-specific
            batch_label = label[batch_idx]

            optimizer.zero_grad()
            outputs = model(batch_feat, batch_adj)  # Running the model
            loss = criterion(outputs[batch_idx], batch_label)  # Select only batch outputs
            loss.backward()
            optimizer.step()

            epoch_loss += loss.item()

            # Compute Accuracy
            predicted_labels = (outputs[batch_idx] > 0.5).float()  # Convert to binary labels
            correct_predictions += (predicted_labels == batch_label).sum().item()
            total_samples += batch_label.numel()  # Total number of elements in labels

            # Update progress bar
            pbar.set_postfix(loss=f"{loss.item():.4f}")
            pbar.update(1)

    # Compute and print epoch accuracy
    epoch_accuracy = correct_predictions / total_samples
    print(f"Epoch {epoch + 1}/{num_epochs}, Avg Loss: {epoch_loss / len(train_idx):.6f}, Accuracy: {epoch_accuracy:.4f}")


Epoch 1/10: 426batch [01:30,  4.70batch/s, loss=0.1251]


Epoch 1/10, Avg Loss: 0.001712, Accuracy: 0.9856


Epoch 2/10: 426batch [01:30,  4.72batch/s, loss=0.1034]


Epoch 2/10, Avg Loss: 0.001312, Accuracy: 0.9885


Epoch 3/10: 426batch [01:26,  4.94batch/s, loss=0.1340]


Epoch 3/10, Avg Loss: 0.001258, Accuracy: 0.9885


Epoch 4/10: 426batch [01:27,  4.90batch/s, loss=0.1837]


Epoch 4/10, Avg Loss: 0.001227, Accuracy: 0.9887


Epoch 5/10: 426batch [01:29,  4.77batch/s, loss=0.1172]


Epoch 5/10, Avg Loss: 0.001214, Accuracy: 0.9887


Epoch 6/10: 426batch [01:28,  4.79batch/s, loss=0.1592]


Epoch 6/10, Avg Loss: 0.001188, Accuracy: 0.9884


Epoch 7/10: 426batch [01:28,  4.81batch/s, loss=0.0935]


Epoch 7/10, Avg Loss: 0.001148, Accuracy: 0.9887


Epoch 8/10: 426batch [01:31,  4.67batch/s, loss=0.1473]


Epoch 8/10, Avg Loss: 0.001139, Accuracy: 0.9888


Epoch 9/10: 426batch [01:28,  4.84batch/s, loss=0.1345]


Epoch 9/10, Avg Loss: 0.001116, Accuracy: 0.9888


Epoch 10/10: 426batch [01:29,  4.78batch/s, loss=0.0873]

Epoch 10/10, Avg Loss: 0.001087, Accuracy: 0.9886





In [31]:
from sklearn.metrics import accuracy_score, f1_score, classification_report, cohen_kappa_score, roc_auc_score

# ---------------------------
# Evaluation on Validation and Test Set
# ---------------------------

model.eval()
with torch.no_grad(): #no backpropagation turn off all memory for updating
    # Validation
    val_outputs = model(edge_feat, adj_numeric) #invoke the def forward function
    val_outputs = val_outputs[val_idx]  # Select only validation samples
    val_labels = label[val_idx]

    val_loss = criterion(val_outputs, val_labels)  # Compute validation loss

    val_outputs_binary = (val_outputs > 0.5).float()  # Threshold for metrics

    # Test
    test_outputs = model(edge_feat, adj_numeric)
    test_outputs = test_outputs[test_idx]  # Select only test samples
    test_labels = label[test_idx]

    test_loss = criterion(test_outputs, test_labels)  # Compute test loss

    test_outputs_binary = (test_outputs > 0.5).float()  # Threshold for metrics
    #float convert the boolean, True become 1, False become 0
# ---------------------------
# Validation Metrics
# ---------------------------
val_true = val_labels.cpu().numpy()
val_pred = val_outputs_binary.cpu().numpy()
val_outputs_proba = val_outputs.cpu().numpy()

val_accuracy = accuracy_score(val_true.flatten(), val_pred.flatten())
val_kappa = cohen_kappa_score(val_true.flatten(), val_pred.flatten())
val_auc = roc_auc_score(val_true.flatten(), val_outputs_proba.flatten())

# ---------------------------
# Test Metrics
# ---------------------------
test_true = test_labels.cpu().numpy()
test_pred = test_outputs_binary.cpu().numpy()
test_outputs_proba = test_outputs.cpu().numpy()

test_accuracy = accuracy_score(test_true.flatten(), test_pred.flatten())
test_kappa = cohen_kappa_score(test_true.flatten(), test_pred.flatten())
test_auc = roc_auc_score(test_true.flatten(), test_outputs_proba.flatten())

# ---------------------------
# Print Results
# ---------------------------
print("\n--- Validation Set ---")
print(f"Validation Loss: {val_loss.item():.4f}")
print(f"Validation Accuracy: {val_accuracy:.4f}")
print(f"Validation Cohen's Kappa: {val_kappa:.4f}")
print(f"Validation AUC: {val_auc:.4f}")

print("\n--- Test Set ---")
print(f"Test Loss: {test_loss.item():.4f}")
print(f"Test Accuracy: {test_accuracy:.4f}")
print(f"Test Cohen's Kappa: {test_kappa:.4f}")
print(f"Test AUC: {test_auc:.4f}")

print("\nTest Classification Report:")
print(classification_report(test_true, test_pred, zero_division=0))



--- Validation Set ---
Validation Loss: 0.0517
Validation Accuracy: 0.9911
Validation Cohen's Kappa: 0.9822
Validation AUC: 0.9908

--- Test Set ---
Test Loss: 0.0601
Test Accuracy: 0.9893
Test Cohen's Kappa: 0.9786
Test AUC: 0.9886

Test Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      4046
           1       0.99      0.99      0.99      3523

   micro avg       0.99      0.99      0.99      7569
   macro avg       0.99      0.99      0.99      7569
weighted avg       0.99      0.99      0.99      7569
 samples avg       0.99      0.99      0.99      7569

