In [1]:
import numpy as np
import pickle



# 불러올 파일 리스트
files = [
     "adj_rus (1).npy", "edge_feat_rus.npy",
    "label_bi_rus (1).npy",  "node_random (1).npy"
]

# 각 파일 로드 및 shape 확인
for file in files:
    try:
        data = np.load(f"{file}", allow_pickle=True)
        print(f"{file}: Shape={data.shape}, Type={type(data)}")
    except Exception as e:
        print(f"Error loading {file}: {e}")

# adj_random_list.dict 파일 로드
try:
    with open(f"adj_random_list (1).dict", "rb") as f:
        adj_random_list = pickle.load(f)
    print(f"adj_random_list.dict (1): Loaded successfully, Type={type(adj_random_list)}")
except Exception as e:
    print(f"Error loading adj_random_list(1).dict: {e}")




adj_rus (1).npy: Shape=(189216, 2), Type=<class 'numpy.ndarray'>
edge_feat_rus.npy: Shape=(189216, 63), Type=<class 'numpy.ndarray'>
label_bi_rus (1).npy: Shape=(189216,), Type=<class 'numpy.ndarray'>
node_random (1).npy: Shape=(19374,), Type=<class 'numpy.ndarray'>
adj_random_list.dict (1): Loaded successfully, Type=<class 'collections.defaultdict'>


TRAINING/ ROUND 2

In [2]:
# Install PyTorch Geometric and its dependencies
!pip install torch-scatter torch-sparse torch-geometric torch-cluster torch-spline-conv -f https://data.pyg.org/whl/torch-2.0.0+cpu.html


Looking in links: https://data.pyg.org/whl/torch-2.0.0+cpu.html
Collecting torch-scatter
  Downloading https://data.pyg.org/whl/torch-2.0.0%2Bcpu/torch_scatter-2.1.2%2Bpt20cpu-cp311-cp311-linux_x86_64.whl (494 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m494.0/494.0 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting torch-sparse
  Downloading https://data.pyg.org/whl/torch-2.0.0%2Bcpu/torch_sparse-0.6.18%2Bpt20cpu-cp311-cp311-linux_x86_64.whl (1.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m20.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting torch-geometric
  Downloading torch_geometric-2.6.1-py3-none-any.whl.metadata (63 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.1/63.1 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting torch-cluster
  Downloading https://data.pyg.org/whl/torch-2.0.0%2Bcpu/torch_cluster-1.6.3%2Bpt20cpu-cp311-cp311-linux_x86_64.whl (750 kB)
[2K     [9

In [5]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from tqdm import tqdm

# ---------------------------
# Load Data
# ---------------------------
edge_feat = np.load("edge_feat_rus.npy", allow_pickle=True)
adj = np.load("adj_rus (1).npy", allow_pickle=True)
label = np.load("label_bi_rus (1).npy")

In [7]:

print("edge_feat shape:", edge_feat.shape)
print("adj shape:", adj.shape)
print("label shape:", label.shape)
# ---------------------------
# Load Data
# ---------------------------
edge_feat = np.load("edge_feat_rus.npy", allow_pickle=True)
adj = np.load("adj_rus (1).npy", allow_pickle=True)
label = np.load("label_bi_rus (1).npy")
# Ensure edge_feat contains only numerical values
edge_feat = edge_feat.astype(np.float32)  # Convert to float32

# Reduce Dataset Size: Sample 5% of Data
subset_size = 0.05  # Use only 5% of data
sample_indices, _ = train_test_split(np.arange(len(label)), train_size=subset_size, stratify=label, random_state=42)

# Apply sampling
edge_feat = edge_feat[sample_indices]
label = label[sample_indices]

# ---------------------------
# Fix Adjacency Index Mapping Issue
# ---------------------------
# Find unique nodes that remain in the sampled dataset
valid_nodes = set(np.unique(edge_feat))  # Nodes that exist in feature matrix
node_mapping = {node: i for i, node in enumerate(valid_nodes)}

# Re-map adjacency list indices to match the reduced dataset
adj_remapped = []
for src, dst in adj[sample_indices]:
    if src in node_mapping and dst in node_mapping:  # Only keep valid edges
        adj_remapped.append([node_mapping[src], node_mapping[dst]])

# Convert adj to NumPy array and PyTorch tensor
adj_numeric = np.array(adj_remapped, dtype=np.int64)
adj_numeric = torch.tensor(adj_numeric, dtype=torch.long)

# Convert Labels to One-Hot Encoding
num_classes = int(label.max()) + 1  # Get number of unique classes
label_one_hot = np.eye(num_classes)[label.astype(int)]  # Convert to one-hot encoding
label = torch.tensor(label_one_hot, dtype=torch.float32)  # Convert to PyTorch tensor

# Convert Node Features to PyTorch Tensor
edge_feat = torch.tensor(edge_feat, dtype=torch.float32)

# ---------------------------
# GNN Model Definition
# ---------------------------
class GraphSAGE(nn.Module):
    def __init__(self, in_feats, hidden_feats, out_feats, dropout=0.5):
        super(GraphSAGE, self).__init__()
        self.fc1 = nn.Linear(in_feats*2, hidden_feats)
        self.fc2 = nn.Linear(hidden_feats*2, out_feats)
        self.dropout = dropout

    def aggregate(self, x, edge_index):
        if edge_index.dim() == 1:
            edge_index = edge_index.view(-1, 2)

        src, dst = edge_index[:, 0], edge_index[:, 1]

        # Mask to avoid out-of-range indices
        valid_mask = (src < x.size(0)) & (dst < x.size(0))
        if valid_mask.sum() == 0:
            return x

        agg = torch.zeros_like(x)
        agg.index_add_(0, dst[valid_mask], x[src[valid_mask]])  # Sum of neighbor features
        return agg

    def forward(self, x, edge_index): #x is node feature, edge_index is edge list
        # First layer
        neighbor_agg = self.aggregate(x, edge_index)
        h = torch.cat([x, neighbor_agg], dim=1)
        h = self.fc1(h)
        h = F.relu(h)
        h = F.dropout(h, p=self.dropout, training=self.training)

        # Second layer
        neighbor_agg = self.aggregate(h, edge_index)
        h = torch.cat([h, neighbor_agg], dim=1)
        h = self.fc2(h)
        return torch.sigmoid(h)

# ---------------------------
# Initialize Model, Loss, Optimizer
# ---------------------------
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

num_features = edge_feat.shape[1]
num_classes = label.shape[1]  # Correct way to get the number of labels

model = GraphSAGE(in_feats=num_features, hidden_feats=128, out_feats=num_classes).to(device)
criterion = nn.BCELoss()  # Binary Cross-Entropy Loss for multi-label classification
optimizer = optim.Adam(model.parameters(), lr=0.003)

# ---------------------------
# Train-Test Split on Reduced Dataset
# ---------------------------
train_idx, test_idx = train_test_split(np.arange(len(label)), test_size=0.2, random_state=42)
train_idx, val_idx = train_test_split(train_idx, test_size=0.1, random_state=42)

# Move tensors to GPU if available
edge_feat, adj_numeric, label = edge_feat.to(device), adj_numeric.to(device), label.to(device)

# ---------------------------
# Training Loop
# ---------------------------
num_epochs = 10
batch_size = 64

for epoch in range(num_epochs):
    model.train()
    epoch_loss = 0.0
    correct_predictions = 0
    total_samples = 0

    # Use tqdm for batch progress tracking
    with tqdm(total=len(train_idx) // batch_size, desc=f"Epoch {epoch+1}/{num_epochs}", unit="batch") as pbar:
        for i in range(0, len(train_idx), batch_size):
            batch_idx = train_idx[i:i + batch_size]
            batch_feat = edge_feat  # Use full node feature matrix
            batch_adj = adj_numeric  # Adjacency is global, not batch-specific
            batch_label = label[batch_idx]

            optimizer.zero_grad()
            outputs = model(batch_feat, batch_adj) #running the model
            loss = criterion(outputs[batch_idx], batch_label)  # Select only batch outputs
            loss.backward()
            optimizer.step()

            epoch_loss += loss.item()

            # Compute Accuracy
            predicted_labels = (outputs[batch_idx] > 0.5).float()  # Convert to binary labels
            correct_predictions += (predicted_labels == batch_label).sum().item()
            total_samples += batch_label.numel()  # Total number of elements in labels

            # Update progress bar
            pbar.set_postfix(loss=f"{loss.item():.4f}")
            pbar.update(1)

    # Compute and print epoch accuracy
    epoch_accuracy = correct_predictions / total_samples
    print(f"Epoch {epoch + 1}/{num_epochs}, Avg Loss: {epoch_loss / len(train_idx):.6f}, Accuracy: {epoch_accuracy:.4f}")


edge_feat shape: torch.Size([9460, 63])
adj shape: (189216, 2)
label shape: torch.Size([9460, 2])


Epoch 1/10: 107batch [00:02, 47.10batch/s, loss=0.0062]


Epoch 1/10, Avg Loss: 0.001787, Accuracy: 0.9780


Epoch 2/10: 107batch [00:02, 48.03batch/s, loss=0.0070]


Epoch 2/10, Avg Loss: 0.001034, Accuracy: 0.9908


Epoch 3/10: 107batch [00:02, 45.67batch/s, loss=0.0072]


Epoch 3/10, Avg Loss: 0.001148, Accuracy: 0.9905


Epoch 4/10: 107batch [00:02, 44.32batch/s, loss=0.0041]                      


Epoch 4/10, Avg Loss: 0.001006, Accuracy: 0.9909


Epoch 5/10: 107batch [00:02, 47.33batch/s, loss=0.0058]                      


Epoch 5/10, Avg Loss: 0.000980, Accuracy: 0.9910


Epoch 6/10: 107batch [00:02, 48.12batch/s, loss=0.0109]                      


Epoch 6/10, Avg Loss: 0.001107, Accuracy: 0.9906


Epoch 7/10: 107batch [00:02, 48.46batch/s, loss=0.0033]                      


Epoch 7/10, Avg Loss: 0.000974, Accuracy: 0.9909


Epoch 8/10: 107batch [00:02, 36.81batch/s, loss=0.0071]


Epoch 8/10, Avg Loss: 0.001047, Accuracy: 0.9908


Epoch 9/10: 107batch [00:02, 41.93batch/s, loss=0.0051]                      


Epoch 9/10, Avg Loss: 0.001024, Accuracy: 0.9908


Epoch 10/10: 107batch [00:02, 46.94batch/s, loss=0.0029]                      

Epoch 10/10, Avg Loss: 0.001025, Accuracy: 0.9910





In [8]:
from sklearn.metrics import accuracy_score, f1_score, classification_report, cohen_kappa_score, roc_auc_score

# ---------------------------
# Evaluation on Validation and Test Set
# ---------------------------

model.eval()
with torch.no_grad(): #no backpropagation turn off all memory for updating
    # Validation
    val_outputs = model(edge_feat, adj_numeric) #invoke the def forward function
    val_outputs = val_outputs[val_idx]  # Select only validation samples
    val_labels = label[val_idx]

    val_loss = criterion(val_outputs, val_labels)  # Compute validation loss

    val_outputs_binary = (val_outputs > 0.5).float()  # Threshold for metrics

    # Test
    test_outputs = model(edge_feat, adj_numeric)
    test_outputs = test_outputs[test_idx]  # Select only test samples
    test_labels = label[test_idx]

    test_loss = criterion(test_outputs, test_labels)  # Compute test loss

    test_outputs_binary = (test_outputs > 0.5).float()  # Threshold for metrics
    #float convert the boolean, True become 1, False become 0
# ---------------------------
# Validation Metrics
# ---------------------------
val_true = val_labels.cpu().numpy()
val_pred = val_outputs_binary.cpu().numpy()
val_outputs_proba = val_outputs.cpu().numpy()

val_accuracy = accuracy_score(val_true.flatten(), val_pred.flatten())
val_kappa = cohen_kappa_score(val_true.flatten(), val_pred.flatten())
val_auc = roc_auc_score(val_true.flatten(), val_outputs_proba.flatten())

# ---------------------------
# Test Metrics
# ---------------------------
test_true = test_labels.cpu().numpy()
test_pred = test_outputs_binary.cpu().numpy()
test_outputs_proba = test_outputs.cpu().numpy()

test_accuracy = accuracy_score(test_true.flatten(), test_pred.flatten())
test_kappa = cohen_kappa_score(test_true.flatten(), test_pred.flatten())
test_auc = roc_auc_score(test_true.flatten(), test_outputs_proba.flatten())

# ---------------------------
# Print Results
# ---------------------------
print("\n--- Validation Set ---")
print(f"Validation Loss: {val_loss.item():.4f}")
print(f"Validation Accuracy: {val_accuracy:.4f}")
print(f"Validation Cohen's Kappa: {val_kappa:.4f}")
print(f"Validation AUC: {val_auc:.4f}")

print("\n--- Test Set ---")
print(f"Test Loss: {test_loss.item():.4f}")
print(f"Test Accuracy: {test_accuracy:.4f}")
print(f"Test Cohen's Kappa: {test_kappa:.4f}")
print(f"Test AUC: {test_auc:.4f}")

print("\nTest Classification Report:")
print(classification_report(test_true, test_pred, zero_division=0))



--- Validation Set ---
Validation Loss: 0.0536
Validation Accuracy: 0.9934
Validation Cohen's Kappa: 0.9868
Validation AUC: 0.9907

--- Test Set ---
Test Loss: 0.0528
Test Accuracy: 0.9926
Test Cohen's Kappa: 0.9852
Test AUC: 0.9927

Test Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      1022
           1       0.99      0.99      0.99       870

   micro avg       0.99      0.99      0.99      1892
   macro avg       0.99      0.99      0.99      1892
weighted avg       0.99      0.99      0.99      1892
 samples avg       0.99      0.99      0.99      1892

