In [None]:
from google.colab import files
uploaded = files.upload()

Saving balanced_dataset.csv to balanced_dataset.csv


In [6]:
import pandas as pd
import torch
import torch.nn as nn
from torch_geometric.data import Data
from torch_geometric.nn import GCNConv
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

class GNNOnlyDataset:
    def __init__(self, csv_path):
        self.data = pd.read_csv(csv_path)

    def create_ast_graph(self, idx):
        row = self.data.iloc[idx]
        num_nodes = max(2, int(row['ast_nodes']))

        edge_index = []
        for i in range(1, num_nodes):
            parent = (i - 1) // 2
            edge_index.append([parent, i])
            edge_index.append([i, parent])

        node_features = torch.zeros((num_nodes, 4))
        for i in range(num_nodes):
            node_features[i] = torch.tensor([
                row['ast_if_count'] / num_nodes,
                row['ast_functions'] / num_nodes,
                row['ast_depth'] / num_nodes,
                row['token_count'] / num_nodes
            ])

        edge_index = torch.tensor(edge_index, dtype=torch.long).t()
        return Data(x=node_features, edge_index=edge_index), torch.tensor(int(row['label']), dtype=torch.long)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.create_ast_graph(idx)

# GNN
class GNNModel(nn.Module):
    def __init__(self, node_features=4, num_classes=2):
        super().__init__()
        self.conv1 = GCNConv(node_features, 32)
        self.conv2 = GCNConv(32, 16)
        self.conv3 = GCNConv(16, 8)
        self.fc = nn.Linear(8, num_classes)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = torch.relu(self.conv1(x, edge_index))
        x = torch.relu(self.conv2(x, edge_index))
        x = self.conv3(x, edge_index)
        x = torch.mean(x, dim=0)  # 全图平均池化
        return self.fc(x)

#Train
def train(model, dataset, optimizer, criterion):
    model.train()
    total_loss = 0
    for data, label in dataset:
        optimizer.zero_grad()
        output = model(data)
        loss = criterion(output.unsqueeze(0), label.unsqueeze(0))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(dataset)

#Evaluate
def evaluate(model, dataset):
    model.eval()
    y_true, y_pred = [], []
    with torch.no_grad():
        for data, label in dataset:
            output = model(data)
            pred = torch.argmax(output).item()
            y_true.append(label.item())
            y_pred.append(pred)
    acc = accuracy_score(y_true, y_pred)
    return acc

if __name__ == "__main__":
    dataset = GNNOnlyDataset('balanced_dataset.csv')
    indices = list(range(len(dataset)))
    train_indices, val_indices = train_test_split(indices, test_size=0.2, random_state=42)

    train_set = [dataset[i] for i in train_indices]
    val_set = [dataset[i] for i in val_indices]

    model = GNNModel(num_classes=2)
    optimizer = torch.optim.AdamW(model.parameters(), lr=0.001)
    criterion = nn.CrossEntropyLoss()

    for epoch in range(1, 11):
        loss = train(model, train_set, optimizer, criterion)
        acc = evaluate(model, val_set)
        print(f"[Epoch {epoch}] Loss: {loss:.4f} | Accuracy: {acc:.4f}")

[Epoch 1] Loss: 0.7066 | Accuracy: 0.4789
[Epoch 2] Loss: 0.6802 | Accuracy: 0.6684
[Epoch 3] Loss: 0.6389 | Accuracy: 0.6947
[Epoch 4] Loss: 0.6172 | Accuracy: 0.7026
[Epoch 5] Loss: 0.6102 | Accuracy: 0.6921
[Epoch 6] Loss: 0.6065 | Accuracy: 0.6974
[Epoch 7] Loss: 0.6027 | Accuracy: 0.6974
[Epoch 8] Loss: 0.5993 | Accuracy: 0.7026
[Epoch 9] Loss: 0.5967 | Accuracy: 0.7053
[Epoch 10] Loss: 0.5941 | Accuracy: 0.7184
