In [37]:
import torch
from torch_geometric.data import Data
import torch_geometric.nn as pyg_nn
import torch.nn.functional as F

### Below we specify arbitrary constants or hiperparams

In [38]:
import data_loader

test_train_split = 0.2 # 0.2 into test
rows_per_example = 20
mode = mode=data_loader.load_char_mode.DROP
NUM_HIDDEN_DIMS = 64
NUM_EPOCHS = 1000

In [39]:

# Define a simple GCN model
class LetterGNN(torch.nn.Module):
    def __init__(self, num_node_features, hidden_dim, num_classes, num_layers=2):
        super(LetterGNN, self).__init__()

        self.convs = torch.nn.ModuleList()
        self.convs.append(pyg_nn.GCNConv(num_node_features, hidden_dim))
        for _ in range(num_layers - 1):
            self.convs.append(pyg_nn.GCNConv(hidden_dim, hidden_dim))

        self.fc = torch.nn.Linear(hidden_dim, num_classes)

    def forward(self, x, edge_index, batch):
        for conv in self.convs:
            x = x.float()
            edge_index = edge_index.long()
            x = conv.forward(x, edge_index)
            x = F.relu(x)
        
        # idk maybe use a different pool method 
        x = pyg_nn.global_mean_pool(x, batch)

        # classify
        x = self.fc(x)

        return x
        


In [40]:
from torch_geometric.data import InMemoryDataset, Data
from sklearn.model_selection import train_test_split
import data_loader


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


data_list_pos = data_loader.load_data_object("../key_presses1.csv", mode=mode, y=torch.tensor([0]), rows_per_example=rows_per_example)
data_list_neg  = data_loader.load_data_object("../key_presses2.csv",  mode=mode, y=torch.tensor([1]), rows_per_example=rows_per_example)
num_features = data_list_pos[0].x.shape[1]

data_list_pos = [elem.to(device) for elem in data_list_pos]
data_list_neg = [elem.to(device) for elem in data_list_neg]

print("Number of examples: ", len(data_list_neg) + len(data_list_pos))

train_pos_list, test_pos_list = train_test_split(data_list_pos, test_size=test_train_split)
train_neg_list, test_neg_list = train_test_split(data_list_neg, test_size=test_train_split)

print("Number of train examples: ", len(train_neg_list) + len(train_pos_list))
print("Number of test examples: ", len(test_neg_list) + len(test_pos_list))

# # Move data to the specified device
class SimpleGraphDataset(InMemoryDataset):
    def __init__(self, data_list):
        super(SimpleGraphDataset, self).__init__('.', None, None, None)
        self.data, self.slices = self.collate(data_list)  # Collate all data objects

    def __len__(self):
        return len(self.data.y)  # Number of graphs in the dataset


# Create the dataset
dataset = SimpleGraphDataset(train_pos_list + train_neg_list)

# Create the dataset
test_dataset = SimpleGraphDataset(test_pos_list + test_neg_list)

# Check the dataset
print("Number of samples in the dataset:", len(dataset))

Number of examples:  38
Number of train examples:  30
Number of test examples:  8
Number of samples in the dataset: 30




In [41]:
# Train the model 

from torch_geometric.loader import DataLoader
# Q: How Do we choose hidden dims size ?? !!!!!!!!!!

# Define the model, loss, and optimizer
model = LetterGNN(num_node_features=dataset.num_node_features, hidden_dim=NUM_HIDDEN_DIMS, num_classes=dataset.num_classes).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
criterion = torch.nn.CrossEntropyLoss()

# Train loop
def train(model, data_loader):
    model.train()
    total_loss = 0
    for data in data_loader:
        optimizer.zero_grad()
        output = model(data.x, data.edge_index, data.batch)  # Forward pass
        loss = criterion(output, data.y)  # Compute the loss
        loss.backward()  # Backpropagation
        optimizer.step()  # Update the model parameters
        total_loss += loss.item()
    return total_loss / len(data_loader)


# Training over epochs
data_loader = DataLoader(dataset, batch_size=32, shuffle=True)
for epoch in range(1, NUM_EPOCHS):
    loss = train(model, data_loader)
    if epoch % 50 == 0:
        print(f"Epoch: {epoch:03d}, Loss: {loss:.4f}")



Epoch: 050, Loss: 0.7761
Epoch: 100, Loss: 0.5487
Epoch: 150, Loss: 0.4366
Epoch: 200, Loss: 0.2191
Epoch: 250, Loss: 0.5726
Epoch: 300, Loss: 0.5499
Epoch: 350, Loss: 0.5057
Epoch: 400, Loss: 0.4493
Epoch: 450, Loss: 0.3911
Epoch: 500, Loss: 0.3412
Epoch: 550, Loss: 0.2988
Epoch: 600, Loss: 0.2619
Epoch: 650, Loss: 0.2301
Epoch: 700, Loss: 0.2036
Epoch: 750, Loss: 0.1795
Epoch: 800, Loss: 0.1594
Epoch: 850, Loss: 0.1420
Epoch: 900, Loss: 0.1269
Epoch: 950, Loss: 0.1137


In [42]:
# test 
def test(model, data_loader):
    model.eval()
    correct = 0
    for data in data_loader:
        output = model(data.x, data.edge_index, data.batch)
        pred = output.argmax(dim=1)  # Get the index of the max log-probability
        print(f"Correct {data.y[0]} Pred {pred[0]}")
        if pred != data.y:
            print(output[0])
        correct += (pred == data.y).sum().item()
    return correct / len(data_loader.dataset)

# Test the model
test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False)
accuracy = test(model, test_loader)
print(f"Test Accuracy: {accuracy:.4f}")

Correct 0 Pred 0
Correct 0 Pred 0
Correct 0 Pred 0
Correct 0 Pred 0
Correct 0 Pred 0
Correct 1 Pred 1
Correct 1 Pred 1
Correct 1 Pred 0
tensor([8.3764, 7.5730], device='cuda:0', grad_fn=<SelectBackward0>)
Test Accuracy: 0.8750
