In [28]:
import torch
from torch_geometric.data import Data
import torch_geometric.nn as pyg_nn
import torch.nn.functional as F

### Below we specify arbitrary constants or hiperparams

In [29]:
import data_loader

test_train_split = 0.2 # 20% into test
rows_per_example = 30
mode = mode=data_loader.load_char_mode.DROP
NUM_HIDDEN_DIMS = 64
NUM_EPOCHS = 1000

In [30]:

# Define a simple GCN model
class LetterGNN(torch.nn.Module):
    def __init__(self, num_node_features, hidden_dim, num_classes, num_layers=2):
        super(LetterGNN, self).__init__()

        self.convs = torch.nn.ModuleList()
        self.convs.append(pyg_nn.GCNConv(num_node_features, hidden_dim))
        for _ in range(num_layers - 1):
            self.convs.append(pyg_nn.GCNConv(hidden_dim, hidden_dim))

        self.fc = torch.nn.Linear(hidden_dim, num_classes)

    def forward(self, x, edge_index, batch):
        for conv in self.convs:
            x = x.float()
            edge_index = edge_index.long()
            x = conv.forward(x, edge_index)
            x = F.relu(x)
        
        # idk maybe use a different pool method 
        x = pyg_nn.global_mean_pool(x, batch)

        # classify
        x = self.fc(x)

        return x
        


In [31]:
file_names = ["../key_presses (1).tsv", "../key_presses (2).tsv", "../key_presses (3).tsv", "../key_presses (4).tsv"]

In [None]:
from torch_geometric.data import InMemoryDataset, Data
from sklearn.model_selection import train_test_split
import data_loader
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


# this is a list of list of Data objects - datasets
# each dataset comes from a different input file - a different user
list_of_datasets = [data_loader.load_data_object(filename, mode=mode, y=torch.tensor([i]), rows_per_example=rows_per_example)
                     for i, filename in enumerate(file_names)]

num_features = list_of_datasets[0][0].x.shape[1]

print("Number of examples: ", sum(len(l) for l in list_of_datasets))


Number of examples:  367
Number of train examples:  291
Number of test examples:  76




In [None]:

training_dataset_pos = []
testing_dataset_pos = []
training_dataset_neg = []
testing_dataset_neg = []

# !!! choose which one get the positive label
positive_index = 0 # just arbitrary

# relabel the datasets
for i, dataset in enumerate(list_of_datasets):
    for data_obj in dataset:
        if i == positive_index:
            data_obj.y = torch.tensor([1])
        else:
            data_obj.y = torch.tensor([0])

    train, test = train_test_split(dataset, test_size=test_train_split)

    if i == positive_index:
        training_dataset_pos.extend(train)
        testing_dataset_pos.extend(test)
    else:
        training_dataset_neg.extend(train)
        testing_dataset_neg.extend(test)


class SimpleGraphDataset(InMemoryDataset):
    def __init__(self, data_list):
        super(SimpleGraphDataset, self).__init__('.', None, None, None)
        self.data, self.slices = self.collate(data_list)  # Collate all data objects

    def __len__(self):
        return len(self.data.y)  # Number of graphs in the dataset



train = [e.to(device) for e in (training_dataset_pos + training_dataset_neg)]
test = [e.to(device) for e in (testing_dataset_pos + testing_dataset_neg)]

dataset = SimpleGraphDataset(train)
test_dataset = SimpleGraphDataset(test)

print("Number of train examples: ", len(dataset))
print("Number of test examples: ", len(test_dataset))
# we need to create two datasets, one for training and one for testing

In [None]:
# Train the model 

from torch_geometric.loader import DataLoader
# Q: How Do we choose hidden dims size ?? !!!!!!!!!!

# Define the model, loss, and optimizer
model = LetterGNN(num_node_features=dataset.num_node_features, hidden_dim=NUM_HIDDEN_DIMS, num_classes=dataset.num_classes).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = torch.nn.CrossEntropyLoss()

# Train loop
def train(model, data_loader):
    model.train()
    total_loss = 0
    for data in data_loader:
        optimizer.zero_grad()
        output = model(data.x, data.edge_index, data.batch)  # Forward pass
        loss = criterion(output, data.y)  # Compute the loss
        loss.backward()  # Backpropagation
        optimizer.step()  # Update the model parameters
        total_loss += loss.item()
    return total_loss / len(data_loader)


# Training over epochs
data_loader = DataLoader(dataset, batch_size=32, shuffle=True)
prev_loss = 100000
for epoch in range(1, NUM_EPOCHS):
    loss = train(model, data_loader)
    if epoch % 50 == 0 or epoch == NUM_EPOCHS-1:
        print(f"Epoch: {epoch:03d}, Loss: {loss:.4f}")

Epoch: 050, Loss: 0.5164
Epoch: 100, Loss: 0.2207
Epoch: 150, Loss: 0.1407
Epoch: 200, Loss: 0.1676
Epoch: 250, Loss: 0.0830
Epoch: 300, Loss: 0.0683
Epoch: 350, Loss: 0.1454
Epoch: 400, Loss: 0.1008
Epoch: 450, Loss: 0.0581
Epoch: 500, Loss: 0.0534
Epoch: 550, Loss: 0.0365
Epoch: 600, Loss: 1.1832
Epoch: 650, Loss: 0.0295
Epoch: 700, Loss: 0.0309
Epoch: 750, Loss: 0.0431
Epoch: 800, Loss: 0.0207
Epoch: 850, Loss: 0.0464
Epoch: 900, Loss: 0.0205
Epoch: 950, Loss: 0.0188


In [37]:
# test 
def test(model, data_loader):
    model.eval()
    correct = 0
    for data in data_loader:
        output = model(data.x, data.edge_index, data.batch)
        pred = output.argmax(dim=1)  # Get the index of the max log-probability
        # print(f"Correct {data.y[0]} Pred {pred[0]}")
        if pred != data.y:
            print(output[0])
        correct += (pred == data.y).sum().item()
    return correct / len(data_loader.dataset)

def test_acc():
    # Test the model
    test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False)
    accuracy = test(model, test_loader)
    print(f"Test Accuracy: {accuracy:.4f}")

test_acc()



tensor([-1.1199, -4.5237], device='cuda:0', grad_fn=<SelectBackward0>)
tensor([-5.2777, -4.3710], device='cuda:0', grad_fn=<SelectBackward0>)
Test Accuracy: 0.9737


