# Phase 1


In [28]:
#!pip install datasets
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
from process_data import process
from datasets import load_dataset, load_from_disk

Load and Process Data

In [29]:
# dataset will be cached after loading the first time 


# try to load processed data from disk
try:
    train_ds = load_from_disk("data/train")
    test_ds = load_from_disk("data/test")
    
except: 
    # this dataset has splits for training and testing already 
    ds = load_dataset("mwritescode/slither-audited-smart-contracts", "big-plain-text")

    # data processing done in process_data.py 
    train_ds, test_ds = process(ds)
    

Visualize Data

In [30]:
# visualizing the data after processing 
train_ds[0]
# features represent frequency of a bigram appearing in the contract
# labels represent whether a vulnerability was found

{'features': tensor([0.0022, 0.0022, 0.0000, 0.0411, 0.0000, 0.0281, 0.0000, 0.0043, 0.0022,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0043, 0.0173, 0.0173, 0.0000, 0.0022,
         0.0000, 0.0130, 0.0108, 0.0043, 0.0130, 0.0043, 0.2078, 0.0000, 0.0087,
         0.0000, 0.0000, 0.0000, 0.0152, 0.0433, 0.0693, 0.0584, 0.0043, 0.0281,
         0.0000, 0.0000, 0.0087, 0.0411, 0.0000, 0.0000, 0.0000, 0.0022, 0.0476,
         0.0000, 0.0108, 0.0000, 0.0000, 0.0238, 0.0065, 0.0000, 0.0022, 0.0000,
         0.0000, 0.0000, 0.0065, 0.0043, 0.0000, 0.0000, 0.0000, 0.0000, 0.0065,
         0.0000, 0.0022, 0.0065, 0.0022, 0.0000, 0.0000, 0.0411, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0065, 0.0043, 0.0022, 0.0000, 0.0130, 0.0108, 0.0000,
         0.0022, 0.0000, 0.0022, 0.0152, 0.0498, 0.0000, 0.0000, 0.0000, 0.0043,
         0.0000, 0.0000, 0.0087, 0.0628, 0.0000, 0.0000, 0.0000, 0.0022, 0.0000,
         0.0022]),
 'labels': tensor([0., 0.])}

# Define Network

In [31]:
# definign input and output size
input_size = len(train_ds[0]['features'])
output_size = len(train_ds[0]['labels']) 

class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.layer1 = nn.Linear(input_size, input_size//2)
        self.layer2 = nn.Linear(input_size//2, output_size)
        

    def forward(self, x):
        x = F.relu(self.layer1(x))
        x = F.relu(self.layer2(x))
        return x

# Train Model

In [32]:
# Define the neural network
model = Net()

# Define the loss function and optimizer
criterion = nn.BCELoss() # Binary Cross Entropy for multilabels
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)

# define data 
features = train_ds['features']
labels = train_ds['labels']
labels = torch.argmax(labels, dim=1)
labels = F.one_hot(labels, num_classes=output_size).float()

# define test data
test_features = test_ds['features']
test_labels = test_ds['labels']

# Train the neural network
num_epochs = 10
for epoch in range(num_epochs):

    
    # Forward pass
    output = model(features)

    # Calculate the loss
    loss = criterion(output, labels)
    
    # Zero the gradients
    optimizer.zero_grad()

    # Backward pass
    loss.backward()

    # Update the weights
    optimizer.step()

    # check prediction 
    prediction = model(test_features)
    prediction_loss = criterion(prediction, test_labels)
    accuracy = (torch.argmax(prediction, 1 ) == torch.argmax(test_labels, 1)).float().mean() 
    # Print the loss
    print('Epoch [{}/{}], Loss: {:.4f} Accuracy: {}'.format(epoch+1, num_epochs, loss.item(), accuracy))


Epoch [1/10], Loss: 2.0692 Accuracy: 0.8567492961883545
Epoch [2/10], Loss: 0.6256 Accuracy: 0.8567492961883545
Epoch [3/10], Loss: 0.6149 Accuracy: 0.8567492961883545
Epoch [4/10], Loss: 0.6049 Accuracy: 0.8567492961883545
Epoch [5/10], Loss: 0.5955 Accuracy: 0.8567492961883545
Epoch [6/10], Loss: 0.5866 Accuracy: 0.8567492961883545
Epoch [7/10], Loss: 0.5783 Accuracy: 0.8567492961883545
Epoch [8/10], Loss: 0.5704 Accuracy: 0.8567492961883545
Epoch [9/10], Loss: 0.5629 Accuracy: 0.8567492961883545
Epoch [10/10], Loss: 0.5558 Accuracy: 0.8567492961883545
