In [1]:
#!pip install datasets
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
from datasets import load_dataset, load_from_disk
from PIL import Image
import torchvision.transforms as transforms
from torch.utils.data import DataLoader

  from .autonotebook import tqdm as notebook_tqdm


Process Data

In [2]:
# dataset will be cached after loading the first time 

train_ds = load_dataset("mwritescode/slither-audited-smart-contracts", 'small-multilabel', split='train', ignore_verifications=True)
test_ds = load_dataset("mwritescode/slither-audited-smart-contracts", 'small-multilabel', split='test', ignore_verifications=True)


# filter out empty bytecode 
train_ds = train_ds.filter(lambda elem: elem['bytecode'] != '0x')
test_ds = test_ds.filter(lambda elem: elem['bytecode'] != '0x')

# extract number of unique classes 


# generate RGB image 
def generate_RGB_image(example):
    # generate RGB image
    bytecode_string = example['bytecode']
    bytecode_hex = bytes.fromhex(bytecode_string[2:])
    image = np.frombuffer(bytecode_hex, dtype=np.uint8)
    length = int(np.ceil(len(image)/3))
    image = np.pad(image, pad_width=(0, length*3 - len(image)))
    image = image.reshape((-1, 3))
    sqrt_len = int(np.ceil(np.sqrt(image.shape[0])))
    image = np.pad(image,  pad_width=((0, sqrt_len**2 - image.shape[0]),(0,0)))
    image = image.reshape((sqrt_len, sqrt_len, 3))
    image = Image.fromarray(image)
    example['image'] = image

    return example

# generate images, remove unneccessary columns 
train_ds = train_ds.map(generate_RGB_image, remove_columns=['address', 'source_code', 'bytecode', 'slither'])
test_ds = test_ds.map(generate_RGB_image, remove_columns=['address', 'source_code', 'bytecode', 'slither'])

# image after generating
train_ds[0]




Found cached dataset slither-audited-smart-contracts (/home/trbiv/.cache/huggingface/datasets/mwritescode___slither-audited-smart-contracts/small-multilabel/1.1.0/4cf503b59ce9d3157914e47f6253de773b7ab828f46642685d4b470b88ca1f13)
Found cached dataset slither-audited-smart-contracts (/home/trbiv/.cache/huggingface/datasets/mwritescode___slither-audited-smart-contracts/small-multilabel/1.1.0/4cf503b59ce9d3157914e47f6253de773b7ab828f46642685d4b470b88ca1f13)
Loading cached processed dataset at /home/trbiv/.cache/huggingface/datasets/mwritescode___slither-audited-smart-contracts/small-multilabel/1.1.0/4cf503b59ce9d3157914e47f6253de773b7ab828f46642685d4b470b88ca1f13/cache-f761296fcbda6e7a.arrow
Loading cached processed dataset at /home/trbiv/.cache/huggingface/datasets/mwritescode___slither-audited-smart-contracts/small-multilabel/1.1.0/4cf503b59ce9d3157914e47f6253de773b7ab828f46642685d4b470b88ca1f13/cache-0b09595cbf775804.arrow
Loading cached processed dataset at /home/trbiv/.cache/huggingfa

{'image': <PIL.PngImagePlugin.PngImageFile image mode=RGB size=50x50>}

Apply augmentations

In [3]:

# pytorch expect RGB inputs in range [0, 1], normalize:
mean_rgb = [0.485, 0.456, 0.406]
std_rgb = [0.229, 0.224, 0.225]

# transform images
transform = transforms.Compose([
    transforms.Resize(128),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize(mean=mean_rgb, std=std_rgb)
])

# apply transformations
transformed_train_ds = train_ds.map(lambda elem: {'image': transform(elem['image'])})
transformed_test_ds = test_ds.map(lambda elem: {'image': transform(elem['image'])})

batch_size =  8

# convert to pytorch format 
transformed_train_ds = transformed_train_ds.with_format("torch")
transformed_test_ds = transformed_test_ds.with_format("torch")

# initialize dataloaders
train_dataloader = DataLoader(transformed_train_ds, batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(transformed_test_ds, batch_size=batch_size, shuffle=False)

print(transformed_train_ds[2]['image'].shape)

Loading cached processed dataset at /home/trbiv/.cache/huggingface/datasets/mwritescode___slither-audited-smart-contracts/small-multilabel/1.1.0/4cf503b59ce9d3157914e47f6253de773b7ab828f46642685d4b470b88ca1f13/cache-11f62f3998af50ea.arrow
Loading cached processed dataset at /home/trbiv/.cache/huggingface/datasets/mwritescode___slither-audited-smart-contracts/small-multilabel/1.1.0/4cf503b59ce9d3157914e47f6253de773b7ab828f46642685d4b470b88ca1f13/cache-4fd0e150a5ab9c83.arrow


torch.Size([3, 128, 128])


In [4]:
class Unsupervised(nn.Module): 
    def __init__(self):
        super(Unsupervised, self).__init__()

        self.conv1 = nn.Conv2d(3, 64, kernel_size=3, padding=1)
        self.bn1 = nn.BatchNorm2d(64)
        self.conv2 = nn.Conv2d(64, 128, kernel_size=3, padding=1)
        self.bn2 = nn.BatchNorm2d(128)
        self.conv3 = nn.Conv2d(128, 128, kernel_size=3, padding=1)
        self.bn3 = nn.BatchNorm2d(128)
        self.conv4 = nn.Conv2d(128, 128, kernel_size=3, padding=1)
        self.bn4 = nn.BatchNorm2d(128)
        self.conv5 = nn.Conv2d(128, 3, kernel_size=3, padding=1)
        self.bn5 = nn.BatchNorm2d(3)


    def forward(self, x):

        x = F.relu(self.bn1(self.conv1(x)))
        x = F.relu(self.bn2(self.conv2(x)))
        x = F.relu(self.bn3(self.conv3(x)))
        x = F.relu(self.bn4(self.conv4(x)))
        x = F.relu(self.bn5(self.conv5(x)))

        return x

# testig with dummy input 
model = Unsupervised()
input_size = (8, 3, 128, 128) 
dummy_input = torch.randn(*input_size)
output = model(dummy_input)
print("Output shape: ", output.shape)


Output shape:  torch.Size([8, 3, 128, 128])


In [5]:

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Instantiate the model
model = Unsupervised()
model = model.to(device)

# Define the loss function
criterion = nn.MSELoss()

# Define the optimizer
learning_rate = 0.001
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

# Training loop
num_epochs = 10

for epoch in range(num_epochs):
    model.train()
    for batch_data in train_dataloader:
        inputs = batch_data["image"].to(device)

        # Forward pass
        #print("inputs: ", inputs.shape)
        outputs = model(inputs)

        # Backpropagation and optimization
        optimizer.zero_grad()
        #loss.backward()
        optimizer.step()

    # Validation 
    model.eval()
    total_loss = 0.0
    num_samples = 0
    with torch.no_grad():
        for batch_data in test_dataloader:
            inputs = batch_data["image"].to(device)

            outputs = model(inputs)

            batch_loss = torch.mean((inputs - outputs) ** 2) # mean squared error 

            total_loss =+ batch_loss.item() * inputs.size(0)
            num_samples += inputs.size(0)

    # Print the loss or evaluation metric for the epoch
    average_loss = total_loss / num_samples
    print(f"Epoch [{epoch+1}/{num_epochs}], Average Loss: {average_loss}")

# Training complete


Epoch [1/10], Average Loss: 0.16149652004241943
Epoch [2/10], Average Loss: 0.17838510741358218
Epoch [3/10], Average Loss: 0.21482018802476965
Epoch [4/10], Average Loss: 0.24259775099547012
Epoch [5/10], Average Loss: 0.2520070801610532


KeyboardInterrupt: 