In [1]:
import torch
from torch.utils.data import Dataset, DataLoader
import numpy as np
from PIL import Image
import os
from torchvision import transforms
import pandas as pd
from model import Model

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
class TestSet(Dataset):
    def __init__(self, dataframe):
        self.dataframe = dataframe
        self.transform = transforms.Compose([
            transforms.Resize((224, 224)),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
        ])

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, index):
        row = self.dataframe.iloc[index]
        image_name = row["img_filename"]
        img = Image.open(
            os.path.join(
                "screen_spot_images/",
                image_name,
            )
        )
        img = img.convert("RGB")
        img = self.transform(img)
        instruction = row["instruction"]
        label = row["label"]
        return img, instruction, label
    

In [3]:
df = pd.read_csv("seeclick_web_test.csv")
dataset = TestSet(df)
dataloader =  DataLoader(dataset, batch_size=32, shuffle=True)

df.head(10)

Unnamed: 0,img_filename,bbox,instruction,data_type,data_source,normalized_bbox,label
0,web_213f816e-8e80-4d13-970d-1347bbc7a2a8.png,"[2321, 129, 208, 70]",create a new project,text,gitlab,"(0.906640625, 0.08958333333333333, 0.987890625...",1194
1,web_213f816e-8e80-4d13-970d-1347bbc7a2a8.png,"[2401, 14, 111, 68]",view my account,icon,gitlab,"(0.937890625, 0.009722222222222222, 0.98125, 0...",395
2,web_e40f1b3f-0f26-4313-a6a2-d79e1047951b.png,"[194, 15, 645, 66]",search in gitlab,text,gitlab,"(0.07578125, 0.010416666666666666, 0.327734375...",320
3,web_e40f1b3f-0f26-4313-a6a2-d79e1047951b.png,"[1753, 8, 112, 77]",add a new one,icon,gitlab,"(0.684765625, 0.005555555555555556, 0.72851562...",370
4,web_fd8d71f6-4229-4458-a77e-7d8a6347c8e9.png,"[2044, 96, 481, 187]",go to personal homepage,icon,gitlab,"(0.7984375, 0.06666666666666667, 0.986328125, ...",1389
5,web_fd8d71f6-4229-4458-a77e-7d8a6347c8e9.png,"[2043, 492, 483, 89]",sign out,text,gitlab,"(0.798046875, 0.3416666666666667, 0.98671875, ...",3789
6,web_4e1d5837-4731-43f3-8101-52375498c4ad.png,"[427, 234, 150, 96]",switch to explore projects,text,gitlab,"(0.166796875, 0.1625, 0.225390625, 0.229166666...",1919
7,web_4e1d5837-4731-43f3-8101-52375498c4ad.png,"[1601, 350, 116, 66]",star the project with 56 stars,icon,gitlab,"(0.625390625, 0.24305555555555555, 0.670703125...",2664
8,web_4e1d5837-4731-43f3-8101-52375498c4ad.png,"[1704, 678, 83, 59]",fork the a11y project,icon,gitlab,"(0.665625, 0.4708333333333333, 0.698046875, 0....",4968
9,web_bcce7aec-b36a-42c5-8beb-ead23f5ada2c.png,"[197, 232, 1279, 68]",view issues i've created,text,gitlab,"(0.076953125, 0.16111111111111112, 0.5765625, ...",1832


In [4]:
def evaluate(model, test_loader, device):
    model.eval()  # Set the model to evaluation mode
    correct = 0
    total = 0
    with torch.no_grad():  # Disable gradient computation
        for images, instructions, labels in test_loader:
            images = images.to(device)
            labels = labels.to(device)
            outputs = model(images, instructions)
            # print('outputs', outputs)
            # print('outputs.data', outputs.data)
            _, predicted = torch.max(outputs.data, 1)
            # print('predicted', predicted)
            # print('labels: ', labels)
            total += labels.size(0)
            # print('labels.size(0)', labels.size(0))
            correct += (predicted == labels).sum().item()
    
    accuracy = 100 * correct / total
    print(f'Accuracy of the model on the test images: {accuracy:.2f}%')
    return accuracy



In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu")
model = Model(device=device)
checkpoint = torch.load("model.ckpt", map_location=device)  # Ensure checkpoint is loaded to the correct device
model.load_state_dict(checkpoint)
model.to(device)  # Ensure model parameters and buffers are on the right device.




Model(
  (cnn): CNN(
    (resnet50): ResNet(
      (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
      (layer1): Sequential(
        (0): Bottleneck(
          (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
          (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (relu): ReLU(inplace=True)
   

In [6]:
evaluate(model, dataloader, device)

outputs tensor([[ 0.7824, -1.4307, -1.8650,  ..., -5.9678, -5.9097, -5.9871],
        [ 0.7824, -1.4307, -1.8650,  ..., -5.9678, -5.9097, -5.9871],
        [ 0.7824, -1.4307, -1.8650,  ..., -5.9678, -5.9097, -5.9871],
        ...,
        [ 0.7824, -1.4307, -1.8650,  ..., -5.9678, -5.9097, -5.9871],
        [ 0.7824, -1.4307, -1.8650,  ..., -5.9678, -5.9097, -5.9871],
        [ 0.7824, -1.4307, -1.8650,  ..., -5.9678, -5.9097, -5.9871]],
       device='mps:0')
outputs.data tensor([[ 0.7824, -1.4307, -1.8650,  ..., -5.9678, -5.9097, -5.9871],
        [ 0.7824, -1.4307, -1.8650,  ..., -5.9678, -5.9097, -5.9871],
        [ 0.7824, -1.4307, -1.8650,  ..., -5.9678, -5.9097, -5.9871],
        ...,
        [ 0.7824, -1.4307, -1.8650,  ..., -5.9678, -5.9097, -5.9871],
        [ 0.7824, -1.4307, -1.8650,  ..., -5.9678, -5.9097, -5.9871],
        [ 0.7824, -1.4307, -1.8650,  ..., -5.9678, -5.9097, -5.9871]],
       device='mps:0')
predicted tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

0.0