In [2]:
import json
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from transformers import BertTokenizer

# Load JSON data
file_path = "final_training_data.json"  # Update with correct file path
with open(file_path, "r", encoding="utf-8") as file:
    data = json.load(file)

# Convert to DataFrame
df = pd.DataFrame(data)

# Select features
df = df[["tag", "id", "classes", "attributes", "bounding_x", "bounding_y", "bounding_width", "bounding_height", "cssSelector"]]

# Encode target variable (CSS Selector)
css_selector_encoder = LabelEncoder()
df["cssSelector"] = css_selector_encoder.fit_transform(df["cssSelector"])

# Combine textual features into a single string
df["text"] = df.apply(lambda row: f"{row['tag']} {row['id']} {row['classes']} {row['attributes']}", axis=1)

# Tokenize input text
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
tokens = tokenizer(df["text"].tolist(), padding=True, truncation=True, return_tensors="pt")

# Define features (X) and labels (Y)
X = tokens["input_ids"]
y = df["cssSelector"].values


In [3]:
print(X)

tensor([[ 101, 2538, 1012,  ..., 1012, 1014,  102],
        [ 101, 2539, 1012,  ..., 1014,  102,    0],
        [ 101, 2654, 1012,  ..., 1014,  102,    0],
        ...,
        [ 101, 2861, 1012,  ..., 1014,  102,    0],
        [ 101, 1018, 1012,  ..., 1014,  102,    0],
        [ 101, 2385, 1012,  ..., 1012, 1014,  102]])


In [4]:
print(y)

[1983 2053 2077 ... 1992 1993 2052]


In [5]:
print(df)

      tag  id  classes  attributes  bounding_x   bounding_y  bounding_width  \
0      21   0       49         841    0.000000  -810.000000      572.666687   
1      19   0        0           0    0.000000     0.000000        0.000000   
2      28   0        0           2    0.000000     0.000000        0.000000   
3      28   0        0         845    0.000000     0.000000        0.000000   
4      28   0        0         844    0.000000     0.000000        0.000000   
...   ...  ..      ...         ...         ...          ...             ...   
2102   10   0       53          56   16.000000  1270.479248      540.666687   
2103   10   0       52          55   16.000000  1270.479248      540.666687   
2104   31   0       33          35   16.000000  1270.479248      540.666687   
2105    4   0        0           0  461.229187  1272.479248        0.000000   
2106   16   0        0         792    0.000000  1312.875000        0.000000   

      bounding_height  cssSelector                 

In [6]:
from sklearn.model_selection import train_test_split
import torch

# Convert labels to Tensor format
y = torch.tensor(y)

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Move data to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
X_train, X_test, y_train, y_test = X_train.to(device), X_test.to(device), y_train.to(device), y_test.to(device)

print(f"Training size: {len(X_train)}, Testing size: {len(X_test)}")


Training size: 1685, Testing size: 422


In [7]:
from transformers import BertModel, BertForSequenceClassification
import torch.nn as nn

# Define Transformer-based Model
class DOMSelectorModel(nn.Module):
    def __init__(self, num_classes):
        super(DOMSelectorModel, self).__init__()
        self.bert = BertModel.from_pretrained("bert-base-uncased")
        self.fc = nn.Linear(self.bert.config.hidden_size, num_classes)  # Fully connected layer for classification

    def forward(self, input_ids, attention_mask=None):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        cls_output = outputs.last_hidden_state[:, 0, :]  # Take [CLS] token output
        return self.fc(cls_output)

# Initialize model
num_classes = len(css_selector_encoder.classes_)
model = DOMSelectorModel(num_classes).to(device)


In [8]:
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

# Define loss function & optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=2e-5)

# Create DataLoaders
train_data = TensorDataset(X_train, y_train)
test_data = TensorDataset(X_test, y_test)

train_loader = DataLoader(train_data, batch_size=16, shuffle=True)
test_loader = DataLoader(test_data, batch_size=16, shuffle=False)

# Training loop
num_epochs = 5

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    
    for batch in train_loader:
        X_batch, y_batch = batch
        optimizer.zero_grad()
        
        # Forward pass
        outputs = model(X_batch)
        loss = criterion(outputs, y_batch)
        
        # Backward pass
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {total_loss / len(train_loader)}")


We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


Epoch 1/5, Loss: 7.714084184394692
Epoch 2/5, Loss: 7.709784278329813
Epoch 3/5, Loss: 7.630587978183098
Epoch 4/5, Loss: 7.583169370327356
Epoch 5/5, Loss: 7.563081628871414


In [9]:
from sklearn.metrics import accuracy_score

model.eval()
all_preds = []
all_labels = []

with torch.no_grad():
    for batch in test_loader:
        X_batch, y_batch = batch
        outputs = model(X_batch)
        preds = torch.argmax(outputs, dim=1)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(y_batch.cpu().numpy())

accuracy = accuracy_score(all_labels, all_preds)
print(f"Model Accuracy: {accuracy:.2f}")


Model Accuracy: 0.00


In [10]:
torch.save(model.state_dict(), "dom_selector_transformer.pth")

In [11]:
model.load_state_dict(torch.load("dom_selector_transformer.pth"))
model.eval()


DOMSelectorModel(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementw

In [12]:
def predict_css_selector(tag, element_id, classes, attributes):
    text = f"{tag} {element_id} {classes} {attributes}"
    tokens = tokenizer(text, padding=True, truncation=True, return_tensors="pt").to(device)
    
    with torch.no_grad():
        output = model(tokens["input_ids"])
        pred = torch.argmax(output, dim=1).cpu().numpy()[0]
    
    return css_selector_encoder.inverse_transform([pred])[0]

# Example prediction
predicted_selector = predict_css_selector("li", "check-scoring", "", "[{'name': 'class', 'value': 'reference internal'}]")
print(f"Predicted CSS Selector: {predicted_selector}")


Predicted CSS Selector: #check-scoring > dl > dd > dl > dd:nth-child(1) > dl > dd:nth-child(4) > div
