In [1]:
import json
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [2]:
file_path = "1739099940358_dom.json"
with open(file_path, "r", encoding="utf-8") as file:
    data = json.load(file)

dom_data = data["domData"]

df = pd.DataFrame(dom_data)

df = df[["tag", "id", "classes", "attributes", "xpath", "cssSelector", "boundingBox"]]

# Expand bounding box into separate columns
df["bounding_x"] = df["boundingBox"].apply(lambda x: x.get("x", None))
df["bounding_y"] = df["boundingBox"].apply(lambda x: x.get("y", None))
df["bounding_width"] = df["boundingBox"].apply(lambda x: x.get("width", None))
df["bounding_height"] = df["boundingBox"].apply(lambda x: x.get("height", None))

# Drop original bounding box column
df = df.drop(columns=["boundingBox"])

# Convert list attributes to string
df["classes"] = df["classes"].apply(lambda x: " ".join(x) if isinstance(x, list) else "")
df["attributes"] = df["attributes"].apply(lambda x: str(x) if isinstance(x, list) else "")

# Encode categorical features (e.g., tag, id, classes, attributes)
label_encoders = {}
for col in ["tag", "id", "classes", "attributes"]:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col].astype(str))
    label_encoders[col] = le  # Store encoder for later use


In [3]:
df

Unnamed: 0,tag,id,classes,attributes,xpath,cssSelector,bounding_x,bounding_y,bounding_width,bounding_height
0,21,0,49,841,"id(""undefined"")/html[1]",html,0.000000,-810.000000,572.666687,2122.875000
1,19,0,0,0,"id(""undefined"")/html[1]/head[1]",html > head,0.000000,0.000000,0.000000,0.000000
2,28,0,0,2,"id(""undefined"")/html[1]/head[1]/meta[1]",html > head > meta:nth-child(1),0.000000,0.000000,0.000000,0.000000
3,28,0,0,845,"id(""undefined"")/html[1]/head[1]/meta[2]",html > head > meta:nth-child(2),0.000000,0.000000,0.000000,0.000000
4,28,0,0,844,"id(""undefined"")/html[1]/head[1]/meta[3]",html > head > meta:nth-child(3),0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...
2102,10,0,53,56,/html/body/footer[1]/div[1]/div[1],html > body > footer > div > div,16.000000,1270.479248,540.666687,26.395834
2103,10,0,52,55,/html/body/footer[1]/div[1]/div[1]/div[1],html > body > footer > div > div > div,16.000000,1270.479248,540.666687,26.395834
2104,31,0,33,35,/html/body/footer[1]/div[1]/div[1]/div[1]/p[1],html > body > footer > div > div > div > p,16.000000,1270.479248,540.666687,26.395834
2105,4,0,0,0,/html/body/footer[1]/div[1]/div[1]/div[1]/p[1]...,html > body > footer > div > div > div > p > br,461.229187,1272.479248,0.000000,21.333334


In [4]:
# Define input features (X) and target variable (Y)
X = df.drop(columns=["cssSelector", "xpath"])  # Features
y = df["cssSelector"]  # Target: CSS Selector

# Encode target variable (CSS Selector)
y_encoder = LabelEncoder()
y = y_encoder.fit_transform(y)

# Split data (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training samples: {len(X_train)}, Testing samples: {len(X_test)}")


Training samples: 1685, Testing samples: 422


In [5]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Initialize model
model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.2f}")


Model Accuracy: 0.00


In [18]:
def predict_css_selector(sample):
    # Convert sample to DataFrame
    sample_df = pd.DataFrame([sample])

    # Encode categorical features using stored encoders
    for col in ["tag", "id", "classes", "attributes"]:
        if col in label_encoders:
            known_classes = set(label_encoders[col].classes_)
            if col == "id":
                print("id", known_classes)
            if col == "classes":
                print("classes", known_classes)
            if col == "attributes":
                print("attributes", known_classes)
            # sample_df[col] = sample_df[col].apply(lambda x: x if x in known_classes else "unknown")
            sample_df[col] = label_encoders[col].transform(sample_df[col])

    # Predict class
    pred = model.predict(sample_df)[0]

    # Convert prediction back to original CSS Selector
    predicted_selector = y_encoder.inverse_transform([pred])[0]
    return predicted_selector

# Example sample (fixing empty `id`)
sample_element = {
    "tag": "li",
    "id": "check-scoring",  # Set unknown for empty strings
    "classes": "",
    "attributes": "[{'name': 'class', 'value': 'reference internal'}, {'name': 'href', 'value': 'sklearn.gaussian_process.kernels.WhiteKernel.html'}]",
    "bounding_x": -413.5,
    "bounding_y": -5.3125,
    "bounding_width": 215.33334350585938,
    "bounding_height": 37.04166793823242
}

predicted_selector = predict_css_selector(sample_element)
print(f"Predicted CSS Selector: {predicted_selector}")

classes {'', 'bd-content', 'bd-header__inner bd-page-width', 'bd-docs-nav bd-links', 'nav-link', 'me-auto navbar-header-items__center', 'svg-inline--fa fa-chevron-down', 'nav-link dropdown-item nav-internal', 'svg-inline--fa fa-list', 'breadcrumb-item breadcrumb-home', 'fontawesome-i2svg-active fontawesome-i2svg-complete', 'header-article-items__start', 'bd-container__inner bd-page-width', 'sidebar-secondary-item', 'toctree-l1 current active has-children', 'mi', 'field-even', 'svg-inline--fa fa-magnifying-glass', 'versionadded', 'rubric', 'bd-main', 'toctree-toggle', 'header-article-items header-article__inner', 'svg-inline--fa fa-sun theme-switch fa-lg', 'bd-search d-flex align-items-center', 'form-control', 'go', 'bd-navbar-elements navbar-nav', 'reference internal nav-link', 'bd-header navbar navbar-expand-lg bd-navbar d-print-none', 'pst-navbar-icon sidebar-toggle secondary-toggle', 'svg-inline--fa fa-house', 'bd-toc-nav page-toc', 'docutils literal notranslate', 'doctest highlight

In [19]:
import joblib

# Save model and encoders
joblib.dump(model, "dom_selector_model.pkl")
joblib.dump(label_encoders, "label_encoders.pkl")
joblib.dump(y_encoder, "y_encoder.pkl")


['y_encoder.pkl']