## Necessary Imports

In [None]:
!python -m spacy download en_core_web_lg

In [1]:
import sys
from lxml import etree
import numpy as np
import spacy
import pickle
import re
import json
import os
import sklearn
from sklearn.metrics.pairwise import cosine_similarity


In [2]:
# Laod English module
nlp = spacy.load("en_core_web_md")

# Node - class

In [3]:
class Node:
    def __init__(self):
        self.xpath = ""
        self.element = None
        self.vector = None
        self.shape = None
        

    def get_vector(self):
        self.vector = self.generate_vector()
        
    def preprocess_text(self, text):
        # Lowercase the text and remove special characters
        text = text.lower()
        text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
        return text

    def generate_vector(self):
        tag = self.get_type()
        vector = np.zeros(len(nlp("sample").vector))
        
        # Extract and process text content
        text = self.element.text
        if text:
            text = text.strip()
            text = self.preprocess_text(text)  # Apply text preprocessing
            vector = np.add(vector, nlp(text).vector)
        
        # Include attribute names and values
        for attr, value in self.element.items():
            attr_vector = f"{attr} {value}"
            attr_vector = self.preprocess_text(attr_vector)  # Apply text preprocessing
            vector = np.add(vector, nlp(attr_vector).vector)
        
        return vector

    def get_shape(self):
        return (len(self.get_vector()),)

    def get_type(self):
        tag = self.element.tag
        if tag in ["span", "i", "p", "h1", "h2", "h3", "h4", "h5", "h6","strong","em","blockquote","br","hr"]:
            return "Text Elements"
        elif tag in ["header", "nav", "article", "section", "aside", "footer","main","div","body","time","aria-*"]:
            return "Semantic Elements"
        elif tag in ["class", "id", "style", "data-*", "title"]:
            return "Attributes"
        elif tag == "form":
            return "Form Elements"
        elif tag == "input" and self.element.get("type") == "button":
            return "Buttons and Interaction (Input Type Button)"
        elif tag == "input":
            return "Form Elements (Other Input Types)"
        elif tag == "textarea":
            return "Form Elements (Text Area)"
        elif tag == "label":
            return "Form Elements (Label)"
        elif tag == "a":
            return "Links"
        elif tag in ["img", "audio", "video","figure","picture","source","figcaption","map","area","iframe","canvas"]:
            return "Media Elements"
        elif tag in ["ul", "ol", "li"]:
            return "Lists"
        elif tag in ["select", "fieldset", "legent","submit","reset","option","radio","checkbox"]:
            return "Form Tags"
        elif tag in ["table", "tr", "td","th","thead","tbody","tfoot"]:    
            return "Tables"
        elif tag in ["code", "kbd", "samp","sub","sup"]:    
            return "Code"
        elif tag in ["dfn", "pre", "var","wbr"]:    
            return "Formatting Tags"
        elif tag in ["del", "ins", "mark","s"]:    
            return "Editorial Tags"
        elif tag in ["dd", "dl", "dt"]:    
            return "Defination Tags"
        elif tag in ["details", "summary", "progress","meter"]:    
            return "Interactive Elements"
        elif tag in ["caption", "colgroup", "col","th scope="]:    
            return "Table Enhancements"
        elif tag in ["base", "link rel","stylesheet", "icon","description"]:    
            return "Meta Information"
        elif tag in ["noscript", "svg","object"]:    
            return "Scripting & APIs"
        elif tag in ["custom-element", "template","shadow-dom"]:    
            return "Web Components"
        elif tag == "html":
            return "HTML Element"
        elif tag == "head":
            return "Head Element"
        elif tag == "meta":
            return "Meta Element"
        elif tag == "link":
            return "Link Element"
        elif tag == "hreflang":
            return "Hreflang Element"
        elif tag == "title":
            return "Title Element"
        elif tag == "a" and self.element.get("href"):
            return "A Element with Href"
        elif tag == "code" and self.element.get("id"):
            return "Code Element with ID"
        elif tag == "form" and self.element.get("class"):
            return "Form Element with Class"
        elif tag == "input" and self.element.get("name"):
            return "Input Element with Name"
        elif tag == "div" and self.element.get("class"):
            return "Div Element with Class"
        elif tag == "icon" and self.element.get("class"):
            return "Icon Element with Class"
        elif tag == "script" and self.element.get("data-delayed-url"):
            return "Script Element with Data-delayed-url"
        else:
            return "Other"

    def get_actual_name(self):
        # Extract the text content of the HTML element
        text = self.element.text
        if text:
            return text.strip()
        return None

    def __str__(self):
        return f"Node(xpath: {self.xpath}, element: {self.element.tag}, type: {self.get_type()}"

# Html2Vec - class

In [4]:
class Html2Vec:
    def __init__(self, html_content):
        self.tree = etree.HTML(html_content)

    def fit(self):
        nodes = []
        for element in self.tree.iter():
            if element.tag in {"button", "nav", "a", "link", "input","html","meta","link","hreflang",
                            "title", "head", "span", "i", "p", "h1", "h2", "h3", "h4", "h5", "h6", 
                            "header", "nav", "article", "section", "aside", "footer", "icon",
                            "class", "id", "style", "data-*", "title","form","input","textarea",
                            "label","a","code","div","script","class","name","id","href",
                            "img", "audio", "video","ul", "ol", "li","table", "tr", "td","Other",
                            "main","div","figure","picture","source","figcaption","map","area","body",
                            "kbd", "samp","sub","sup","dfn", "pre", "var","wbr","del", "ins", "mark","s","dd","dl","dt",
                            "strong","em","blockquote","br","hr","select", "fieldset", "legent","submit","reset",
                            "th","thead","tbody","tfoot","time","iframe","canvas","option","radio","checkbox",
                            "details", "summary", "progress","meter","caption", "colgroup", "col","th scope=",
                            "custom-element","base", "link rel","stylesheet", "icon","description",
                            "noscript", "svg","object","aria-*","template","shadow-dom"}:
                node = Node()
                node.xpath = self.get_xpath(element)
                node.element = element
                node.get_vector()
                nodes.append(node)
        return nodes

    def get_xpath(self, element):
        xpath = [element.tag]
        while element.getparent() is not None:
            element = element.getparent()
            xpath.insert(0, element.tag)
        #return f"/{'/'.join(xpath)}[text()='{element.text.strip()}']"    
        return '/'.join(xpath)


# UIMapper - class

In [5]:
class UIMapper:
    def map_ui_elements(self, html_content):
        html2vec = Html2Vec(html_content)
        nodes = html2vec.fit()
        return nodes

# ReportGenerator - class

In [6]:
class ReportGenerator:
    @staticmethod
    def generate_report(html_content, user_input_vector, similarity_threshold):
        mapper = UIMapper()
        nodes = mapper.map_ui_elements(html_content)

        report_list = []

        for node in nodes:
            if node.vector is not None:
                report_dict = {
                    "Node XPath": node.xpath,
                    "Generated Vector": node.vector.tolist(),  # Convert numpy array to list for JSON serialization
                    "Node Element Tag": node.element.tag,
                    "Node Type": node.get_type(),
                    "Actual Name": node.get_actual_name(),
                    "Category": node.get_type(),  # Set category to node type
                    "Cosine Similarity": None  # Initialize cosine similarity field
                }

                # Calculate cosine similarity with user input vector
                if user_input_vector is not None:
                    cosine_similarity_value = cosine_similarity([user_input_vector], [node.vector])[0][0]
                    report_dict["Cosine Similarity"] = cosine_similarity_value

                # Include node in report if it has an actual name
                if report_dict["Actual Name"]:
                    report_list.append(report_dict)

        return report_list

# Generate vectors of UI elements & Cosine similarity with User query

In [15]:
from os.path import join
import os

# Function to take user input for UI element text
def get_user_input_text():
    return "forgot password?"

# Define the directory path containing HTML files
html_folder_path = r"C:\Users\Ankit\Projects\OCR_AI\html2vec\Final\Hierarchy\linkedin"

# Increase spaCy's max_length limit
nlp.max_length = 2000000  # Set an appropriate value based on your HTML content length

# Initialize variables to store the overall best matching entry
overall_highest_cosine_similarity_global = -1
overall_best_matching_entry_global = None
overall_best_matching_html_file = None

# Get user input for UI element text
user_input_text = get_user_input_text()

# Convert user input text into vector using spaCy
user_input_vector = nlp(user_input_text).vector

# Process each HTML file in the directory and its subdirectories
for root, dirs, files in os.walk(html_folder_path):
    for html_file_name in files:
        if html_file_name.endswith('.html'):
            # Construct the full path to the HTML file
            html_file_path = os.path.join(root, html_file_name)

            # Read HTML content from the file
            with open(html_file_path, "r", encoding="utf-8") as html_file:
                html_content = html_file.read()

            # Generate the report data and get the maximum cosine similarity
            report_data = ReportGenerator.generate_report(html_content, user_input_vector, similarity_threshold=None)
            max_cosine_similarity = max(entry["Cosine Similarity"] for entry in report_data)

            # Use the maximum cosine similarity as the threshold
            similarity_threshold = max_cosine_similarity

            # Find the UI element with the highest cosine similarity for the current HTML file
            highest_cosine_similarity = -1
            best_matching_entry = None

            for entry in report_data:
                cosine_similarity_value = entry["Cosine Similarity"]

                if cosine_similarity_value > highest_cosine_similarity:
                    highest_cosine_similarity = cosine_similarity_value
                    best_matching_entry = entry

            # Print the details of the highest cosine similarity for the current HTML file
            if best_matching_entry is not None:
                print(f"Highest Cosine Similarity for {html_file_name}: {highest_cosine_similarity}")
                print(f"Best Matching Entry:")
                print(f"Node Type: {best_matching_entry['Node Type']}")
                print(f"Actual Name: {best_matching_entry['Actual Name']}")
                print(f"Category: {best_matching_entry['Category']}")
                print(f"Node XPath: {best_matching_entry['Node XPath']}")
                print(f"Generated Vector: {best_matching_entry['Generated Vector']}")
                print()

            # Update overall variables if the current entry has higher cosine similarity
            if highest_cosine_similarity > overall_highest_cosine_similarity_global:
                overall_highest_cosine_similarity_global = highest_cosine_similarity
                overall_best_matching_entry_global = best_matching_entry
                overall_best_matching_html_file = html_file_path

# Print the details of the overall best matching entry across all HTML files
if overall_best_matching_entry_global is not None:
    print(f"Overall Highest Cosine Similarity: {overall_highest_cosine_similarity_global}")
    print(f"Best Matching Entry:")
    print(f"Node Type: {overall_best_matching_entry_global['Node Type']}")
    print(f"Actual Name: {overall_best_matching_entry_global['Actual Name']}")
    print(f"Category: {overall_best_matching_entry_global['Category']}")
    print(f"Node XPath: {overall_best_matching_entry_global['Node XPath']}")
    print(f"Generated Vector: {overall_best_matching_entry_global['Generated Vector']}")
    print(f"Corresponding HTML File: {overall_best_matching_html_file}")
    print()

    # Open the HTML file corresponding to the overall best match
    print(f"Opening HTML file: {overall_best_matching_html_file}")

    # Now you can open the HTML file using your preferred method (e.g., web browser, etc.)
    # Example: open the HTML file in the default web browser
    import webbrowser
    webbrowser.open(overall_best_matching_html_file)


Highest Cosine Similarity for gmail.html: 0.7139961723700201
Best Matching Entry:
Node Type: Text Elements
Actual Name: Emails from social networks, media-sharing sites, dating services and other social sites will be shown here.
Category: Text Elements
Node XPath: html/body/div/div/div/div/div/div/div/div/div/div/div/div/div/div/div/div/div/div/div/div/p
Generated Vector: [0.6788442730903625, -0.5932208299636841, -1.6883869171142578, 1.1796618700027466, 3.6282622814178467, 1.120784044265747, -0.14288562536239624, 4.312306880950928, -1.6884236335754395, 0.042547762393951416, 5.71486234664917, 1.8804112672805786, -4.98381233215332, 1.5568100214004517, 1.6233388185501099, 1.7912724018096924, 1.8370131254196167, -0.10709978640079498, -3.3536736965179443, -2.0100762844085693, 2.352381944656372, -0.9482662677764893, -3.376128911972046, -0.5315375328063965, -1.3477814197540283, -1.5946191549301147, -1.270690679550171, -1.3487825393676758, -1.740720510482788, -0.4115949869155884, 1.56782746315