In [None]:
import pandas as pd
from dotenv import load_dotenv
from bs4 import BeautifulSoup
import os
import json

load_dotenv()

False

### 1. Get the form we think as search

In [47]:
class FormParser:
    def __init__(self, html_content, base_url=None):
        self.html_content = html_content
        self.base_url = base_url

    def strip_forms_and_inputs(self, forms):
        for form in forms:
            form.pop('css_path')
            for input in form['inputs']:
                input.pop('css_path')
        return forms

    def get_css_path(self, element):
        """
        Generate a CSS path for a given BeautifulSoup element
        """
        path = []
        
        while element and element.name:
            # Build selector for current element
            selector = element.name
            
            # Add ID if present
            if element.get('id'):
                selector += f"#{element['id']}"
                path.append(selector)
                break  # ID is unique, so we can stop here
            
            # Add classes if present
            if element.get('class'):
                classes = '.'.join(element['class'])
                selector += f".{classes}"
            
            # Add attribute selectors for uniqueness if needed
            parent = element.parent
            if parent:
                siblings = parent.find_all(element.name, recursive=False)
                if len(siblings) > 1:
                    # Add nth-child selector if there are multiple siblings
                    index = siblings.index(element) + 1
                    selector += f":nth-child({index})"
            
            path.append(selector)
            element = element.parent
        
        return ' > '.join(reversed(path))

    def parse_forms_from_html(self, html_content, base_url=None):
        """
        Parse HTML content and extract form details including input names, action, and method
        """
        soup = BeautifulSoup(html_content, 'html.parser')
        forms_data = []
        
        # Find all forms
        forms = soup.find_all('form')
        
        for i, form in enumerate(forms):
            form_info = {
                'form_index': i + 1,
                'action': form.get('action', ''),
                'method': form.get('method', 'GET').upper(),
                'id': form.get('id', ''),
                'class': form.get('class', []),
                'inputs': [],
                'css_path': get_css_path(form)
            }
            
            # Make action URL absolute if base_url provided
            if base_url and form_info['action']:
                form_info['action'] = urljoin(base_url, form_info['action'])
            
            # Find all form elements (input, select, textarea)
            form_elements = form.find_all(['input', 'textarea'])
            
            for element in form_elements:
                element_info = {
                    'tag': element.name,
                    'name': element.get('name', ''),
                    'type': element.get('type', ''),
                    'value': element.get('value', ''),
                    'id': element.get('id', ''),
                    'required': element.has_attr('required'),
                    'placeholder': element.get('placeholder', ''),
                    'css_path': get_css_path(element)
                }
                
                form_info['inputs'].append(element_info)
            
            forms_data.append(form_info)
        
        return forms_data


In [48]:
from pydantic import BaseModel
from pydantic import BaseModel, Field
from openai import OpenAI
client = OpenAI()

from langchain_openai import ChatOpenAI
model = ChatOpenAI(model="gpt-4o", temperature=0)


class SearchInput(BaseModel):
    query: str

class MatchRequest(BaseModel):
    query: str
    result: str

class MatchResponse(BaseModel):
    match: bool


class QueryResultMatch:
    def __init__(self, prompt: str):
        self.prompt = prompt

    def check(self, req: MatchRequest) -> MatchResponse:
        prompt = f"""
        Query: {req.query}
        Result: {req.result}

        {self.prompt}
        """

        response = client.responses.create(
            model="gpt-4",  # or gpt-3.5-turbo
            input=prompt,
            temperature=0
        )

        answer = response.output_text.strip()
        match = answer.lower() == "true"
        return MatchResponse(match=match)


class YesNoMatch:
    def __init__(self, prompt: str):
        self.prompt = prompt

    def check(self, req: SearchInput) -> MatchResponse:
        prompt = f"""
        Query: {req.query}

        {self.prompt}
        """

        response = client.responses.create(
            model="gpt-4",  # or gpt-3.5-turbo
            input=prompt,
            temperature=0
        )

        answer = response.output_text.strip()
        match = answer.lower() == "true"
        return MatchResponse(match=match)

In [59]:
html_content = open('www.jiomart.com.html', 'r').read()
soup = BeautifulSoup(html_content, 'html.parser')

forms = soup.find_all('form')

print(forms)

[<form action="" class="aa-Form" novalidate="" role="search"><div class="aa-InputWrapperPrefix"><label aria-label="Submit" class="aa-Label" for="autocomplete-0-input" id="autocomplete-0-label"><button class="aa-SubmitButton" title="Submit" type="submit"><svg class="aa-SubmitIcon" fill="currentColor" height="20" viewbox="0 0 24 24" width="20"><path d="M16.041 15.856c-0.034 0.026-0.067 0.055-0.099 0.087s-0.060 0.064-0.087 0.099c-1.258 1.213-2.969 1.958-4.855 1.958-1.933 0-3.682-0.782-4.95-2.050s-2.050-3.017-2.050-4.95 0.782-3.682 2.050-4.95 3.017-2.050 4.95-2.050 3.682 0.782 4.95 2.050 2.050 3.017 2.050 4.95c0 1.886-0.745 3.597-1.959 4.856zM21.707 20.293l-3.675-3.675c1.231-1.54 1.968-3.493 1.968-5.618 0-2.485-1.008-4.736-2.636-6.364s-3.879-2.636-6.364-2.636-4.736 1.008-6.364 2.636-2.636 3.879-2.636 6.364 1.008 4.736 2.636 6.364 3.879 2.636 6.364 2.636c2.125 0 4.078-0.737 5.618-1.968l3.675 3.675c0.391 0.391 1.024 0.391 1.414 0s0.391-1.024 0-1.414z"></path></svg></button></label><div class

In [60]:
form_parser = FormParser(html_content)
forms = form_parser.parse_forms_from_html(html_content)
stripped_forms = form_parser.strip_forms_and_inputs(forms)
print(stripped_forms)

[{'form_index': 1, 'action': '', 'method': 'GET', 'id': '', 'class': ['aa-Form'], 'inputs': [{'tag': 'input', 'name': '', 'type': 'search', 'value': '', 'id': 'autocomplete-0-input', 'required': False, 'placeholder': 'Search in JioMart'}]}, {'form_index': 2, 'action': '', 'method': 'GET', 'id': 'rel_search_form', 'class': [], 'inputs': [{'tag': 'textarea', 'name': 'shopping-list', 'type': 'textarea', 'value': '', 'id': 'rel_search_val', 'required': False, 'placeholder': 'e.g. Milk, Bread, Fruit'}]}, {'form_index': 3, 'action': '', 'method': 'GET', 'id': 'delivery_pincode_form', 'class': [], 'inputs': [{'tag': 'input', 'name': 'rel_pincode', 'type': 'text', 'value': '', 'id': 'rel_pincode', 'required': False, 'placeholder': 'Enter your Pincode'}]}]


In [61]:
is_search_form = YesNoMatch(prompt="""
Given a form and its inputs in json parsed format, tell me if it's a search box of a website? Answer only with "True" or "False".
""")


In [62]:
import json

idx = 0
print(json.dumps(forms[idx]))
result = is_search_form.check(SearchInput(query=json.dumps(forms[idx])))
print(result.match)

{"form_index": 1, "action": "", "method": "GET", "id": "", "class": ["aa-Form"], "inputs": [{"tag": "input", "name": "", "type": "search", "value": "", "id": "autocomplete-0-input", "required": false, "placeholder": "Search in JioMart"}]}
True


### 2. Get the input we think as search