In [1]:
import pandas as pd
from dotenv import load_dotenv
from bs4 import BeautifulSoup
import os
import json

load_dotenv()

%load_ext autoreload
%autoreload 2

### 1. Get the form we think as search

In [2]:
from helpers.forms import FormParser
from helpers.llm import YesNoMatch, QueryResultMatch, SearchInput, MatchRequest, MatchResponse
from importlib import reload


In [17]:
html_content = open('./html_files/www.smartprix.com.html', 'r').read()
soup = BeautifulSoup(html_content, 'html.parser')

forms = soup.find_all('form')

print(forms)

[<form action="/products" autocomplete="off" method="get"><div class="sm-input type-text no-label"><!-- --><input aria-label="Search" class="input empty" name="q" placeholder="Search with Smartprix" type="text" value=""/><!-- --><!-- --></div><svg class="icon i-search" style="" viewbox="0 0 24 24"><path d="M9.5,3A6.5,6.5 0 0,1 16,9.5C16,11.11 15.41,12.59 14.44,13.73L14.71,14H15.5L20.5,19L19,20.5L14,15.5V14.71L13.73,14.44C12.59,15.41 11.11,16 9.5,16A6.5,6.5 0 0,1 3,9.5A6.5,6.5 0 0,1 9.5,3M9.5,5C7,5 5,7 5,9.5C5,12 7,14 9.5,14C12,14 14,12 14,9.5C14,7 12,5 9.5,5Z"></path></svg></form>]


In [18]:
form_parser = FormParser(html_content)
forms = form_parser.parse_forms_from_html(html_content)
stripped_forms = form_parser.strip_forms_and_inputs(forms)
print(forms)
print(stripped_forms)

[{'form_index': 1, 'action': '/products', 'method': 'GET', 'id': '', 'class': [], 'inputs': [{'tag': 'input', 'name': 'q', 'type': 'text', 'value': '', 'id': '', 'required': False, 'placeholder': 'Search with Smartprix', 'css_path': 'div#app > header > div.sm-search-header > div.sm-search-bar-wrap:nth-child(1) > div.sm-search-bar > form > div.sm-input.type-text.no-label > input.input.empty'}], 'css_path': 'div#app > header > div.sm-search-header > div.sm-search-bar-wrap:nth-child(1) > div.sm-search-bar > form'}]
[{'form_index': 1, 'action': '/products', 'method': 'GET', 'id': '', 'class': [], 'inputs': [{'tag': 'input', 'name': 'q', 'type': 'text', 'value': '', 'id': '', 'required': False, 'placeholder': 'Search with Smartprix'}]}]


In [19]:
is_search_form = YesNoMatch(prompt="""
Given a form and its inputs in json parsed format, tell me if it's a search box of a website? Answer only with "True" or "False".
""")


In [20]:
import json

idx = 0
print(json.dumps(forms[idx]))
result = is_search_form.check(SearchInput(query=json.dumps(forms[idx])))
print(result.match)

{"form_index": 1, "action": "/products", "method": "GET", "id": "", "class": [], "inputs": [{"tag": "input", "name": "q", "type": "text", "value": "", "id": "", "required": false, "placeholder": "Search with Smartprix", "css_path": "div#app > header > div.sm-search-header > div.sm-search-bar-wrap:nth-child(1) > div.sm-search-bar > form > div.sm-input.type-text.no-label > input.input.empty"}], "css_path": "div#app > header > div.sm-search-header > div.sm-search-bar-wrap:nth-child(1) > div.sm-search-bar > form"}
True


### 2. Get the input we think as search

In [None]:
websites = ['https://www.tatacliq.com', 'https://www.jiomart.com', 'https://www.boat-lifestyle.com', 'https://www.moglix.com', 'https://www.reliancedigital.in', 'https://play.google.com', 'https://www.smartprix.com', 'https://www.swiggy.com', 'https://www.flipkart.com', 'https://blinkit.com', 'https://www.bigbasket.com', 'https://www.91mobiles.com', 'https://www.poorvika.com', 'https://www.zeptonow.com', 'https://dir.indiamart.com', 'https://www.ajio.com', 'https://www.myntra.com', 'https://www.vijaysales.com', 'https://m.snapdeal.com', 'https://www.croma.com', 'https://www.nykaafashion.com']

## 3. Get products highlight

In [23]:
from helpers.products import ProductParser

In [64]:
html_content = open('./html_files/www.amazon.in.search_results.html', 'r').read()
products_parser = ProductParser(html_content)

products = products_parser.parse_all_links_from_html(html_content)

print(products)
len(products)

[{'href': None, 'css_path': 'a#nav-top'}, {'href': '#skippedLink', 'css_path': 'a#nav-assist-skip-to-main-content'}, {'href': '.s-asin a:has(h2)', 'css_path': 'a#nav-assist-skip-to-results'}, {'href': '#s-refinements > div.a-section > div:first-child', 'css_path': 'a#nav-assist-skip-to-filters'}, {'href': '#twotabsearchtextbox', 'css_path': 'a#nav-assist-search'}, {'href': '/gp/cart/view.html/?ref_=nav_assist', 'css_path': 'a#nav-assist-cart'}, {'href': '/?ref_=nav_assist', 'css_path': 'a#nav-assist-home'}, {'href': '/gp/css/order-history/?ref_=nav_assist', 'css_path': 'a#nav-assist-your-orders'}, {'href': '/ref=nav_logo', 'css_path': 'a#nav-logo-sprites'}, {'href': '', 'css_path': 'a#nav-global-location-popover-link'}, {'href': '/customer-preferences/edit?ie=UTF8&preferencesReturnUrl=%2F&ref_=topnav_lang', 'css_path': 'div#icp-nav-flyout > a.nav-a.nav-a-2.icp-link-style-2'}, {'href': 'https://www.amazon.in/ap/signin?openid.pape.max_auth_age=0&openid.return_to=https%3A%2F%2Fwww.amazon.

539

In [65]:
from locale import currency
from urllib.parse import urlparse
from pydantic import BaseModel
from typing import List

class Product(BaseModel):
    index: int
    href: str
    css_path: List[str]
    path: str
    max_common: str

def get_raw_link(href: str) -> str:
    return urlparse(href).netloc + urlparse(href).path


current_url = 'https://www.amazon.in/'


def if_external_link(href: str, current_url: str) -> bool:
    if href.startswith('http'):
        return urlparse(href).netloc != urlparse(current_url).netloc
    if href.startswith('/'):
        return False
    if urlparse('https://' + href).netloc != urlparse(current_url).netloc:
        return True
    return False


products_split: list[Product] = [
    Product(index=idx, href=get_raw_link(p['href']), css_path=p['css_path'].split(' > '), path=p['css_path'], max_common='')
    for idx,p in enumerate(products)
    if p['href'] and not if_external_link(p['href'], current_url)
]

In [66]:
temp_mapping = {}

for idx, ps in enumerate(products_split):
    print(idx, ps)
#     if ps.css_path[0] not in temp_mapping:
#         temp_mapping[ps.css_path[0]] = []
#     temp_mapping[ps.css_path[0]].append(ps)

# print(temp_mapping)

0 index=5 href='/gp/cart/view.html/' css_path=['a#nav-assist-cart'] path='a#nav-assist-cart' max_common=''
1 index=6 href='/' css_path=['a#nav-assist-home'] path='a#nav-assist-home' max_common=''
2 index=7 href='/gp/css/order-history/' css_path=['a#nav-assist-your-orders'] path='a#nav-assist-your-orders' max_common=''
3 index=8 href='/ref=nav_logo' css_path=['a#nav-logo-sprites'] path='a#nav-logo-sprites' max_common=''
4 index=10 href='/customer-preferences/edit' css_path=['div#icp-nav-flyout', 'a.nav-a.nav-a-2.icp-link-style-2'] path='div#icp-nav-flyout > a.nav-a.nav-a-2.icp-link-style-2' max_common=''
5 index=11 href='www.amazon.in/ap/signin' css_path=['div#nav-link-accountList', 'a.nav-a.nav-a-2.nav-progressive-attribute'] path='div#nav-link-accountList > a.nav-a.nav-a-2.nav-progressive-attribute' max_common=''
6 index=12 href='/gp/css/order-history' css_path=['a#nav-orders'] path='a#nav-orders' max_common=''
7 index=13 href='/gp/cart/view.html' css_path=['a#nav-cart'] path='a#nav-c

In [67]:
max_length = max([len(p.css_path) for p in products_split])

In [68]:
max_length

20

In [69]:
for l in range(max_length, 0, -1):
    filtered_products = [p for p in products_split if len(p.css_path) == l]
    print(filtered_products)
    break


[Product(index=379, href='/s', css_path=['div#search', 'div.s-desktop-width-max.s-desktop-content.s-opposite-dir.s-wide-grid-style.sg-row:nth-child(1)', 'div.sg-col-4-of-4.sg-col-20-of-24.s-matching-dir.sg-col-16-of-20.sg-col.sg-col-12-of-12.sg-col-8-of-8.sg-col-12-of-16:nth-child(1)', 'div.sg-col-inner', 'span.rush-component.s-latency-cf-section:nth-child(1)', 'div.s-main-slot.s-result-list.s-search-results.sg-row:nth-child(1)', 'div.sg-col-4-of-4.sg-col-20-of-24.s-result-item.sg-col-16-of-20.s-widget.sg-col.sg-col-12-of-12.sg-col-8-of-8.sg-col-12-of-16.s-widget-spacing-large:nth-child(29)', 'div.sg-col-inner', 'div.s-widget-container.s-spacing-medium.s-widget-container-height-medium.celwidget.slot=MAIN.template=TEXT_REFORMULATION.widget=loom-desktop-bottom-slot_related-searches.pf_rd_p=b5876297-ded6-4475-876b-d315610bb575.pf_rd_r=07KWF3YJNNKEW45GWFKY.pd_rd_wg-LdaVL.pd_rd_w-zXsu2.content-id=amzn1.sym.b5876297-ded6-4475-876b-d315610bb575:amzn1.sym.b5876297-ded6-4475-876b-d315610bb575.p

In [70]:
grouped = {}

for p in products_split:
    if p.href not in grouped:
        grouped[p.href] = []
    grouped[p.href].append(p)

print(grouped)

{'/gp/cart/view.html/': [Product(index=5, href='/gp/cart/view.html/', css_path=['a#nav-assist-cart'], path='a#nav-assist-cart', max_common='')], '/': [Product(index=6, href='/', css_path=['a#nav-assist-home'], path='a#nav-assist-home', max_common='')], '/gp/css/order-history/': [Product(index=7, href='/gp/css/order-history/', css_path=['a#nav-assist-your-orders'], path='a#nav-assist-your-orders', max_common='')], '/ref=nav_logo': [Product(index=8, href='/ref=nav_logo', css_path=['a#nav-logo-sprites'], path='a#nav-logo-sprites', max_common='')], '/customer-preferences/edit': [Product(index=10, href='/customer-preferences/edit', css_path=['div#icp-nav-flyout', 'a.nav-a.nav-a-2.icp-link-style-2'], path='div#icp-nav-flyout > a.nav-a.nav-a-2.icp-link-style-2', max_common=''), Product(index=526, href='/customer-preferences/edit', css_path=['div#icp-touch-link-language', 'a.icp-language-link'], path='div#icp-touch-link-language > a.icp-language-link', max_common='')], 'www.amazon.in/ap/signin

In [None]:
def get_most_common_parent(v):
    min_length = min([len(p.css_path) for p in v])

    if len(set([p.css_path[0] for p in v])) > 1:
        return None

    for i in range(0, min_length):
        if len(set([p.css_path[i] for p in v])) > 1:
            return Product(
                index=v[0].index,
                href=v[0].href,
                css_path=v[0].css_path[:i],
                path=' > '.join(v[0].css_path[:i]),
                max_common=''
            )
    return None

candidates = []

for v in grouped.values():
    if len(v) == 1:
        candidates.append(v[0])
    else:
        parent = get_most_common_parent(v)
        if parent:
            candidates.append(parent)
        else:
            candidates += v

for c in candidates:
    print(c)

0 /customer-preferences/edit
['div#icp-nav-flyout', 'div#icp-touch-link-language']
0 /customer-preferences/edit
['div#icp-nav-flyout', 'div#icp-touch-link-language']
index=10 href='/customer-preferences/edit' css_path=[] path='' max_common=''
0 www.amazon.in/ap/signin
['div#nav-link-accountList', 'div#nav-flyout-ya-signin']
0 www.amazon.in/ap/signin
['div#nav-link-accountList', 'div#nav-flyout-ya-signin']
index=11 href='www.amazon.in/ap/signin' css_path=[] path='' max_common=''
0 /gp/css/order-history
['a#nav-orders', 'a#nav_prefetch_yourorders']
0 /gp/css/order-history
['a#nav-orders', 'a#nav_prefetch_yourorders']
index=12 href='/gp/css/order-history' css_path=[] path='' max_common=''
0 /hz/wishlist/ls
['div#nav-al-wishlist', 'div#nav-al-your-account']
0 /hz/wishlist/ls
['div#nav-al-wishlist', 'div#nav-al-your-account']
index=16 href='/hz/wishlist/ls' css_path=[] path='' max_common=''
0 /gp/css/homepage.html
['div#nav-al-your-account', 'div#navFooter']
0 /gp/css/homepage.html
['div#na