# Load lib

In [3]:
from htmlrag import clean_html
from htmlrag import EmbedHTMLPruner
from htmlrag import build_block_tree
from htmlrag import GenHTMLPruner
from htmlrag import BM25HTMLPruner
import torch
from transformers import AutoTokenizer
import os

# Clean HTML

In [4]:
path = "/kaggle/input/kyanon-data/data_kyanon/datahtml/anhone.html"
html = open(path).read() 

simplified_html = clean_html(html)
print(simplified_html)


<div>
<div><div>2018 Baseball Hall of Fame balloting</div><div>1 language</div><div>Article<span>Talk<span>Read</span>EditView history</span></div><div>Tools</div><div><div>←<span>20172019</span>→</div><div>2018 Baseball Hall of Fame</div><div>balloting</div><div>   National Baseball   </div><div>Hall of Fame and Museum</div><div>New inductees<span>6</span></div><div>via BBWAA<span>4</span></div><div>via Modern Baseball Era</div><div>Committee</div><div>2</div><div>Total inductees<span>323</span></div><div>Induction date<span>July 29, 2018</span></div><div>Key</div><div></div><div>Elected to the Hall of Fame on this ballot (named</div><div>in <span>bold italics</span>).</div><div></div><div>Elected subsequently, as of 2025 (named in <span>plain</span></div><div>italics<span>).</span></div><div></div><div>Renominated for the <span>2019 BBWAA election<span> by</span></span></div><div>adequate performance on this ballot and has not</div><div>subsequently been eliminated.</d

In [5]:
# Maximum number of words in a node when constructing the block tree for pruning with the embedding model
# MAX_NODE_WORDS_EMBED = 10 
MAX_NODE_WORDS_EMBED = 256 # a recommended setting for real-world HTML documents
# Maximum number of tokens in the output HTML document pruned with the embedding model
# MAX_CONTEXT_WINDOW_EMBED = 60
MAX_CONTEXT_WINDOW_EMBED = 6144 # a recommended setting for real-world HTML documents
# Maximum number of words in a node when constructing the block tree for pruning with the generative model
# MAX_NODE_WORDS_GEN = 5
MAX_NODE_WORDS_GEN = 128 # a recommended setting for real-world HTML documents
# Maximum number of tokens in the output HTML document pruned with the generative model
# MAX_CONTEXT_WINDOW_GEN = 32
MAX_CONTEXT_WINDOW_GEN = 4096 # a recommended setting for real-world HTML documents

# Build tree

In [6]:
block_1, html_1 = build_block_tree(simplified_html, max_node_words=MAX_NODE_WORDS_EMBED)
# block_tree, simplified_html = build_block_tree(simplified_html, max_node_words=MAX_NODE_WORDS_GEN, zh_char=True) # for Chinese text
for block in block_1:
    print("Block Content: ", block[0])
    print("Block Path: ", block[1])
    print("Is Leaf: ", block[2])
    print("")

# Block Content:  <h1>Bellagio Hotel in Las</h1>
# Block Path:  ['html', 'title']
# Is Leaf:  True
# 
# Block Content:  <div>
# <p>Some other text</p>
# <p>Some other text</p>
# </div>
# Block Path:  ['html', 'div']
# Is Leaf:  True
# 
# Block Content:  <p>The Bellagio is a luxury hotel and casino located on the Las Vegas Strip in Paradise, Nevada. It was built in 1998.</p>
# Block Path:  ['html', 'p']
# Is Leaf:  True

Block Content:  <div1><div>ballots.</div><div></div><div>Eliminated from annual BBWAA consideration by</div><div>poor performance or expiration on this ballot.</div><div>†First time on the BBWAA ballot.</div><div>*</div><div>Eliminated from annual BBWAA consideration by</div><div>poor performance on this ballot (not expiration).</div><div>Chipper Jones</div><div>Another recent rules change, announced in 2015,</div><div>tightened the qualifications for the BBWAA</div><div>electorate. Beginning with the 2016 election,</div><div>eligible voters must not only have 10years of</div><div>continuous BBWAA membership, but also be</div><div>currently active members, or have held active</div><div>status within the 10years prior to the election. A</div><div>BBWAA member who has not been active for more</div><div>than 10years can regain voting status by covering MLB in the year preceding the election.</div><div>[7]</div><div> As a result of</div><div>the new rule, the vote total in 2016 decr

In [7]:
html_1

'\n<div>\n<div0><div0>2018 Baseball Hall of Fame balloting</div0><div1>1 language</div1><div2>Article<span>Talk<span>Read</span>EditView history</span></div2><div3>Tools</div3><div4><div>←\ue662<span>20172019</span>\ue662→</div><div>2018 Baseball Hall of Fame</div><div>balloting</div><div>   National Baseball   </div><div>Hall of Fame and Museum</div><div>New inductees<span>6</span></div><div>via BBWAA<span>4</span></div><div>via Modern Baseball Era</div><div>Committee</div><div>2</div><div>Total inductees<span>323</span></div><div>Induction date<span>July 29, 2018</span></div><div>Key</div><div>\ue662\ue662\ue662\ue662</div><div>Elected to the Hall of Fame on this ballot (named</div><div>in <span>bold italics</span>).</div><div>\ue662\ue662\ue662\ue662</div><div>Elected subsequently, as of 2025 (named in <span>plain</span></div><div>italics<span>).</span></div><div>\ue662\ue662\ue662\ue662</div><div>Renominated for the <span>2019 BBWAA election<span> by</span></span></div><div>adequat

# Prune HTML Blocks with Embedding Model

In [15]:
embed_model="BAAI/bge-large-en"
query_instruction_for_retrieval = "Instruct: Given a web search query, retrieve relevant passages that answer the query\nQuery: "
# question = "Given the following HTML content, extract the Hall of Fame voting data as a structured table (Player, Votes, Percent, Change, Year):"

question = """Given two HTML snippets, HTML 1 and HTML 2, do they represent the same table format with the columns "Player", "Votes", "Percent", "Change", and "Year"? If they do, print HTML 1

HTML 1:
<div>PlayerVotesPercentChangeYear</div>
<div>Chipper Jones<span>†<span>41097.2%–1st</span></span></div>
<div>Vladimir Guerrero<span>39292.9%21.2%2nd</span></div>
<div>Jim Thome<span>†<span>37989.8%–1st</span></span></div>
<div>Trevor Hoffman<span>33779.9%5.9%3rd</span></div>
<div>Edgar Martínez<span>29770.4%11.8%9th</span></div>
<div>Mike Mussina<span>26863.5%11.7%5th</span></div>
<div>Roger Clemens<span>24257.3%3.2%6th</span></div>
<div>Barry Bonds<span>23856.4%2.6%6th</span></div>
<div>Curt Schilling<span>21651.2%6.2%6th</span></div>
<div>Omar Vizquel<span>†<span>15637.0%–1st</span></span></div>
<div>Larry Walker<span>14434.1%12.2%8th</span></div>
<div>Fred McGriff<span>9823.2%1.5%9th</span></div>
<div>Manny Ramírez<span>9322.0%1.8%2nd</span></div>
<div>Jeff Kent<span>6114.5%2.2%5th</span></div>
<div>Gary Sheffield<span>4711.1%2.2%4th</span></div>

HTML 2:
<div>Billy Wagner<span>4711.1%0.9%3rd</span></div>
<div>Scott Rolen<span>†<span>4310.2%–1st</span></span></div>
<div>Sammy Sosa<span>337.8%0.8%6th</span></div>
<div>Andruw Jones<span>†<span>317.3%–1st</span></span></div>
<div>Jamie Moyer<span>†*<span>102.4%–1st</span></span></div>
<div>Johan Santana<span>†*<span>102.4%–1st</span></span></div>
<div>Johnny Damon<span>†*<span>81.9%–1st</span></span></div>
<div>Hideki Matsui<span>†*<span>40.9%–1st</span></span></div>
<div>Chris Carpenter<span>†*<span>20.5%–1st</span></span></div>
<div>Kerry Wood<span>†*<span>20.5%–1st</span></span></div>
<div>Liván Hernández<span>†*<span>10.2%–1st</span></span></div>
<div>Carlos Lee<span>†*<span>10.2%–1st</span></span></div>
<div>Orlando Hudson<span>†*<span>00%–1st</span></span></div>
<div>Aubrey Huff<span>†*<span>00%–1st</span></span></div>
<div>Jason Isringhausen<span>†*<span>00%–1st</span></span></div>
<div>Brad Lidge<span>†*<span>00%–1st</span></span></div>
<div>Kevin Millwood<span>†*<span>00%–1st</span></span></div>
<div>Carlos Zambrano<span>†*<span>00%–1st</span></span></div>
"""

In [16]:
embed_model="BAAI/bge-large-en"
query_instruction_for_retrieval = "Instruct: Given a web search query, retrieve relevant passages that answer the query\nQuery: "
embed_html_pruner = EmbedHTMLPruner(embed_model=embed_model, local_inference=True, query_instruction_for_retrieval = query_instruction_for_retrieval)
# alternatively you can init a remote TEI model, refer to https://github.com/huggingface/text-embeddings-inference.
# tei_endpoint="http://YOUR_TEI_ENDPOINT"
# embed_html_pruner = EmbedHTMLPruner(embed_model=embed_model, local_inference=False, query_instruction_for_retrieval = query_instruction_for_retrieval, endpoint=tei_endpoint)
block_rankings=embed_html_pruner.calculate_block_rankings(question, html_1, block_1)
print(block_rankings)

[1, 132, 0, 109, 99, 138, 128, 97, 92, 103, 129, 140, 144, 112, 145, 127, 2, 101, 96, 78, 102, 95, 146, 76, 3, 143, 93, 68, 126, 147, 12, 39, 17, 29, 86, 22, 133, 7, 84, 117, 5, 73, 74, 82, 89, 115, 136, 50, 122, 98, 104, 107, 24, 4, 8, 55, 125, 38, 20, 121, 131, 49, 31, 120, 85, 118, 62, 28, 75, 83, 90, 116, 137, 66, 79, 80, 33, 18, 52, 64, 23, 53, 123, 71, 60, 94, 70, 119, 81, 134, 44, 139, 142, 124, 58, 114, 72, 54, 6, 130, 88, 135, 65, 105, 14, 9, 91, 41, 45, 100, 87, 110, 111, 46, 113, 10, 35, 106, 13, 30, 56, 26, 108, 36, 63, 57, 59, 25, 11, 34, 37, 77, 141, 61, 42, 21, 43, 27, 32, 47, 19, 40, 15, 51, 16, 69, 48, 67]


In [17]:
bm25_html_pruner = BM25HTMLPruner()
block_rankings = bm25_html_pruner.calculate_block_rankings(question, html_1, block_1)
print(block_rankings)

[4, 48, 37, 15, 43, 130, 61, 113, 9, 47, 11, 10, 79, 49, 108, 14, 67, 2, 98, 7, 0, 92, 86, 62, 125, 138, 34, 112, 36, 128, 32, 27, 8, 66, 41, 12, 69, 22, 18, 65, 51, 1, 13, 26, 145, 107, 144, 21, 16, 42, 25, 35, 40, 95, 68, 38, 39, 114, 115, 116, 117, 118, 119, 120, 33, 44, 45, 46, 50, 52, 53, 54, 55, 121, 94, 96, 97, 3, 5, 6, 143, 146, 17, 131, 132, 30, 134, 122, 123, 124, 126, 127, 129, 28, 29, 56, 31, 133, 78, 80, 81, 82, 83, 84, 85, 87, 88, 89, 90, 57, 142, 136, 137, 19, 20, 23, 24, 147, 93, 135, 139, 140, 91, 141, 58, 59, 60, 63, 64, 106, 109, 110, 111, 70, 77, 72, 73, 99, 100, 101, 102, 103, 104, 105, 74, 75, 76, 71]


In [None]:
os.environ["HF_TOKEN"] = "tự create mà dùng nha"
token = os.environ.get("HF_TOKEN")

if token:
    print("Token successfully retrieved!")
else:
    print("Token not found!")

Token successfully retrieved!


In [19]:
chat_tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.1-70B-Instruct",  token=token)

pruned_html = embed_html_pruner.prune_HTML(html_1, block_1, block_rankings, chat_tokenizer, MAX_CONTEXT_WINDOW_EMBED)
print(pruned_html)

<div>
<div0><div0>2018 Baseball Hall of Fame balloting</div0><div1>1 language</div1><div4><div>←<span>20172019</span>→</div><div>2018 Baseball Hall of Fame</div><div>balloting</div><div>   National Baseball   </div><div>Hall of Fame and Museum</div><div>New inductees<span>6</span></div><div>via BBWAA<span>4</span></div><div>via Modern Baseball Era</div><div>Committee</div><div>2</div><div>Total inductees<span>323</span></div><div>Induction date<span>July 29, 2018</span></div><div>Key</div><div></div><div>Elected to the Hall of Fame on this ballot (named</div><div>in <span>bold italics</span>).</div><div></div><div>Elected subsequently, as of 2025 (named in <span>plain</span></div><div>italics<span>).</span></div><div></div><div>Renominated for the <span>2019 BBWAA election<span> by</span></span></div><div>adequate performance on this ballot and has not</div><div>subsequently been eliminated.</div><div>Eliminated from annual BBWAA consideration by</div><div>poor perfor

# Prune HTML Blocks with Generative Model

In [20]:
# construct a finer block tree
block_2, pruned_html = build_block_tree(pruned_html, max_node_words=MAX_NODE_WORDS_GEN)
# block_tree, pruned_html = build_block_tree(pruned_html, max_node_words=MAX_NODE_WORDS_GEN, zh_char=True) # for Chinese text
for block in block_2:
    print("Block Content: ", block[0])
    print("Block Path: ", block[1])
    print("Is Leaf: ", block[2])
    print("")

Block Content:  <div0>2018 Baseball Hall of Fame balloting</div0>
Block Path:  ['div', 'div0', 'div0']
Is Leaf:  True

Block Content:  <div1>1 language</div1>
Block Path:  ['div', 'div0', 'div1']
Is Leaf:  True

Block Content:  <div4><div>←<span>20172019</span>→</div><div>2018 Baseball Hall of Fame</div><div>balloting</div><div>   National Baseball   </div><div>Hall of Fame and Museum</div><div>New inductees<span>6</span></div><div>via BBWAA<span>4</span></div><div>via Modern Baseball Era</div><div>Committee</div><div>2</div><div>Total inductees<span>323</span></div><div>Induction date<span>July 29, 2018</span></div><div>Key</div><div></div><div>Elected to the Hall of Fame on this ballot (named</div><div>in <span>bold italics</span>).</div><div></div><div>Elected subsequently, as of 2025 (named in <span>plain</span></div><div>italics<span>).</span></div><div></div><div>Renominated for the <span>2019 BBWAA election<span> by</span></span></div><div>adequate performance on t

In [22]:
ckpt_path = "zstanjj/HTML-Pruner-Phi-3.8B"
if torch.cuda.is_available():
    device="cuda"
else:
    device="cpu"
    
gen_embed_pruner = GenHTMLPruner(gen_model=ckpt_path, device=device)
block_rankings = gen_embed_pruner.calculate_block_rankings(question, pruned_html, block_2)
print(block_rankings)

# [1, 0]

pruned_html = gen_embed_pruner.prune_HTML(pruned_html, block_2, block_rankings, chat_tokenizer, MAX_CONTEXT_WINDOW_GEN)
print(pruned_html)