In [2]:
import sys

from IPython import embed

sys.path.append('../toolkit/')
from htmlrag import clean_html

question="When was the bellagio in las vegas built?"
html="""
<html>
<head>
<title>When was the bellagio in las vegas built?</title>
</head>
<body>
<p class="class0">The Bellagio is a luxury hotel and casino located on the Las Vegas Strip in Paradise, Nevada. It was built in 1998.</p>
</body>
<div>
<div>
<p>Some other text</p>
<p>Some other text</p>
</div>
</div>
<p class="class1"></p>
<!-- Some comment -->
<script type="text/javascript">
document.write("Hello World!");
</script>
</html>
"""


simplified_html = clean_html(html)
print(simplified_html)


# <html>
# <title>When was the bellagio in las vegas built?</title>
# <p>The Bellagio is a luxury hotel and casino located on the Las Vegas Strip in Paradise, Nevada. It was built in 1998.</p>
# <div>
# <p>Some other text</p>
# <p>Some other text</p>
# </div>
# </html>

<html>
<title>When was the bellagio in las vegas built?</title>
<p>The Bellagio is a luxury hotel and casino located on the Las Vegas Strip in Paradise, Nevada. It was built in 1998.</p>
<div>
<p>Some other text</p>
<p>Some other text</p>
</div>
</html>


In [3]:
from htmlrag import build_block_tree

# if you have mutiple HTML documents, merge them using the following code
# simplified_html = "\n".join([clean_html(html) for html in htmls])
block_tree, simplified_html=build_block_tree(simplified_html, max_node_words=10)
for block in block_tree:
    print("Block Content: ", block[0])
    print("Block Path: ", block[1])
    print("Is Leaf: ", block[2])
    print("")

# Block Content:  <title>When was the bellagio in las vegas built?</title>
# Block Path:  ['html', 'title']
# Is Leaf:  True
# 
# Block Content:  <div>
# <p>Some other text</p>
# <p>Some other text</p>
# </div>
# Block Path:  ['html', 'div']
# Is Leaf:  True
# 
# Block Content:  <p>The Bellagio is a luxury hotel and casino located on the Las Vegas Strip in Paradise, Nevada. It was built in 1998.</p>
# Block Path:  ['html', 'p']
# Is Leaf:  True


Block Content:  <title>When was the bellagio in las vegas built?</title>
Block Path:  ['html', 'title']
Is Leaf:  True

Block Content:  <div>
<p>Some other text</p>
<p>Some other text</p>
</div>
Block Path:  ['html', 'div']
Is Leaf:  True

Block Content:  <p>The Bellagio is a luxury hotel and casino located on the Las Vegas Strip in Paradise, Nevada. It was built in 1998.</p>
Block Path:  ['html', 'p']
Is Leaf:  True



In [5]:
from htmlrag import EmbedHTMLPruner

embed_model="/train_data_load/tjj_wst_hf/tjj_hf/bge-large-en/"
query_instruction_for_retrieval = "Instruct: Given a web search query, retrieve relevant passages that answer the query\nQuery: "
embed_html_pruner = EmbedHTMLPruner(embed_model=embed_model, local_inference=True, query_instruction_for_retrieval = query_instruction_for_retrieval)
# alternatively you can init a remote TEI model, refer to https://github.com/huggingface/text-embeddings-inference.
# tei_endpoint="http://YOUR_TEI_ENDPOINT"
# embed_html_pruner = EmbedHTMLPruner(embed_model=embed_model, local_inference=False, query_instruction_for_retrieval = query_instruction_for_retrieval, endpoint=tei_endpoint)
block_rankings=embed_html_pruner.calculate_block_rankings(question, simplified_html, block_tree)
print(block_rankings)

# [0, 2, 1]

[0, 2, 1]


In [7]:
#. alternatively you can use bm25 to rank the blocks
from htmlrag import BM25HTMLPruner
bm25_html_pruner = BM25HTMLPruner()
assert len(block_rankings)==len(block_tree), f"The number of block rankings {len(block_rankings)} should be equal to the number of blocks {len(block_tree)}"
block_rankings=bm25_html_pruner.calculate_block_rankings(question, simplified_html, block_tree)
print(block_rankings)

# [0, 2, 1]

[0, 2, 1]


In [6]:
from transformers import AutoTokenizer

chat_tokenizer=AutoTokenizer.from_pretrained("../../../huggingface/Meta-Llama-3.1-70B-Instruct")

max_context_window=60
pruned_html=embed_html_pruner.prune_HTML(simplified_html, block_tree, block_rankings, chat_tokenizer, max_context_window)
print(pruned_html)

# <html>
# <title>When was the bellagio in las vegas built?</title>
# <p>The Bellagio is a luxury hotel and casino located on the Las Vegas Strip in Paradise, Nevada. It was built in 1998.</p>
# </html>

OSError: Incorrect path_or_model_id: '../../../huggingface/Meta-Llama-3.1-70B-Instruct'. Please provide either the path to a local folder or the repo_id of a model on the Hub.

In [5]:
block_tree, pruned_html=build_block_tree(pruned_html, max_node_words=10)
for block in block_tree:
    print("Block Content: ", block[0])
    print("Block Path: ", block[1])
    print("Is Leaf: ", block[2])
    print("")
    
# Block Content:  <title>When was the bellagio in las vegas built?</title>
# Block Path:  ['html', 'title']
# Is Leaf:  True
# 
# Block Content:  <p>The Bellagio is a luxury hotel and casino located on the Las Vegas Strip in Paradise, Nevada. It was built in 1998.</p>
# Block Path:  ['html', 'p']
# Is Leaf:  True

Block Content:  <title>When was the bellagio in las vegas built?</title>
Block Path:  ['html', 'title']
Is Leaf:  True

Block Content:  <p>The Bellagio is a luxury hotel and casino located on the Las Vegas Strip in Paradise, Nevada. It was built in 1998.</p>
Block Path:  ['html', 'p']
Is Leaf:  True



In [11]:
from htmlrag import GenHTMLPruner

ckpt_path = "../../../model/train-tree-rerank-llama32/v1008/checkpoint-381/"
gen_embed_pruner = GenHTMLPruner(gen_model=ckpt_path, max_node_words=5)
block_rankings=gen_embed_pruner.calculate_block_rankings(question, pruned_html, block_tree)
print(block_rankings)

# [1, 0]

[0, 1]


In [12]:
max_context_window=32
pruned_html=gen_embed_pruner.prune_HTML(pruned_html, block_tree, block_rankings, chat_tokenizer, max_context_window)
print(pruned_html)

# <p>The Bellagio is a luxury hotel and casino located on the Las Vegas Strip in Paradise, Nevada. It was built in 1998.</p>

<title>When was the bellagio in las vegas built?</title>


In [13]:
import numpy as np
list(np.argsort([1,2])[::-1])

[1, 0]