In [None]:
# process websrc dataset
# process websrc dataset
import random
from tqdm import tqdm
import csv
import json
import argparse
import os.path as osp
import os
from operator import itemgetter


def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument("--root_dir", default="../html_data/websrc/release/", type=str,
                        help="The root directory of the raw WebSRC dataset; The output SQuAD-style json file will also"
                             "be placed here.")
    parser.add_argument("--version", default="websrc1.0", type=str,
                        help="The version of the generating dataset, which will also be the name of the json file.")
    parser.add_argument("--suffix", default="", type=str,
                        help="Other suffix to distinguish different dataset.")
    parser.add_argument("-f", default=None, type=str)
    return parser.parse_args()


def convert_csv_to_dict(args):
    dir_list = os.walk(args.root_dir)
    print('Start Converting')

    data, websites, qas, answers = [], [], [], []
    last_domain = None

    for d, _, fs in tqdm(dir_list):
        for f in fs:
            if f != 'dataset.csv':
                continue
            # print('Now converting', d + '/' + f)
            raw_data = list(csv.DictReader(open(osp.join(d, f))))
            curr_domain = d.split('/')[-2]
            if last_domain != curr_domain and last_domain is not None:
                domain = {'domain': last_domain, 'websites': websites}
                data.append(domain)
                websites = []
            last_domain = curr_domain

            raw_data.sort(key=itemgetter('id'))

            last = raw_data[0]
            for i in range(len(raw_data)):
                current = raw_data[i]
                if i != 0:
                    qa = {'question': last['question'],
                          'id'      : last['id'],
                          'answers' : answers}  # , 'type': last['type']}
                    qas.append(qa)
                    answers = []
                if last['id'][:-5] != current['id'][:-5]:
                    website = {'qas': qas, 'page_id': last['id'][2:-5]}
                    websites.append(website)
                    qas = []
                if 'answer' in current:
                    answer = {'text'        : current['answer'],
                              'element_id'  : int(current['element_id']),
                              'answer_start': int(current['answer_start'])}
                else:
                    answer = {}
                answers.append(answer)
                last = current

            if len(answers) > 0:
                qa = {'question': last['question'],
                      'id'      : last['id'],
                      'answers' : answers}  # , 'type'    : last['type']}
                qas.append(qa)
                answers = []
            if len(qas) > 0:
                website = {'qas': qas, 'page_id': last['id'][2:-5]}
                websites.append(website)
                qas = []

    domain = {'domain': last_domain, 'websites': websites}
    data.append(domain)
    dataset = {'version': args.version, 'data': data}
    print('Converting Finished\n')

    return dataset


def dataset_split(args, dataset):
    def count(last, curr):
        if last is None:
            return False
        if last != curr:
            return False
        return True

    split = json.load(open(osp.join(args.root_dir, 'dataset_split.json')))
    data = dataset['data']
    count_website = set()
    for domain in data:
        for website in domain['websites']:
            count_website.add(domain['domain'][0:2] + website['page_id'][0:2])
    print('The number of total websites is', len(count_website))

    train_list = []
    dev_list, test_list = split['dev'], split['test']
    for website in count_website:
        if website not in dev_list and website not in test_list:
            train_list.append(website)
    print('The train websites list is', train_list)
    print('The test websites list is', test_list)
    print('The dev websites list is', dev_list)

    train_data, test_data, dev_data = [], [], []
    cnt = 0
    for domain in data:
        train_websites, test_websites, dev_websites = [], [], []
        last = None
        for website in domain['websites']:
            if not count(last, website['page_id'][0:2]):
                last = website['page_id'][0:2]
                cnt += 1
            name = domain['domain'][0:2] + website['page_id'][0:2]
            if name in test_list:
                test_websites.append(website)
                continue
            if name in dev_list:
                dev_websites.append(website)
                continue
            if len(train_list) != 0 and name not in train_list:
                continue
            train_websites.append(website)
        if len(train_websites) != 0:
            train_data.append({'domain': domain['domain'], 'websites': train_websites})
        if len(test_websites) != 0:
            test_data.append({'domain': domain['domain'], 'websites': test_websites})
        if len(dev_websites) != 0:
            dev_data.append({'domain': domain['domain'], 'websites': dev_websites})
    print('The number of processed websites is', cnt)
    
    #  sample one qa pair from each website in the dev set as the final dev set
    #  one qa pair forms one sample
    final_test_data = []
    for domain in dev_data:
        for website in domain['websites']:
            #  randomly sample one qa pair
            if len(website['qas']) > 0:
                html_file = os.path.join(args.root_dir, domain['domain'], website['page_id'][:2], "processed_data", website['page_id'] + '.html')
                html_text = open(html_file, 'r').read()
                qa = website['qas'][0]
                answers=[a['text'] for a in qa['answers']]
                final_test_data.append({'domain': domain['domain'], 'website': website['page_id'], 'question': qa['question'], 'answers': answers, 'html': html_text})
    print('The number of final test samples is', len(final_test_data))
    #  randomly shuffle the final test data and sample 400 samples
    random.shuffle(final_test_data)
    final_dev_data = final_test_data[400:]
    final_test_data = final_test_data[:400]
    return final_test_data, final_dev_data
   
dataset = "websrc"         
args = parse_args()
full_dataset = convert_csv_to_dict(args)
final_test_data, final_dev_data = dataset_split(args, full_dataset)
#  save the final test data
with open(f'../html_data/{dataset}/{dataset}-test.jsonl', 'w') as f:
    for sample in tqdm(final_test_data):
        f.write(json.dumps(sample, ensure_ascii=False) + '\n')
#  save the final dev data
with open(f'../html_data/{dataset}/{dataset}-dev.jsonl', 'w') as f:
    for sample in tqdm(final_dev_data):
        f.write(json.dumps(sample, ensure_ascii=False) + '\n')

In [None]:
# process wstqa dataset
import json

dataset = "wstqa"

data_lines = [json.loads(l) for l in open(f"../html_data/{dataset}/origin-wstqa-test.jsonl")]
for line in data_lines:
    
    line["answers"] = [item for sublist in line.pop("answers") for item in sublist]
    line["html"] = line.pop("positive_html")
    line["raw_text"] = line.pop("positive_reference")

data_file = f"../html_data/{dataset}/wstqa-test.jsonl"
with open(data_file, "w") as f:
    for line in data_lines:
        f.write(json.dumps(line, ensure_ascii=False) + "\n")

In [None]:
import random
def sample_text_from_html(soup):
    # get raw texts
    texts = soup.find_all(string=True)
    texts = [((i.parent.name if i.parent.name else "") , i.text.strip()) for i in texts if i.text.strip()]
    title = ""
    for i in texts:
        # if i[0] in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
        if re.match(r'h\d', i[0]):
            title = i[1]
            break
    if title == "":
        for i in texts:
            if i[0]  == 'title':
                title = i[1]
                break
    #  filter texts with length > 20
    raw_texts = [i[1] for i in texts if len(i[1]) > 20]
    if len(raw_texts) == 0:
        raw_texts = [i[1] for i in texts]
    #  skip text containing "\"
    raw_texts = [i for i in raw_texts if "\\" not in i]
    # random.seed(2023)
    sampled_texts = random.sample(raw_texts, 1)
    sampled_texts = sampled_texts[0].split(" ")
    if len(sampled_texts) < 3:
        snippet = " ".join(sampled_texts)
    else:
        # randomly text pieces up to 3 words
        start_word = random.randint(0, len(sampled_texts)-3)
        snippet = sampled_texts[start_word:start_word+3]
        snippet = " ".join(snippet)
        # print(f"sampled text: {snippet}")
    #  add a \ before special characters
    snippet = re.sub(r"([\\`*{}[\]()#+\-^.!?])", r"\\\1", snippet)
    
    return snippet



In [2]:
#  init tokenizer
from transformers import AutoTokenizer

tokenizer_path = "../../../huggingface/Baichuan2-7B-Chat/"
tokenizer = AutoTokenizer.from_pretrained(tokenizer_path, trust_remote_code=True)

In [3]:
tokenizer.tokenize("<p> </p> <div></div> <h1></h1> <title> </title> <li>")

['<p>',
 '▁',
 '</p>',
 '▁<',
 'div',
 '></',
 'div',
 '>',
 '▁',
 '<h1>',
 '</h1>',
 '▁<',
 'title',
 '>',
 '▁</',
 'title',
 '>',
 '▁',
 '<li>']

In [None]:
#  simplify html text for single html text
from bs4 import Comment
#  remove css and js tags
import re
from bs4 import BeautifulSoup as bs
import json
import os
from tqdm import tqdm

dataset = "arxiv"
segment = "cs"
# dataset = "sinafinance"
# segment = "01"
data_file = f"../html_data/{dataset}/{dataset}-{segment}.jsonl"
data_lines = [json.loads(l) for l in open(data_file)]

original_token_lens = []
simple_token_lens = []
for data_line in tqdm(data_lines):
    html_text = data_line["html"]
    # original_token_lens.append(len(tokenizer.encode(html_text)))
    soup = bs(html_text, "html.parser")
    #  remove css and js tags
    for script in soup(["script", "style"]):
        script.decompose()
    #  remove all attributes
    for tag in soup.find_all(True):
        tag.attrs = {}
    #  remove empty tags
    for tag in soup.find_all():
        if not tag.text.strip():
            tag.decompose()
    #  remove href attributes
    for tag in soup.find_all("a"):
        del tag["href"]
    #  remove comments
    comments = soup.find_all(string=lambda text: isinstance(text, Comment))
    for comment in comments:
        comment.extract()
    #  remove blank lines and spaces
    for tag in soup.find_all():
        #  select leaf nodes
        if not tag.findChild():
            tag.string = tag.text.strip()
    simple_html_text = str(soup)
    #  remove blank lines and spaces
    simple_html_text = re.sub(r"\n+", "\n", simple_html_text)
    #  remove leading and trailing spaces and concatenate into a single line
    simple_html_text = "".join([line.strip() for line in simple_html_text.split("\n") if line.strip()])
    data_line["html"] = simple_html_text
    # simple_token_lens.append(len(tokenizer.encode(simple_html_text)))
    
# average_original_token_len = sum(original_token_lens) / len(original_token_lens)
# average_simple_token_len = sum(simple_token_lens) / len(simple_token_lens)
# print(f"original token lens: {average_original_token_len} -> simple token lens: {average_simple_token_len}")
with open(f"../html_data/{dataset}/{dataset}-{segment}-simple.jsonl", "w") as f:
    for line in data_lines:
        f.write(json.dumps(line, ensure_ascii=False) + "\n")

In [None]:
#  simplify html text for multiple html texts
from bs4 import Comment
#  remove css and js tags
import re
from bs4 import BeautifulSoup as bs
import json
import os
from tqdm import tqdm

dataset = "hotpot-qa"
split="test"
rewrite_method="slimplmqr"

data_file = f"../html_data/{dataset}/bing/binghtml-{rewrite_method}-{dataset}-{split}.jsonl"
data_lines = [json.loads(l) for l in open(data_file)]

original_token_lens = []
simple_token_lens = []
for data_line in tqdm(data_lines):
    for ref in data_line[f'{rewrite_method}_results']:
        html_text=ref["html"]
        # original_token_lens.append(len(tokenizer.encode(html_text)))
        soup = bs(html_text, "html.parser")
        #  remove css and js tags
        for script in soup(["script", "style"]):
            script.decompose()
        #  remove all attributes
        for tag in soup.find_all(True):
            tag.attrs = {}
        #  remove empty tags
        for tag in soup.find_all():
            if not tag.text.strip():
                tag.decompose()
        #  remove href attributes
        for tag in soup.find_all("a"):
            del tag["href"]
        #  remove comments
        comments = soup.find_all(string=lambda text: isinstance(text, Comment))
        for comment in comments:
            comment.extract()
        #  remove blank lines and spaces
        for tag in soup.find_all():
            #  select leaf nodes
            if not tag.findChild():
                tag.string = tag.text.strip()
        simple_html_text = str(soup)
        #  remove blank lines and spaces
        simple_html_text = re.sub(r"\n+", "\n", simple_html_text)
        #  remove leading and trailing spaces and concatenate into a single line
        simple_html_text = "".join([line.strip() for line in simple_html_text.split("\n") if line.strip()])
        ref["html"] = simple_html_text
    # simple_token_lens.append(len(tokenizer.encode(simple_html_text)))
    
with open(f"../html_data/{dataset}/bing/binghtml-{rewrite_method}-{dataset}-simple-{split}.jsonl", "w") as f:
    for line in data_lines:
        f.write(json.dumps(line, ensure_ascii=False) + "\n")

In [None]:
tag = soup.find_all()[0]
print(tag)
tag.string=tag.text.strip()
print(tag.findChild())

In [None]:
from html.parser import HTMLParser
class MLStripper(HTMLParser):
    def __init__(self):
        super().__init__()
        self.reset()
        self.strict = False
        self.convert_charrefs = True
        self.fed = []
    def handle_data(self, d):
        self.fed.append(d)
    def get_data(self):
        return "".join(self.fed)
def strip_tags(html: str) -> str:
    s = MLStripper()
    s.feed(html)
    return s.get_data()

In [None]:
def random_neighbor_tag(tag, max_jump=2, max_neighbor_tags=20):
    neighbor_tags = [["", tag, 1]]
    max_neighbor_tags = 20
    #  find all non-empty neighbor tags within 3 jumps
    for i in range(max_jump):
        for tag in neighbor_tags:
            if len(neighbor_tags) > max_neighbor_tags:
                break
            if len(tag[0]) < max_jump and tag[2]:
                # add parent tag
                if tag[1].parent:
                    neighbor_tags.append([tag[0] + ".parent", tag[1].parent, 1])
                
                # add next sibling tag
                if tag[1].next_sibling and tag[1].next_sibling.string and tag[1].next_sibling.string.strip():
                    neighbor_tags.append([tag[0] + ".next_sibling", tag[1].next_sibling, 1])
                # add previous sibling tag
                if tag[1].previous_sibling and tag[1].previous_sibling.string and tag[1].previous_sibling.string.strip():
                    neighbor_tags.append([tag[0] + ".previous_sibling", tag[1].previous_sibling, 1])
                if hasattr(tag[1], "children"):
                    # add children tags
                    for ci, child in enumerate(tag[1].children):
                        if len(neighbor_tags) > max_neighbor_tags:
                            break
                        if child.string and child.string.strip():
                            neighbor_tags.append([f"{tag[0]}.children[{ci}]", child, 1])
                tag[2] = 0
    #  randomly select one neighbor tag
    neighbor_tag = random.choice(neighbor_tags)
    return neighbor_tag
    
    

In [None]:
#  randomly construct prompt
import random

def random_bs_prompt_zh(html_text, target_str, relative_path):
    # prompt = f"请问以下代码段的执行结果是什么？\nfrom bs4 import BeautifulSoup as bs\nhtml_text={html_text}\nsoup=bs(html_text, 'html.parser')\nprint(soup.find(string=re.compile('{target_str}')).parent{relative_path})"
    prompt_options = [
        ["请问以下关于beautiful soup的代码段的执行结果是什么？", "输出一下关于beautiful soup的代码的执行结果。", "以下代码的执行结果是什么？", "如果执行以下代码，会输出什么？"],
        ["from bs4 import BeautifulSoup as bs", ""],
        [f"html_text='{html_text}'", f"html_text=\"{html_text}\"", f"html_text=\"\"\"{html_text}\"\"\""],
        ["soup=bs(html_text, 'html.parser')", "soup=bs(html_text, 'html.parser') # 解析html文本"],
        [f"print(soup.find(string=re.compile('{target_str}')).parent{relative_path})", f"soup.find(string=re.compile('{target_str}')).parent{relative_path}", f"tag=soup.find(string=re.compile('{target_str}')).parent\nprint(tag{relative_path})"]
    ]
    prompt="\n".join([random.choice(options) for options in prompt_options])
    return prompt

def random_bs_prompt_en(html_text, target_str, relative_path):
    prompt_options = [
        ["What's the execution result of the following code snippet?", "What's the output of the following code snippet?", "What's the result of the following code snippet?", "What's the result of the following code snippet?"],
        ["from bs4 import BeautifulSoup as bs", "from bs4 import BeautifulSoup", "import bs4", "import BeautifulSoup as bs"],
        [f"html_text='{html_text}'", f"html_text=\"{html_text}\"", f"html_text=\"\"\"{html_text}\"\"\""],
        ["soup=bs(html_text, 'html.parser')", "soup=bs(html_text, 'html.parser') # parse the html text"],
        [f"print(soup.find(string=re.compile('{target_str}')).parent{relative_path})", f"soup.find(string=re.compile('{target_str}')).parent{relative_path}", f"tag=soup.find(string=re.compile('{target_str}')).parent\nprint(tag{relative_path})"]
    ]
    prompt="\n".join([random.choice(options) for options in prompt_options])
    return prompt

In [None]:
#  construct training data with beautiful soup
from tqdm import tqdm
import json
from bs4 import BeautifulSoup as bs
import re

dataset="sinafinance"
segment="01"
# dataset="arxiv"
# segment="cs"
task="bs4"
lang="zh"
data_file=f"../html_data/{dataset}/{dataset}-{segment}-simple.jsonl"
data_lines=[json.loads(l) for l in open(data_file)]

#  sample the first 1000 samples
# data_lines=data_lines[:1000]

print(len(data_lines))
training_data_lines=[]
for i, data_line in tqdm(enumerate(data_lines), total=len(data_lines)):
    html_text=data_line["html"]
    soup=bs(html_text, "html.parser")
    #  get the table tag
    target_str=sample_text_from_html(soup)
    # print(target_str)
    target_tag=soup.find(string=re.compile(target_str))
    #  randomly swith to parent tag, chile tag, next sibling tag, or previous sibling tag
    relative_path, target_tag, _ = random_neighbor_tag(target_tag.parent)
    
        
    assert target_tag is not None, f"target tag is None, target_str: {target_str}"
    if lang=="zh":
        question=random_bs_prompt_zh(html_text, target_str, relative_path)
    else:
        question=random_bs_prompt_en(html_text, target_str, relative_path)
    answers=str(target_tag)
    training_data_lines.append({"id": f"{dataset}_{segment}_{task}_{i}", "messages": [{"role": "user", "content": question}, {"role": "assistant", "content": answers}]})
    
with open(f"../html_data/{dataset}/{dataset}-{segment}-{task}-sft.jsonl", "w") as f:
    for line in training_data_lines:
        f.write(json.dumps(line, ensure_ascii=False) + "\n")
        

In [None]:
target_tag.parent

In [None]:
#  construct training data with markdownify
from markdownify import markdownify as md
from tqdm import tqdm
import json

def random_markdown_prompt_zh(html_text):
    prompt_options = [
        [f"{html_text}", f"```{html_text}```", f"\"\"\"{html_text}\"\"\"", f"((({html_text})))"],
        ["请问以上html文本对应的markdown文本是什么？", "请写出以上html文本对应的markdown文本。", "请将以上html文本转换为markdown文本。", "请将以上html文本转换为markdown格式。"],
    ]
    prompt="\n".join([random.choice(options) for options in prompt_options])
    return prompt

def random_markdown_prompt_en(html_text):
    prompt_options = [
        [f"{html_text}", f"```{html_text}```", f"\"\"\"{html_text}\"\"\"", f"((({html_text})))"],
        ["What's the markdown text corresponding to the above html text?", "Please write the markdown text corresponding to the above html text.", "Please convert the above html text to markdown text.", "Please convert the above html text to markdown format."],
    ]
    prompt="\n".join([random.choice(options) for options in prompt_options])
    return prompt



dataset="sinafinance"
segment="01"
# dataset="arxiv"
# segment="cs"
lang="zh"
task="markdownify"
data_file=f"../html_data/{dataset}/{dataset}-{segment}-simple.jsonl"

data_lines=[json.loads(l) for l in open(data_file)]

#  sample the first 1000 samples
# data_lines=data_lines[:1000]

print(len(data_lines))
training_data_lines=[]
for i, data_line in tqdm(enumerate(data_lines), total=len(data_lines)):
    html_text=data_line["html"]
    markdown=md(html_text)
    if lang=="zh":
        question=random_markdown_prompt_zh(html_text)
    else:
        question=random_markdown_prompt_en(html_text)
    answers= markdown
    training_data_lines.append({"id": f"{dataset}_{segment}_{task}_{i}", "messages": [{"role": "user", "content": question}, {"role": "assistant", "content": answers}]})

with open(f"../html_data/{dataset}/{dataset}-{segment}-{task}-sft.jsonl", "w") as f:
    for line in training_data_lines:
        f.write(json.dumps(line, ensure_ascii=False) + "\n")

In [None]:
idx=23
question=data_lines[idx]["question"]
answers=data_lines[idx]["answers"]


In [None]:
html_text.find("and Vaz [2022]")

In [None]:
idx=117
print(data_lines[idx].keys())
html_text=data_lines[idx]["html"]
soup=bs(html_text, "html.parser")
#  get the table tag
res=soup.find(string=re.compile("and two computer scientists"))
print(res)
for p in res.parents:
    print(p.name)
for s in res.next_siblings:
    print(s)

In [None]:
print(f"{data_lines[idx]['url']}")
# print(f"html text: {html_text}")


In [None]:
dataset="arxiv"
data_file="../html_data/arixv/arxiv-test.jsonl"
data_file.replace("test", "train")
print(data_file)

In [None]:
#  collect html sft dataset
import json
import os


version="0510"
datset_segments=[
    ("sinafinance", "01", "bs4"),
    ("sinafinance", "01", "markdownify"),
    ("arxiv", "cs", "bs4"),
    ("arxiv", "cs", "markdownify"),
]

for dataset, segment, task in datset_segments:
    data_file=f"../html_data/{dataset}/{dataset}-{segment}-{task}-train.jsonl"
    segment_data_lines=[json.loads(l) for l in open(data_file)]
    for i, line in enumerate(segment_data_lines):
        segment_data_lines[i]={"id": f"{dataset}_{segment}_{task}_{i}", "messages": [{"role": "user", "content": line["question"]}, {"role": "assistant", "content": line["answers"]}]}
    
    with open(f"../html_data/sft/{version}/{dataset}-{segment}-{task}-train.jsonl", "w") as f:
        for line in segment_data_lines:
            f.write(json.dumps(line, ensure_ascii=False) + "\n")
        

In [None]:
#  open https://dl.acm.org/doi/10.1145/3631938
import requests

url="https://dl.acm.org/doi/10.1145/3631938"
response=requests.get(url)

In [None]:
response.text

In [None]:
import requests

result=requests.post("http://tx.plageon.cn:5005/search/html_proxy/", json={"urls": ["https://dl.acm.org/doi/10.1145/3631938"]})

print(result.json())

In [7]:
from bs4 import BeautifulSoup as bs
def bs_parse(html_content):
    soup = bs(html_content, 'html.parser')
    texts = soup.find_all(string=True)
    texts = [((i.parent.name if i.parent.name else ""), i.text.strip()) for i in texts if i.text.strip()]
    title = ""
    for i in texts:
        if i[0] in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
            title = i[1]
            break
    if title == "":
        for i in texts:
            if i[0] == 'title':
                title = i[1]
                break
    texts = [i[1] for i in texts]
    texts = " ".join(texts)
    return texts


def html2raw_text(html_content):
    if isinstance(html_content, list):
        raw_text = [bs_parse(i) for i in html_content]
        raw_text = " ".join(raw_text)
    else:
        raw_text = bs_parse(html_content)
    return raw_text


def html2markdown(html_content):
    from markdownify import markdownify as md
    if isinstance(html_content, list):
        markdown  = []
        for i in range(len(html_content)):
            try:
                markdown.append(md(html_content[i]))
            except Exception as e:
                print(f"convert html to markdown failed: {str(e)}")
                print("convert to raw text instead")
                markdown.append(bs_parse(html_content[i]))

        markdown = " ".join(markdown)

    else:
        try:
            markdown = md(html_content)
        except Exception as e:
            print(f"convert html to markdown failed: {str(e)}")
            print("convert to raw text instead")
            markdown = bs_parse(html_content)
    return markdown


In [None]:
dataset="musique"
chat_model="bc34b192k"
search_engine="bing"
reference_format="html-simple"
multi_docs="top10"
split="test"
rewrite_method="slimplmqr"
output_dir = f"../html_data/{dataset}/{chat_model}/{search_engine}"
file = f"{output_dir}/{chat_model}-{reference_format}-{rewrite_method}-{multi_docs}-{dataset}-{split}.jsonl"
data_lines = [json.loads(l) for l in open(file)]

In [None]:
for idx in range(len(data_lines)):
    res=data_lines[idx][f"{chat_model}_{reference_format}"]
    assert res != "" and "timeout" not in res, f"idx: {idx}, res: {res}"

In [None]:
if re.match(r"\d{4}-ep\d{1}", "2022-ep1"):
    print("match")

In [None]:
import requests
import random
api_keys = [
            "sk-3spFeSahdrwUPzqWhQ6SnsMJTYSIOex6xHJiY8CINCj60kof",
            "sk-1yzw6Qh90EeI9TKku21VjdF8HbQgaKUD5yDL3uwtkLOc7ixv",
            "sk-1dg2nXUM0Nl9l9aodC5NsksM0Dp6zwfjnODTRO6zt3LKdve7",
            # "sk-k5oVcGwMW2n0OGZ269b3oJw2uG3g7SIju8c3q18WeHMMAeWf",
        ]
token_api = "https://api.moonshot.cn/v1/tokenizers/estimate-token-count"
res = requests.post(token_api, json={
    "model": "moonshot-v1-128k",
    "headers":{"Authorization": f"Bearer {random.choice(api_keys)}"},
    "messages": [
        {
            "role": "system",
            "content": "你是 Kimi，由 Moonshot AI 提供的人工智能助手，你更擅长中文和英文的对话。你会为用户提供安全，有帮助，准确的回答。同时，你会拒绝一切涉及恐怖主义，种族歧视，黄色暴力等问题的回答。Moonshot AI 为专有名词，不可翻译成其他语言。"
        },
        { "role": "user", "content": "你好，我叫李雷，1+1等于多少？" }
    ]})

In [None]:
res.json()

In [None]:
# curl http://localhost:8000/v1/chat/completions \
#     -H "Content-Type: application/json" \
#     -d '{
#     "model": "Qwen2-72B-Instruct",
#     "messages": [
#         {"role": "system", "content": "You are a helpful assistant."},
#         {"role": "user", "content": "Your Long Input Here."}
#     ]
#     }'
import requests

res = requests.post("http://vllm-qwen2-72b-instruct.gw-gqqd25no78ncp72xfw-1151584402193309.cn-wulanchabu.pai-eas.aliyuncs.com/v1/", json={
    "model": "qwen2_72b_instruct",
    "messages": [
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": "Your Long Input Here."}
    ]
})


In [None]:
#  count average document number and document length (token level)
import json
from transformers import AutoTokenizer
from tqdm import tqdm

datasets = ["asqa", "hotpot-qa", "nq", "trivia-qa", "musique"]
search_engine="bing"
split="test"
rewrite_method="slimplmqr"

html_file = f"../html_data/{dataset}/{search_engine}/{search_engine}html-{rewrite_method}-{dataset}-{split}.jsonl"
simple_html_file = html_file.replace(f"{dataset}-{split}", f"{dataset}-simple-{split}")
data_lines = [json.loads(l) for l in open(html_file)]
simple_html_data_lines = [json.loads(l) for l in open(simple_html_file)]

tokenizer = AutoTokenizer.from_pretrained(f"../../../huggingface/Baichuan2-7B-Chat/", trust_remote_code=True)

url_num = 0
doc_num = 0
# doc_len = 0
simple_doc_len = 0
raw_text_len = 0
markdown_len = 0
sample_num = len(data_lines)

markdown_table="| Dataset | Average URL Number | Average Document Number | Average Document Length | Average Simple Document Length | Average Raw Text Length | Average Markdown Length |\n"
markdown_table+="| --- | --- | --- | --- | --- | --- | --- |\n"
for dataset in datasets:
    for i in tqdm(range(sample_num)):
        data_line = data_lines[i]
        simple_data_line = simple_html_data_lines[i]
        search_res = data_line[f'{rewrite_method}_results']
        for res in search_res:
            html_text = res["html"]
            doc_num += 1
            # doc_len += len(tokenizer.encode(html_text))
            raw_text = html2raw_text(html_text)
            try:
                markdown = html2markdown(html_text)
            except:
                markdown = raw_text
            raw_text_len += len(tokenizer.encode(raw_text))
            markdown_len += len(tokenizer.encode(markdown))
        for res in simple_data_line[f'{rewrite_method}_results']:
            html_text = res["html"]
            simple_doc_len += len(tokenizer.encode(html_text))
            
        #  unique urls
        urls = set([res["url"] for res in search_res])
        url_num += len(urls)
        
    average_url_num = round(url_num / sample_num, 2)
    average_doc_num = round(doc_num / sample_num, 2)
    average_doc_len = 0
    average_simple_doc_len = round(simple_doc_len / doc_num, 0)
    average_raw_text_len = round(raw_text_len / doc_num, 0)
    average_markdown_len = round(markdown_len / doc_num, 0)
    
    print(f"{dataset} average url number: {average_url_num}")
    # print(f"{dataset} average document number: {average_doc_num}, average document length: {average_doc_len}")
    # print(f"{dataset} average simple document length: {average_simple_doc_len}")
    # print(f"{dataset} average raw text length: {average_raw_text_len}")
    # print(f"{dataset} average markdown length: {average_markdown_len}")
    markdown_table+=f"| {dataset} | {average_url_num} | {average_doc_num} | {average_doc_len} | {average_simple_doc_len} | {average_raw_text_len} | {average_markdown_len} |\n"
    
print(markdown_table)

In [None]:
res

In [None]:
from openai import OpenAI
# Set OpenAI's API key and API base to use vLLM's API server.
openai_api_key = "EMPTY"
openai_api_base = "http://vllm-qwen2-72b-instruct.gw-gqqd25no78ncp72xfw-1151584402193309.cn-wulanchabu.pai-eas.aliyuncs.com/v1/"

client = OpenAI(
    api_key=openai_api_key,
    base_url=openai_api_base,
)

chat_response = client.chat.completions.create(
    model="qwen2-72b-instruct",
    messages=[
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": "Tell me something about large language models."},
    ]
)
print("Chat response:", chat_response)

In [15]:
dataset = "asqa"
search_engine = "bing"
split = "test"
rewrite_method = "slimplmqr"
rerank_model = "bgelargeen"
node_file=f"../html_data/{dataset}/{search_engine}/{search_engine}node-{rewrite_method}-{rerank_model}-{dataset}-simple-{split}.jsonl"
node_lines=[json.loads(line) for line in open(node_file)]



In [None]:
#  construct prompt

In [14]:
import anthropic

client = anthropic.Anthropic(
    # defaults to os.environ.get("ANTHROPIC_API_KEY")
    api_key="sk-LzZdJrt2A1g2QbPE03827235421242A4914b63FcB8CaFfCb",
    base_url="http://claude-oneapi.baichuan.svc"
)
message = client.messages.create(
    model="claude-3-opus-20240229",
    max_tokens=1024,
    messages=[
        {"role": "user", "content": "Hello, Claude"}
    ]
)
print(message.content)


APIConnectionError: Connection error.

In [None]:
# curl http://claude-oneapi.baichuan.svc/v1/messages \
#      --header "x-api-key: sk-LzZdJrt2A1g2QbPE03827235421242A4914b63FcB8CaFfCb" \
#      --header "anthropic-version: 2023-06-01" \
#      --header "content-type: application/json" \
#      --data \
# '{
#     "model": "claude-3-opus-20240229",
#     "max_tokens": 1024,
#     "messages": [
#         {"role": "user", "content": "Hello, world"}
#     ]
# }'
import requests

res = requests.post("http://claude-oneapi.baichuan.svc/v1/messages", json={
    "model": "claude-3-opus-20240229",
    "max_tokens": 1024,
    "messages": [
        {"role": "user", "content": "Hello, Claude"}
    ]
})