In [1]:
import json
import requests
import asyncio
import random
import pandas as pd
from rouge import Rouge
import nltk
import nest_asyncio
import time
import aiohttp  

nest_asyncio.apply()
dataset_list = []

with open("development.json", "r") as file:
    seed_data = json.load(file)

with open("shopping_chatglm_data.json", "r") as file:
    dataset_list = json.load(file)
data_str = [item["text"] for item in dataset_list]
df_seed = pd.DataFrame(seed_data)

def split_string(gpt_output: str):
    parts = gpt_output.split('###')
    return parts

def delete_repeat(threshold, dataset_list: list, tar):
    if len(dataset_list) == 0:
        return True
    for item in dataset_list:
        rouge = Rouge()
        scores = rouge.get_scores([item], [tar])
        if scores[0]['rouge-l']['f'] > threshold:
            return False
    return True

In [2]:
flag = 0
final_seed_len = 8
generate_time = 100
num_list = list(range(len(df_seed)))
selected_set = []
prefix_prompt = """
You are asked to come up with a set of 20 diverse task Instructions&Input/Questions/Problems. These task instructions will be given to a GPT model and we will evaluate the GPT model for completing the instructions.

Here are the requirements:
0. The theme of this task is about Shopping Concept Understanding, aims to evaluate the model's ability to understand shopping entities in the form of texts, such as product names, product categories, attributes, product descriptions, reviews, etc
1. Try not to repeat the verb for each instruction to maximize diversity.
2. The language used for the Instructions&Input/Questions/Problems also should be diverse.
3. In order to make the type of task be diverse, trying to generate task types different from the examples. The list should include diverse types of tasks like open-ended generation, classification, ranking, multi-choice etc.
2. A GPT language model should be able to complete the Instructions&Input/Questions/Problems. 
4. The Instructions&Input/Questions/Problems should be 1 to 4 sentences long. Either an imperative sentence or a question is permitted.
5. The Output/Answer should be an appropriate and correct response to the Instructions&Input/Questions/Problems. Make sure the Output/Answer is less than 100 words.
6. Each task needs to be enclosed with “###” just like the examples.
List of 8 tasks:
"""

In [3]:
prompt_gpt_input_list = []

for i in range(generate_time):
     while True:
        selected_numbers = set(random.sample(num_list, final_seed_len))
        if selected_numbers not in selected_set:
            selected_set.append(selected_numbers)
            break

for set_item in selected_set[-generate_time:]:
    df_final_seed = pd.DataFrame([], columns = ["input_field", "output_field", "task_name", "task_type", "metric", "is_multiple_choice"])
    item_list = ""
    for j in set_item:
        df_final_seed.loc[len(df_final_seed)] = df_seed.loc[j]
    for j in range(len(df_final_seed)):
        temp_string = '\n###\n' + str(df_final_seed['input_field'][j]) + '\n' + str(df_final_seed['output_field'][j]) + '\n'
        item_list += temp_string
    prompt_gpt_input = prefix_prompt + item_list
    prompt_gpt_input_list.append(prompt_gpt_input)

In [4]:
async def get_access_token():
    """
    使用 API Key，Secret Key 获取access_token，替换下列示例中的应用API Key、应用Secret Key
    """
        
    url = "https://aip.baidubce.com/oauth/2.0/token?grant_type=client_credentials&client_id=###&client_secret=###"
    
    payload = json.dumps("")
    headers = {
        'Content-Type': 'application/json',
        'Accept': 'application/json'
    }
    
    response = requests.request("POST", url, headers=headers, data=payload)
    return response.json().get("access_token")
  
async def create_request(input, session):  
    url = "https://aip.baidubce.com/rpc/2.0/ai_custom/v1/wenxinworkshop/chat/completions_pro?access_token=" + await get_access_token()  
    payload = json.dumps({  
        "messages": [  
            {  
                "role": "user",  
                "content": input  
            }  
        ]  
    })  
    headers = {  
        'Content-Type': 'application/json'  
    }  
    async with session.post(url, headers=headers, data=payload.encode('utf-8')) as response:  
        return await response.text()  
  
async def main():  
    async with aiohttp.ClientSession() as session:  
        # 假设我们有一个输入列表  
        inputs = prompt_gpt_input_list
        # 创建一个任务列表  
        tasks = [create_request(input, session) for input in inputs]  
        # 使用gather等待所有任务完成并获取结果  
        results = await asyncio.gather(*tasks)  
        print("Results accepted")
        for result in results:  
            s = json.loads(result)
            if "result" not in s.keys():
                continue
            tar_str_list = split_string(s["result"])
            for tar in tar_str_list:
                if tar != "" and delete_repeat(0.7, data_str, tar):
                    dataset_list.append({"text": tar})
                    data_str.append(tar)
  
# 运行主函数  
asyncio.run(main())

Results accepted


In [5]:
json_data = json.dumps(dataset_list, indent=4, separators=(',', ': '))
filename = 'shopping_chatglm_data.json'

# 将JSON字符串写入文件
with open(filename, 'w') as f:  
    f.write(json_data)

In [9]:
a = {2: "df"}

In [11]:
2 in a.keys()

True