In [39]:
import json 
import os 
import pandas as pd 
import importlib 
import preprocess_utils 
importlib.reload(preprocess_utils)
from preprocess_utils import OneRoundEvaluationDataset

import torch 
from transformers import AutoTokenizer, AutoModel, AutoConfig

from openai_api.task_decompostion import TaskDecomposer
from openai_api import gpt 

from preprocess_utils import sanity_check

In [29]:
def cal_match(prediction, label):
    # 如果为空列表，说明没有激起任何工具
    if len(label) == 0:
        label = ('no corresponding tools.', '')
    # 否则取第一条工具，每个工具是一个元组
    else:
        label = label[0]
    target_intent = label[0]
    target_method = label[1]
    
    # 如果ai没有调用工具
    if isinstance(prediction, str):
        prediction_intent = 'no corresponding tools.'
        prediction_method = ''
    # 如果ai调用了工具则一定是一个字典
    else:
        try:
            prediction_intent = prediction.get('function', prediction.get('name'))
            prediction_method = prediction.get('arguments', prediction.get('parameters'))
        except:
            prediction_intent = list(prediction.keys())[0]
            prediction_method = prediction[prediction_intent]

    if prediction_intent == target_intent:
        if prediction_method == target_method:
            result = [1, 1]
        else:
            result = [1, 0]
    else:
        result = [0, 0]
    return result

In [58]:
def cal_chatglm(train_data, train_dataset, model, tokenizer, set_name, model_name, limit=100):
    print(set_name, model_name)
    results = []
    role = "user"
    for i in range(len(train_dataset)):
        print(i)
        if i == limit:
            break
        data = train_dataset[i]
        query = data["query"]
        # 如果target是[]，则说明query没有激起任何工具，no corresponding tools
        # 如果target不是空列表，则只取第一条分析
        target = data['query_target']
        num_decomposition = train_data[i]['num_decomposition']
        hit = train_data[i]['hit']
        if num_decomposition == 1:
            history = [data["history"]]
            try:
                response, history = model.chat(tokenizer, query, history=history, role=role)
            except:
                result = [-1, -1, 1, query, None]
            else:
                result = cal_match(response, target)
                result += [0, query, response]
            if not hit:
                result += ['no corresponding tools.']
            else:
                result += [target[0]]
            result += ['single_turn']
            result = [i] + result
        else:
            result = [i, -1, -1, -1, query, response, target, 'multi_turn']
        results.append(result)
    results = pd.DataFrame(results)
    results.columns = ['index', 'intent', 'slot', 'error', 'query', 'response', 'target', 'type']
    results = results.set_index('index')
    results['set_name'] = set_name
    results['model_name'] = model_name
    return results

In [31]:
def cal_openai(train_data, train_dataset, set_name, model_name, limit=100):
    print(set_name, model_name)
    with open('openai_api/configs/azure_openai_config_4.0.json', 'r') as f:
        config = json.load(f)
        gpt.openai_init(**config)
    
    llm = gpt.Gpt(config['model_name'], config['deployment_name'])
    results = []
    for i in range(0, len(train_data)):
        print(i)
        if i == limit:
            break
        data = train_dataset[i]
        query = data["query"]
        # 如果target是[]，则说明query没有激起任何工具，no corresponding tools
        # 如果target不是空列表，则只取第一条分析
        target = data['query_target']
        num_decomposition = train_data[i]['num_decomposition']
        hit = train_data[i]['hit']
        if num_decomposition == 1:
            task_decomposer = TaskDecomposer(llm,
                                             'openai_api/prompts/task_decomposition.txt',
                                             'openai_api/prompts/tools/toolapaca/{}.json'.format(i),
                                             'openai_api/few_shot_examples/task_decomposition.json',
                                             "User Requests:\n {user_request}\nAI: (The JSON (array) format output): ",
                                             )
            try:
                response = task_decomposer.run(query)
                if isinstance(response, list):
                    response = response[0]
            except:
                print('try failed', query)
                result = [-1, -1, 1, query, None]
            else:
                if response is None:
                    result = [-1, -1, 1, query, None]
                else:
                    result = cal_match(response, target)
                    result += [0, query, response]
            if not hit:
                result += ['no corresponding tools.']
            else:
                result += [target[0]]
            result += ['single_turn']
            result = [i] + result
        else:
            result = [i, -1, -1, -1, query, response, target, 'multi_turn']
        results.append(result)
    results = pd.DataFrame(results)
    results.columns = ['index', 'intent', 'slot', 'error', 'query', 'response', 'target', 'type']
    results = results.set_index('index')
    results['set_name'] = set_name
    results['model_name'] = model_name
    return results 

In [70]:
def cal_accuracy(results):
    num = results.shape[0]
    intent = results[(results['error'] == 0) & (results['type'] == 'single_turn')]['intent']
    zero_error_count = intent.shape[0]
    zero_error_rate = zero_error_count / num
    intent_sum, intent_mean = intent.sum(), intent.mean()
    slot = results[(results['error'] == 0) & (results['type'] == 'single_turn') & (results['intent'] == 1)]['slot']
    slot_sum, slot_mean = slot.sum(), slot.mean()
    both_mean = slot_sum / zero_error_count
    return pd.Series([num, zero_error_count, zero_error_rate, intent_sum, intent_mean, slot_sum, slot_mean, both_mean],
                     index=['num', 'zero_error_count', 'zero_error_rate', 'intent_sum', 'intent_mean', 'slot_sum', 'slot_mean', 'both_mean'])

In [3]:
tokenizer = AutoTokenizer.from_pretrained("/data/dataset/huggingface/hub/models--THUDM--chatglm3-6b/snapshots/456aa875cf1f46623006edaa23103774ea9c0eae", trust_remote_code=True)

In [9]:
# 微调模型

In [6]:
pt_checkpoint = "/root/PycharmProjects/ChatGLM3/finetune_chatmodel_demo/output/tool_alpaca_pt-20240103-113633-128-2e-2/checkpoint-1000"   
PRE_SEQ_LEN = int(os.environ.get("PRE_SEQ_LEN", 128))
if pt_checkpoint is not None and os.path.exists(pt_checkpoint):
    print('pt point found')
    config = AutoConfig.from_pretrained(
        "/data/dataset/huggingface/hub/models--THUDM--chatglm3-6b/snapshots/456aa875cf1f46623006edaa23103774ea9c0eae",
        trust_remote_code=True,
        pre_seq_len=PRE_SEQ_LEN
    )
    model = AutoModel.from_pretrained(
        "/data/dataset/huggingface/hub/models--THUDM--chatglm3-6b/snapshots/456aa875cf1f46623006edaa23103774ea9c0eae",
        trust_remote_code=True,
        config=config,
        device_map="auto"
    ).eval()
    prefix_state_dict = torch.load(os.path.join(pt_checkpoint, "pytorch_model.bin"))
    new_prefix_state_dict = {}
    for k, v in prefix_state_dict.items():
        if k.startswith("transformer.prefix_encoder."):
            new_prefix_state_dict[k[len("transformer.prefix_encoder."):]] = v
    print("Loaded from pt checkpoints", new_prefix_state_dict.keys())
    model.transformer.prefix_encoder.load_state_dict(new_prefix_state_dict)

pt point found


Loading checkpoint shards:   0%|          | 0/7 [00:00<?, ?it/s]

Some weights of ChatGLMForConditionalGeneration were not initialized from the model checkpoint at /data/dataset/huggingface/hub/models--THUDM--chatglm3-6b/snapshots/456aa875cf1f46623006edaa23103774ea9c0eae and are newly initialized: ['transformer.prefix_encoder.embedding.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Loaded from pt checkpoints dict_keys(['embedding.weight'])


In [10]:
# 原始模型

In [5]:
model_raw = AutoModel.from_pretrained("/data/dataset/huggingface/hub/models--THUDM--chatglm3-6b/snapshots/456aa875cf1f46623006edaa23103774ea9c0eae", trust_remote_code=True)
model_raw = model_raw.to(2).eval()

Loading checkpoint shards:   0%|          | 0/7 [00:00<?, ?it/s]

In [19]:
# 数据集概览

In [11]:
with open('formatted_data/tool_alpaca.jsonl', "r", encoding="utf-8") as f:
    if 'formatted_data/tool_alpaca.jsonl'.endswith(".json"):
        train_data = json.load(f)
    elif 'formatted_data/tool_alpaca.jsonl'.endswith(".jsonl"):
        train_data = [json.loads(line) for line in f]

In [40]:
train_dataset = OneRoundEvaluationDataset(
    train_data,
    tokenizer,
    2048,
)

In [41]:
train_dataset[0]

{'input_ids': [64790,
  64792,
  64794,
  30910,
  13,
  20115,
  267,
  1762,
  2554,
  362,
  1077,
  362,
  344,
  457,
  30930,
  809,
  431,
  1675,
  289,
  267,
  1762,
  4159,
  30954,
  13,
  30995,
  13,
  296,
  30955,
  12185,
  14428,
  7936,
  30954,
  15007,
  284,
  22764,
  2555,
  356,
  267,
  9377,
  1969,
  30932,
  21427,
  30932,
  293,
  1166,
  289,
  267,
  23933,
  8449,
  11313,
  332,
  5274,
  5659,
  14079,
  30916,
  25533,
  30954,
  6342,
  30955,
  12159,
  30974,
  2932,
  701,
  30955,
  2383,
  15830,
  30930,
  3806,
  30930,
  1618,
  290,
  30954,
  790,
  15352,
  30932,
  24838,
  30932,
  346,
  4059,
  30932,
  6745,
  2341,
  7461,
  30932,
  10422,
  3117,
  30932,
  346,
  20264,
  2769,
  353,
  22764,
  1969,
  289,
  792,
  359,
  15352,
  30932,
  24838,
  30932,
  346,
  4059,
  30932,
  6745,
  2341,
  7461,
  30932,
  10422,
  3117,
  30932,
  400,
  346,
  20264,
  649,
  30974,
  1252,
  701,
  30955,
  4223,
  30974,
  2932,
  7

In [6]:
train_data[0]

{'tools': ['sendHttpRequest: Send an HTTP request with the specified method, headers, and data to the Httpbin API for testing purposes.\nParameters: {"method": "Required. string. One of: [GET, POST, PUT, DELETE, HEAD, PATCH]. The HTTP method to use (GET, POST, PUT, DELETE, HEAD, or PATCH).", "url": "Required. string. The endpoint URL to send the request to.", "headers": "Object.  A key-value pair of headers to include in the request.", "data": "Object.  A key-value pair of data to include in the request body."}\nOutput: Successful response.\n - Format: application/json\n - Structure: Object{response: Object{status_code, headers: Object, body}}\ngetClientRequestData: Retrieve the client\'s request data, including headers, form data, uploaded files, and cookies.\nParameters: {"url": "Required. string. The endpoint URL to send the request to."}\nOutput: Successful response.\n - Format: application/json\n - Structure: Object{requestData: Object{headers: Object, form: Object, files: Object,

In [11]:
sanity_check(train_dataset[0]['input_ids'], train_dataset[0]['labels'], tokenizer)  

Sanity Check >>>>>>>>>>>>>
           '[gMASK]':  64790 ->   -100
               'sop':  64792 ->   -100
        '<|system|>':  64794 ->   -100
                  '':  30910 ->   -100
                '\n':     13 ->   -100
            'Answer':  20115 ->   -100
               'the':    267 ->   -100
         'following':   1762 ->   -100
         'questions':   2554 ->   -100
                'as':    362 ->   -100
              'best':   1077 ->   -100
                'as':    362 ->   -100
               'you':    344 ->   -100
               'can':    457 ->   -100
                 '.':  30930 ->   -100
               'You':    809 ->   -100
              'have':    431 ->   -100
            'access':   1675 ->   -100
                'to':    289 ->   -100
               'the':    267 ->   -100
         'following':   1762 ->   -100
             'tools':   4159 ->   -100
                 ':':  30954 ->   -100
                '\n':     13 ->   -100
                 '[':  30995 ->   -10

In [42]:
for i in range(0, len(train_data)):
    data = train_data[i]
    count = 0
    for item in data['conversations']:
        if item['role'] == 'tool': 
            count += 1 
    data['num_decomposition'] = count 
    if count >= 1:
        data['hit'] = True
    else:
        data['hit'] = False

In [18]:
for i in range(0, len(train_data)):
    data = train_data[i]
    # Path to the JSON file
    file_path = "openai_api/prompts/tools/toolapaca/{}.json".format(i)
    # Save the string as JSON in a file
    with open(file_path, 'w') as file:
        json.dump(data['tools'], file)

In [43]:
df_train = pd.DataFrame()
hit = [i['hit'] for i in train_data]
num_decomposition = [i['num_decomposition'] for i in train_data]
df_train['hit'] = hit 
df_train['num_decomposition'] = num_decomposition
df_train['set'] = '1train'
df = df_train
df['multi_turn'] = df['num_decomposition'] > 1
df['no_tools'] = (df['hit'] == False)

In [28]:
df.groupby('set').agg({'multi_turn': [len, 'sum', 'mean'], 'num_decomposition': ['min', 'max', 'median']})

Unnamed: 0_level_0,multi_turn,multi_turn,multi_turn,num_decomposition,num_decomposition,num_decomposition
Unnamed: 0_level_1,len,sum,mean,min,max,median
set,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
1train,4048,1535,0.3792,1,14,1.0


In [33]:
# 统计分析

In [69]:
final = []
final.append(cal_chatglm(train_data, train_dataset, model, tokenizer, set_name='1train', model_name='1finetune', limit=100))
final.append(cal_chatglm(train_data, train_dataset, model_raw, tokenizer, set_name='1train', model_name='2raw', limit=100))
final.append(cal_openai(train_data, train_dataset, set_name='1train', model_name='3openai', limit=100))

1train 1finetune
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
1train 2raw
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
1train 3openai
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33


[ERROR]	TaskDecomposer: Expecting value: line 1 column 1 (char 0)


34
35
36
37
38


[ERROR]	TaskDecomposer: Expecting value: line 1 column 1 (char 0)


39
40
41
42
43
44


[ERROR]	TaskDecomposer: Expecting value: line 1 column 1 (char 0)


45
46
47
48
49
50
51
52
53
54
55
56
57
58
59


[ERROR]	TaskDecomposer: Expecting value: line 1 column 1 (char 0)


60


[ERROR]	TaskDecomposer: Expecting value: line 1 column 1 (char 0)


61
62
63


[ERROR]	TaskDecomposer: Expecting value: line 1 column 1 (char 0)


64
65
66


[ERROR]	TaskDecomposer: Expecting value: line 1 column 1 (char 0)


67
68
69
70
71
72
73
74
75


[ERROR]	TaskDecomposer: Expecting value: line 1 column 1 (char 0)


76


[ERROR]	TaskDecomposer: Expecting value: line 1 column 1 (char 0)


77


[ERROR]	TaskDecomposer: Expecting value: line 1 column 1 (char 0)


78
79


[ERROR]	TaskDecomposer: Expecting value: line 1 column 1 (char 0)


80


[ERROR]	TaskDecomposer: Expecting value: line 1 column 1 (char 0)


81


[ERROR]	TaskDecomposer: Expecting value: line 1 column 1 (char 0)


82
83
84
85
86
87
88
89
90
91


[ERROR]	TaskDecomposer: Expecting value: line 1 column 1 (char 0)


92
93
94
95


[ERROR]	TaskDecomposer: Expecting value: line 1 column 1 (char 0)


96


[ERROR]	TaskDecomposer: Expecting value: line 1 column 1 (char 0)


97
98
99
100


In [71]:
final = pd.concat(final, axis=0).reset_index(drop=True)

In [72]:
final.to_csv('toolapaca.csv', index=None)

In [73]:
final.groupby(['model_name', 'set_name']).apply(cal_accuracy)

Unnamed: 0_level_0,Unnamed: 1_level_0,num,zero_error_count,zero_error_rate,intent_sum,intent_mean,slot_sum,slot_mean,both_mean
model_name,set_name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1finetune,1train,100.0,68.0,0.68,58.0,0.852941,34.0,0.586207,0.5
2raw,1train,100.0,66.0,0.66,49.0,0.742424,24.0,0.489796,0.363636
3openai,1train,100.0,54.0,0.54,50.0,0.925926,26.0,0.52,0.481481


In [75]:
# 随便看一个

In [77]:
role = "user"
data = train_dataset[0]
query = data["query"]
history = [data["history"]]

In [79]:
query 

'I\'m troubleshooting some requests, so can you help me send a POST request to https://httpbin.org/post with the header "Content-Type: application/json" and the data \'{"name": "John Doe", "email": "john.doe@example.com"}\'? Let me know the response details.'

In [78]:
data['query_target']

[('sendHttpRequest',
  {'method': 'POST',
   'url': 'https://httpbin.org/post',
   'headers': {'Content-Type': 'application/json'},
   'data': {'name': 'John Doe', 'email': 'john.doe@example.com'}})]

In [None]:
response, history = model.chat(tokenizer, query, history=history, role=role)

In [13]:
response 

{'name': 'sendHttpRequest',
 'parameters': {'method': 'POST',
  'url': 'https://httpbin.org/post',
  'headers': {'Content-Type': 'application/json'},
  'data': {'name': 'John Doe', 'email': 'john.doe@example.com'}}}

In [14]:
history 

[{'role': 'system',
  'content': 'Answer the following questions as best as you can. You have access to the following tools:\n',
  'tools': ['sendHttpRequest: Send an HTTP request with the specified method, headers, and data to the Httpbin API for testing purposes.\nParameters: {"method": "Required. string. One of: [GET, POST, PUT, DELETE, HEAD, PATCH]. The HTTP method to use (GET, POST, PUT, DELETE, HEAD, or PATCH).", "url": "Required. string. The endpoint URL to send the request to.", "headers": "Object.  A key-value pair of headers to include in the request.", "data": "Object.  A key-value pair of data to include in the request body."}\nOutput: Successful response.\n - Format: application/json\n - Structure: Object{response: Object{status_code, headers: Object, body}}\ngetClientRequestData: Retrieve the client\'s request data, including headers, form data, uploaded files, and cookies.\nParameters: {"url": "Required. string. The endpoint URL to send the request to."}\nOutput: Success