## Generate Request Object

In [5]:
import json

system_prompt = """You are a professional evaluator of language model outputs. Your task is to score a model's response against a reference answer, based on the given instruction and input. Rate the response on a **0–5 scale** for each of the following:

1. **Correctness**: Is the information accurate and logical?
2. **Completeness**: Does it cover all required points?
3. **Relevance**: Is all content relevant to the task?
4. **Fluency**: Is the language natural, grammatically correct, and well-structured?

Return only the result in **strict JSON** format:

```json
{{
  "Correctness": ?,
  "Completeness": ?,
  "Relevance": ?,
  "Fluency": ?
}}
```"""
template_user = """
### Instruction:
{}
 
### Input:
{}
 
### Reference Output:
{}
 
### Generated Output:
{}
"""


import os
file_list = os.listdir('./generate/raw')

for file_name in file_list:
    file_path = os.path.join('./generate/raw', file_name)

    with open(file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)

    # 写入目标文件
    with open(f'./generate/request/{file_name}.jsonl', 'w', encoding='utf-8') as out_f:
        for idx, item in enumerate(data, 1):
            req = {
                "custom_id": f"request-{idx}",
                "method": "POST",
                "url": "/v4/chat/completions",
                "body": {
                    "model": "glm-4-flash",
                    "messages": [
                        {
                            "role": "system",
                            "content": system_prompt
                        },
                        {
                            "role": "user",
                            "content": template_user.format(
                                item.get("instruction", "").strip(),
                                item.get("input", "").strip(),
                                item.get("reference_output", "").strip(),
                                item.get("generated_output", "").strip()
                            )
                        }
                    ],
                    "temperature": 0.0
                }
            }
            out_f.write(json.dumps(req, ensure_ascii=False) + '\n')


## Caculate Model Score

In [None]:
import os
import json

file_list = os.listdir('./generate/response')
# print(file_list)
# file_list = ['none.jsonl']
for file in file_list:
    file_path = os.path.join('./generate/response/', file)
    scores = []

    with open(file_path, 'r', encoding='utf-8') as f:
        for idx,line in enumerate(f):
            data = json.loads(line)
            content = data['response']['body']['choices'][0]['message']['content']
            try:
                content = content.strip().strip('```json').strip('```').strip()
                obj = json.loads(content)
            except:
                print(f'Error in line {idx+1} of {file}')
                continue

            scores.append(obj)

    dimensions = ["Correctness", "Completeness", "Relevance", "Fluency"]
    avg_scores = {}
    # for i,score in enumerate(scores):
    #     try:
    #         s = score['Correctness']
    #     except:
    #         print(i)
    #         continue
    # requestid-574 ---- none

    for dim in dimensions:
        # try:
        #     values = [score[dim] for score in scores]
        # except:
        #     continue
        values = []
        for score in scores:
            try:
                values.append(score[dim])
            except:
                continue
        avg_scores[dim] = round(sum(values) / len(values), 2)
    print(f'{file} valid scores: {len(values)}----{avg_scores}')

fkl.jsonl valid scores: 2046----{'Correctness': 3.53, 'Completeness': 3.54, 'Relevance': 4.46, 'Fluency': 4.37}
Error in line 1206 of jskl.jsonl
jskl.jsonl valid scores: 2045----{'Correctness': 3.55, 'Completeness': 3.57, 'Relevance': 4.49, 'Fluency': 4.4}
none.jsonl valid scores: 2045----{'Correctness': 3.52, 'Completeness': 3.53, 'Relevance': 4.44, 'Fluency': 4.38}
rkl.jsonl valid scores: 2046----{'Correctness': 3.53, 'Completeness': 3.55, 'Relevance': 4.46, 'Fluency': 4.38}
seqkl.jsonl valid scores: 2046----{'Correctness': 3.51, 'Completeness': 3.55, 'Relevance': 4.43, 'Fluency': 4.38}
teacher.jsonl valid scores: 2046----{'Correctness': 4.0, 'Completeness': 4.0, 'Relevance': 4.83, 'Fluency': 4.73}
