# Driving Exam Auto Tagging
Direct tagging with a VLM

## A. Format Question Data

## 0. Set up the environment

Set the source path to the root of the project

In [1]:
import json
import os

In [2]:
SRC_PATH = "/Users/simonxu/Files/Projects/Drivetest_App/2_NLP Tag Creation/drivetest_tag_extraction/src"
os.chdir(SRC_PATH)

### 1. Set up the question bank

In [3]:
from entities.question_bank import QuestionBank
from data_access.local_json_db import LocalJsonDB
from data_formatting.data_formatter import DataFormatter, DataFormat

### i) Load the question bank

In [4]:
RAW_DATA_FILE = "data_storage/raw_database/data.json"
RAW_IMG_DIR = "data_storage/raw_database/images"

def load_data() -> QuestionBank:
    """ Load the question bank from the formatted data directory """
    raw_db = LocalJsonDB(RAW_DATA_FILE, RAW_IMG_DIR)
    return raw_db.load()

In [5]:
raw_qb = load_data()
print(raw_qb.question_count())

2836


### ii) Preprocessing

Images are reshaped to a standard size and format.

In [6]:
FORMATTED_IMG_DIR = "data_storage/formatted_database/images"
def format_data(raw_qb: QuestionBank, data_format: DataFormat) -> QuestionBank:
    """ Load the question bank from the formatted data directory """
    data_formatter = DataFormatter(data_format=data_format)
    new_qb = data_formatter.format_data(question_bank=raw_qb,
                                        new_img_dir=FORMATTED_IMG_DIR)
    return new_qb

In [7]:
%%time
INPUT_IMG_EXTENSION = "webp"
OUTPUT_IMG_EXTENSION = "jpg"

data_format = DataFormat(image_shape=(256, 256),
                         input_image_extension=INPUT_IMG_EXTENSION,
                         output_image_extension=OUTPUT_IMG_EXTENSION)
qb = format_data(raw_qb=raw_qb, data_format=data_format)
print(qb.question_count())

2836
CPU times: user 14 s, sys: 2.8 s, total: 16.8 s
Wall time: 17.4 s


### iii) Save the formatted question bank

In [8]:
FORMATTED_DB_FILE_PATH = "data_storage/formatted_database/data.json"
def save_formatted_data(question_bank: QuestionBank) -> None:
    """ Save the question bank to the specified file path """
    formatted_db = LocalJsonDB(FORMATTED_DB_FILE_PATH, FORMATTED_IMG_DIR)
    formatted_db.save(question_bank)

In [9]:
save_formatted_data(qb)

## C. Question Bank to Batch Request File

Turn the question bank into a jsonl file that can be used for making batch requests compatible with the OpenAI standard.

In [10]:
import datetime
import logging
from logging import Logger

from label_generator.batch_request_factory import BatchRequestFactory

In [11]:
def load_prompt() -> str:
    """ Load the prompt from the specified file path. """
    with open(PROMPT_FILE_PATH, 'r', encoding='utf-8') as file:
        prompt = file.read()
    return prompt

In [12]:
def make_logger(logging_directory: str, verbose: bool=False, debug: bool=False) -> Logger:
    """ Create a logger that logs to the specified directory. """
    log_filename, timestamp = _make_logger_name(logging_directory)
    logger = logging.getLogger(f"batch_request_{timestamp}")
    if debug:
        logger.setLevel(logging.DEBUG)
    else:
        logger.setLevel(logging.INFO)
    _add_handlers(log_filename, logger, verbose, debug)
    return logger

def _make_logger_name(logging_directory):
    timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M")
    log_filename = os.path.join(logging_directory,
                                f"batch_request_{timestamp}.log")
    return log_filename, timestamp

def _add_handlers(log_filename, logger, verbose, debug):
    for handler in logger.handlers[:]:
        logger.removeHandler(handler)
    formatter = _add_file_handler(log_filename, logger, debug)
    if verbose:
        _add_console_handler(formatter, logger)

def _add_file_handler(log_filename, logger, debug):
    file_handler = logging.FileHandler(log_filename)
    if debug:
        file_handler.setLevel(logging.DEBUG)
    else:
        file_handler.setLevel(logging.INFO)
    formatter = logging.Formatter(
        '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
    file_handler.setFormatter(formatter)
    logger.addHandler(file_handler)
    return formatter

def _add_console_handler(formatter, logger):
    console_handler = logging.StreamHandler()
    console_handler.setLevel(logging.DEBUG)
    console_handler.setFormatter(formatter)
    logger.addHandler(console_handler)

Specify model information and request URL.

In [20]:
LOGGING_DIRECTORY = "my_logs"
PROMPT_FILE_PATH = "data_storage/prompt_file/prompt.txt"
MODEL_NAME = "qwen-vl-max"
REQUEST_URL = "/v1/chat/completions"

In [21]:
%%time
batch_maker = BatchRequestFactory(
    question_bank=qb,
    prompt=load_prompt(),
    url=REQUEST_URL,
    model_name=MODEL_NAME,
    logger=make_logger(LOGGING_DIRECTORY, verbose=False, debug=True))
batch_request = batch_maker.make_batch_request()

CPU times: user 858 ms, sys: 395 ms, total: 1.25 s
Wall time: 1.26 s


In [22]:
REQUEST_FILE_PATH = "data_storage/batch_request_file/tagging_request.jsonl"

In [23]:
def clear_request_file():
    with open(REQUEST_FILE_PATH, 'w', encoding='utf-8') as file:
        json.dump({}, file)

In [24]:
def count_lines_in_file(file_path: str) -> int:
    """ Count the number of lines in a file. """
    with open(file_path, 'r', encoding='utf-8') as file:
        return sum(1 for _ in file)

In [43]:
batch_request.to_jsonl_file(REQUEST_FILE_PATH)
print(f"Number of lines in the request file: {count_lines_in_file(REQUEST_FILE_PATH)}")

Number of lines in the request file: 2836


# 2. Generate the Labels

In [26]:
from pathlib import Path
from openai import OpenAI

In [27]:
client = OpenAI(
    api_key=os.getenv("DASHSCOPE_API_KEY"),
    base_url="https://dashscope.aliyuncs.com/compatible-mode/v1",
)

## a) Upload batch file

In [28]:
# REQUEST_TEST_FILE_PATH = "data_storage/batch_request_file/tagging_request_test.jsonl"

In [29]:
%%time
file_object = client.files.create(file=Path(REQUEST_FILE_PATH), purpose="batch")

CPU times: user 432 ms, sys: 227 ms, total: 659 ms
Wall time: 36.5 s


In [30]:
print(file_object.model_dump_json())

{"id":"file-batch-77ec329f500a4ecd9de16a3d","bytes":69592805,"created_at":1753690121,"filename":"tagging_request.jsonl","object":"file","purpose":"batch","status":"processed","expires_at":null,"status_details":null}


## b) Create batch job

In [31]:
REQUEST_METADATA = {'ds_name':"科目一标签生成",
                    'ds_description':'为驾考科目一题目自动生成 "tags" 和 "keywords"。 其中"tags" 需要深入理解问题的测试内容，代表问题的知识点与考点。“keywords”需要提取问题中明确或隐含的关键词， 用来检索问题内容。'}

In [32]:
%%time
request_id = file_object.id
batch = client.batches.create(
    input_file_id=request_id,
    endpoint=REQUEST_URL,
    completion_window="24h",
    metadata=REQUEST_METADATA
)
print(batch)

Batch(id='batch_61bc1296-7776-416f-9552-36a3593295be', completion_window='24h', created_at=1753690123, endpoint='/v1/chat/completions', input_file_id='file-batch-77ec329f500a4ecd9de16a3d', object='batch', status='validating', cancelled_at=None, cancelling_at=None, completed_at=None, error_file_id=None, errors=None, expired_at=None, expires_at=None, failed_at=None, finalizing_at=None, in_progress_at=None, metadata={'ds_name': '科目一标签生成', 'ds_description': '为驾考科目一题目自动生成 "tags" 和 "keywords"。 其中"tags" 需要深入理解问题的测试内容，代表问题的知识点与考点。“keywords”需要提取问题中明确或隐含的关键词， 用来检索问题内容。'}, output_file_id=None, request_counts=BatchRequestCounts(completed=0, failed=0, total=0))
CPU times: user 4.38 ms, sys: 8.1 ms, total: 12.5 ms
Wall time: 187 ms


Periodically check the status of the batch job.

In [33]:
from time import sleep
sleep(30) # Wait for 30 seconds before checking the first status

In [36]:
WAIT_TIME = 900 # 15 Minutes
IN_PROGRESS_STATUS_CODES = ["validating", "in_progress", "finalizing", "cancelling"]
ERROR_STATUS_CODES = ["failed", "expired", "cancelled"]

In [37]:
batch_status = client.batches.retrieve(batch.id)
while batch_status.status in IN_PROGRESS_STATUS_CODES:
    print(f"Batch job status: {batch_status.status}")
    sleep(WAIT_TIME)
    batch_status = client.batches.retrieve(batch.id)
print(f"Final batch job status:\n"
      f"{batch_status}")

Final batch job status:
Batch(id='batch_61bc1296-7776-416f-9552-36a3593295be', completion_window='24h', created_at=1753690123, endpoint='/v1/chat/completions', input_file_id='file-batch-77ec329f500a4ecd9de16a3d', object='batch', status='completed', cancelled_at=None, cancelling_at=None, completed_at=1753690864, error_file_id='file-batch_output-f15a4cfe77224367a082c084', errors=None, expired_at=None, expires_at=None, failed_at=None, finalizing_at=1753690860, in_progress_at=1753690129, metadata={'ds_name': '科目一标签生成', 'ds_description': '为驾考科目一题目自动生成 "tags" 和 "keywords"。 其中"tags" 需要深入理解问题的测试内容，代表问题的知识点与考点。“keywords”需要提取问题中明确或隐含的关键词， 用来检索问题内容。'}, output_file_id='file-batch_output-668c17e08d304ba59883ebc4', request_counts=BatchRequestCounts(completed=1812, failed=1024, total=2836))


## c) Error handling

In [70]:
ERROR_FILE_PATH = "data_storage/tagging_results/error.jsonl"
# if batch_status.status in ERROR_STATUS_CODES:
content = client.files.content(batch_status.error_file_id)
content.write_to_file(ERROR_FILE_PATH)
print(f"完整的请求失败信息已保存至本地错误文件: {ERROR_FILE_PATH}")

完整的请求失败信息已保存至本地错误文件: data_storage/tagging_results/error.jsonl


## d) Retrieve result

In [39]:
RESULT_OUTPUT_PATH = "data_storage/tagging_results/result.jsonl"

In [40]:
output_file_id = batch_status.output_file_id
output_file = client.files.content(file_id=output_file_id)

In [41]:
output_file.write_to_file(RESULT_OUTPUT_PATH)

In [42]:
count_lines_in_file(RESULT_OUTPUT_PATH)

1812

# 3) Parse the results

In [47]:
with open(RESULT_OUTPUT_PATH, 'r', encoding='utf-8') as file:
    results = [json.loads(line) for line in file]
print(f"Number of results: {len(results)}")

Number of results: 1812


In [48]:
processed_questions = []
for labeling_result in results:
    processed_questions.append(labeling_result["custom_id"])
print(f"Number of processed questions: {len(processed_questions)}")

Number of processed questions: 1812


In [49]:
unprocessed_questions = []
for qid in qb.get_qid_list():
    if qid not in processed_questions:
        unprocessed_questions.append(qid)
print(f"Number of unprocessed questions: {len(unprocessed_questions)}")

Number of unprocessed questions: 1024


In [72]:
count = 0
for qid in unprocessed_questions:
    if qb.get_question(qid).has_img():
        count += 1
    else:
        print(f"Unprocessed question: {qid}")
print(count)

Unprocessed question: 23f56
Unprocessed question: e386c
1022


In [73]:
with open(REQUEST_FILE_PATH, 'r', encoding='utf-8') as file:
    request_data = [json.loads(line) for line in file]
for request in request_data:
    if request["custom_id"] in ["23f56", "e386c"]:
        print(request)

{'custom_id': '23f56', 'method': 'POST', 'url': '/v1/chat/completions', 'body': {'model': 'qwen-vl-max', 'messages': [{'role': 'system', 'content': '<SYSTEM_PROMPT>\n你是一位拥有超过15年经验的中国资深驾驶教练和考官。你对中国的交通法规，特别是科目一（理论考试）的场景有深入的了解，尤其擅长分析题目背后所考察的核心知识点、潜在陷阱以及学员的常见错误认知。\n你的任务是分析给定的驾考题目（可包含图片）并提取两种类型的信息：‘tags’（代表题目背后所考察的深层交通规则或安全概念）和‘keywords’（代表题目中直接出现的或隐含的关键物体、标志或术语)。你必须输出一个包含两个键的JSON对象：\'keywords\' 和 \'tags\' 。\n</SYSTEM_PROMPT>\n\n<INSTRUCTIONS>\n请遵循以下步骤生成你的回答：\n1. **第一步：生成推理依据**。你需要生成一个结构化的推理过程，该过程将直接导向最终的JSON输出。请遵循以下工作流程：\n    * **步骤1：视觉观察与文本提取 (Keywords)**。首先，请详细描述并列出题目图像中所有关键的视觉元素（如物体、标志、标线等），并结合题目文本信息，识别出所有关键术语。这是你的`keywords`来源。\n    * **步骤2：分析与关联**。接着，分析这些关键词之间的关系，并将这些观察结果与具体的中国交通法规或安全原则联系起来。明确指出该场景具体对应哪条法规或安全理念。\n    * **步骤3：归纳与生成 (Tags)**。最后，基于第二步的分析，从预定义的标签分类体系中，生成一组高度概括的、抽象的语义标签。这些标签应总结该题目所考察的核心交通规则或安全概念。这是你的`tag`来源。\n2. **第二步：生成JSON**。 仅根据你的推理过程，生成最终的JSON对象。\n    * `keywords`: 一个字符串数组，内容来自你在“步骤1：视觉观察与文本提取”中识别出的关键词。\n    * `tags`: 一个字符串数组，内容来自你在“步骤3：归纳与生成”中得出的核心概念标签。\n</INSTRUCTIONS