# Driving Exam Auto Tagging
Direct tagging with a VLM

## A. Format Question Data

## 0. Set up the environment

Set the source path to the root of the project

In [1]:
import json
import os

In [2]:
SRC_PATH = "/Users/simonxu/Files/Projects/Drivetest_App/2_NLP Tag Creation/drivetest_tag_extraction/src"
os.chdir(SRC_PATH)

### 1. Set up the question bank

In [3]:
from entities.question_bank import QuestionBank
from data_access.local_json_db import LocalJsonDB
from data_formatting.data_formatter import DataFormatter, DataFormat

### i) Load the question bank

In [4]:
RAW_DATA_FILE = "data_storage/raw_database/data.json"
RAW_IMG_DIR = "data_storage/raw_database/images"

def load_data() -> QuestionBank:
    """ Load the question bank from the formatted data directory """
    raw_db = LocalJsonDB(RAW_DATA_FILE, RAW_IMG_DIR)
    return raw_db.load()

In [5]:
raw_qb = load_data()
print(raw_qb.question_count())

2836


### ii) Preprocessing

Images are reshaped to a standard size and format.

In [6]:
FORMATTED_IMG_DIR = "data_storage/formatted_database/images"
def format_data(raw_qb: QuestionBank, data_format: DataFormat) -> QuestionBank:
    """ Load the question bank from the formatted data directory """
    data_formatter = DataFormatter(data_format=data_format)
    new_qb = data_formatter.format_data(question_bank=raw_qb,
                                        new_img_dir=FORMATTED_IMG_DIR)
    return new_qb

In [7]:
%%time
INPUT_IMG_EXTENSION = "webp"
OUTPUT_IMG_EXTENSION = "jpg"

data_format = DataFormat(image_shape=(256, 256),
                         input_image_extension=INPUT_IMG_EXTENSION,
                         output_image_extension=OUTPUT_IMG_EXTENSION)
qb = format_data(raw_qb=raw_qb, data_format=data_format)
print(qb.question_count())

2836
CPU times: user 13 s, sys: 2.51 s, total: 15.5 s
Wall time: 15.7 s


### iii) Save the formatted question bank

In [8]:
FORMATTED_DB_FILE_PATH = "data_storage/formatted_database/data.json"
def save_formatted_data(question_bank: QuestionBank) -> None:
    """ Save the question bank to the specified file path """
    formatted_db = LocalJsonDB(FORMATTED_DB_FILE_PATH, FORMATTED_IMG_DIR)
    formatted_db.save(question_bank)

In [9]:
save_formatted_data(qb)

## C. Question Bank to Batch Request File

Turn the question bank into a jsonl file that can be used for making batch requests compatible with the OpenAI standard.

In [10]:
import datetime
import logging
from logging import Logger

from label_generator.batch_request_factory import BatchRequestFactory

In [11]:
def load_prompt() -> str:
    """ Load the prompt from the specified file path. """
    with open(PROMPT_FILE_PATH, 'r', encoding='utf-8') as file:
        prompt = file.read()
    return prompt

In [12]:
def make_logger(logging_directory: str, verbose: bool=False, debug: bool=False) -> Logger:
    """ Create a logger that logs to the specified directory. """
    log_filename, timestamp = _make_logger_name(logging_directory)
    logger = logging.getLogger(f"batch_request_{timestamp}")
    if debug:
        logger.setLevel(logging.DEBUG)
    else:
        logger.setLevel(logging.INFO)
    _add_handlers(log_filename, logger, verbose, debug)
    return logger

def _make_logger_name(logging_directory):
    timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M")
    log_filename = os.path.join(logging_directory,
                                f"batch_request_{timestamp}.log")
    return log_filename, timestamp

def _add_handlers(log_filename, logger, verbose, debug):
    for handler in logger.handlers[:]:
        logger.removeHandler(handler)
    formatter = _add_file_handler(log_filename, logger, debug)
    if verbose:
        _add_console_handler(formatter, logger)

def _add_file_handler(log_filename, logger, debug):
    file_handler = logging.FileHandler(log_filename)
    if debug:
        file_handler.setLevel(logging.DEBUG)
    else:
        file_handler.setLevel(logging.INFO)
    formatter = logging.Formatter(
        '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
    file_handler.setFormatter(formatter)
    logger.addHandler(file_handler)
    return formatter

def _add_console_handler(formatter, logger):
    console_handler = logging.StreamHandler()
    console_handler.setLevel(logging.DEBUG)
    console_handler.setFormatter(formatter)
    logger.addHandler(console_handler)

Specify model information and request URL.

In [13]:
LOGGING_DIRECTORY = "my_logs"
PROMPT_FILE_PATH = "data_storage/prompt_file/prompt.txt"
MODEL_NAME = "qwen-vl-max"
REQUEST_URL = "/v1/chat/completions"

In [14]:
%%time
batch_maker = BatchRequestFactory(
    question_bank=qb,
    prompt=load_prompt(),
    url=REQUEST_URL,
    model_name=MODEL_NAME,
    logger=make_logger(LOGGING_DIRECTORY, verbose=False, debug=True))
batch_request = batch_maker.make_batch_request()

CPU times: user 808 ms, sys: 360 ms, total: 1.17 s
Wall time: 1.17 s


In [15]:
REQUEST_FILE_PATH = "data_storage/batch_request_file/tagging_request.jsonl"

In [16]:
def clear_request_file():
    with open(REQUEST_FILE_PATH, 'w', encoding='utf-8') as file:
        json.dump({}, file)

In [17]:
def count_lines_in_file(file_path: str) -> int:
    """ Count the number of lines in a file. """
    with open(file_path, 'r', encoding='utf-8') as file:
        return sum(1 for _ in file)

In [18]:
batch_request.to_jsonl_file(REQUEST_FILE_PATH)
print(f"Number of lines in the request file: {count_lines_in_file(REQUEST_FILE_PATH)}")

Number of lines in the request file: 2836


# 2. Generate the Labels

In [19]:
from pathlib import Path
from openai import OpenAI

In [20]:
client = OpenAI(
    api_key=os.getenv("DASHSCOPE_API_KEY"),
    base_url="https://dashscope.aliyuncs.com/compatible-mode/v1",
)

## a) Upload batch file

In [21]:
%%time
file_object = client.files.create(file=Path(REQUEST_FILE_PATH), purpose="batch")

CPU times: user 287 ms, sys: 200 ms, total: 488 ms
Wall time: 12 s


In [22]:
print(file_object.model_dump_json())

{"id":"file-batch-0312f2b5ab6d4c74abd5687a","bytes":67210233,"created_at":1753837727,"filename":"tagging_request.jsonl","object":"file","purpose":"batch","status":"processed","expires_at":null,"status_details":null}


## b) Create batch job

In [23]:
REQUEST_METADATA = {'ds_name':"科目一标签生成",
                    'ds_description':'为驾考科目一题目自动生成 "tags" 和 "keywords"。 其中"tags" 需要深入理解问题的测试内容，代表问题的知识点与考点。“keywords”需要提取问题中明确或隐含的关键词， 用来检索问题内容。'}

In [24]:
%%time
request_id = file_object.id
batch = client.batches.create(
    input_file_id=request_id,
    endpoint=REQUEST_URL,
    completion_window="24h",
    metadata=REQUEST_METADATA
)
print(batch)

Batch(id='batch_940aebec-4478-404b-a4da-2802a1b3a316', completion_window='24h', created_at=1753837729, endpoint='/v1/chat/completions', input_file_id='file-batch-0312f2b5ab6d4c74abd5687a', object='batch', status='validating', cancelled_at=None, cancelling_at=None, completed_at=None, error_file_id=None, errors=None, expired_at=None, expires_at=None, failed_at=None, finalizing_at=None, in_progress_at=None, metadata={'ds_name': '科目一标签生成', 'ds_description': '为驾考科目一题目自动生成 "tags" 和 "keywords"。 其中"tags" 需要深入理解问题的测试内容，代表问题的知识点与考点。“keywords”需要提取问题中明确或隐含的关键词， 用来检索问题内容。'}, output_file_id=None, request_counts=BatchRequestCounts(completed=0, failed=0, total=0))
CPU times: user 3.47 ms, sys: 1.22 ms, total: 4.68 ms
Wall time: 127 ms


Periodically check the status of the batch job.

In [25]:
from time import sleep

In [26]:
WAIT_TIME = 300 # 5 Minutes
IN_PROGRESS_STATUS_CODES = ["validating", "in_progress", "finalizing", "cancelling"]
ERROR_STATUS_CODES = ["failed", "expired", "cancelled"]

It may be easier to just check batch_status on their website

In [28]:
batch_status = client.batches.retrieve(batch.id)
while batch_status.status in IN_PROGRESS_STATUS_CODES:
    print(f"Batch job status: {batch_status.status}")
    sleep(WAIT_TIME)
    batch_status = client.batches.retrieve(batch.id)
print(f"Final job status: {batch_status.status}")

Final job status: completed


## c) Error handling

In [29]:
ERROR_FILE_PATH = "data_storage/tagging_results/error.jsonl"

### i) Clear the error file

In [30]:
# clear the error file if it exists
if os.path.exists(ERROR_FILE_PATH):
    with open(ERROR_FILE_PATH, 'w', encoding='utf-8') as file:
        file.write("")

In [33]:
print(batch_status)

Batch(id='batch_940aebec-4478-404b-a4da-2802a1b3a316', completion_window='24h', created_at=1753837729, endpoint='/v1/chat/completions', input_file_id='file-batch-0312f2b5ab6d4c74abd5687a', object='batch', status='completed', cancelled_at=None, cancelling_at=None, completed_at=1753841531, error_file_id=None, errors=None, expired_at=None, expires_at=None, failed_at=None, finalizing_at=1753841528, in_progress_at=1753837786, metadata={'ds_name': '科目一标签生成', 'ds_description': '为驾考科目一题目自动生成 "tags" 和 "keywords"。 其中"tags" 需要深入理解问题的测试内容，代表问题的知识点与考点。“keywords”需要提取问题中明确或隐含的关键词， 用来检索问题内容。'}, output_file_id='file-batch_output-6da8a02ca98d49139e6d5d21', request_counts=BatchRequestCounts(completed=2836, failed=0, total=2836))


### ii) Save the new errors

In [38]:
if batch_status.error_file_id is not None:
    content = client.files.content(batch_status.error_file_id)
    content.write_to_file(ERROR_FILE_PATH)
    print(f"完整的请求失败信息已保存至本地错误文件: {ERROR_FILE_PATH}")

## d) Retrieve result

In [39]:
RESULT_OUTPUT_PATH = "data_storage/tagging_results/result.jsonl"

### i) Clear the result file

In [40]:
if os.path.exists(RESULT_OUTPUT_PATH):
    with open(RESULT_OUTPUT_PATH, 'w', encoding='utf-8') as file:
        file.write("")

### ii) Save the result file

In [41]:
output_file = client.files.content(file_id=batch_status.output_file_id)
output_file.write_to_file(RESULT_OUTPUT_PATH)

In [42]:
count_lines_in_file(RESULT_OUTPUT_PATH)

2836

## e) Archive result

In [43]:
RESULT_ARCHIVE_PATH = "data_storage/tagging_results/result_archive"
ERROR_ARCHIVE_PATH = "data_storage/tagging_results/error_archive"

In [44]:
def get_archive_paths() -> (str, str):
    """ Get the archive path for the result and error files. """
    os.makedirs(RESULT_ARCHIVE_PATH, exist_ok=True)
    os.makedirs(ERROR_ARCHIVE_PATH, exist_ok=True)
    timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M")
    archive_path = os.path.join(RESULT_ARCHIVE_PATH, f"result_{timestamp}.jsonl")
    error_archive_path = os.path.join(ERROR_ARCHIVE_PATH, f"error_{timestamp}.jsonl")
    return archive_path, error_archive_path

In [45]:
import shutil

In [46]:
def archive_files():
    """ Archive the result file by copying it to the archive directory. """
    result_path, error_path = get_archive_paths()
    shutil.copy(RESULT_OUTPUT_PATH, result_path)
    shutil.copy(ERROR_FILE_PATH, error_path)

In [47]:
archive_files()

# 3) Parse the results

In [None]:
from label_generator.response_parsing_pipeline import ResponseParsingPipeline

In [None]:
for file in os.listdir(RESULT_ARCHIVE_PATH):
    if file.endswith(".jsonl"):
        print(f"Archive file: {file}")
        file_path = os.path.join(RESULT_ARCHIVE_PATH, file)
        pipe = ResponseParsingPipeline(question_bank=qb, result_path=file_path)
        pipe.parse_and_load()

In [None]:
SAMPLE_MESSAGE = ("<THINKING>\n"
                  "**第一步：生成推理依据**\n"
                  "\n"
                  "  * **步骤1：视觉观察与文本提取 (Keywords)**\n"
                  "      * **视觉元素**: 无图像。\n"
                  "      * **文本信息**: 题目陈述“谨慎驾驶的三个原则是集中注意力、仔细观察和提前预防。”，答案为“对”。\n"
                  "      * **关键词**: 谨慎驾驶, 集中注意力, 仔细观察, 提前预防\n"
                  "  * **步骤2：分析与关联**\n"
                  "      * **分析**: 此题考察的是安全驾驶的基本原则。谨慎驾驶是防御性驾驶的核心理念之一，它要求驾驶员在驾驶过程中始终保持高度的警觉性和预见性。题目中的三个原则（集中注意力、仔细观察和提前预防）正是谨慎驾驶的关键要素。\n"
                  "      * **关联法规**: 这些原则虽然不是具体的交通法规条款，但它们是《道路交通安全法》及其实施条例所倡导的安全驾驶行为的重要组成部分。\n"
                  "  * **步骤3：归纳与生成 (Tags)**\n"
                  "      * **核心概念**: 题目考察的是安全驾驶的基本理念和原则，特别是谨慎驾驶的三个核心要素。这属于安全驾驶知识的一部分，强调了驾驶员的行为规范和心理准备。\n"
                  "      * **标签**: 安全驾驶-基本原则-谨慎驾驶, 安全意识-防御性驾驶\n"
                  "\n"
                  "**第二步：生成JSON**\n"
                  "\n"
                  "```json\n"
                  "{\n"
                  " \"keywords\": [\n"
                  "  \"谨慎驾驶\",\n"
                  "  \"集中注意力\",\n"
                  "  \"仔细观察\",\n"
                  "  \"提前预防\"\n"
                  " ],\n"
                  " \"tags\": [\n"
                  "  \"安全驾驶-基本原则-谨慎驾驶\",\n"
                  "  \"安全意识-防御性驾驶\"\n"
                  " ]\n"
                  "}\n"
                  "```\n"
                  "\n"
                  "</THINKING>\n"
                  "<OUTPUT>\n"
                  "```json\n"
                  "{\n"
                  "\"keywords\": ["
                  "\n\"谨慎驾驶\","
                  "\n\"集中注意力\","
                  "\n\"仔细观察\","
                  "\n\"提前预防\""
                  "\n],"
                  "\n\"tags\": ["
                  "\n\"安全驾驶-基本原则-谨慎驾驶\","
                  "\n\"安全意识-防御性驾驶\""
                  "\n]"
                  "\n}"
                  "\n```")