# Driving Exam Auto Tagging
Direct tagging with a VLM

## A. Format Question Data

## 0. Set up the environment

Set the source path to the root of the project

In [None]:
import json
import os

In [None]:
SRC_PATH = "/Users/simonxu/Files/Projects/Drivetest_App/2_NLP Tag Creation/drivetest_tag_extraction/src"
os.chdir(SRC_PATH)

### 1. Set up the question bank

In [None]:
from entities.question_bank import QuestionBank
from data_access.local_json_db import LocalJsonDB
from data_formatting.data_formatter import DataFormatter, DataFormat

### i) Load the question bank

In [None]:
RAW_DATA_FILE = "data_storage/raw_database/data.json"
RAW_IMG_DIR = "data_storage/raw_database/images"

def load_data() -> QuestionBank:
    """ Load the question bank from the formatted data directory """
    raw_db = LocalJsonDB(RAW_DATA_FILE, RAW_IMG_DIR)
    return raw_db.load()

In [None]:
raw_qb = load_data()
print(raw_qb.question_count())

### ii) Preprocessing

Images are reshaped to a standard size and format.

In [None]:
FORMATTED_IMG_DIR = "data_storage/formatted_database/images"
def format_data(raw_qb: QuestionBank, data_format: DataFormat) -> QuestionBank:
    """ Load the question bank from the formatted data directory """
    data_formatter = DataFormatter(data_format=data_format)
    new_qb = data_formatter.format_data(question_bank=raw_qb,
                                        new_img_dir=FORMATTED_IMG_DIR)
    return new_qb

In [None]:
%%time
INPUT_IMG_EXTENSION = "webp"
OUTPUT_IMG_EXTENSION = "jpg"

data_format = DataFormat(image_shape=(256, 256),
                         input_image_extension=INPUT_IMG_EXTENSION,
                         output_image_extension=OUTPUT_IMG_EXTENSION)
qb = format_data(raw_qb=raw_qb, data_format=data_format)
print(qb.question_count())

### iii) Save the formatted question bank

In [None]:
FORMATTED_DB_FILE_PATH = "data_storage/formatted_database/data.json"
def save_formatted_data(question_bank: QuestionBank) -> None:
    """ Save the question bank to the specified file path """
    formatted_db = LocalJsonDB(FORMATTED_DB_FILE_PATH, FORMATTED_IMG_DIR)
    formatted_db.save(question_bank)

In [None]:
save_formatted_data(qb)

## C. Question Bank to Batch Request File

Turn the question bank into a jsonl file that can be used for making batch requests compatible with the OpenAI standard.

In [None]:
import datetime
import logging
from logging import Logger

from label_generator.batch_request_factory import BatchRequestFactory

In [None]:
def load_prompt() -> str:
    """ Load the prompt from the specified file path. """
    with open(PROMPT_FILE_PATH, 'r', encoding='utf-8') as file:
        prompt = file.read()
    return prompt

In [None]:
def make_logger(logging_directory: str, verbose: bool=False, debug: bool=False) -> Logger:
    """ Create a logger that logs to the specified directory. """
    log_filename, timestamp = _make_logger_name(logging_directory)
    logger = logging.getLogger(f"batch_request_{timestamp}")
    if debug:
        logger.setLevel(logging.DEBUG)
    else:
        logger.setLevel(logging.INFO)
    _add_handlers(log_filename, logger, verbose, debug)
    return logger

def _make_logger_name(logging_directory):
    timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M")
    log_filename = os.path.join(logging_directory,
                                f"batch_request_{timestamp}.log")
    return log_filename, timestamp

def _add_handlers(log_filename, logger, verbose, debug):
    for handler in logger.handlers[:]:
        logger.removeHandler(handler)
    formatter = _add_file_handler(log_filename, logger, debug)
    if verbose:
        _add_console_handler(formatter, logger)

def _add_file_handler(log_filename, logger, debug):
    file_handler = logging.FileHandler(log_filename)
    if debug:
        file_handler.setLevel(logging.DEBUG)
    else:
        file_handler.setLevel(logging.INFO)
    formatter = logging.Formatter(
        '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
    file_handler.setFormatter(formatter)
    logger.addHandler(file_handler)
    return formatter

def _add_console_handler(formatter, logger):
    console_handler = logging.StreamHandler()
    console_handler.setLevel(logging.DEBUG)
    console_handler.setFormatter(formatter)
    logger.addHandler(console_handler)

Specify model information and request URL.

In [None]:
LOGGING_DIRECTORY = "my_logs"
PROMPT_FILE_PATH = "data_storage/prompt_file/prompt.txt"
MODEL_NAME = "qwen-vl-max"
REQUEST_URL = "v1/chat/completions"

In [None]:
%%time
batch_maker = BatchRequestFactory(
    question_bank=qb,
    prompt=load_prompt(),
    url=REQUEST_URL,
    model_name=MODEL_NAME,
    logger=make_logger(LOGGING_DIRECTORY, verbose=False, debug=True))
batch_request = batch_maker.make_batch_request()

In [None]:
REQUEST_FILE_PATH = "data_storage/batch_request_file/tagging_request.jsonl"

In [None]:
def clear_request_file():
    with open(REQUEST_FILE_PATH, 'w', encoding='utf-8') as file:
        json.dump({}, file)

In [None]:
def count_lines_in_file(file_path: str) -> int:
    """ Count the number of lines in a file. """
    with open(file_path, 'r', encoding='utf-8') as file:
        return sum(1 for _ in file)

In [None]:
batch_request.to_jsonl_file(REQUEST_FILE_PATH)
print(f"Number of lines in the request file: {count_lines_in_file(REQUEST_FILE_PATH)}")

# 2. Generate the Labels

In [None]:
from pathlib import Path
from openai import OpenAI

In [None]:
client = OpenAI(
    api_key=os.getenv("DASHSCOPE_API_KEY"),
    base_url="https://dashscope.aliyuncs.com/compatible-mode/v1",
)

## a) Upload batch file

In [None]:
# REQUEST_TEST_FILE_PATH = "data_storage/batch_request_file/tagging_request_test.jsonl"

In [None]:
%%time
file_object = client.files.create(file=Path(REQUEST_FILE_PATH), purpose="batch")

In [None]:
print(file_object.model_dump_json())

## b) Create batch job

In [None]:
REQUEST_METADATA = {'ds_name':"科目一标签生成",
                    'ds_description':'为驾考科目一题目自动生成 "tags" 和 "keywords"。 其中"tags" 需要深入理解问题的测试内容，代表问题的知识点与考点。“keywords”需要提取问题中明确或隐含的关键词， 用来检索问题内容。'}

In [None]:
%%time
request_id = file_object.id
batch = client.batches.create(
    input_file_id=request_id,
    endpoint=REQUEST_URL,
    completion_window="24h",
    metadata=REQUEST_METADATA
)
print(batch)

Periodically check the status of the batch job.

In [None]:
from time import sleep
sleep(30) # Wait for 30 seconds before checking the first status

In [None]:
WAIT_TIME = 1800 # Half an hour
IN_PROGRESS_STATUS_CODES = ["validating", "in_progress", "finalizing", "cancelling"]
ERROR_STATUS_CODES = ["failed", "expired", "cancelled"]

In [None]:
batch_status = client.batches.retrieve(batch.id)
while batch_status.status in IN_PROGRESS_STATUS_CODES:
    print(f"Batch job status: {batch_status.status}")
    sleep(WAIT_TIME)
    batch_status = client.batches.retrieve(batch.id)
print(f"Final batch job status:\n"
      f"{batch_status}")

## c) Error handling

In [None]:
ERROR_FILE_PATH = "data_storage/tagging_results/error.jsonl"
if batch_status.status in ERROR_STATUS_CODES:
    content = client.files.content(batch_status.error_file_id)
    content.write_to_file(ERROR_FILE_PATH)
    print(f"完整的请求失败信息已保存至本地错误文件: {ERROR_FILE_PATH}")

## d) Retrieve result

In [None]:
RESULT_OUTPUT_PATH = "data_storage/tagging_results/result.jsonl"

In [None]:
output_file_id = batch_status.output_file_id
output_file = client.files.content(file_id=output_file_id)

In [None]:
output_file.write_to_file(RESULT_OUTPUT_PATH)