# Drive Test Tag Generation
Generate tags for the written portion of the chinese driving exam using a process similar to BERTopic.

## 1. Load Data
Loading data from a local database into a question bank class.

In [1]:
from src.qb.question_bank import QuestionBank
from data_storage.database.json_database import LocalJsonDB

In [2]:
db = LocalJsonDB("data_storage/database/json_db/data.json",
                 "data_storage/database/json_db/images")
qb : QuestionBank = db.load()
print(qb.question_count())

2836


## 2. Format Data
Although the Siglip2 model can handle images of different sizes, we will still standardize the image sizes to the same size to avoid unnecessary complications.

In [3]:
from data_cleaning.img_reshaper import ImgSquarer

In [4]:
IMG_DIR_256 = "data_cleaning/resized_imgs/img256"
squarer_256 = ImgSquarer(256)

In [5]:
def resize_images(qb: QuestionBank, squarer: ImgSquarer, new_dir: str) -> None:
    for chapter_id in qb.get_all_chapter_num():
        for qid in qb.get_qids_by_chapter(chapter_id):
            question = qb.get_question(qid)
            if question.get_img_path() is not None:
                question.set_img_path(squarer.reshape(qid, qb.get_img_dir(), new_dir))

In [6]:
import os

In [7]:
# If the directory is empty, resize images.
if not os.listdir(IMG_DIR_256):
    print("Resizing images to 256x256...")
    resize_images(qb, squarer_256, IMG_DIR_256)
else:
    print("Images already resized to 256x256, skipping...")

Images already resized to 256x256, skipping...


## 3. Create Multimodal Embeddings
Create multimodal embeddings for the questions using a Siglip2 model.

In [8]:
# Library Imports
from transformers import AutoModel, AutoProcessor

# Local Imports
from embedder.siglip2_qb_embedder import Siglip2QBEmbedder

### a) Load/Download the Siglip2 Model
We will be using "google/siglip2-base-patch16-256" for this task.

In [9]:
import torch

In [10]:
MODEL_NAME = "google/siglip2-base-patch16-256"
model = AutoModel.from_pretrained(MODEL_NAME, torch_dtype=torch.float32, attn_implementation="sdpa")
processor = AutoProcessor.from_pretrained(MODEL_NAME, use_fast=True)

### b) Create embeddings

#### i) Define a logger

In [11]:
import logging
from logging import Logger
from datetime import datetime

LOGGING_PATH = "logs"

def get_logger(name: str) -> Logger:
    # Create logger
    logger = logging.getLogger(name)
    logger.setLevel(logging.INFO) # Set the logging level

    # Create a file handler with timestamp in filename
    timestamp = datetime.now().strftime("%Y%m%d_%H%M")
    file_handler = logging.FileHandler(
        os.path.join(LOGGING_PATH, f"{name}_{timestamp}.log")
    )

    # Create formatter
    formatter = logging.Formatter(
        '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
    )
    file_handler.setFormatter(formatter)

    # Add handler to logger
    logger.addHandler(file_handler)

    return logger

embedder_logger = get_logger("embedder")

#### ii) Create the embedder

In [12]:
custom_embedder = Siglip2QBEmbedder(model, processor, embedder_logger)

#### iii) Generate embeddings

In [13]:
import numpy as np

In [14]:
def save_embeddings(embeddings, file_path):
    np.savez(file_path, **{str(qid): embeddings[qid] for qid in embeddings})

In [15]:
EMBEDDING_FILE_NAME = "data_storage/embedding_dir/siglip2_embeddings.npz"
if os.path.exists(EMBEDDING_FILE_NAME):
    print(f"Embeddings already exist at {EMBEDDING_FILE_NAME}, skipping generation.")
else:
    print("Generating embeddings...")
    embeddings = custom_embedder.encode_qb(qb)

    print(f"Saving embeddings to {EMBEDDING_FILE_NAME}...")
    save_embeddings(embeddings, EMBEDDING_FILE_NAME)

Generating embeddings...
Saving embeddings to data_storage/embedding_dir/siglip2_embeddings.npz...


## 4. Dimension Reduction

### a) Load Embeddings

In [16]:
def load_embeddings(file_path):
    loaded = np.load(file_path)
    return {key: loaded[key] for key in loaded.files}

In [17]:
id_to_embedding = load_embeddings(EMBEDDING_FILE_NAME)

In [18]:
from typing import List

from embedder.siglip2_qb_embedder import format_question

In [19]:
def format_for_clustering(id_to_embedding: dict, qb: QuestionBank) -> (List[str], List[str], np.ndarray):
    """
    Format the embeddings for clustering.
    """
    qid_lst: List[str] = []
    documents: List[str] = []
    embedding_lst: List[np.ndarray] = []

    qid_to_chapter = {}
    for chapter_id in qb.get_all_chapter_num():
        for qid in qb.get_qids_by_chapter(chapter_id):
            qid_lst.append(qid)
            qid_to_chapter[qid] = chapter_id
    qid_lst.sort()

    for qid in qid_lst:
        documents.append(
            format_question(qb.get_question(qid), qb.describe_chapter(
                qid_to_chapter[qid])))
        embedding_lst.append(id_to_embedding[qid])

    embedding_array = np.array(embedding_lst)
    return qid_lst, documents, embedding_array

In [20]:
qid_lst, documents, embeds = format_for_clustering(id_to_embedding, qb)
print(f"Number of questions: {len(qid_lst)} "
      f"Embedding shape: {embeds.shape}")

Number of questions: 2836 Embedding shape: (2836, 768)


In [21]:
documents[:2]  # Display the first few documents to verify formatting

['章节:交通信号 题目:这个标志是何含义？ 答案:禁止直行和向右转弯', '章节:交通信号 题目:这个标志是何含义？ 答案:十字交叉路口预告']

### b) Set up dimension reduction model
We will be using UMAP for dimension reduction.

In [22]:
from umap import UMAP
def dim_reduction(dimensions: int, n_neighbors: int, embeddings: np.ndarray) -> np.ndarray:
    umap_model = UMAP(
        n_components=dimensions,
        metric='cosine',
        n_neighbors=n_neighbors,
        min_dist=0.0
    )
    return umap_model.fit_transform(embeddings)

In [23]:
def make_reduced_embeddings(dimension, n_neighbors, embeddings):
    path = f"data_storage/embedding_dir/rdc_embeds_n{n_neighbors}d{dimension}.npz"
    if not os.path.exists(path):
        rdc_embeds = dim_reduction(dimension, n_neighbors, embeddings)
        np.savez(path, embeddings=rdc_embeds)
    else:
        rdc_embeds = np.load(path)['embeddings']
    return rdc_embeds

## 5. Clustering


Generate a small representative sample of the question bank by clustering the questions and selecting representative questions from each cluster.

### a) Set up clustering model

In [24]:
from sklearn.cluster import HDBSCAN
def cluster_embeddings(embeddings: np.ndarray) -> np.ndarray:
    """
    Cluster the embeddings using HDBSCAN.

    Args:
        embeddings: Embeddings array with shape (n_samples, n_features)

    Returns:
        Cluster labels for each embedding
    """
    clusterer = HDBSCAN(min_cluster_size=2, metric='cosine', min_samples=2, cluster_selection_method="leaf", allow_single_cluster=True)
    return clusterer.fit_predict(embeddings)

### b) Tune Hyperparameters
Find optimal hyperparameters
#### i) Set up experiments trying a range of hyperparameters

In [25]:
from pandas import DataFrame

In [26]:
def make_topics(dimension: int, n_neighbors: int, qid_lst: List[str], docs: List[str], embeddings: np.ndarray) -> DataFrame:
    """
    A pipeline that takes in the dimension and n_neighbors, then performs dimension reduction and clustering using those hyperparameters on docs.
    """
    topic_labels = cluster_embeddings(make_reduced_embeddings(dimension, n_neighbors, embeddings))
    return DataFrame({
        "topic" : topic_labels,
        "qid": qid_lst,
        "question": docs
    })

In [27]:
from typing import Tuple
from ipywidgets import IntProgress, Label, HBox
from IPython.display import display

In [28]:
def dimred_experiment(qid_lst: List[str], questions: List[str], embeddings: np.ndarray, n_d_pairs: List[Tuple[int]]) -> DataFrame:
    """
    An experiment to find the optimal hyperparameters for dimension reduction (dimensions and n_neighbors).

    With a fixed clustering method, we will vary the dimensions and n_neighbors to see how it affects the outlier count.
    """

    cur, total = 0, len(n_d_pairs)
    progress_bar = IntProgress(min=0, max=total, description='Progress:')
    progress_label = Label(f'0/{total} complete')
    progress_box = HBox([progress_bar, progress_label])
    display(progress_box)

    result_dict = {
        "qid": qid_lst,
        "question": questions
    }
    for n_neighbors, dimension in n_d_pairs:
        progress_label.value = f"{cur}/{total} complete. Calculating n = {n_neighbors}, d = {dimension}..."

        column_name = f"n{n_neighbors}d{dimension}"
        result_dict[column_name] = cluster_embeddings(make_reduced_embeddings(dimension, n_neighbors, embeddings))

        cur += 1
        progress_bar.value += 1
    progress_label.value = "Experiment complete!"
    return DataFrame(result_dict)

In [29]:
import pandas as pd
import warnings

In [30]:
DIMRED_EXPR1_PATH = "data_storage/experiments/dimension_reduction_experiment1.csv"
if not os.path.exists(DIMRED_EXPR1_PATH):
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")

        n_d_pairs = [(n, d) for n in range(2, 767, 50) for d in range(2, 767, 50)]
        results = dimred_experiment(qid_lst=qid_lst,
                                    questions=documents,
                                    embeddings=embeds,
                                    n_d_pairs=n_d_pairs)
        results.to_csv(DIMRED_EXPR1_PATH, index=False)
        dimred_exp1_results = results
else:
    print(f"Results already exist at {DIMRED_EXPR1_PATH}. Loading results from file...")
    dimred_exp1_results = pd.read_csv(DIMRED_EXPR1_PATH)

HBox(children=(IntProgress(value=0, description='Progress:', max=256), Label(value='0/256 complete')))

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [31]:
combined_results = dimred_exp1_results
print(combined_results.shape)

(2836, 258)


#### ii) Analyze Results

Extract key metrics from the data into a dataframe.

In [32]:
def extract_n_d(col_name: str) -> (int, int):
    """
    Extract n_neighbors and dimensions from the column name.
    """
    parts = col_name.split("d")
    n_neighbors = int(parts[0][1:])  # Remove 'n' and convert to int
    dimensions = int(parts[1])  # Convert to int
    return n_neighbors, dimensions

In [33]:
def analyze_dimred_data(data: DataFrame) -> DataFrame:
    """
    Set up a data frame with n_neighbours and dimensions as the first 2 columns, followed by
    key metrics, such as:
        - Number of clusters
        - Number of outliers
        - ...
    """
    metrics = {
        "n_neighbors": [],
        "dimensions": [],
        "num_clusters": [],
        "num_outliers": []
    }
    for col in data.columns:
        if col.startswith("n") and "d" in col:
            n_neighbors, dimensions = extract_n_d(col)
            metrics["n_neighbors"].append(n_neighbors)
            metrics["dimensions"].append(dimensions)
            metrics["num_clusters"].append(data[col].nunique())
            metrics["num_outliers"].append((data[col] == -1).sum())
    return DataFrame(metrics)

In [34]:
import plotly.express as px

In [37]:
def interactive_plot3d(df, z_metric, title=None, size=(1600, 800)):
    """
    Creates an interactive 3D plot using Plotly.

    Parameters:
    -----------
    df : DataFrame
        DataFrame with 'n_neighbors', 'dimensions', and z_metric
    z_metric : str
        Column name to use for z-axis values
    title : str, optional
        Plot title
    size : tuple, optional
        Figure size in pixels
    """
    if title is None:
        title = f'3D Plot of {z_metric} by n_neighbors and dimensions'

    fig = px.scatter_3d(
        df,
        x='n_neighbors',
        y='dimensions',
        z=z_metric,
        color=z_metric,
        color_continuous_scale='Viridis',
        range_color=(df[z_metric].min(), df[z_metric].max()),
        title=title
    )

    fig.update_traces(marker=dict(size=3))

    fig.update_layout(
        width=size[0],
        height=size[1],
        scene=dict(
            xaxis_title='n_neighbors',
            yaxis_title='dimensions',
            zaxis_title=z_metric
        ),
        coloraxis_colorbar=dict(
            title=z_metric,
            len=0.75
        )
    )

    return fig

Examine number of clusters

In [38]:
metrics = analyze_dimred_data(combined_results)
fig = interactive_plot3d(metrics, z_metric='num_clusters')
fig.show()

Examine number of outliers

In [39]:
fig = interactive_plot3d(metrics, z_metric='num_outliers')
fig.show()

#### iii) Refine Experiment Based on Results
##### Experiment 2

In [None]:
def get_nd_experiment2() -> List[Tuple[int]]:
    """
    Get the n_neighbors and dimensions pairs for the second experiment.
    This is based on the results of the first experiment.
    """
    nd_set = set()
    for n in range(2, 767, 25):
        for d in range(2, 11, 1):
            nd_set.add((n, d))
    return list(nd_set)

In [None]:
def run_experiment(qid_lst: List[str], documents: List[str], embeds: np.ndarray, n_d_pairs: List[Tuple[int]], path: str) -> DataFrame:
    """Set up experiment."""
    # Suppress all warnings within this block
    if not os.path.exists(path):
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            results = dimred_experiment(qid_lst=qid_lst, questions=documents, embeddings=embeds, n_d_pairs=n_d_pairs)
            results.to_csv(path, index=False)
    else:
        results = pd.read_csv(path)
    return results

In [None]:
DIMRED_EXPR2_PATH = "data_storage/experiments/dimension_reduction_experiment2.csv"

In [None]:
dimred_exp2_results = run_experiment(qid_lst=qid_lst, documents=documents, embeds=embeds, n_d_pairs=get_nd_experiment2(), path=DIMRED_EXPR2_PATH)

In [None]:
combined_results = pd.concat([
    combined_results.drop(columns=["id", "question"]),
    dimred_exp2_results], axis=1).reindex()
combined_results.shape

In [None]:
metrics = analyze_dimred_data(combined_results)

In [None]:
fig = interactive_plot3d(metrics, z_metric='num_clusters')
fig.show()

In [None]:
fig = interactive_plot3d(metrics, z_metric='num_outliers')
fig.show()

Inspect certain points manually to determine the quality of clusters

In [None]:
def inspect_topic(data: DataFrame, n_neighbors: int, dimensions: int, sort_df=False, ascending=False) -> DataFrame:
    """
    Inspect the results for a specific n_neighbors and dimensions combination.
    """
    col_name = f"n{n_neighbors}d{dimensions}"
    if col_name in data.columns:
        output = data[["qid", "question", col_name]].rename(columns={col_name: "topic"})
        if sort_df:
            output = output.sort_values(by="topic", ascending=ascending)
    else:
        raise ValueError(f"No results found for n_neighbors={n_neighbors} and dimensions={dimensions}")
    return output