# Drive Test Tag Generation With BERTopic
Generate tags for the written portion of the chinese driving exam using BERTopic.

## 1. Load Data
Loading data fom local database into a pandas dataframe

### a) Load data into question bank class

In [None]:
from qb.question import Question
from qb.question_bank import QuestionBank
from data_storage.database.json_database import LocalJsonDB

db = LocalJsonDB("data_storage/database/json_db/data.json",
                 "data_storage/database/json_db/images")
qb : QuestionBank = db.load()
print(qb.question_count())

### b) Fill in questions without images with a blank image

In [None]:
from PIL import Image

def make_blank_img(path: str) -> None:
    """ Create a blank image and save it to the specified path. """
    img = Image.new('RGB', (10, 10), color='white')
    img.save(path)

In [None]:
def get_blank_img_path() -> str:
    """ Create a path for the blank image. """
    return f"data_storage/database/json_db/images/00blank.webp"
make_blank_img(get_blank_img_path())

### c) Convert question bank to pandas dataframe

In [None]:
import pandas as pd
from pandas import DataFrame
def qb_to_df(qb: QuestionBank) -> DataFrame:
    data = {
        "ID": [],
        "Question": [],
        "Answer Choices": [],
        "Answer": [],
        "Chapter": [],
        "Image Path": []
    }
    for chapter_id in qb.get_all_chapter_num():
        chapter_description = qb.describe_chapter(chapter_id)
        qid_lst = qb.get_qids_by_chapter(chapter_id)
        for qid in qid_lst:
            question: Question = qb.get_question(qid)
            data["ID"].append(qid)
            data["Question"].append(question.get_question())
            data["Answer Choices"].append(", ".join(question.get_answers()))
            data["Answer"].append(question.get_correct_answer())
            data["Chapter"].append(chapter_description)
            data["Image Path"].append(question.get_img_path() if question.get_img_path() else get_blank_img_path())
    return pd.DataFrame(data)
question_bank = qb_to_df(qb)

In [None]:
print(question_bank.shape)
question_bank.head()

## 2. Format Data
Convert Question Bank to a form suitable for BERTopic

In [None]:
from typing import List
def make_docs_images(question_bank: DataFrame) -> (List[str], List[str]):
    docs = []
    images = []
    for key in question_bank.index:
        question = question_bank.loc[key, "Question"]
        answer_choices = question_bank.loc[key, "Answer Choices"]
        answer = question_bank.loc[key, "Answer"]
        chapter = question_bank.loc[key, "Chapter"]
        # Combine all parts into a single document
        doc = f"章节: {chapter}\n 题目: {question}\n 选项: {answer_choices}\n 答案: {answer}"
        img_path = question_bank.loc[key, "Image Path"]

        docs.append(doc)
        images.append(img_path if img_path else None)
    return docs, images
docs, images = make_docs_images(question_bank)

In [None]:
# Display the first 5 documents and images
for i in range(5):
    print(docs[i], "\n")
    print(images[i], "\n")

## 3. Naive Processing

### a) Set up model
#### Set up the visual component

In [None]:
# Imports
from bertopic import BERTopic
from bertopic.representation import VisualRepresentation

In [None]:
# Set up the visual component
visual_model = VisualRepresentation()

In [None]:
representation_model = {
    "Visual_Aspect": visual_model,
}

#### Set up the embedding model

In [None]:
embedding_model = "distiluse-base-multilingual-cased-v1"

In [None]:
# Put the model together
topic_model = BERTopic(embedding_model=embedding_model,
                       representation_model=representation_model,
                       verbose=True)

### b) Fit the model

In [None]:
topic_model.fit(docs, images=images)

### c) Save the model

In [None]:
import os
from datetime import datetime

time = str(datetime.today().strftime("%Y-%m-%d %H:%M:%S"))
model_save_path = f"data_storage/model_dir/{time}"
os.makedirs(model_save_path, exist_ok=True)
print(model_save_path)

In [None]:
topic_model.save(model_save_path, serialization="pytorch", save_ctfidf=True, save_embedding_model=embedding_model)

### d) Inspect the model

In [None]:
topic_model.visualize_topics()

In [None]:
# View a sample of the topics and their representative documents
def view_topic_samples(topic_model, n_topics=5, n_docs_per_topic=5):
    """
    Display a sample of topics and their representative documents.
    """
    for topic_id in range(n_topics):
        print(f"Topic {topic_id}:")
        print(f"Topics: {topic_model.get_topic(topic_id)}")
        # Get the representative documents for the topic
        representative_docs = topic_model.get_representative_docs(topic_id)
        # Print a sample of the documents
        for doc in representative_docs[:min(len(representative_docs), n_docs_per_topic)]:
            print(f"- {doc}")
        print("\n")

# Call the function with the trained model and documents
view_topic_samples(topic_model, n_topics=5, n_docs_per_topic=5)

In [None]:
topic_model.get_topic(36)

In [None]:
topic_model.get_representative_docs(36)

## 4. Multimodal Topic Modeling Through Multimodal Embeddings

### a) Create custom embeddings

To improve upon the quality of the tags, we need to first make sure we are taking both the image and the text into account when generating the embeddings. This can be done by using a model that embeds images and text into the same space. For questions that have images, we will embed both the text and the image and take the average.

In [1]:
# Load model directly
from transformers import AutoProcessor, AutoModelForZeroShotImageClassification

processor = AutoProcessor.from_pretrained("google/siglip-base-patch16-256-multilingual")
model = AutoModelForZeroShotImageClassification.from_pretrained("google/siglip-base-patch16-256-multilingual")

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


IndentationError: expected an indented block after class definition on line 3 (3875396116.py, line 5)