In [19]:
from typing import Optional

import json
import numpy as np
import pandas as pd

from datasets import load_dataset


from tqdm import tqdm

In [20]:
video_mme = load_dataset("lmms-lab/Video-MME")
df = video_mme["test"].to_pandas()
df.head()

Unnamed: 0,video_id,duration,domain,sub_category,url,videoID,question_id,task_type,question,options,answer
0,1,short,Knowledge,Humanity & History,https://www.youtube.com/watch?v=fFjv93ACGo8,fFjv93ACGo8,001-1,Counting Problem,When demonstrating the Germany modern Christma...,"[A. Apples., B. Candles., C. Berries., D. The ...",C
1,1,short,Knowledge,Humanity & History,https://www.youtube.com/watch?v=fFjv93ACGo8,fFjv93ACGo8,001-2,Information Synopsis,What is the genre of this video?,[A. It is a news report that introduces the hi...,A
2,1,short,Knowledge,Humanity & History,https://www.youtube.com/watch?v=fFjv93ACGo8,fFjv93ACGo8,001-3,Counting Problem,How many red socks are above the fireplace at ...,"[A. 1., B. 4., C. 2., D. 3.]",D
3,2,short,Knowledge,Humanity & History,https://www.youtube.com/watch?v=N1cdUjctpG8,N1cdUjctpG8,002-1,Object Recognition,Which of the following features/items is not d...,"[A. Inkstone., B. Niche., C. Jade., D. Sacrifi...",C
4,2,short,Knowledge,Humanity & History,https://www.youtube.com/watch?v=N1cdUjctpG8,N1cdUjctpG8,002-2,Action Reasoning,Which of the following reasons motivated the a...,[A. Because it's from Ming Dynasty and of spec...,D


In [60]:
def parse_options(options_str):
    import re

    if isinstance(options_str, list):
        # Already a list
        return options_str

    if not isinstance(options_str, str):
        raise ValueError(f"Input must be a string or list, got {type(options_str)}")

    # Remove leading/trailing whitespace and brackets
    s = options_str.strip()
    if s.startswith("[") and s.endswith("]"):
        s = s[1:-1].strip()

    # Try to split by newlines if present
    if "\n" in s:
        # Remove empty lines and strip each line
        lines = [line.strip() for line in s.splitlines() if line.strip()]
        # Remove trailing commas
        lines = [line.rstrip(",") for line in lines]
        # Remove enclosing quotes if present
        lines = [line.strip('"').strip("'") for line in lines]
        # Remove empty lines again
        lines = [line for line in lines if line]
        # If each line looks like an option (starts with e.g. "A." or "A:"), return as is
        if all(re.match(r"^[A-Da-d][\.\:]", line) for line in lines):
            return lines
        # Otherwise, try to join and split by comma
        s = " ".join(lines)

    # Now split by comma, but only at top level (not inside brackets)
    # This regex splits on commas that are followed by a space and a capital letter (option start)
    parts = re.split(r",\s*(?=[A-Da-d][\.\:])", s)
    # Remove any extra whitespace and trailing commas
    parts = [part.strip().rstrip(",") for part in parts if part.strip()]
    return parts


def qa_row_to_sharegpt(
    row: pd.Series,
    system: str = None,
    tools: str = None,
    question_col: str = "question",
    answer_col: str = "answer",
    options_col: Optional[str] = "options",
    # option_choice_text = "Please select the correct answer from the following options:\n",
    option_choice_text = "Options:\n",
    post_prompt_text = "Answer with the option's letter from the given choices directly.",
    full_option_answer: bool = False,
    extras: Optional[dict] = None,  # eg {"idx": i}
) -> dict:
    """ShareGPT FT Dataset Format

    [
    {
        "conversations": [
        {
            "from": "human",
            "value": "user instruction"
        },
        {
            "from": "function_call",
            "value": "tool arguments"
        },
        {
            "from": "observation",
            "value": "tool result"
        },
        {
            "from": "gpt",
            "value": "model response"
        }
        ],
        "system": "system prompt (optional)",
        "tools": "tool description (optional)"
    }
    ]
    """

    # format the question with the MC options if provided
    question = row[question_col]
    answer = row[answer_col]
    if options_col is not None:
        options = row[options_col]
        if options is None or len(options) == 0:
            raise ValueError(f"Options must be a non-empty list, got {options}")

        if isinstance(options, str):
            options = parse_options(options)
        if isinstance(options, np.ndarray):
            options = options.tolist()
        assert isinstance(options, list), f"Options must be a list, got {type(options)}"
        assert isinstance(options[0], str), f"Options must be a list of strings, got {type(options[0])}"

        options_str = "\n".join(options)

        # find the option that starts with the answer
        correct_option = next((opt for opt in options if opt.startswith(answer)), None)
        # override the answer with the full option if full_option_answer is True
        if full_option_answer:
            if correct_option is None:
                raise ValueError(f"No correct option found for answer {answer}")
            answer = correct_option
        question = f"{question}\n{option_choice_text}{options_str}\n{post_prompt_text}"
    elif full_option_answer:
        raise ValueError("Options must be provided if full_option_answer is True")

    # format the conversation
    conversations = []
    conversations.append({"from": "human", "value": question})
    conversations.append({"from": "gpt", "value": answer})

    return {
        "conversations": conversations,
        **({k: v for k, v in [
            ("system", system),
            ("tools", tools),
            ("extras", extras),
        ] if v is not None})
    }

def bold(text: str):
    return f"\033[1m{text}\033[0m"

def pretty_print_sharegpt(entry: dict):
    # print(json.dumps(entry, indent=2))
    """
    Print it in Q:, A: format
    """
    if "system" in entry:
        print(f"System: {entry['system']}")
    if "tools" in entry:
        print(f"Tools: {entry['tools']}")
    if "extras" in entry:
        print(f"Extras: {entry['extras']}")
    print("-" * 100)
    for conv in entry["conversations"]:
        if conv["from"] == "human":
            print(f"{bold('[Q]')} {conv['value']}")
        elif conv["from"] == "gpt":
            print(f"{bold('[A]')} {conv['value']}")
        elif conv["from"] == "function_call":
            print(f"{bold('[Function Call]')} {conv['value']}")
        elif conv["from"] == "observation":
            print(f"{bold('[Observation]')} {conv['value']}")
        else:
            raise ValueError(f"Unknown from: {conv['from']}")


# test it on the first row of the mmmu dataset
row = df.iloc[0]
entry = qa_row_to_sharegpt(row, extras={"idx": 0})
print(f"Entry: {json.dumps(entry, indent=2)}")
print("-" * 100)
pretty_print_sharegpt(entry)

Entry: {
  "conversations": [
    {
      "from": "human",
      "value": "When demonstrating the Germany modern Christmas tree is initially decorated with apples, candles and berries, which kind of the decoration has the largest number?\nOptions:\nA. Apples.\nB. Candles.\nC. Berries.\nD. The three kinds are of the same number.\nAnswer with the option's letter from the given choices directly."
    },
    {
      "from": "gpt",
      "value": "C"
    }
  ],
  "extras": {
    "idx": 0
  }
}
----------------------------------------------------------------------------------------------------
Extras: {'idx': 0}
----------------------------------------------------------------------------------------------------
[1m[Q][0m When demonstrating the Germany modern Christmas tree is initially decorated with apples, candles and berries, which kind of the decoration has the largest number?
Options:
A. Apples.
B. Candles.
C. Berries.
D. The three kinds are of the same number.
Answer with the option'

In [61]:
import os

def df_to_sharegpt(df: pd.DataFrame, **kwargs):
    """
    Convert a dataframe to a list of sharegpt entries.
    """
    return [qa_row_to_sharegpt(row, *kwargs) for _, row in df.iterrows()]

def df_to_sharegpt_jsonl(df: pd.DataFrame, path: str, **kwargs):
    """
    Convert a dataframe to a list of sharegpt entries and save to a jsonl file.
    """
    os.makedirs(os.path.dirname(path), exist_ok=True)

    # save each line as it is created, more efficient
    with open(path, "w") as f:
        for i, row in tqdm(df.iterrows()):
            entry = qa_row_to_sharegpt(row, **kwargs, extras={"idx": i})
            f.write(json.dumps(entry) + "\n")

    print(f"Saved {len(df)} entries to {path}")

path = "../data/mmmu_sharegpt.jsonl"
df_to_sharegpt_jsonl(df, path)


2700it [00:00, 22778.58it/s]

Saved 2700 entries to ../data/mmmu_sharegpt.jsonl





In [62]:
# generate a sample 80/20 train/test split

from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

df_to_sharegpt_jsonl(train_df, "../data/mmmu_sharegpt_train_80.jsonl")
df_to_sharegpt_jsonl(test_df, "../data/mmmu_sharegpt_test_20.jsonl")

2160it [00:00, 23649.87it/s]


Saved 2160 entries to ../data/mmmu_sharegpt_train_80.jsonl


540it [00:00, 23637.28it/s]

Saved 540 entries to ../data/mmmu_sharegpt_test_20.jsonl



