Deduplication

In [None]:
import json

def find_high_similarity_questions(json_data, threshold=0.9):
    result = {}
    for image_id, image_data in json_data.items():
        for category in ['generated_vs_original', 'generated_vs_paraphrased']:
            if category in image_data:
                for gen_q_key, questions in image_data[category].items():
                    for q_key, details in questions.items():
                        if details['similarity_score'] > threshold:
                            if image_id not in result:
                                result[image_id] = []
                            if gen_q_key not in result[image_id]:
                                result[image_id].append(gen_q_key)
    return result

def remove_duplicates(origin_json, duplicated_json):
    filtered_json = {}
    for image_id, content in origin_json.items():
        filtered_content = json.loads(json.dumps(content))
        if "qwenvl_generated" in filtered_content and "question_generated" in filtered_content["qwenvl_generated"]:
            generated_questions = filtered_content["qwenvl_generated"]["question_generated"]
            if image_id in duplicated_json:
                for gen_q_key in duplicated_json[image_id]:
                    if gen_q_key in generated_questions:
                        del generated_questions[gen_q_key]
        filtered_json[image_id] = filtered_content
    return filtered_json

def main():
    with open('data/duplicated/score_train.json', 'r', encoding='utf-8') as f:
        data = json.load(f)
    high_similarity = find_high_similarity_questions(data)
    
    # Write duplicated json and ensure file is closed before reading
    with open('data/duplicated/duplicated_train.json', 'w', encoding='utf-8') as f:
        json.dump(high_similarity, f, indent=4, ensure_ascii=False)
    
    # Now read the duplicated json after the write is complete
    with open('data/duplicated/duplicated_train.json', 'r', encoding='utf-8') as f:
        duplicated_json = json.load(f)
        
    with open('data/qwenvl_openvivqa/qwenvl_train.json', 'r', encoding='utf-8') as f:
        origin_json = json.load(f)
        
    filtered_json = remove_duplicates(origin_json, duplicated_json)
    
    with open('data/qwenvl_openvivqa/qwenvl_train_filtered.json', 'w', encoding='utf-8') as f:
        json.dump(filtered_json, f, indent=4, ensure_ascii=False)
        
    return high_similarity, filtered_json

if __name__ == "__main__":
    main()


q_extraction

In [None]:
import argparse
import json

def extract_questions_by_image_id(data):
    """
    Extract and organize questions from a dataset by their image_id.
    
    This function takes a JSON structure containing annotations with image_id and question fields,
    groups all questions that belong to the same image, and restructures them into a new format
    with up to three questions per image (filling empty slots with empty strings).
    
    Args:
        data (dict): The input data dictionary containing an 'annotations' key with question data
        
    Returns:
        dict: A dictionary where each key is an image_id (as string) with structured question data
              Format: {
                  "image_id": The image ID (integer),
                  "original_question": {
                      "question_1": "First question for this image",
                      "question_2": "Second question for this image (if exists)",
                      "question_3": "Third question for this image (if exists)"
                  }
              }
    """
    result = {}
    # Temporary dictionary to hold questions grouped by image_id
    temp = {}

    # Step 1: Group all questions by their respective image_id
    for key, value in data["annotations"].items():
        image_id = value["image_id"]
        question = value["question"]

        # Create a new list for this image_id if we haven't seen it before
        if image_id not in temp:
            temp[image_id] = []
        # Add the question to the list for this image_id
        temp[image_id].append(question)

    # Step 2: Format the grouped questions into the required output structure
    for image_id, questions in temp.items():
        question_dict = {}
        # Assign each question to a numbered key (question_1, question_2, etc.)
        for i, q in enumerate(questions, 1):
            question_dict[f"question_{i}"] = q
        
        # Add the formatted entry to the result dictionary using image_id as key
        result[str(image_id)] = {
            "image_id": image_id,
            "original_question": question_dict
        }

    return result

def main():
    # Setup argument parser
    parser = argparse.ArgumentParser(description='Process input and output JSON files for question extraction')
    parser.add_argument('--input_json', type=str, required=True, 
                        help='Path to the input JSON file with annotations')
    parser.add_argument('--output_json', type=str, required=True, 
                        help='Path to the output JSON file for processed questions')

    # Parse arguments
    args = parser.parse_args()

    # Step 1: Load the input JSON file containing question annotations
    try:
        with open(args.input_json, 'r', encoding='utf-8') as f:
            input_data = json.load(f)
    except FileNotFoundError:
        print(f"Error: Input file {args.input_json} not found. Please check the file path.")
        exit(1)
    except json.JSONDecodeError:
        print(f"Error: Invalid JSON format in the input file {args.input_json}.")
        exit(1)

    # Step 2: Process the data to extract and structure questions by image_id
    output_json = extract_questions_by_image_id(input_data)

    # Step 3: Save the processed data to a new JSON file
    try:
        with open(args.output_json, 'w', encoding='utf-8') as f:
            json.dump(output_json, f, ensure_ascii=False, indent=4)
        print(f"Successfully processed and saved data for {len(output_json)} images.")
    except IOError:
        print(f"Error: Unable to write to output file {args.output_json}. Please check permissions and path.")

if __name__ == "__main__":
    main()


Filtering

In [None]:
import argparse
import yaml
import json
import os

def remove_unqualified_questions(unqualified_questions, origin_data, target_image_id):
    # Extract unqualified question texts for the target image_id
    unqualified_texts = set()
    for item in unqualified_questions:
        if item.get('ID') == target_image_id:
            for key, value in item.items():
                if key.startswith('generated_question_'):
                    unqualified_texts.add(value)

    # Filter origin data for the target image_id
    if str(target_image_id) not in origin_data:
        return origin_data  # No changes if image_id not found

    data = origin_data[str(target_image_id)]
    if 'qwenvl_generated' not in data or 'question_generated' not in data['qwenvl_generated']:
        return origin_data  # No generated questions to filter

    generated_questions = data['qwenvl_generated']['question_generated']

    # Remove unqualified generated questions
    filtered_questions = {k: v for k, v in generated_questions.items() if v not in unqualified_texts}

    # Update the origin data with filtered questions
    origin_data[str(target_image_id)]['qwenvl_generated']['question_generated'] = filtered_questions

    return origin_data

def main() -> None:
    parser = argparse.ArgumentParser(description='Qwen2.5-VL Question Evaluation')
    
    parser.add_argument('--config', type=str, help='Path to YAML config file')
    parser.add_argument('--input_json', type=str, help='Path to input JSON file with questions')
    parser.add_argument('--unqualified_json', type=str, help='Path to JSON file with unqualified questions')
    parser.add_argument('--output_dir', type=str, help='Directory to save output JSON files')

    
    args = parser.parse_args()
    
    # Process config and arguments
    config = {}
    if args.config:
        with open(args.config, 'r') as f:
            config = yaml.safe_load(f)
    
    args_dict = vars(args)
    merged_config = {}
    for key in args_dict:
        if key == 'config': continue
        if args_dict[key] is not None:
            merged_config[key] = args_dict[key]
        elif key in config:
            merged_config[key] = config[key]
    
    # Validate required arguments
    required_args = ['input_json', 'unqualified_json', 'output_dir']
    missing_args = [arg for arg in required_args if arg not in merged_config]
    if missing_args:
        print(f"Missing required arguments: {', '.join(missing_args)}")
        parser.print_help()
        return

    # Load input JSON
    with open(merged_config['input_json'], 'r', encoding='utf-8') as f:
        origin_data = json.load(f)

    # Load unqualified questions JSON
    with open(merged_config['unqualified_json'], 'r', encoding='utf-8') as f:
        unqualified_questions = json.load(f)

    # Remove unqualified questions for each image_id in origin_data
    for image_id in origin_data.keys():
        origin_data = remove_unqualified_questions(unqualified_questions, origin_data, int(image_id))

    # Save filtered data to output directory
    output_path = merged_config['output_dir']
    with open(output_path, 'w', encoding='utf-8') as f:
        json.dump(origin_data, f, ensure_ascii=False, indent=4)

    print(f"Filtered questions saved to {output_path}")

if __name__ == '__main__':
    main()


merge_json

In [None]:
import json
import argparse

def merge_json(orig_path, gen_path, paraphr_path, output_path):
    """
    Merges original Q&A, generated questions, and paraphrased questions
    based on image_id into one JSON file with dynamic slot counts.

    Args:
        orig_path (str): Path to the original Q&A JSON file
        gen_path (str): Path to the generated questions JSON file
        paraphr_path (str): Path to the paraphrased questions JSON file
        output_path (str): Path where to save the merged output JSON
    """

    # Load source files
    with open(orig_path, 'r', encoding='utf-8') as f:
        orig_data = json.load(f)
    with open(gen_path, 'r', encoding='utf-8') as f:
        gen_data = json.load(f)
    with open(paraphr_path, 'r', encoding='utf-8') as f:
        paraphr_data = json.load(f)

    # Group original questions & answers by image_id
    qa_map = {}
    for ann in orig_data.get("annotations", {}).values():
        img_id = ann.get("image_id")
        if img_id is None:
            continue
        qa_map.setdefault(img_id, {"questions": [], "answers": []})
        qa_map[img_id]["questions"].append(ann.get("question", ""))
        qa_map[img_id]["answers"].append(ann.get("answer", ""))

    # Group generated (alternate) questions by image_id
    gen_map = {}
    for item in gen_data.values():
        img_id = item.get("image_id")
        if img_id is None:
            continue
        alt = item.get("question_generated", {})
        ordered = [
            alt[k] for k in sorted(
                alt.keys(), key=lambda x: int(x.rsplit("_", 1)[-1])
            )
        ]
        gen_map[img_id] = ordered

    # Group paraphrased questions by image_id
    paraphr_map = {}
    for item in paraphr_data.values():
        img_id = item.get("image_id")
        if img_id is None:
            continue
        text = item.get("question_generated", {})\
                   .get("question_paraphrased", "")
        paraphr_map.setdefault(img_id, []).append(text)

    # Merge into final structure with dynamic slots
    merged = {}
    all_ids = set(qa_map) | set(gen_map) | set(paraphr_map)
    for img_id in sorted(all_ids):
        key = str(img_id)
        merged[key] = {"image_id": img_id}

        # Original questions/answers
        qs = qa_map.get(img_id, {}).get("questions", [])
        ans = qa_map.get(img_id, {}).get("answers", [])
        merged[key]["original_question"] = {
            f"question_{i+1}": q for i, q in enumerate(qs)
        }
        merged[key]["original_answer"] = {
            f"answer_{i+1}": a for i, a in enumerate(ans)
        }

        # Generated questions
        gen_list = gen_map.get(img_id, [])
        gen_q = {f"generated_question_{i+1}": q for i, q in enumerate(gen_list)}

        # Paraphrased questions
        parap_list = paraphr_map.get(img_id, [])
        parap_q = {f"paraphrased_question_{i+1}": p for i, p in enumerate(parap_list)}

        merged[key]["qwenvl_generated"] = {
            "question_paraphrased": parap_q,
            "question_generated": gen_q
        }

    # Write merged JSON
    with open(output_path, 'w', encoding='utf-8') as f:
        json.dump(merged, f, ensure_ascii=False, indent=4)

    print(f"Saved merged data to {output_path}")


def main():
    parser = argparse.ArgumentParser(
        description="Merge original Q&A, generated, and paraphrased questions by image_id"
    )
    parser.add_argument(
        "--orig", required=True,
        help="Path to original Q&A JSON file"
    )
    parser.add_argument(
        "--gen", required=True,
        help="Path to generated questions JSON file"
    )
    parser.add_argument(
        "--paraphr", required=True,
        help="Path to paraphrased questions JSON file"
    )
    parser.add_argument(
        "--output", required=True,
        help="Path to output merged JSON file"
    )
    args = parser.parse_args()
    merge_json(
        orig_path=args.orig,
        gen_path=args.gen,
        paraphr_path=args.paraphr,
        output_path=args.output
    )

if __name__ == "__main__":
    main()


merge_final

In [None]:
import json
import argparse
import yaml
import os

def merge_json_data(original_file, external_file, output_file):
    # Load JSON files
    with open(original_file, 'r', encoding='utf-8') as f:
        original_data = json.load(f)
    with open(external_file, 'r', encoding='utf-8') as f:
        external_data = json.load(f)

    # Map image_id to external entry and extract question lists
    ext_by_id = {v['image_id']: v for v in external_data.values()}
    paraphrased_lists = {
        img_id: list(entry.get('qwenvl_generated', {}).get('question_paraphrased', {}).values())
        for img_id, entry in ext_by_id.items()
    }
    generated_lists = {
        img_id: list(entry.get('qwenvl_generated', {}).get('question_generated', {}).values())
        for img_id, entry in ext_by_id.items()
    }

    # Counters to track assignment per annotation
    counters = {}

    # Iterate annotations in order, assigning questions sequentially
    for ann_id, ann_data in original_data.get('annotations', {}).items():
        image_id = ann_data.get('image_id')
        # Initialize lists and counter
        p_list = paraphrased_lists.get(image_id, [])
        g_list = generated_lists.get(image_id, [])
        idx = counters.get(image_id, 0)

        # Pick the next paraphrased and generated question or empty string
        pq = p_list[idx] if idx < len(p_list) else ""
        gq = g_list[idx] if idx < len(g_list) else ""

        # Attach to annotation
        ann_data['qwenvl_generated'] = {
            'paraphrased_question': pq,
            'generated_question': gq
        }

        # Increment counter for this image_id
        counters[image_id] = idx + 1

    # Ensure output directory exists
    output_dir = os.path.dirname(output_file)
    if output_dir:
        os.makedirs(output_dir, exist_ok=True)

    # Save merged data
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(original_data, f, ensure_ascii=False, indent=4)
    print(f"Merged data saved to {output_file}")


def main():
    parser = argparse.ArgumentParser(description='Merge JSON data based on image_id')
    parser.add_argument('--config', type=str, help='Path to YAML config file')
    parser.add_argument('--original', help='Path to original JSON file')
    parser.add_argument('--external', help='Path to external JSON file')
    parser.add_argument('--output', help='Path to output merged JSON file')
    args = parser.parse_args()

    # Load config if provided
    config = {}
    if args.config:
        with open(args.config, 'r') as f:
            config = yaml.safe_load(f)

    # Merge CLI args and YAML configuraton
    merged_config = {}
    for key in ['original', 'external', 'output']:
        val = getattr(args, key)
        if val is not None:
            merged_config[key] = val
        elif key in config:
            merged_config[key] = config[key]

    # Validate required params
    missing = [k for k in ['original', 'external', 'output'] if k not in merged_config]
    if missing:
        print(f"Missing required arguments: {', '.join(missing)}")
        parser.print_help()
        return

    merge_json_data(
        merged_config['original'],
        merged_config['external'],
        merged_config['output']
    )

if __name__ == '__main__':
    main()


Top 10 object extracted by Faster RCNN

In [3]:
from __future__ import annotations
import csv, sys, base64, json, os, time
from pathlib import Path
from typing import List, Dict, Union, Optional
import numpy as np

csv.field_size_limit(sys.maxsize)

FIELDNAMES = [
    "img_id", "img_h", "img_w",
    "objects_id", "objects_conf",
    "attrs_id", "attrs_conf",
    "num_boxes", "boxes", "features"
]

# --- 2. Core helpers (unchanged) ---------------------------------------------
def load_object_vocab(vocab_file: str | Path) -> Dict[int, str]:
    object_names: Dict[int, str] = {}
    with open(vocab_file, encoding="utf-8") as f:
        for idx, name in enumerate(line.strip() for line in f if line.strip()):
            object_names[idx] = name
    print(f"Loaded {len(object_names)} object names from {vocab_file}")
    return object_names


def load_obj_tsv(
    fname: str | Path,
    topk: Optional[int] = None,
    verbose: bool = True
) -> List[Dict[str, Union[str, int, np.ndarray]]]:
    data, start = [], time.time()
    if verbose:
        print(f"Loading Faster R-CNN objects from {fname}")
    with open(fname) as f:
        reader = csv.DictReader(f, FIELDNAMES, delimiter="\t")
        for i, item in enumerate(reader):
            if verbose and i and i % 1000 == 0:
                print(f"  {i} images... ({time.time() - start:.1f}s)")

            # convert scalar fields
            for k in ("img_h", "img_w", "num_boxes"):
                item[k] = int(item[k])

            n = item["num_boxes"]
            decode_cfg = [
                ("objects_id",   (n,),     np.int64),
                ("objects_conf", (n,),     np.float32),
                ("attrs_id",     (n,),     np.int64),
                ("attrs_conf",   (n,),     np.float32),
                ("boxes",        (n, 4),   np.float32),
                ("features",     (n, -1),  np.float32),
            ]
            for key, shape, dtype in decode_cfg:
                arr = np.frombuffer(base64.b64decode(item[key]), dtype=dtype).reshape(shape)
                arr.setflags(write=False)
                item[key] = arr

            data.append(item)
            if topk and len(data) >= topk:
                break

    if verbose:
        print(f"Loaded {len(data)} images in {time.time() - start:.1f}s")
    return data


def save_top_objects_to_json(
    data: List[Dict[str, Union[str, int, np.ndarray]]],
    output_file: str | Path,
    num_objects: int = 10,
    object_mapping: Dict[int, str] | None = None,
    verbose: bool = True,
) -> Path:
    if not data:
        raise ValueError("`data` is empty—did loading fail?")

    object_mapping = object_mapping or {}
    results, start = [], time.time()

    for img_idx, item in enumerate(data):
        if verbose and img_idx and img_idx % 1000 == 0:
            print(f"  {img_idx} images... ({time.time() - start:.1f}s)")

        n = min(num_objects, item["num_boxes"])
        if n == 0:
            continue

        top_idx = np.argsort(item["objects_conf"])[-n:][::-1]
        entry = {"img_id": item["img_id"], "objects": []}

        for rank, idx in enumerate(top_idx, 1):
            obj_id = int(item["objects_id"][idx])
            entry["objects"].append({
                f"object_{rank}": object_mapping.get(obj_id, f"object_{obj_id}"),
                "confidence":   float(item["objects_conf"][idx]),
                "bbox":         item["boxes"][idx].tolist(),
            })

        results.append(entry)

    output_file = Path(output_file)
    output_file.parent.mkdir(parents=True, exist_ok=True)
    with output_file.open("w", encoding="utf-8") as f:
        json.dump(results, f, ensure_ascii=False, indent=2)

    if verbose:
        print(f"Saved {len(results)} images ⇒ {output_file}")
    return output_file


def run_pipeline(
    tsv_file: str | Path,
    vocab_file: str | Path,
    output_json: str | Path,
    *,
    num_objects: int = 10,
    topk: Optional[int] = None,
    verbose: bool = True,
) -> Path:
    """One-liner to execute the full extraction inside a notebook cell."""
    if not Path(tsv_file).exists():
        raise FileNotFoundError(tsv_file)
    if not Path(vocab_file).exists():
        raise FileNotFoundError(vocab_file)

    mapping = load_object_vocab(vocab_file)
    data = load_obj_tsv(tsv_file, topk=topk, verbose=verbose)
    return save_top_objects_to_json(
        data, output_json, num_objects=num_objects, object_mapping=mapping, verbose=verbose
    )

In [4]:
tsv_path   = "/mnt/VLAI_data/detection_features/openvivqa_train_obj36.tsv"
vocab_path = "/home/hanhpm/task_detection_result_frcnn/VQA_Template/data/1600-400-20/objects_vocab.txt"
out_path   = "/home/duyth/vqa_co_training/vivqa-co-training/data_gen_qwenvl/data/extracted_objects/objects_train.json"

run_pipeline(
    tsv_file   = tsv_path,
    vocab_file = vocab_path,
    output_json= out_path,
    num_objects= 10,   
    topk       = None, 
    verbose    = True,
)


Loaded 1600 object names from /home/hanhpm/task_detection_result_frcnn/VQA_Template/data/1600-400-20/objects_vocab.txt
Loading Faster R-CNN objects from /mnt/VLAI_data/detection_features/openvivqa_train_obj36.tsv
  1000 images... (3.0s)
  2000 images... (6.0s)
  3000 images... (9.1s)
  4000 images... (12.2s)
  5000 images... (15.4s)
  6000 images... (18.5s)
  7000 images... (21.7s)
  8000 images... (24.9s)
  9000 images... (28.0s)
Loaded 9129 images in 28.4s
  1000 images... (0.0s)
  2000 images... (0.1s)
  3000 images... (0.1s)
  4000 images... (0.2s)
  5000 images... (0.2s)
  6000 images... (0.2s)
  7000 images... (0.3s)
  8000 images... (0.3s)
  9000 images... (0.4s)
Saved 9129 images ⇒ /home/duyth/vqa_co_training/vivqa-co-training/data_gen_qwenvl/data/extracted_objects/objects_train.json


PosixPath('/home/duyth/vqa_co_training/vivqa-co-training/data_gen_qwenvl/data/extracted_objects/objects_train.json')