In [4]:
import pandas as pd
import json
import os
from pathlib import Path
from sklearn.model_selection import train_test_split

def process_tsv_files(tsv_directory, output_directory, image_base_path):
    """
    Load TSV files, split into train/test, and create separate JSONL outputs.
    
    Args:
        tsv_directory: Directory containing TSV files
        output_directory: Directory for output JSONL files
        image_base_path: Base path for images
    """
    # Create output directory if it doesn't exist
    os.makedirs(output_directory, exist_ok=True)
    
    # Get all TSV files in directory
    tsv_files = list(Path(tsv_directory).glob("*.tsv"))
    
    for tsv_file in tsv_files:
        print(f"Processing {tsv_file.name}")
        
        # Load TSV
        df = pd.read_csv(tsv_file, sep='\t')
        
        # Extract subset name from filename (remove .tsv extension)
        subset_name = tsv_file.stem
        
        # Create output paths using subset name
        train_jsonl_path = os.path.join(output_directory, f"{subset_name}_train.jsonl")
        test_jsonl_path = os.path.join(output_directory, f"{subset_name}_test.jsonl")
        
        train_records = []
        test_records = []
        
        # Split by index (80/20 train/test)
        train_df, test_df = train_test_split(
            df, 
            test_size=0.2, 
            random_state=42,
            stratify=df['answer']
        )
        
        # Process train split
        for _, row in train_df.iterrows():
            image_path = os.path.join(image_base_path, row['type'])
            id = int(row['index']) + 1

            record = {
                "id": id,
                "combined_image_path": f"{image_path}/{id}.jpg",
                "correct_answer": row['answer'],
                "subset_split": f"{row['type']}", # forward compatibility
                "split": "train"
            }
            train_records.append(record)
        
        # Process test split
        for _, row in test_df.iterrows():
            image_path = os.path.join(image_base_path, row['type'])
            id = int(row['index']) + 1

            record = {
                "id": id,
                "combined_image_path": f"{image_path}/{id}.jpg",
                "correct_answer": row['answer'],
                "subset_split": f"{row['type']}", # forward compatibility
                "split": "test"
            }
            test_records.append(record)
        
        # Write train JSONL
        with open(train_jsonl_path, 'w') as f:
            for record in train_records:
                f.write(json.dumps(record) + '\n')
        
        # Write test JSONL
        with open(test_jsonl_path, 'w') as f:
            for record in test_records:
                f.write(json.dumps(record) + '\n')
        
        print(f"Created {len(train_records)} training records in {train_jsonl_path}")
        print(f"Created {len(test_records)} test records in {test_jsonl_path}")

# Usage
tsv_dir = "/data/users/brandon/ob1-projects/InternVL/internvl_chat/rollout_generation/preprocessed_prompts/preprocessing_scripts/RAVEN/raven_processed_jsonl/last_four_raw_tsv"
output_dir = "/data/users/brandon/ob1-projects/InternVL/internvl_chat/rollout_generation/preprocessed_prompts/preprocessing_scripts/RAVEN/raven_processed_jsonl/last_four_jsonl"

image_base = "/data/users/brandon/ob1-projects/InternVL/internvl_chat/rollout_generation/preprocessed_prompts/preprocessing_scripts/RAVEN/processed_raven_images"

process_tsv_files(tsv_dir, output_dir, image_base)


Processing in_center_single_out_center_single.tsv
Created 8000 training records in /data/users/brandon/ob1-projects/InternVL/internvl_chat/rollout_generation/preprocessed_prompts/preprocessing_scripts/RAVEN/raven_processed_jsonl/last_four_jsonl/in_center_single_out_center_single_train.jsonl
Created 2000 test records in /data/users/brandon/ob1-projects/InternVL/internvl_chat/rollout_generation/preprocessed_prompts/preprocessing_scripts/RAVEN/raven_processed_jsonl/last_four_jsonl/in_center_single_out_center_single_test.jsonl
Processing left_center_single_right_center_single.tsv
Created 8000 training records in /data/users/brandon/ob1-projects/InternVL/internvl_chat/rollout_generation/preprocessed_prompts/preprocessing_scripts/RAVEN/raven_processed_jsonl/last_four_jsonl/left_center_single_right_center_single_train.jsonl
Created 2000 test records in /data/users/brandon/ob1-projects/InternVL/internvl_chat/rollout_generation/preprocessed_prompts/preprocessing_scripts/RAVEN/raven_processed_js