In [1]:
import pandas as pd
import random
import os

# -------------------------
# CONFIGURATION
# -------------------------
random.seed(42)

parent_path = "C:/Users/yehte/Downloads/Ye Htet/Projects/TikTok/Annotation/"
INPUT_CSV = parent_path + "Annotation_Template_2025-06-23.csv"  # Your original CSV with columns: image_path, name, age_group, gender, expression
OUTPUT_CSV = parent_path + "Annotation_Template_Fine_Tune_Consistent_2025-06-30.csv"
NUM_PROMPTS_PER_IMAGE = 1

# Prompt templates with placeholders
# TEMPLATES = [
#     "{age_group} {gender} named {name} with {expression} expression.",
#     "{name} is {age_group} {gender} showing {expression} face.",
#     "portrait of {name}, {age_group} {gender} who looks {expression}.",
#     "face of {name}, {age_group} {gender}, expressing {expression}.",
#     "{name}, {age_group} {gender}, with {expression} look.",
#     "the {expression} face of {name}, {age_group} {gender}.",
#     "{name} looks {expression}, is {age_group} {gender}.",
# ]

TEMPLATES = [
    "{name}, {gender}, {age_group}, {expression}."
]

def generate_prompts(metadata, num_prompts=NUM_PROMPTS_PER_IMAGE):
    prompts = []
    for _ in range(num_prompts):
        template = random.choice(TEMPLATES)
        prompt = template.format(
            name=metadata['name'].lower(),
            age_group=metadata['age_group'].lower(),
            gender=metadata['gender'].lower(),
            expression=metadata['expression'].lower()
        )
        prompts.append(prompt)
    return prompts

def main():
    df = pd.read_csv(INPUT_CSV)

    # Check required columns
    required_cols = ["image_filename", "name", "age_group", "gender", "expression"]
    for col in required_cols:
        if col not in df.columns:
            raise ValueError(f"Input CSV missing required column: {col}")

    expanded_rows = []
    for idx, row in df.iterrows():
        metadata = {
            "name": str(row["name"]),
            "age_group": str(row["age_group"]),
            "gender": str(row["gender"]),
            "expression": str(row["expression"])
        }
        prompts = generate_prompts(metadata)

        for prompt in prompts:
            expanded_rows.append({
                "image_filename": row["image_filename"],
                "text_prompt": prompt,
                # You can also add columns like identity, age, gender, expression if needed
            })

    expanded_df = pd.DataFrame(expanded_rows)
    expanded_df.to_csv(OUTPUT_CSV, index=False)
    print(f"✅ Expanded dataset saved to {OUTPUT_CSV}")
    print(f"Total rows: {len(expanded_df)}")

if __name__ == "__main__":
    main()


✅ Expanded dataset saved to C:/Users/yehte/Downloads/Ye Htet/Projects/TikTok/Annotation/Annotation_Template_Fine_Tune_Consistent_2025-06-30.csv
Total rows: 1000
