In [8]:
import os
import cv2
import torch
import open_clip
import numpy as np
from PIL import Image
import matplotlib.pyplot as plt
from ultralytics import YOLO
from collections import Counter
import time
import csv
import pandas as pd

# Setup device for computations (CUDA if available, otherwise CPU)
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# Load OpenCLIP model and its preprocessor
# 'ViT-H-14' is the model architecture, 'laion2b_s32b_b79k' is the pretrained weights
model, _, preprocess = open_clip.create_model_and_transforms(
    'ViT-B-16', pretrained='laion400m_e32', device=device)

model.load_state_dict(torch.load("finetuned_identity_only_best_B_16.pt")); model.to(device=device).eval()

# Get the tokenizer specific to the loaded model
tokenizer = open_clip.get_tokenizer('ViT-B-16')

# Text prompts for identity - EXPANDED LIST
person_list = ["Soekarno", "Suharto", "Baharuddin Jusuf Habibie", "Abdurrahman Wahid", "Megawati Sukarnoputri", "Susilo Bambang Yudhoyono", "Joko Widodo",
               "Prabowo Subianto", "Anies Rasyid Baswedan", "Ganjar Pranowo", "Gibran Rakabuming Raka", "Maruf Amin", "Airlangga Hartarto",
               "Sri Mulyani Indrawati", "Erick Thohir", "Agus Harimurti Yudhoyono", "Muhaimin Iskandar", "Mahfud MD", "Boediono", "Jusuf Kalla"]
face_prompts = [
    "A face of Soekarno, a male First President (1945-1967) from Indonesia.",
    "A face of Suharto, a male Second President (1967-1998) from Indonesia.",
    "A face of Baharuddin Jusuf Habibie, a male Third President (1998-1999) from Indonesia.",
    "A face of Abdurrahman Wahid, a male Fourth President (1999-2001) from Indonesia.",
    "A face of Megawati Sukarnoputri, a female Fifth President (2001-2004) from Indonesia.",
    "A face of Susilo Bambang Yudhoyono, a male Sixth President (2004-2014) from Indonesia.",
    "A face of Joko Widodo, a male Seventh President (2014-2024) from Indonesia.",
    "A face of Prabowo Subianto, a male Eight President (2024-Present) from Indonesia.",
    "A face of Anies Rasyid Baswedan, a male Governor of Jakarta (2017-2022) and Presidential Candidate Election (2024) from Indonesia.",
    "A face of Ganjar Pranowo, a male Governor of Central Java (2013-2023) and Presidential Candidate Election (2024) from Indonesia.",
    "A face of Gibran Rakabuming Raka, a male Vice President (2024-2029) from Indonesia.",
    "A face of Maruf Amin, a male Vice President (2019-2024) from Indonesia.",
    "A face of Airlangga Hartarto, a male Coordinating Minister of Economic Affairs (2024-2029) from Indonesia.",
    "A face of Sri Mulyani Indrawati, a female Minister of Finance (2024-2029) from Indonesia.",
    "A face of Erick Thohir, a male Minister of State Owned Entreprises (2024-2029) from Indonesia.",
    "A face of Agus Harimurti Yudhoyono, a male Coordinating Minister of Agrarian Affairs and Spatial Planning (2024-2029) and Chairman of Democratic Party from Indonesia.",
    "A face of Muhaimin Iskandar, a male Coordinating Minister of Social Empowrement (2024-2029) and Chairman of National Awakening Party from Indonesia.",
    "A face of Mahfud MD, a male Coordinating Minister of Political, Legal, and Security Affairs (2019-2024) from Indonesia.",
    "A face of Boediono, a male Vice President (2009-2014) from Indonesia.",
    "A face of Jusuf Kalla, a male Vice President (2004-2009) and Vice President (2014-2019) from Indonesia.",
    # Refined prompt for Unknown Person
    f"A face of Unknown Person, who is not a politician."
    # f"A face of Unknown Person, not similar to any of the 16 listed Indonesian figures in {person_list}."
]
print(face_prompts)
# Extract simplified labels from face prompts (e.g., "Sukarno" from "A face of Sukarno, ...")
labels = [p.replace("A face of ", "").split(",")[0] for p in face_prompts]
label_list = ["Soekarno", "Suharto", "Baharuddin Jusuf Habibie", "Abdurrahman Wahid", "Megawati Sukarnoputri", "Susilo Bambang Yudhoyono", "Joko Widodo",
               "Prabowo Subianto", "Anies Rasyid Baswedan", "Ganjar Pranowo", "Gibran Rakabuming Raka", "Maruf Amin", "Airlangga Hartarto",
               "Sri Mulyani Indrawati", "Erick Thohir", "Agus Harimurti Yudhoyono", "Muhaimin Iskandar", "Mahfud MD", "Boediono", "Jusuf Kalla", "Unknown Person"]
# Create a mapping from extracted labels to their consistent keys
label_map = dict(zip(labels, label_list))

# Prompts for age, gender, expression (unchanged)
age_prompts = ["a teenager", "a young adult", "a middle-aged person", "a late adult", "an elderly person"]
gender_prompts = ["a male", "a female"]
expression_prompts = [
    "a person showing anger", "a person showing contempt", "a person showing disgust",
    "a person showing happiness", "a person showing fear", "a person showing sadness",
    "a person showing surprise", "a person with a neutral expression"
]
age_keys = ["teenager", "young adult", "middle-aged person", "late adult", "elderly"]
gender_keys = ["male", "female"]
expression_keys = ["anger", "contempt", "disgust", "happiness", "fear", "sadness", "surprise", "neutral"]

# Tokenize all prompts
all_prompts = face_prompts ## + age_prompts + gender_prompts + expression_prompts
all_labels = labels ## + age_prompts + gender_prompts + expression_prompts # This variable is not used after this point
text_tokens = tokenizer(all_prompts).to(device)
with torch.no_grad():
    text_features = model.encode_text(text_tokens)
    text_features /= text_features.norm(dim=-1, keepdim=True)

# Load YOLOv8 face detector
face_model = YOLO("yolov8l-face-lindevs.pt")
face_model.to(device)

# Final summary list
final_summary = []

# Helper function
def classify_with_clip(image_pil):
    try:
        image_pil = image_pil.convert("RGB")
        image_input = preprocess(image_pil).unsqueeze(0).to(device)
        with torch.no_grad():
            image_features = model.encode_image(image_input)
            image_features /= image_features.norm(dim=-1, keepdim=True)
            similarity = (100.0 * image_features @ text_features.T).softmax(dim=-1)[0].cpu().numpy()
    except Exception as e:
        print(f"[ERROR] classify_with_clip failed: {e}")
        return "Unknown Person", 0.0, "unknown", "unknown", "unknown"

    # The slicing here correctly adapts to the new length of 'labels'
    identity_sim = similarity[:len(labels)]
    # age_sim = similarity[len(labels):len(labels)+len(age_prompts)]
    # gender_sim = similarity[len(labels)+len(age_prompts):len(labels)+len(age_prompts)+len(gender_prompts)]
    # expr_sim = similarity[len(labels)+len(age_prompts)+len(gender_prompts):]

    identity_idx = np.argmax(identity_sim)
    # age = age_prompts[np.argmax(age_sim)].replace("a ", "").replace("an ", "")
    # gender = gender_prompts[np.argmax(gender_sim)].replace("a ", "")
    # expression = expression_prompts[np.argmax(expr_sim)].replace("a person showing ", "").replace("a person with a ", "").replace(" expression", "")
    confidence = identity_sim[identity_idx]

    predicted_identity = labels[identity_idx] if confidence > 0.6 else "Unknown Person"
    return predicted_identity, confidence  # , age, gender, expression

# Define parent folders
parent_input_dir = "C:/Users/yehte/Downloads/Ye Htet/Projects/TikTok/zero_shot_face_recognition/third_discussion/extracted_frames"
parent_output_dir = "C:/Users/yehte/Downloads/Ye Htet/Projects/TikTok/zero_shot_face_recognition/fourth_discussion/finetune_model_B_16_laion400m_e32_results"

# Process each video folder
for video_folder_name in os.listdir(parent_input_dir)[:10]:
    input_folder = os.path.join(parent_input_dir, video_folder_name)
    if not os.path.isdir(input_folder):
        continue

    result_folder = os.path.join(parent_output_dir, video_folder_name)
    os.makedirs(result_folder, exist_ok=True)

    recognized_faces_dir = os.path.join(result_folder, "recognized_faces")
    os.makedirs(recognized_faces_dir, exist_ok=True)

    print(f"\n🚀 Processing video folder: {video_folder_name}")

    identity_counter = Counter()
    age_counter = Counter()
    gender_counter = Counter()
    expression_counter = Counter()

    csv_path = os.path.join(result_folder, "inference_log.csv")
    csv_file = open(csv_path, mode='w', newline='', encoding='utf-8')
    csv_writer = csv.writer(csv_file)
    csv_writer.writerow(["frame_name", "face_identity"])  #, "gender", "age", "expression", "inference_time_sec"])

    start_time = time.time()

    # Collect all image files in the current video folder to determine its length
    image_files = sorted([f for f in os.listdir(input_folder) if f.lower().endswith(('.jpg', '.png'))])
    original_video_length_sec = len(image_files) # Assuming 1 frame per second

    for filename in image_files:
        image_path = os.path.join(input_folder, filename)
        bgr_img = cv2.imread(image_path)
        if bgr_img is None:
            print(f"Warning: Could not read image {image_path}, skipping.")
            continue
        rgb_img = cv2.cvtColor(bgr_img, cv2.COLOR_BGR2RGB)
        draw_img = rgb_img.copy()

        results = face_model.predict(image_path, conf=0.8, verbose=False)
        boxes = results[0].boxes.xyxy.cpu().numpy().astype(int)

        if len(boxes) == 0:
            print(f"No faces in {filename}, skipping.")
            continue

        fig, axs = plt.subplots(1, len(boxes) + 1, figsize=(5 * (len(boxes) + 1), 5))

        for i, (x1, y1, x2, y2) in enumerate(boxes):
            face = rgb_img[y1:y2, x1:x2]
            face_pil = Image.fromarray(face).convert("RGB")

            face_start = time.time()
            # predicted, confidence, age, gender, expression = classify_with_clip(face_pil)
            predicted, confidence = classify_with_clip(face_pil)
            face_end = time.time()
            face_inference_time = face_end - face_start

            identity_counter[predicted] += 1
            # age_counter[age] += 1
            # gender_counter[gender] += 1
            # expression_counter[expression] += 1

            csv_writer.writerow([filename.split("_")[-1], predicted, f"{face_inference_time:.4f}"])  # , gender, age, expression

            axs[i + 1].imshow(face)
            axs[i + 1].set_title(f"{predicted} ({confidence:.2f})", fontsize=10)  # \n{gender}, {age}\n{expression}
            axs[i + 1].axis("off")

            save_filename = f"{filename.split('_')[-1].split('.')[0]}_face{i+1}_{label_map.get(predicted, 'Unknown Person').replace(' ', '_')}.jpg"
            save_path = os.path.join(recognized_faces_dir, save_filename)
            cv2.imwrite(save_path, cv2.cvtColor(np.array(face_pil), cv2.COLOR_RGB2BGR))

            cv2.rectangle(draw_img, (x1, y1), (x2, y2), (0, 255, 0), 3)

        axs[0].imshow(draw_img)
        axs[0].set_title("Original (bbox)")
        axs[0].axis("off")

        plt.tight_layout()
        save_vis_path = os.path.join(result_folder, filename.split("_")[-1])
        plt.savefig(save_vis_path, dpi=100)  # reduce dpi
        plt.close('all')  # ensure figure is removed from memory

    end_time = time.time()
    total_time_sec = end_time - start_time
    csv_file.close()

    # Save summary (TODO need to refine)
    summary_path = os.path.join(result_folder, "summary.txt")
    with open(summary_path, "w", encoding="utf-8") as f:
        f.write("Face Appearance Summary (Most Frequent Identity in Video Sequence)\n")
        f.write("=" * 65 + "\n\n")

        if identity_counter:
            sorted_identities = identity_counter.most_common()
            total_count = sum(identity_counter.values())

            for name, count in sorted_identities:
                percentage = (count / total_count) * 100
                f.write(f"{name:<25} : {count:>3} times ({percentage:5.1f}%)\n")

            f.write("\n" + "-" * 65 + "\n")
            f.write(f"🟩 Most Frequently Appearing: {sorted_identities[0][0]} ({sorted_identities[0][1]} times)\n")
        else:
            f.write("❗ No identifiable person detected in the video sequence.\n")

        f.write("\nTotal Inference Time: {:.2f} minutes\n".format(total_time_sec / 60))

    summary_row = {"video_folder": video_folder_name}
    summary_row["main_character"] = identity_counter.most_common(1)[0][0] if identity_counter else "None"
    # summary_row["dominance_gender"] = gender_counter.most_common(1)[0][0] if gender_counter else "None"
    # summary_row["dominance_age"] = age_counter.most_common(1)[0][0] if age_counter else "None"
    # summary_row["dominance_expression"] = expression_counter.most_common(1)[0][0] if expression_counter else "None"

    # Initialize all relevant keys to 0, including the new identity keys
    all_summary_keys = label_list + gender_keys + age_keys + expression_keys
    summary_row.update({k: 0 for k in all_summary_keys})

    for name, count in identity_counter.items():
        k = label_map.get(name, "Unknown Person")
        summary_row[k] = count
    # for i, g in enumerate(["male", "female"]):
    #     if g in gender_counter:
    #         summary_row[gender_keys[i]] = gender_counter[g]
    # for i, a in enumerate([p.replace("a ", "").replace("an ", "") for p in age_prompts]):
    #     if a in age_counter:
    #         summary_row[age_keys[i]] = age_counter[a]
    # for i, e in enumerate([p.replace("a person showing ", "").replace("a person with a ", "").replace(" expression", "") for p in expression_prompts]):
    #     if e in expression_counter:
    #         summary_row[expression_keys[i]] = expression_counter[e]

    summary_row["original_video_length_sec"] = original_video_length_sec
    summary_row["inference_time_sec"] = f"{total_time_sec:.2f}"

    final_summary.append(summary_row)
    print(f"\n✅ Done with: {video_folder_name}")

# Save final summary
final_summary_csv = os.path.join(parent_output_dir, "final_summary.csv")
df = pd.DataFrame(final_summary)
df.to_csv(final_summary_csv, index=False, encoding="utf-8")
print(f"\n📝 Final summary saved to: {final_summary_csv}")

Using device: cuda
['A face of Soekarno, a male First President (1945-1967) from Indonesia.', 'A face of Suharto, a male Second President (1967-1998) from Indonesia.', 'A face of Baharuddin Jusuf Habibie, a male Third President (1998-1999) from Indonesia.', 'A face of Abdurrahman Wahid, a male Fourth President (1999-2001) from Indonesia.', 'A face of Megawati Sukarnoputri, a female Fifth President (2001-2004) from Indonesia.', 'A face of Susilo Bambang Yudhoyono, a male Sixth President (2004-2014) from Indonesia.', 'A face of Joko Widodo, a male Seventh President (2014-2024) from Indonesia.', 'A face of Prabowo Subianto, a male Eight President (2024-Present) from Indonesia.', 'A face of Anies Rasyid Baswedan, a male Governor of Jakarta (2017-2022) and Presidential Candidate Election (2024) from Indonesia.', 'A face of Ganjar Pranowo, a male Governor of Central Java (2013-2023) and Presidential Candidate Election (2024) from Indonesia.', 'A face of Gibran Rakabuming Raka, a male Vice Pr