## Generate images for evaluation

In [1]:
# !pip install opencv-python transformers accelerate insightface
import diffusers
from diffusers.utils import load_image
from diffusers.models import ControlNetModel

import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
import json
import cv2
import torch
import numpy as np
from PIL import Image

from insightface.app import FaceAnalysis
from pipeline_stable_diffusion_xl_instantid import StableDiffusionXLInstantIDPipeline, draw_kps

# prepare 'antelopev2' under ./models
app = FaceAnalysis(name='antelopev2', root='./', providers=['CUDAExecutionProvider', 'CPUExecutionProvider'])
app.prepare(ctx_id=0, det_size=(640, 640))

# prepare models under ./checkpoints
face_adapter = f'./checkpoints/ip-adapter.bin'
controlnet_path = f'./checkpoints/ControlNetModel'

# load IdentityNet
controlnet = ControlNetModel.from_pretrained(controlnet_path, torch_dtype=torch.float16)

base_model = 'wangqixun/YamerMIX_v8'  # from https://civitai.com/models/84040?modelVersionId=196039
pipe = StableDiffusionXLInstantIDPipeline.from_pretrained(
    base_model,
    controlnet=controlnet,
    torch_dtype=torch.float16
)
pipe.cuda()

# load adapter
pipe.load_ip_adapter_instantid(face_adapter)

  from .autonotebook import tqdm as notebook_tqdm


Applied providers: ['CUDAExecutionProvider', 'CPUExecutionProvider'], with options: {'CPUExecutionProvider': {}, 'CUDAExecutionProvider': {'use_tf32': '1', 'prefer_nhwc': '0', 'tunable_op_max_tuning_duration_ms': '0', 'enable_skip_layer_norm_strict_mode': '0', 'tunable_op_tuning_enable': '0', 'tunable_op_enable': '0', 'use_ep_level_unified_stream': '0', 'device_id': '0', 'has_user_compute_stream': '0', 'gpu_external_empty_cache': '0', 'cudnn_conv_algo_search': 'EXHAUSTIVE', 'cudnn_conv1d_pad_to_nc1d': '0', 'gpu_mem_limit': '18446744073709551615', 'gpu_external_alloc': '0', 'gpu_external_free': '0', 'arena_extend_strategy': 'kNextPowerOfTwo', 'do_copy_in_default_stream': '1', 'enable_cuda_graph': '0', 'user_compute_stream': '0', 'cudnn_conv_use_max_workspace': '1'}}
find model: ./models/antelopev2/1k3d68.onnx landmark_3d_68 ['None', 3, 192, 192] 0.0 1.0
Applied providers: ['CUDAExecutionProvider', 'CPUExecutionProvider'], with options: {'CPUExecutionProvider': {}, 'CUDAExecutionProvider

The config attributes {'controlnet_list': ['controlnet', 'RPMultiControlNetModel'], 'requires_aesthetics_score': False} were passed to StableDiffusionXLInstantIDPipeline, but are not expected and will be ignored. Please verify your model_index.json configuration file.
Keyword arguments {'controlnet_list': ['controlnet', 'RPMultiControlNetModel'], 'requires_aesthetics_score': False} are not expected by StableDiffusionXLInstantIDPipeline and will be ignored.
Loading pipeline components...: 100%|██████████| 7/7 [00:00<00:00,  7.24it/s]


In [2]:
def resize_img(input_image, max_side=768, min_side=512, size=None, 
               pad_to_max_side=False, mode=Image.BILINEAR, base_pixel_number=64):

    w, h = input_image.size
    if size is not None:
        w_resize_new, h_resize_new = size
    else:
        ratio = min_side / min(h, w)
        w, h = round(ratio*w), round(ratio*h)
        ratio = max_side / max(h, w)
        input_image = input_image.resize([round(ratio*w), round(ratio*h)], mode)
        w_resize_new = (round(ratio * w) // base_pixel_number) * base_pixel_number
        h_resize_new = (round(ratio * h) // base_pixel_number) * base_pixel_number
    input_image = input_image.resize([w_resize_new, h_resize_new], mode)

    if pad_to_max_side:
        res = np.ones([max_side, max_side, 3], dtype=np.uint8) * 255
        offset_x = (max_side - w_resize_new) // 2
        offset_y = (max_side - h_resize_new) // 2
        res[offset_y:offset_y+h_resize_new, offset_x:offset_x+w_resize_new] = np.array(input_image)
        input_image = Image.fromarray(res)
    return input_image

In [None]:
# Single Subject Generation
single_subject = []                            # "backpack"
# Single Prompt Generation
single_prompt = []                            # e.g. ["a {0} {1} near the pool"]

num_generation = 4
negative_prompt = "ugly, deformed, noisy, blurry, low contrast, realism, photorealistic, vibrant, colorful"

output_path = "../../outputs/face/instantid"
subjects = os.listdir("../../pcs_dataset/face")
dataset_info_path = "../../pcs_dataset/info.json"

with open(dataset_info_path, 'r') as file:
    data_info = json.load(file)["face"]

id_with_gender = data_info["id_with_gender"]

if len(single_subject):
    subjects = single_subject

for subject in subjects:

    # define and show the input ID images
    input_image_name = os.path.join('../../pcs_dataset/face/', subject, "face.jpg")

    print(f"***** Subject: {subject} *****")

    os.makedirs(os.path.join(output_path, subject), exist_ok=True)

    if len(single_prompt):
        prompts = single_prompt
    else:
        with open(dataset_info_path, 'r') as file:
          data_info = json.load(file)["face"]
        prompts = data_info["prompt_accessory"] + data_info["prompt_context"] + data_info["prompt_action"] + data_info["prompt_style"]

    for prompt in prompts:

        prompt = prompt.replace("{0} {1}", f"{id_with_gender[subject]} img") + ", high quality"
        print(f"**Prompt**: {prompt}")

        # load an image
        face_image = load_image(input_image_name)
        face_image = resize_img(face_image)

        # prepare face emb
        face_info = app.get(cv2.cvtColor(np.array(face_image), cv2.COLOR_RGB2BGR))
        face_info = sorted(face_info, key=lambda x:(x['bbox'][2]-x['bbox'][0])*(x['bbox'][3]-x['bbox'][1]))[-1]  # only use the maximum face
        face_emb = face_info['embedding']
        face_kps = draw_kps(face_image, face_info['kps'])

        # generate image
        images = pipe(
            prompt,
            negative_prompt=negative_prompt,
            image_embeds=face_emb,
            image=face_kps,
            controlnet_conditioning_scale=0.8,
            ip_adapter_scale=0.8,
            num_images_per_prompt=num_generation,
            guidance_scale=7.5,
        ).images

        save_path = os.path.join(output_path, subject, f"{prompt}")
        os.makedirs(save_path, exist_ok=True)

        for img_idx, img in enumerate(images):
            img.save(os.path.join(save_path, f"{img_idx:04d}.jpg"))
    

## Evaluation

### Calculate the similarity for each sample
Calculate img to img similarity and text to img similarity by CLIP Evaluator

In [1]:
import os, sys, json

os.environ["CUDA_VISIBLE_DEVICES"] = "0"

utils_path = os.path.abspath(os.path.join('../..'))
sys.path.append(utils_path)

import torch
import numpy as np
from PIL import Image
from utils.clip_eval import evaluate_t2i
from utils.face_eval import calculate_identity_similarity

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def convert_to_native(data):
    if isinstance(data, np.ndarray):
        return data.tolist()
    elif isinstance(data, np.generic):
        return data.item()
    elif isinstance(data, dict):
        return {key: convert_to_native(value) for key, value in data.items()}
    elif isinstance(data, list):
        return [convert_to_native(item) for item in data]
    else:
        return data
    
def add_evaluation(file_path, new_data):
    # check whether the file exist
    if not os.path.exists(file_path):
        with open(file_path, 'w') as file:
            json.dump(convert_to_native(new_data), file, indent=4)
    else:
        with open(file_path, 'r') as file:
            data = json.load(file)
        data.update(convert_to_native(new_data))
        with open(file_path, 'w') as file:
            json.dump(data, file, indent=4)

In [None]:
outputs_path = "../../outputs/face/instantid"
eval_res_path = "../../eval_results/face/instantid"
dataset_path = "../../pcs_dataset/face"
dataset_info_path = "../../pcs_dataset/info.json"

with open(dataset_info_path, 'r') as file:
    data_info = json.load(file)["face"]

id_with_gender = data_info["id_with_gender"]

os.makedirs(eval_res_path, exist_ok=True)

id_list = os.listdir(outputs_path)

for id in id_list:
    evaluation_res = dict()
    print(f"***** Face ID: {id} *****")

    res_for_each_subject =dict()

    ref_image = Image.open(os.path.join(dataset_path, id, "face.jpg")).convert("RGB")

    for prompt in os.listdir(os.path.join(outputs_path, id)):
        prompt_eval = prompt.replace(", high quality", "")
        print(f"**Prompt**: {prompt_eval}")

        res_for_each_prompt =dict()

        for generate_img_name in os.listdir(os.path.join(outputs_path, id, prompt)):
            generate_img_path = os.path.join(outputs_path, id, prompt, generate_img_name)
            
            text_similarity = evaluate_t2i(generate_img_path, prompt_eval)

            enerated_image = Image.open(generate_img_path).convert("RGB")
            identity_similarity = float(calculate_identity_similarity(device=torch.device("cuda"), generated_image=enerated_image, ref_image=ref_image))

            res_for_each_prompt[generate_img_name] = [identity_similarity, text_similarity]
        
        res_for_each_subject[prompt] = res_for_each_prompt
        print(res_for_each_prompt)

    evaluation_res[id] = res_for_each_subject

    add_evaluation(os.path.join(eval_res_path, "evaluation_results.json"), evaluation_res)

### Calculate the average similarity

In [4]:
import json

eval_res_path = "../../eval_results/face/instantid/evaluation_results.json"

with open(eval_res_path, "r") as f:
    data = json.load(f)

img_sim = 0.0
text_sim = 0.0
cnt = 0

for subject in data:
    for prompt in data[subject]:
        for sample in data[subject][prompt]:
            img_sim = img_sim + data[subject][prompt][sample][0]
            text_sim = text_sim + data[subject][prompt][sample][1]
            cnt = cnt + 1
print("Identity Similarity: ", img_sim/cnt, "\nText Similarity:", text_sim/cnt)

Identity Similarity:  0.7072043699429681 
Text Similarity: 0.2778454081217448
