In [1]:
# %pip install torchvision scikit-image lpips pytorch-fid

Collecting scikit-image
  Downloading scikit_image-0.25.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (14 kB)
Collecting lpips
  Downloading lpips-0.1.4-py3-none-any.whl.metadata (10 kB)
Collecting pytorch-fid
  Downloading pytorch_fid-0.3.0-py3-none-any.whl.metadata (5.3 kB)
Collecting scipy>=1.11.4 (from scikit-image)
  Downloading scipy-1.15.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
Collecting imageio!=2.35.0,>=2.33 (from scikit-image)
  Downloading imageio-2.37.0-py3-none-any.whl.metadata (5.2 kB)
Collecting tifffile>=2022.8.12 (from scikit-image)
  Downloading tifffile-2025.5.10-py3-none-any.whl.metadata (31 kB)
Collecting lazy-loader>=0.4 (from scikit-image)
  Downloading lazy_loader-0.4-py3-none-any.whl.metadata (7.6 kB)
Downloading scikit_image-0.25.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (14.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.8/14.8 MB[0m [31m99.8 MB/s[0m eta

In [1]:
import torch
# import clip
from PIL import Image
import torchvision.transforms as T
from transformers import CLIPProcessor, CLIPModel
import numpy as np

import os
import torch
import torchvision.transforms as T
from torchvision.io import read_image
from torchvision.utils import save_image
from torchvision.models import inception_v3
from torchvision import transforms as TF
from pytorch_fid import fid_score
from skimage.metrics import peak_signal_noise_ratio as compare_psnr
from skimage.metrics import structural_similarity as compare_ssim
import lpips
import numpy as np
from PIL import Image
from tqdm import tqdm
from pytorch_fid.fid_score import calculate_fid_given_paths


In [2]:
path = %pwd
path

'/home/jenny/MasaCtrl'

In [3]:
source_image_path = path + "/dataset/test_output/final_test_original.png"
edit_image_path = path + "/dataset/test_output/final_test_output.png"

In [19]:
source_dir_path = path + "/input"
edit_dir_path = path + "/output"
edit_dir_path

'/home/jenny/MasaCtrl/output'

## CLIP
The result is in [−1,1], but often normalized or scaled to [0,1] or even percentages in practical use.

In [5]:
# Load model and processor
device = "cuda" if torch.cuda.is_available() else "cpu"
clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(device)
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


### Indivudual pairs

In [6]:
def compute_image_prompt_clip_score(image: Image.Image, prompt: str):
    inputs = clip_processor(text=[prompt], images=image, return_tensors="pt", padding=True).to(device)
    with torch.no_grad():
        outputs = clip_model(**inputs)
        image_embeds = outputs.image_embeds  # (1, D)
        text_embeds = outputs.text_embeds    # (1, D)
        similarity = torch.nn.functional.cosine_similarity(image_embeds, text_embeds)
    return similarity.item()


In [7]:
def compute_image_image_clip_similarity(image1: Image.Image, image2: Image.Image):
    inputs1 = clip_processor(images=image1, return_tensors="pt").to(device)
    inputs2 = clip_processor(images=image2, return_tensors="pt").to(device)
    
    with torch.no_grad():
        image1_embed = clip_model.get_image_features(**inputs1)
        image2_embed = clip_model.get_image_features(**inputs2)
        similarity = torch.nn.functional.cosine_similarity(image1_embed, image2_embed)
    
    return similarity.item()


In [8]:
# Load images
image1 = Image.open(source_image_path).convert("RGB")
image2 = Image.open(edit_image_path).convert("RGB")

# Text prompt for the edited image
target_prompt = "a boy dancing outdoors"

# Image–Prompt CLIP Score
clip_score = compute_image_prompt_clip_score(image2, target_prompt)
print("CLIP (image-prompt):", clip_score)

# Image–Image CLIP Similarity
clip_image_sim = compute_image_image_clip_similarity(image1, image2)
print("CLIP (image-image):", clip_image_sim)


CLIP (image-prompt): 0.314338356256485
CLIP (image-image): 0.8859783411026001


### Directory Level

In [9]:
def compute_clip_image_text_dir(image_dir, prompt_dict):
    scores = []

    for fname in tqdm(sorted(os.listdir(image_dir)), desc="CLIP image-text"):
        if fname not in prompt_dict:
            continue

        img_path = os.path.join(image_dir, fname)
        image = Image.open(img_path).convert("RGB")
        text = prompt_dict[fname]

        inputs = clip_processor(images=image, text=text, return_tensors="pt").to(device)
        with torch.no_grad():
            outputs = clip_model(**inputs)
            sim = torch.nn.functional.cosine_similarity(outputs.image_embeds, outputs.text_embeds).item()
            scores.append(sim)

    return sum(scores) / len(scores) if scores else 0.0


In [10]:
def compute_clip_image_image_dir(source_dir, edited_dir):
    scores = []

    valid_exts = ('.png', '.jpg', '.jpeg')

    source_files = filenames = sorted([
        f for f in os.listdir(source_dir)
        if f.lower().endswith(valid_exts) and os.path.isfile(os.path.join(source_dir, f))
    ])
    
    edited_files = sorted([
        f for f in os.listdir(source_dir)
        if f.lower().endswith(valid_exts) and os.path.isfile(os.path.join(edited_dir, f))
    ])

    for fname_src, fname_edit in zip(source_files, edited_files):
        path1 = os.path.join(source_dir, fname_src)
        path2 = os.path.join(edited_dir, fname_edit)

        if not os.path.exists(path1) or not os.path.exists(path2):
            continue

        image1 = Image.open(path1).convert("RGB")
        image2 = Image.open(path2).convert("RGB")

        inputs1 = clip_processor(images=image1, return_tensors="pt").to(device)
        inputs2 = clip_processor(images=image2, return_tensors="pt").to(device)

        with torch.no_grad():
            feat1 = clip_model.get_image_features(**inputs1)
            feat2 = clip_model.get_image_features(**inputs2)
            sim = torch.nn.functional.cosine_similarity(feat1, feat2).item()
            scores.append(sim)

    return sum(scores) / len(scores) if scores else 0.0


In [11]:
# Path to folders
source_dir = source_dir_path
edit_dir = edit_dir_path 

# Image-Prompt dict (only needed for image-text CLIP score)
prompt_dict = {
    "img1.png": "a boy standing",
    "img2.png": "a boy dancing",
    # ... (one entry per image in edit_dir)
}

# Compute scores
clip_text_score = compute_clip_image_text_dir(edit_dir, prompt_dict)
clip_image_score = compute_clip_image_image_dir(source_dir, edit_dir)

print(f"Average CLIP (image-text): {clip_text_score:.4f}")
print(f"Average CLIP (image-image): {clip_image_score:.4f}")


CLIP image-text: 100%|█████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 220752.84it/s]

Average CLIP (image-text): 0.0000
Average CLIP (image-image): 1.0000





## PSNR, LPIPS, SSIM

In [12]:
def load_image(path, size=(256, 256), as_tensor=True):
    img = Image.open(path).convert("RGB").resize(size, Image.BICUBIC)
    if as_tensor:
        return T.ToTensor()(img)
    return img

### Individual pairs

In [13]:
def compute_metrics_image(image1_path, image2_path, image_size=(256, 256)):
    lpips_model = lpips.LPIPS(net='alex').cuda().eval()

    img1 = load_image(image1_path, size=image_size).unsqueeze(0).cuda()
    img2 = load_image(image2_path, size=image_size).unsqueeze(0).cuda()

    # PSNR / SSIM
    np1 = img1.squeeze().permute(1, 2, 0).cpu().numpy()
    np2 = img2.squeeze().permute(1, 2, 0).cpu().numpy()

    psnr = compare_psnr(np1, np2, data_range=1.0)
    ssim = compare_ssim(np1, np2, multichannel=True, data_range=1.0, win_size=3) # default: win_size=7
    lpips_val = lpips_model(img1, img2).item()

    return {
        "PSNR": psnr,
        "SSIM": ssim,
        "LPIPS": lpips_val
    }


In [14]:
metrics = compute_metrics_image(source_image_path, edit_image_path)
print("Image-level PSNR:", metrics["PSNR"])
print("Image-level SSIM:", metrics["SSIM"])
print("Image-level LPIPS:", metrics["LPIPS"])


Setting up [LPIPS] perceptual loss: trunk [alex], v[0.1], spatial [off]




Loading model from: /home/jenny/anaconda3/envs/clean310torch/lib/python3.10/site-packages/lpips/weights/v0.1/alex.pth
Image-level PSNR: 13.168018447839662
Image-level SSIM: 0.5579255978456787
Image-level LPIPS: 0.505652666091919


### Directory level

In [15]:
def compute_metrics_folder(source_dir, edit_dir, image_size=(256, 256)):
    lpips_model = lpips.LPIPS(net='alex').cuda().eval()

    psnr_list, ssim_list, lpips_list = [], [], []

    # filenames = sorted(os.listdir(source_dir))
    valid_exts = ('.png', '.jpg', '.jpeg')
    filenames = sorted([
        f for f in os.listdir(source_dir)
        if f.lower().endswith(valid_exts) and os.path.isfile(os.path.join(source_dir, f))
    ])
    for fname in tqdm(filenames, desc="Computing PSNR/SSIM/LPIPS"):
        path1 = os.path.join(source_dir, fname)
        path2 = os.path.join(edit_dir, fname)
        if not os.path.exists(path2):
            continue

        img1 = load_image(path1, size=image_size).unsqueeze(0).cuda()
        img2 = load_image(path2, size=image_size).unsqueeze(0).cuda()

        np1 = img1.squeeze().permute(1, 2, 0).cpu().numpy()
        np2 = img2.squeeze().permute(1, 2, 0).cpu().numpy()

        psnr = compare_psnr(np1, np2, data_range=1.0)
        ssim = compare_ssim(np1, np2, multichannel=True, data_range=1.0, win_size=3) # default: win_size=7
        lpips_val = lpips_model(img1, img2).item()

        psnr_list.append(psnr)
        ssim_list.append(ssim)
        lpips_list.append(lpips_val)

    return {
        "PSNR": np.mean(psnr_list),
        "SSIM": np.mean(ssim_list),
        "LPIPS": np.mean(lpips_list)
    }


In [16]:
metrics = compute_metrics_folder(source_dir_path, edit_dir_path)
print("Folder-level PSNR:", metrics["PSNR"])
print("Folder-level SSIM:", metrics["SSIM"])
print("Folder-level LPIPS:", metrics["LPIPS"])


Setting up [LPIPS] perceptual loss: trunk [alex], v[0.1], spatial [off]
Loading model from: /home/jenny/anaconda3/envs/clean310torch/lib/python3.10/site-packages/lpips/weights/v0.1/alex.pth


  return 10 * np.log10((data_range**2) / err)
Computing PSNR/SSIM/LPIPS: 100%|███████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 33.35it/s]

Folder-level PSNR: inf
Folder-level SSIM: 1.0
Folder-level LPIPS: 0.0





## FID (Directory level only)

In [17]:
def compute_fid(source_dir, edit_dir, batch_size=4, device="cuda"):
    import os
    import numpy as np
    from torchvision import transforms as TF
    from pytorch_fid import fid_score

    # Monkey-patch ImagePathDataset correctly
    def patched_init(self, files, transforms=None):  # Accept `transforms`
        self.files = files
        self.transforms = TF.Compose([
            TF.Resize((299, 299)),
            TF.CenterCrop(299),
            TF.ToTensor()
        ])

    fid_score.ImagePathDataset.__init__ = patched_init

    # Validate image folder contents
    def has_valid_images(folder):
        return any(f.lower().endswith((".png", ".jpg", ".jpeg")) for f in os.listdir(folder))

    if not has_valid_images(source_dir) or not has_valid_images(edit_dir):
        raise ValueError("One or both directories are empty or lack valid image files.")

    # Compute FID
    return fid_score.calculate_fid_given_paths(
        paths=[source_dir, edit_dir],
        batch_size=batch_size,
        device=device,
        dims=2048,
        num_workers=0  # safer for Jupyter
    )


In [18]:
fid = compute_fid(source_dir_path, edit_dir_path)
print(f"FID: {fid:.4f}")


100%|██████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 14.29it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 17.19it/s]


FID: -0.0002
