# Auditing Vision Models using Language

AI is becoming more pervasively used in various social contexts. As a result, it is important to understand the decisions that these models make and how they affect people. In this notebook, we will explore how to audit the decisions of a vision model using language. We will use the [DRML](https://arxiv.org/abs/1905.13677) method to audit the decisions of a vision model. We will use the [CIFAR-10](https://www.cs.toronto.edu/~kriz/cifar.html) dataset to train a vision model and the [IMDB](https://www.imdb.com/interfaces/) dataset to train a language model. We will then use the language model to audit the decisions of the vision model.

## Download software and sampled CelebA dataset

In [None]:
# Download our software and install dependencies
!git clone https://github.com/yuhui-zh15/model_audit.git
!cd model_audit
!pip install -r requirements.txt

# Download the CelebA dataset and unzip it
!wget http://cs.stanford.edu/~yuhuiz/assets/manuscripts/celeba_sampled.zip > /dev/null
!unzip celeba_sampled.zip > /dev/null

In [1]:
from typing import Optional
import json
import random
import os
from collections import defaultdict

from tqdm import tqdm, trange
import clip
from clip.model import CLIP
import numpy as np
import torch
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset

from datasets import ImageDataset, TextDataset, create_dataloader
from trainer import extract_features
from models import Linear

base_path = "./celeba_sampled"

## Load CelebA dataset and extract features

In [2]:
data = [json.loads(line) for line in open(f"{base_path}/attributes.jsonl")]
for item in data:
    item["image"] = f"{base_path}/{item['image']}"
    item["label"] = 1 if item["attributes"]["Wearing_Lipstick"] == 1 else 0

image_labels = torch.tensor([item["label"] for item in data])
train_idxs = [
    i for i, item in enumerate(data) if item["attributes"]["split"] == "train"
]
val_idxs = [i for i, item in enumerate(data) if item["attributes"]["split"] == "val"]

In [3]:
item

{'image': './celeba_sampled/images/003489.jpg',
 'attributes': {'5_o_Clock_Shadow': -1,
  'Arched_Eyebrows': 1,
  'Attractive': 1,
  'Bags_Under_Eyes': -1,
  'Bald': -1,
  'Bangs': -1,
  'Big_Lips': 1,
  'Big_Nose': -1,
  'Black_Hair': -1,
  'Blond_Hair': 1,
  'Blurry': -1,
  'Brown_Hair': -1,
  'Bushy_Eyebrows': -1,
  'Chubby': -1,
  'Double_Chin': -1,
  'Eyeglasses': -1,
  'Goatee': -1,
  'Gray_Hair': -1,
  'Heavy_Makeup': 1,
  'High_Cheekbones': 1,
  'Male': -1,
  'Mouth_Slightly_Open': 1,
  'Mustache': -1,
  'Narrow_Eyes': -1,
  'No_Beard': 1,
  'Oval_Face': 1,
  'Pale_Skin': -1,
  'Pointy_Nose': 1,
  'Receding_Hairline': -1,
  'Rosy_Cheeks': -1,
  'Sideburns': -1,
  'Smiling': 1,
  'Straight_Hair': -1,
  'Wavy_Hair': 1,
  'Wearing_Earrings': -1,
  'Wearing_Hat': -1,
  'Wearing_Lipstick': 1,
  'Wearing_Necklace': 1,
  'Wearing_Necktie': -1,
  'Young': 1,
  'split': 'val'},
 'label': 1}

In [4]:
counts = defaultdict(int)
for item in data:
    attributes = item["attributes"]
    counts[(attributes["Male"], attributes["Wearing_Lipstick"])] += 1

counts

defaultdict(int, {(1, -1): 2059, (-1, 1): 2370, (-1, -1): 560, (1, 1): 11})

## Extract image features from the dataset

In [5]:
model_name = "ViT-B/32"
clip_model, transform = clip.load(name=model_name, device="cuda")
clip_model = clip_model.float()

image_dataset = ImageDataset(data)
image_dataloader = create_dataloader(
    dataset=image_dataset,
    modality="image",
    transform=transform,
    shuffle=False,
    batch_size=1024,
    num_workers=16,
)
image_features = extract_features(
    dataloader=image_dataloader,
    clip_model=clip_model,
    modality="image",
    verbose=True,
)

Extracting features for image: 100%|██████████| 5/5 [00:12<00:00,  2.41s/it]


## Train visual classifier on the extracted features

In [6]:
def train_one_epoch(
    dataloader: torch.utils.data.DataLoader,
    model: torch.nn.Module,
    optimizer: torch.optim.Optimizer,
    device: str = "cuda",
):
    model.train()
    for batch in dataloader:
        x, y = batch
        x, y = x.to(device), y.to(device)
        logits = model(x)
        loss = F.cross_entropy(logits, y)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()


def evaluate(
    dataloader: torch.utils.data.DataLoader,
    model: torch.nn.Module,
    device: str = "cuda",
) -> dict:
    model.eval()
    losses, preds, labels = [], [], []
    with torch.no_grad():
        for batch in dataloader:
            x, y = batch
            x, y = x.to(device), y.to(device)
            logits = model(x)
            loss = F.cross_entropy(logits, y)

            preds.extend(logits.argmax(-1).cpu().tolist())
            labels.extend(y.cpu().tolist())
            losses.append(loss.item())
    preds_np, labels_np = np.array(preds), np.array(labels)
    acc = np.mean(preds_np == labels_np)
    mean_loss = np.mean(losses)
    return {
        "acc": acc,
        "loss": mean_loss,
        "preds": preds_np,
        "labels": labels_np,
    }


def train_image_model(
    features: torch.Tensor,
    labels: torch.Tensor,
    train_idxs: list,
    val_idxs: list,
    n_epochs: int = 25,
    batch_size: int = 32,
    lr: float = 1e-3,
) -> torch.nn.Module:
    assert len(features) == len(
        labels
    ), "Features and labels should have the same length."
    features = F.normalize(features)

    train_features = features[train_idxs]
    train_labels = labels[train_idxs]
    val_features = features[val_idxs]
    val_labels = labels[val_idxs]

    train_dataset = TensorDataset(train_features, train_labels)
    val_dataset = TensorDataset(val_features, val_labels)
    train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

    d_model = features.shape[1]
    n_classes = int(labels.max().item() + 1)
    model = Linear(d_model, n_classes).cuda()
    opt = torch.optim.Adam(model.parameters(), lr=lr)

    for epoch_idx in range(n_epochs):
        train_one_epoch(train_dataloader, model, opt)
        metrics_train = evaluate(train_dataloader, model)
        metrics_val = evaluate(val_dataloader, model)
        print(
            f"Epoch {epoch_idx}: train_loss = {metrics_train['loss']:.4f}, train_acc = {metrics_train['acc']:.4f}, val_loss = {metrics_val['loss']:.4f}, val_acc = {metrics_val['acc']:.4f}"
        )

    return model


model = train_image_model(image_features, image_labels, train_idxs, val_idxs)
torch.save(model.state_dict(), "model.pt")

Epoch 0: train_loss = 0.5273, train_acc = 0.9155, val_loss = 0.5287, val_acc = 0.9220
Epoch 1: train_loss = 0.4253, train_acc = 0.9160, val_loss = 0.4268, val_acc = 0.9220
Epoch 2: train_loss = 0.3608, train_acc = 0.9180, val_loss = 0.3624, val_acc = 0.9230
Epoch 3: train_loss = 0.3189, train_acc = 0.9185, val_loss = 0.3202, val_acc = 0.9240
Epoch 4: train_loss = 0.2898, train_acc = 0.9203, val_loss = 0.2909, val_acc = 0.9260
Epoch 5: train_loss = 0.2689, train_acc = 0.9207, val_loss = 0.2698, val_acc = 0.9270
Epoch 6: train_loss = 0.2533, train_acc = 0.9210, val_loss = 0.2541, val_acc = 0.9280
Epoch 7: train_loss = 0.2413, train_acc = 0.9225, val_loss = 0.2421, val_acc = 0.9270
Epoch 8: train_loss = 0.2317, train_acc = 0.9230, val_loss = 0.2325, val_acc = 0.9260
Epoch 9: train_loss = 0.2240, train_acc = 0.9233, val_loss = 0.2250, val_acc = 0.9290
Epoch 10: train_loss = 0.2176, train_acc = 0.9245, val_loss = 0.2187, val_acc = 0.9280
Epoch 11: train_loss = 0.2121, train_acc = 0.9250, va

## Diagnose the visual classifier using the language

### Discovering error slices

In [7]:
def evaluate_language(
    text_input: str,
    clip_model: CLIP,
    model: torch.nn.Module,
    device: str = "cuda",
) -> dict:
    model.eval()

    tokenized_texts = clip.tokenize([text_input]).to(device)
    with torch.no_grad():
        text_features = clip_model.encode_text(tokenized_texts)
        text_logits = model(text_features)
        text_preds = text_logits.softmax(-1).cpu().squeeze(0).tolist()

    return text_preds


genders = ["man", "woman"]
lipsticks = ["with lipstick", "without lipstick"]

probs = []
for gender in genders:
    for lipstick in lipsticks:
        text_input = f"a {gender} {lipstick}."
        text_pred = evaluate_language(text_input, clip_model, model)
        print(text_input, text_pred)
        probs.append(text_pred[1])

a man with lipstick. [1.0, 2.5855320107126545e-09]
a man without lipstick. [1.0, 4.570902337186489e-11]
a woman with lipstick. [1.5590202337989467e-06, 0.9999984502792358]
a woman without lipstick. [0.0017276920843869448, 0.9982722997665405]


In [8]:
def get_group_acc(test_idxs):
    test_features = image_features[test_idxs]
    test_labels = image_labels[test_idxs]
    test_dataset = TensorDataset(test_features, test_labels)
    test_dataloader = DataLoader(test_dataset, batch_size=32, shuffle=False)
    test_metrics = evaluate(test_dataloader, model)
    return test_metrics["acc"]


test_idxs = [
    i
    for i, item in enumerate(data)
    if item["attributes"]["Wearing_Lipstick"] == -1 and item["attributes"]["Male"] == -1
]
print("Overall accuracy:", get_group_acc([i for i, item in enumerate(data)]))
print(
    "Accuracy of man with lipstick:",
    get_group_acc(
        [
            i
            for i, item in enumerate(data)
            if item["attributes"]["Wearing_Lipstick"] == 1
            and item["attributes"]["Male"] == 1
        ]
    ),
)
print(
    "Accuracy of woman without lipstick:",
    get_group_acc(
        [
            i
            for i, item in enumerate(data)
            if item["attributes"]["Wearing_Lipstick"] == -1
            and item["attributes"]["Male"] == -1
        ]
    ),
)

Overall accuracy: 0.922
Accuracy of man with lipstick: 0.0
Accuracy of woman without lipstick: 0.3678571428571429


### Identifying influential attributes

In [9]:
genders = ["woman", "person"]
lipsticks = ["with lipstick", "without lipstick"]

probs = []
for gender in genders:
    for lipstick in lipsticks:
        text_input = f"a {gender} {lipstick}."
        text_pred = evaluate_language(text_input, clip_model, model)
        print(text_input, text_pred)
        probs.append(text_pred[1])

print(
    f'The influence of "woman" to "lipstick" = {((probs[0] - probs[2]) + (probs[1] - probs[3])) / 2}'
)

a woman with lipstick. [1.5590202337989467e-06, 0.9999984502792358]
a woman without lipstick. [0.0017276920843869448, 0.9982722997665405]
a person with lipstick. [0.09957513213157654, 0.9004248380661011]
a person without lipstick. [0.8847416639328003, 0.1152583435177803]
The influence of "woman" to "lipstick" = 0.4912937842309475


## Rectify the visual classifier using the language

In [10]:
def train_one_epoch(
    dataloader: torch.utils.data.DataLoader,
    model: torch.nn.Module,
    optimizer: torch.optim.Optimizer,
    device: str = "cuda",
):
    model.train()
    for batch in dataloader:
        x, y = batch
        x, y = x.to(device), y.to(device)
        logits = model(x)
        loss = F.cross_entropy(logits, y)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()


text_inputs = [
    "a man with lipstick.",
    "a man without lipstick.",
    "a woman with lipstick.",
    "a woman without lipstick.",
]
text_labels = torch.tensor([1, 0, 1, 0])
tokenized_texts = clip.tokenize(text_inputs).to("cuda")
with torch.no_grad():
    text_features = clip_model.encode_text(tokenized_texts)
text_dataset = TensorDataset(text_features, text_labels)
text_dataloader = DataLoader(text_dataset, batch_size=32, shuffle=True)

optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
for epoch in trange(1000):
    train_one_epoch(text_dataloader, model, optimizer)

100%|██████████| 1000/1000 [00:00<00:00, 1238.84it/s]


In [11]:
def get_group_acc(test_idxs):
    test_features = image_features[test_idxs]
    test_labels = image_labels[test_idxs]
    test_dataset = TensorDataset(test_features, test_labels)
    test_dataloader = DataLoader(test_dataset, batch_size=32, shuffle=False)
    test_metrics = evaluate(test_dataloader, model)
    return test_metrics["acc"]


test_idxs = [
    i
    for i, item in enumerate(data)
    if item["attributes"]["Wearing_Lipstick"] == -1 and item["attributes"]["Male"] == -1
]
print("Overall accuracy:", get_group_acc([i for i, item in enumerate(data)]))
print(
    "Accuracy of man with lipstick:",
    get_group_acc(
        [
            i
            for i, item in enumerate(data)
            if item["attributes"]["Wearing_Lipstick"] == 1
            and item["attributes"]["Male"] == 1
        ]
    ),
)
print(
    "Accuracy of woman without lipstick:",
    get_group_acc(
        [
            i
            for i, item in enumerate(data)
            if item["attributes"]["Wearing_Lipstick"] == -1
            and item["attributes"]["Male"] == -1
        ]
    ),
)

Overall accuracy: 0.9274
Accuracy of man with lipstick: 0.18181818181818182
Accuracy of woman without lipstick: 0.4589285714285714
