## Importing necessary libraries

In [1]:
%pip install ftfy regex tqdm git+https://github.com/openai/CLIP.git torchvision

import torch
import os
import clip
import torch.nn as nn
from torch.utils.data import DataLoader
from torchvision.datasets import ImageFolder
from tqdm.notebook import tqdm
from PIL import Image
import numpy as np
from glob import glob
from sklearn.metrics.pairwise import cosine_similarity
import itertools
import urllib.request
import pandas as pd
from IPython.display import display, HTML

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


Collecting git+https://github.com/openai/CLIP.git
  Cloning https://github.com/openai/CLIP.git to /private/var/folders/_9/51_3yw1x3db244x7mjkk66h80000gn/T/pip-req-build-wakqic1m
  Running command git clone --filter=blob:none --quiet https://github.com/openai/CLIP.git /private/var/folders/_9/51_3yw1x3db244x7mjkk66h80000gn/T/pip-req-build-wakqic1m
  Resolved https://github.com/openai/CLIP.git to commit dcba3cb2e2827b402d2701e7e1c7d9fed8a20ef1
  Installing build dependencies ... [?done
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49m/Users/vishwasparekh/Desktop/University of Southern California/CSCI-544/Assignments/HW2/myenv/bin/python -m pip install --upgrade pip[0m
Note: you may need to restart the 

## Creating the model

In [2]:
class BiasAwareAdversary(nn.Module):
    def __init__(self, input_size=512, hidden_size=256, num_classes=10):
        super().__init__()
        self.debias_head = nn.Sequential(
            nn.Linear(input_size, hidden_size),
            nn.LayerNorm(hidden_size),
            nn.GELU(),
            nn.Dropout(0.4),
            nn.Linear(hidden_size, num_classes)
        )
        
    def forward(self, x):
        return self.debias_head(x)

## Defining helper functions for training

In [3]:
clip_model, preprocess = clip.load("ViT-B/32", device=device)

def create_debias_dataset(data_path):
    return ImageFolder(
        root=data_path,
        transform=preprocess
    )

def entropy_loss(predictions):
    softmax = torch.nn.functional.softmax(predictions, dim=-1)
    log_softmax = torch.nn.functional.log_softmax(predictions, dim=-1)
    return torch.sum(softmax * log_softmax, dim=-1).mean()

def gradient_penalty(image_features, adversary_preds):
    gradients = torch.autograd.grad(
        outputs=adversary_preds.sum(),
        inputs=image_features,
        create_graph=True,
        retain_graph=True
    )[0]
    return torch.mean(gradients.pow(2).sum(dim=1))


## Training the model

In [4]:
def train_joint_debias(clip_model, adversary_model, train_loader, num_epochs=10):
    # Optimizers with differential learning rates
    clip_optim = torch.optim.AdamW(
        clip_model.parameters(), 
        lr=1e-6,  # Small LR for CLIP
        weight_decay=0.05
    )
    adv_optim = torch.optim.AdamW(
        adversary_model.parameters(),
        lr=1e-4,
        weight_decay=0.01
    )
    
    classification_loss = nn.CrossEntropyLoss()
    
    for epoch in range(num_epochs):
        total_loss = 0
        progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}")
        
        for images, labels in progress_bar:
            images, labels = images.to(device), labels.to(device)
            
            # Forward through CLIP (trainable)
            image_features = clip_model.encode_image(images)
            
            # Adversarial predictions
            adv_preds = adversary_model(image_features)
            
            # Loss components
            task_loss = classification_loss(adv_preds, labels)
            gp_loss = gradient_penalty(image_features, adv_preds)
            
            # Static feature debiasing
            with torch.no_grad():
                static_features = image_features.mean(dim=0, keepdim=True).repeat(images.size(0), 1)
            debias_loss = -entropy_loss(adversary_model(static_features))  # Maximize entropy
            
            # Combined loss (λ weights from paper)
            loss = task_loss + 0.7*debias_loss + 0.2*gp_loss
            
            # Backpropagation
            clip_optim.zero_grad()
            adv_optim.zero_grad()
            loss.backward()
            
            # Gradient clipping for CLIP stability
            torch.nn.utils.clip_grad_norm_(clip_model.parameters(), 1.0)
            
            clip_optim.step()
            adv_optim.step()
            
            # Update progress
            total_loss += loss.item()
            progress_bar.set_postfix(loss=loss.item())
        
        print(f"Epoch {epoch+1} | Avg Loss: {total_loss/len(train_loader):.4f}")

## Training setup

In [5]:
DATA_PATH = "Dataset"
batch_size = 64

# Create datasets and loader
train_dataset = create_debias_dataset(DATA_PATH)
train_loader = DataLoader(
    train_dataset,
    batch_size=batch_size,
    shuffle=True,
    num_workers=4,
    pin_memory=True
)

# Initialize models
adversary_model = BiasAwareAdversary(
    num_classes=len(train_dataset.classes)
).to(device)

# Ensure CLIP is trainable
for param in clip_model.parameters():
    param.requires_grad = True

# Start joint training
train_joint_debias(
    clip_model=clip_model,
    adversary_model=adversary_model,
    train_loader=train_loader,
    num_epochs=10
)

Epoch 1:   0%|          | 0/56 [00:00<?, ?it/s]

Epoch 1 | Avg Loss: 2.4710


Epoch 2:   0%|          | 0/56 [00:00<?, ?it/s]

Epoch 2 | Avg Loss: 1.9116


Epoch 3:   0%|          | 0/56 [00:00<?, ?it/s]

Epoch 3 | Avg Loss: 1.3759


Epoch 4:   0%|          | 0/56 [00:00<?, ?it/s]

Epoch 4 | Avg Loss: 0.9172


Epoch 5:   0%|          | 0/56 [00:00<?, ?it/s]

Epoch 5 | Avg Loss: 0.6743


Epoch 6:   0%|          | 0/56 [00:00<?, ?it/s]

Epoch 6 | Avg Loss: 0.5172


Epoch 7:   0%|          | 0/56 [00:00<?, ?it/s]

Epoch 7 | Avg Loss: 0.4014


Epoch 8:   0%|          | 0/56 [00:00<?, ?it/s]

Epoch 8 | Avg Loss: 0.3251


Epoch 9:   0%|          | 0/56 [00:00<?, ?it/s]

Epoch 9 | Avg Loss: 0.2613


Epoch 10:   0%|          | 0/56 [00:00<?, ?it/s]

Epoch 10 | Avg Loss: 0.2292


## Loading necessary data and helper functions

In [6]:
os.makedirs("wordlists", exist_ok=True)
urllib.request.urlretrieve(
    "http://ptrckprry.com/course/ssd/data/positive-words.txt",
    "wordlists/positive-words.txt"
)
urllib.request.urlretrieve(
    "http://ptrckprry.com/course/ssd/data/negative-words.txt",
    "wordlists/negative-words.txt"
)

def load_images(folder, limit=None):
    paths = sorted(glob(os.path.join(folder, "*.jpg")))[:limit]
    tensors = []
    for p in paths:
        try:
            image = preprocess(Image.open(p).convert("RGB")).unsqueeze(0).to(device)
            tensors.append(image)
        except:
            continue
    if tensors:
        return torch.cat(tensors)
    return None

def embed_images(folder, limit=None):
    imgs = load_images(folder, limit)
    if imgs is None: return None
    with torch.no_grad():
        emb = clip_model.encode_image(imgs).float()
    return emb

def collect_all_embeddings(root_path, limit=None):
    target_groups = [
        "Religion", 
        "Nationality", 
        "Disability", 
        "Sexual Orientation", 
        "Valence Images"
    ]
    
    embeddings = {}
    for category in target_groups:
        cat_path = os.path.join(root_path, category)
        if not os.path.isdir(cat_path):
            continue
        embeddings[category] = {}
        for subgroup in os.listdir(cat_path):
            sub_path = os.path.join(cat_path, subgroup)
            if not os.path.isdir(sub_path):
                continue
            print(f"Embedding {category}/{subgroup}")
            emb = embed_images(sub_path, limit)
            if emb is not None:
                embeddings[category][subgroup] = emb
    return embeddings

## Loading words, embedding them and creating sentiment lookup

In [7]:
def load_words(filepath, prefix="This is"):
    with open(filepath, encoding='latin1') as f:
        lines = [w.strip() for w in f if w.strip() and not w.startswith(";")]
    clean = [line for line in lines if line.isascii() and line.isalpha()]
    return [f"{prefix} {w}." for w in clean]

def batch_tokenize_and_embed(text_list, batch_size=64):
    all_embeddings = []
    for i in range(0, len(text_list), batch_size):
        batch = text_list[i:i+batch_size]
        tokens = clip.tokenize(batch).to(device)
        with torch.no_grad():
            emb = clip_model.encode_text(tokens).float()
        all_embeddings.append(emb.cpu())
    return torch.cat(all_embeddings)

# Load positive and negative words
pos_words = load_words("wordlists/positive-words.txt")
neg_words = load_words("wordlists/negative-words.txt")
all_words = pos_words + neg_words

sentiment_lookup = {}
for word in pos_words:
    sentiment_lookup[word] = "positive"
for word in neg_words:
    sentiment_lookup[word] = "negative"

print(f"Embedding {len(pos_words)} positive and {len(neg_words)} negative words...")
pos_emb = batch_tokenize_and_embed(pos_words)
neg_emb = batch_tokenize_and_embed(neg_words)
all_word_emb = torch.cat([pos_emb, neg_emb])
print("✅ Text embedding complete.")

Embedding 1904 positive and 4658 negative words...
✅ Text embedding complete.


## Functions for loading top attributes and computing pairwise bias

In [8]:
def top_attributes(emb, all_words, all_word_emb, top_k=15):
    avg_emb = emb.mean(dim=0, keepdim=True)
    sims = cosine_similarity(avg_emb.cpu().numpy(), all_word_emb.cpu().numpy())[0]
    indices = np.argsort(sims)[::-1][:top_k]
    return [(all_words[i], sims[i]) for i in indices]

def caliskan_score(X, Y, A, B):
    def s(w): return cosine_similarity(w.cpu(), A.cpu()).mean() - cosine_similarity(w.cpu(), B.cpu()).mean()
    s_X = torch.tensor([s(x.unsqueeze(0)) for x in X])
    s_Y = torch.tensor([s(y.unsqueeze(0)) for y in Y])
    return ((s_X.mean() - s_Y.mean()) / torch.std(torch.cat([s_X, s_Y]))).item()

def compute_pairwise_bias(image_embeddings, A, B, group_label):
    rows = []
    subgroups = list(image_embeddings.keys())

    for g1, g2 in itertools.combinations(subgroups, 2):
        emb1 = image_embeddings[g1]
        emb2 = image_embeddings[g2]
        score = caliskan_score(emb1, emb2, A, B)
        favored = g1 if score > 0 else g2 if score < 0 else "neutral"
        rows.append({
            "Group": group_label,
            "Subgroup_A": g1,
            "Subgroup_B": g2,
            "Bias_Score": score,
            "Favored_Group": favored
        })
    return rows


## Evaluating bias and storing results

In [9]:
word_attributes_bias_rows = []

embeddings = collect_all_embeddings("Dataset")

for group_name in ["Religion", "Nationality", "Disability", "Sexual Orientation"]:
    subgroup_embeddings = embeddings[group_name]
    
    rows_word_attributes = compute_pairwise_bias(subgroup_embeddings, pos_emb, neg_emb, group_name)
    word_attributes_bias_rows.extend(rows_word_attributes)

df_word_attributes = pd.DataFrame(word_attributes_bias_rows)
display(df_word_attributes.sort_values(by="Group"))

Embedding Religion/Christians
Embedding Religion/Muslims
Embedding Religion/Hindu
Embedding Religion/Jewish
Embedding Religion/Sikhs
Embedding Religion/Buddists
Embedding Nationality/Indian
Embedding Nationality/European
Embedding Nationality/Americans
Embedding Nationality/Chinese
Embedding Nationality/Arab
Embedding Nationality/Mexican
Embedding Disability/Non-Disabled
Embedding Disability/Mentally Disabled
Embedding Disability/Physically Disabled
Embedding Sexual Orientation/Lesbian
Embedding Sexual Orientation/Gay
Embedding Sexual Orientation/Heterosexual
Embedding Sexual Orientation/Transgender


Unnamed: 0,Group,Subgroup_A,Subgroup_B,Bias_Score,Favored_Group
32,Disability,Mentally Disabled,Physically Disabled,-1.377831,Physically Disabled
31,Disability,Non-Disabled,Physically Disabled,-0.319904,Physically Disabled
30,Disability,Non-Disabled,Mentally Disabled,1.166855,Non-Disabled
19,Nationality,Indian,Mexican,0.429248,Indian
15,Nationality,Indian,European,0.847663,Indian
18,Nationality,Indian,Arab,-0.057382,Arab
20,Nationality,European,Americans,-0.299404,Americans
21,Nationality,European,Chinese,-0.295654,Chinese
22,Nationality,European,Arab,-0.927816,Arab
23,Nationality,European,Mexican,-0.492094,Mexican
