In [1]:
import json
import clip
import torch
import numpy as np
from torch import nn
import torch.nn.functional as F

# CLIPSeg Experiments

In [2]:
from clipseg_model import CLIPSeg
from clipseg_train import load_concepts

with open('concepts/CUB/parts.txt') as fp:
    part_texts = fp.read().splitlines()

concept_dict = load_concepts()

state_dict = torch.load('checkpoints/clipseg_pascub_ft.pt')
model = CLIPSeg(
    part_texts=part_texts,
    concepts_dict=concept_dict,
    meta_category_text='bird',
    ft_layers=['d', 'va'],
    state_dict=state_dict
)

<class 'clipseg.configuration_clipseg.CLIPSegConfig'> 512


Some weights of CLIPSegForImageSegmentation were not initialized from the model checkpoint at CIDAS/clipseg-rd64-refined and are newly initialized: ['non_object_embedding', 'text_adapter.fc.0.weight', 'text_adapter.fc.2.weight', 'tunable_linear.weight', 'visual_adapter.fc.0.weight', 'visual_adapter.fc.2.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [3]:
model.part_texts

['bird head',
 'bird beak',
 'bird tail',
 'bird wing',
 'bird leg',
 'bird eye',
 'bird torso']

In [4]:
model.concept_embedding_dict

{'bird head': tensor([[ 0.1576,  0.1256, -0.0866,  ..., -0.3160, -0.1718, -0.0576],
         [-0.1047,  0.0813, -0.2178,  ...,  0.0657, -0.1565,  0.0173],
         [ 0.1395, -0.2119, -0.0750,  ...,  0.1409,  0.0794,  0.0063],
         ...,
         [-0.3731, -0.0802, -0.3408,  ...,  0.0267, -0.3798,  0.4257],
         [ 0.2522,  0.2424, -0.2402,  ...,  0.0371, -0.0869,  0.2086],
         [-0.0781,  0.0294,  0.0346,  ...,  0.0630, -0.0071,  0.0199]],
        device='cuda:0'),
 'bird beak': tensor([[ 0.2434, -0.0680,  0.1827,  ..., -0.2545, -0.0431,  0.0671],
         [-0.0032, -0.3645,  0.0895,  ...,  0.2297, -0.1688, -0.2071],
         [ 0.4853, -0.2493,  0.2674,  ..., -0.0249,  0.0837,  0.2307],
         ...,
         [ 0.1037,  0.2100,  0.1047,  ..., -0.3292,  0.2128,  0.5063],
         [ 0.2971, -0.3803, -0.0804,  ...,  0.0131,  0.0424,  0.3985],
         [-0.0781,  0.0294,  0.0346,  ...,  0.0630, -0.0071,  0.0199]],
        device='cuda:0'),
 'bird tail': tensor([[ 0.0362, -0.1581,

In [None]:
import os
from torch.utils.data import DataLoader
from data.cub_dataset_v2 import CUBDatasetSimple

def collate_fn(batch):
    image_list, label_list = list(zip(*batch))
    return image_list, torch.stack(label_list)

dataset_train = CUBDatasetSimple(os.path.join('datasets', 'CUB'), split='train')
dataloader_train = DataLoader(dataset=dataset_train, collate_fn=collate_fn, batch_size=2, shuffle=True)
dataloader_train_iter = iter(dataloader_train)

In [None]:
with torch.no_grad():
    image_list, targets = next(dataloader_train_iter)
    loss, logits = model(image_list, targets)
    print(loss)

In [None]:
import spacy
from collections import defaultdict

def load_concepts():
    nlp = spacy.load("en_core_web_sm")
    def attach_part_name(concepts: list[str], part_name: str):
        concepts_processed = []
        for cpt in concepts:
            doc = nlp(cpt)
            if not any('NOUN' == word.pos_ for word in doc):
                cpt = cpt + ' ' + part_name
            # if 'than' in cpt or 'male' in cpt:  # Note that this would cause Purple Finch to have 0 concept for torso and American GoldFinch to have 0 concept for head
            #     continue 
            concepts_processed.append(cpt)
        return concepts_processed

    concept_sets = defaultdict(set)
    with open('concepts/CUB/concepts_processed.json', 'rb') as fp:
        concepts_processed = json.load(fp=fp)

    # Add a noun to purely adjective concepts
    for class_name, concept_dict in concepts_processed.items():
        for part_name, concepts in concept_dict.items():
            concepts_with_part_name = attach_part_name(concepts, part_name)
            concept_dict[part_name] = concepts_with_part_name
            concept_sets[part_name].update(concepts_with_part_name)

    concept_sets_sorted = {k: sorted(list(v)) for k, v in concept_sets.items()}
    return concept_sets_sorted

concepts_dict = load_concepts()

In [None]:
all_concepts = set()
for v in concepts_dict.values():
    all_concepts.update(v)

all_concepts = list(all_concepts)

In [None]:
from transformers.tokenization_utils_base import BatchEncoding

with torch.no_grad():
    concepts_token = model.clipseg_processor.tokenizer(all_concepts, return_tensors='pt', padding='max_length')
    embeddings = model.clipseg_model.get_conditional_embeddings(**concepts_token.to('cuda'))

In [None]:
embeddings.shape

In [None]:
list(concepts_token.keys())

In [None]:
concepts_token

In [None]:
from scipy.optimize import linear_sum_assignment

prototypes = torch.rand(7, 512, 50)
database = torch.randn(900, 512)
prototypes = prototypes.permute(0, 2, 1).reshape(7 * 50, 512)
affinities = pairwise_cosine_similarity(prototypes, database).numpy()

rows, cols = linear_sum_assignment(affinities)

In [None]:
rows.shape, cols.shape

In [None]:
cols

In [None]:
from torchmetrics.functional.pairwise import pairwise_cosine_similarity
weights = torch.ones(7, 512, 100)
encodings = torch.randn(1200, 512)

weights_flatten = weights.permute(0, 2, 1).reshape(7 * 100, 512)

sims =pairwise_cosine_similarity(weights_flatten, encodings)
1 - torch.mean(sims)

In [None]:
model.part_texts

# Mahalanobis Distance Experiments

In [None]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
clip_model, clip_preprocess = clip.load('RN50')

with open('concepts/CUB/concepts_processed.json', 'r') as fp:
    concept_dict = json.load(fp=fp)

unique_concepts = set()
for class_name, class_concepts in concept_dict.items():
    for concepts in class_concepts.values():
        unique_concepts.update(concepts)

unique_concepts = sorted(list(unique_concepts))

concepts_tokenized = clip.tokenize(unique_concepts).to(device)
with torch.no_grad():
    concepts_encoded = clip_model.encode_text(concepts_tokenized)

concepts_encoded = concepts_encoded.to(torch.float32)
concepts_encoded_norm = F.normalize(concepts_encoded, dim=-1)
concepts_mean = torch.mean(concepts_encoded_norm, dim=0)

In [None]:
from sklearn.covariance import EmpiricalCovariance

cov = EmpiricalCovariance().fit(concepts_encoded.cpu().numpy())
_sigma_inv = cov.get_precision()
sigma_inv = torch.from_numpy(_sigma_inv).to('cuda')

In [None]:
def mahalanobis_v1(samples, mu, sigma_inv):
    dists = torch.sum((samples - mu) @ sigma_inv * (samples - mu))
    return dists

def mahalanobis_v2(samples, mu, sigma_inv):
    dists = []
    for s in samples: 
        d = (s - mu) @ sigma_inv @ (s - mu)
        dists.append(d)
    return sum(dists)

In [None]:
sample1 = torch.randn(10, 1024).to('cuda')
sample2 = concepts_encoded[100:110]

In [None]:
mah1 = mahalanobis_v1(sample2.to(torch.float64), concepts_mean.to(torch.float64), sigma_inv)
mah2 = mahalanobis_v2(sample2.to(torch.float64), concepts_mean.to(torch.float64), sigma_inv)

In [None]:
mah1, mah2

In [None]:
mah1 = mahalanobis_v1(sample1.to(torch.float64), concepts_mean.to(torch.float64), sigma_inv)
mah2 = mahalanobis_v2(sample1.to(torch.float64), concepts_mean.to(torch.float64), sigma_inv)

In [None]:
mah1, mah2

In [None]:
class MHBLoss(nn.Module):
    def __init__(self, T: torch.Tensor, coef=1e-4) -> None:
        super().__init__()
        self.coef = coef
        
        self.T_norm = F.normalize(T, dim=-1)
        self.T_mu = torch.mean(self.T_norm, dim=0)
        self.T_sigma = torch.cov(self.T_norm.T)

        self.T_sigma_inv = torch.inverse(self.T_norm)
    
    def forward(self, samples):
        assert samples.dim in [2, 3]
        if samples.dim == 3:
            n, m, d = samples.shape
            samples_flat = samples.view(n*m, d)
        
        return self.coef * (samples_flat - self.T_mu) @ sigma_inv @ (samples_flat - concepts_mean)