In [1]:
import json
import clip
import torch
import numpy as np
from torch import nn
import torch.nn.functional as F

# CLIPSeg Experiments

In [30]:
from clipseg_model import CLIPSeg

with open('concepts/CUB/parts.txt') as fp:
    part_texts = ['bird ' + word for word in fp.read().splitlines()]

state_dict = torch.load('checkpoints/clipseg_pascub_ft.pt')
model = CLIPSeg(part_texts=part_texts, ft_layers=['visual_adapter', 'film', 'decoder'], state_dict=state_dict)

<class 'clipseg.configuration_clipseg.CLIPSegConfig'> 512


Some weights of CLIPSegForImageSegmentation were not initialized from the model checkpoint at CIDAS/clipseg-rd64-refined and are newly initialized: ['non_object_embedding', 'text_adapter.fc.0.weight', 'text_adapter.fc.2.weight', 'tunable_linear.weight', 'visual_adapter.fc.0.weight', 'visual_adapter.fc.2.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
import os
from torch.utils.data import DataLoader
from data.cub_dataset_v2 import CUBDatasetSimple

def collate_fn(batch):
    image_list, label_list = list(zip(*batch))
    return image_list, torch.stack(label_list)

dataset_train = CUBDatasetSimple(os.path.join('datasets', 'CUB'), split='train')
dataloader_train = DataLoader(dataset=dataset_train, collate_fn=collate_fn, batch_size=2, shuffle=True)
dataloader_train_iter = iter(dataloader_train)

In [None]:
with torch.no_grad():
    image_list, targets = next(dataloader_train_iter)
    loss, logits = model(image_list, targets)
    print(loss)

In [24]:
import spacy
from collections import defaultdict

def load_concepts():
    nlp = spacy.load("en_core_web_sm")
    def attach_part_name(concepts: list[str], part_name: str):
        concepts_processed = []
        for cpt in concepts:
            doc = nlp(cpt)
            if not any('NOUN' == word.pos_ for word in doc):
                cpt = cpt + ' ' + part_name
            # if 'than' in cpt or 'male' in cpt:  # Note that this would cause Purple Finch to have 0 concept for torso and American GoldFinch to have 0 concept for head
            #     continue 
            concepts_processed.append(cpt)
        return concepts_processed

    concept_sets = defaultdict(set)
    with open('concepts/CUB/concepts_processed.json', 'rb') as fp:
        concepts_processed = json.load(fp=fp)

    # Add a noun to purely adjective concepts
    for class_name, concept_dict in concepts_processed.items():
        for part_name, concepts in concept_dict.items():
            concepts_with_part_name = attach_part_name(concepts, part_name)
            concept_dict[part_name] = concepts_with_part_name
            concept_sets[part_name].update(concepts_with_part_name)

    concept_sets_sorted = {k: sorted(list(v)) for k, v in concept_sets.items()}
    return concept_sets_sorted

concepts_dict = load_concepts()

In [31]:
all_concepts = set()
for v in concepts_dict.values():
    all_concepts.update(v)

all_concepts = list(all_concepts)

In [47]:
from transformers.tokenization_utils_base import BatchEncoding

with torch.no_grad():
    concepts_token = model.clipseg_processor.tokenizer(all_concepts, return_tensors='pt', padding='max_length')
    embeddings = model.clipseg_model.get_conditional_embeddings(**concepts_token.to('cuda'))

In [49]:
embeddings.shape

torch.Size([980, 512])

In [37]:
list(concepts_token.keys())

['input_ids', 'attention_mask']

In [39]:
concepts_token

{'input_ids': [[49406, 33356, 541, 4398, 1061, 1642, 18977, 1008, 25652, 531, 518, 9475, 7578, 49407], [49406, 17257, 1901, 525, 518, 22465, 5149, 49407], [49406, 44570, 912, 7048, 537, 1449, 49407], [49406, 2442, 1579, 22104, 525, 5178, 49407], [49406, 1167, 893, 525, 1774, 49407], [49406, 1449, 530, 2541, 537, 12059, 1061, 530, 1673, 49407], [49406, 8157, 4623, 49407], [49406, 14426, 720, 320, 3144, 267, 645, 1243, 4132, 49407], [49406, 3005, 36498, 49407], [49406, 12623, 4132, 49407], [49406, 783, 1070, 8626, 4132, 49407], [49406, 20703, 36498, 49407], [49406, 43383, 3272, 2540, 49407], [49406, 3005, 531, 8675, 4132, 49407], [49406, 3638, 1340, 49407], [49406, 10709, 1340, 49407], [49406, 1449, 1710, 537, 6906, 49407], [49406, 1113, 26690, 1449, 268, 537, 268, 1579, 7447, 525, 893, 530, 1673, 49407], [49406, 3339, 9475, 3049, 49407], [49406, 25789, 36498, 49407], [49406, 1579, 31259, 530, 1340, 49407], [49406, 1449, 8979, 537, 16371, 530, 14426, 49407], [49406, 40331, 7072, 49407], 

In [23]:
from torchmetrics.functional.pairwise import pairwise_cosine_similarity
weights = torch.ones(7, 512, 100)
encodings = torch.randn(1200, 512)

weights_flatten = weights.permute(0, 2, 1).reshape(7 * 100, 512)

sims =pairwise_cosine_similarity(weights_flatten, encodings)
1 - torch.mean(sims)

tensor(0.9994)

# Mahalanobis Distance Experiments

In [None]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
clip_model, clip_preprocess = clip.load('RN50')

with open('concepts/CUB/concepts_processed.json', 'r') as fp:
    concept_dict = json.load(fp=fp)

unique_concepts = set()
for class_name, class_concepts in concept_dict.items():
    for concepts in class_concepts.values():
        unique_concepts.update(concepts)

unique_concepts = sorted(list(unique_concepts))

concepts_tokenized = clip.tokenize(unique_concepts).to(device)
with torch.no_grad():
    concepts_encoded = clip_model.encode_text(concepts_tokenized)

concepts_encoded = concepts_encoded.to(torch.float32)
concepts_encoded_norm = F.normalize(concepts_encoded, dim=-1)
concepts_mean = torch.mean(concepts_encoded_norm, dim=0)

In [None]:
from sklearn.covariance import EmpiricalCovariance

cov = EmpiricalCovariance().fit(concepts_encoded.cpu().numpy())
_sigma_inv = cov.get_precision()
sigma_inv = torch.from_numpy(_sigma_inv).to('cuda')

In [None]:
def mahalanobis_v1(samples, mu, sigma_inv):
    dists = torch.sum((samples - mu) @ sigma_inv * (samples - mu))
    return dists

def mahalanobis_v2(samples, mu, sigma_inv):
    dists = []
    for s in samples: 
        d = (s - mu) @ sigma_inv @ (s - mu)
        dists.append(d)
    return sum(dists)

In [None]:
sample1 = torch.randn(10, 1024).to('cuda')
sample2 = concepts_encoded[100:110]

In [None]:
mah1 = mahalanobis_v1(sample2.to(torch.float64), concepts_mean.to(torch.float64), sigma_inv)
mah2 = mahalanobis_v2(sample2.to(torch.float64), concepts_mean.to(torch.float64), sigma_inv)

In [None]:
mah1, mah2

In [None]:
mah1 = mahalanobis_v1(sample1.to(torch.float64), concepts_mean.to(torch.float64), sigma_inv)
mah2 = mahalanobis_v2(sample1.to(torch.float64), concepts_mean.to(torch.float64), sigma_inv)

In [None]:
mah1, mah2

In [None]:
class MHBLoss(nn.Module):
    def __init__(self, T: torch.Tensor, coef=1e-4) -> None:
        super().__init__()
        self.coef = coef
        
        self.T_norm = F.normalize(T, dim=-1)
        self.T_mu = torch.mean(self.T_norm, dim=0)
        self.T_sigma = torch.cov(self.T_norm.T)

        self.T_sigma_inv = torch.inverse(self.T_norm)
    
    def forward(self, samples):
        assert samples.dim in [2, 3]
        if samples.dim == 3:
            n, m, d = samples.shape
            samples_flat = samples.view(n*m, d)
        
        return self.coef * (samples_flat - self.T_mu) @ sigma_inv @ (samples_flat - concepts_mean)