In [1]:
import json
import clip
import torch
import numpy as np
from torch import nn
import torch.nn.functional as F

# CLIPSeg Experiments

In [2]:
from clipseg_model import CLIPSeg

with open('concepts/CUB/parts.txt') as fp:
    part_texts = ['bird ' + word for word in fp.read().splitlines()]

state_dict = torch.load('checkpoints/clipseg_pascub_ft.pt')
model = CLIPSeg(part_texts=part_texts, ft_layers=['visual_adapter', 'film', 'decoder'], state_dict=state_dict)

Some weights of CLIPSegForImageSegmentation were not initialized from the model checkpoint at CIDAS/clipseg-rd64-refined and are newly initialized: ['non_object_embedding', 'text_adapter.fc.0.weight', 'text_adapter.fc.2.weight', 'tunable_linear.weight', 'visual_adapter.fc.0.weight', 'visual_adapter.fc.2.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


<class 'clipseg.configuration_clipseg.CLIPSegConfig'> 512
decoder.film_mul.weight
decoder.film_mul.bias
decoder.film_add.weight
decoder.film_add.bias
decoder.transposed_convolution.0.weight
decoder.transposed_convolution.0.bias
decoder.transposed_convolution.2.weight
decoder.transposed_convolution.2.bias
decoder.transposed_convolution.4.weight
decoder.transposed_convolution.4.bias
decoder.reduces.0.weight
decoder.reduces.0.bias
decoder.reduces.1.weight
decoder.reduces.1.bias
decoder.reduces.2.weight
decoder.reduces.2.bias
decoder.layers.0.self_attn.k_proj.weight
decoder.layers.0.self_attn.k_proj.bias
decoder.layers.0.self_attn.v_proj.weight
decoder.layers.0.self_attn.v_proj.bias
decoder.layers.0.self_attn.q_proj.weight
decoder.layers.0.self_attn.q_proj.bias
decoder.layers.0.self_attn.out_proj.weight
decoder.layers.0.self_attn.out_proj.bias
decoder.layers.0.layer_norm1.weight
decoder.layers.0.layer_norm1.bias
decoder.layers.0.mlp.fc1.weight
decoder.layers.0.mlp.fc1.bias
decoder.layers.0

In [None]:
import os
from torch.utils.data import DataLoader
from data.cub_dataset_v2 import CUBDatasetSimple

def collate_fn(batch):
    image_list, label_list = list(zip(*batch))
    return image_list, torch.stack(label_list)

dataset_train = CUBDatasetSimple(os.path.join('datasets', 'CUB'), split='train')
dataloader_train = DataLoader(dataset=dataset_train, collate_fn=collate_fn, batch_size=2, shuffle=True)
dataloader_train_iter = iter(dataloader_train)

In [None]:
with torch.no_grad():
    image_list, targets = next(dataloader_train_iter)
    loss, logits = model(image_list, targets)
    print(loss)

# Mahalanobis Distance Experiments

In [None]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
clip_model, clip_preprocess = clip.load('RN50')

with open('concepts/CUB/concepts_processed.json', 'r') as fp:
    concept_dict = json.load(fp=fp)

unique_concepts = set()
for class_name, class_concepts in concept_dict.items():
    for concepts in class_concepts.values():
        unique_concepts.update(concepts)

unique_concepts = sorted(list(unique_concepts))

concepts_tokenized = clip.tokenize(unique_concepts).to(device)
with torch.no_grad():
    concepts_encoded = clip_model.encode_text(concepts_tokenized)

concepts_encoded = concepts_encoded.to(torch.float32)
concepts_encoded_norm = F.normalize(concepts_encoded, dim=-1)
concepts_mean = torch.mean(concepts_encoded_norm, dim=0)

In [None]:
from sklearn.covariance import EmpiricalCovariance

cov = EmpiricalCovariance().fit(concepts_encoded.cpu().numpy())
_sigma_inv = cov.get_precision()
sigma_inv = torch.from_numpy(_sigma_inv).to('cuda')

In [None]:
def mahalanobis_v1(samples, mu, sigma_inv):
    dists = torch.sum((samples - mu) @ sigma_inv * (samples - mu))
    return dists

def mahalanobis_v2(samples, mu, sigma_inv):
    dists = []
    for s in samples: 
        d = (s - mu) @ sigma_inv @ (s - mu)
        dists.append(d)
    return sum(dists)

In [None]:
sample1 = torch.randn(10, 1024).to('cuda')
sample2 = concepts_encoded[100:110]

In [None]:
mah1 = mahalanobis_v1(sample2.to(torch.float64), concepts_mean.to(torch.float64), sigma_inv)
mah2 = mahalanobis_v2(sample2.to(torch.float64), concepts_mean.to(torch.float64), sigma_inv)

In [None]:
mah1, mah2

In [None]:
mah1 = mahalanobis_v1(sample1.to(torch.float64), concepts_mean.to(torch.float64), sigma_inv)
mah2 = mahalanobis_v2(sample1.to(torch.float64), concepts_mean.to(torch.float64), sigma_inv)

In [None]:
mah1, mah2

In [None]:
class MHBLoss(nn.Module):
    def __init__(self, T: torch.Tensor, coef=1e-4) -> None:
        super().__init__()
        self.coef = coef
        
        self.T_norm = F.normalize(T, dim=-1)
        self.T_mu = torch.mean(self.T_norm, dim=0)
        self.T_sigma = torch.cov(self.T_norm.T)

        self.T_sigma_inv = torch.inverse(self.T_norm)
    
    def forward(self, samples):
        assert samples.dim in [2, 3]
        if samples.dim == 3:
            n, m, d = samples.shape
            samples_flat = samples.view(n*m, d)
        
        return self.coef * (samples_flat - self.T_mu) @ sigma_inv @ (samples_flat - concepts_mean)