<a href="https://colab.research.google.com/github/uowol/ML-projects/blob/main/CLIP_ALL.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# data collect

In [None]:
# !pip install pytorch pytorch==1.7.1 torchvision cudatoolkit==11.0
!pip install ftfy regex tqdm
!pip install git+https://github.com/openai/CLIP.git
!pip install deeplake

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting git+https://github.com/openai/CLIP.git
  Cloning https://github.com/openai/CLIP.git to /tmp/pip-req-build-mlnyk_fc
  Running command git clone --filter=blob:none --quiet https://github.com/openai/CLIP.git /tmp/pip-req-build-mlnyk_fc
  Resolved https://github.com/openai/CLIP.git to commit a9b1bf5920416aaeaec965c25dd9e8f98c864f16
  Preparing metadata (setup.py) ... [?25l[?25hdone
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import deeplake
from torchvision import transforms, models

ds_train = deeplake.load('hub://activeloop/pacs-train')
ds_test = deeplake.load('hub://activeloop/pacs-test')

hub://activeloop/pacs-train loaded successfully.

This dataset can be visualized in Jupyter Notebook by ds.visualize() or at https://app.activeloop.ai/activeloop/pacs-train

hub://activeloop/pacs-test loaded successfully.

This dataset can be visualized in Jupyter Notebook by ds.visualize() or at https://app.activeloop.ai/activeloop/pacs-test



In [None]:
import torch
from torch import nn 
from torch.utils.data import DataLoader

import clip

device = "cuda" if torch.cuda.is_available() else "cpu"
clip_model, preprocess = clip.load("ViT-B/32", device=device)

In [None]:
def get_image_features(clip_model, images):
    num_image_layer = clip_model.visual.transformer.layers
    images = images.to(device)

    out_list = []
    x = clip_model.visual.conv1(images.type(clip_model.dtype))
    x = x.reshape(x.shape[0], x.shape[1], -1)   # shape = [*, width, grid ** 2]
    x = x.permute(0, 2, 1)                      # shape = [*, grid ** 2, width]
    x = torch.cat([clip_model.visual.class_embedding.to(x.dtype) + 
                   torch.zeros(x.shape[0], 1, x.shape[-1], dtype=x.dtype, device=x.device), x], dim=1)  # shape = [*, grid ** 2 + 1, width]
    x = x + clip_model.visual.positional_embedding.to(x.dtype)
    x = clip_model.visual.ln_pre(x)
    x = x.permute(1, 0, 2)          # NLD -> LND

    for i in range(num_image_layer):
        x = clip_model.visual.transformer.resblocks[i](x)
        tmp = x.permute(1, 0, 2)    # LND -> NLD
        tmp = tmp[:, 0, :].detach()
        out_list.append(tmp)

    image_features = torch.stack(out_list)

    return image_features

def get_text_features(clip_model, texts):
    num_text_layer = clip_model.transformer.layers
    texts = texts.to(device)

    out_list = []
    x = clip_model.token_embedding(texts).type(clip_model.dtype)  # [batch_size, n_ctx, d_clip_model]
    x = x + clip_model.positional_embedding.type(clip_model.dtype)
    x = x.permute(1, 0, 2)                  # NLD -> LND

    for i in range(num_text_layer):
        x = clip_model.transformer.resblocks[i](x)
        tmp = x.permute(1, 0, 2).detach()   # LND -> NLD
        out_list.append(tmp)

    text_features = torch.stack(out_list)

    return text_features

In [None]:
def get_image_features_and_labels(data_loader):
    image_features_list = []
    labels_list = []

    for i, data in enumerate(data_loader):
        images = data['images']
        labels = torch.squeeze(data['labels'])

        images = images.to(device)
        labels = labels.to(device)

        image_features = get_image_features(clip_model, images)
        
        labels_list.append(labels)
        image_features_list.append(image_features)
    
    return image_features_list, labels_list

In [None]:
batch_size = 32
tform = transforms.Compose([
    # transforms.RandomRotation(20), # Image augmentation
    transforms.ToTensor(), # Must convert to pytorch tensor for subsequent operations to run
    # transforms.Normalize([0.5], [0.5]),
])

train_loader = ds_train.pytorch(num_workers = 0, shuffle = True, 
                                transform = {'images': tform, 'labels': None}, 
                                batch_size = batch_size, decode_method = {'images': 'pil'})
test_loader = ds_test.pytorch(num_workers = 0, transform = {'images': tform, 'labels': None}, 
                                batch_size = batch_size, decode_method = {'images': 'pil'})

In [None]:
train_image_features_list, train_labels_list = get_image_features_and_labels(train_loader)
test_image_features_list, test_labels_list = get_image_features_and_labels(test_loader)

In [None]:
import pickle

## save pickle
with open('train_image_features_list.pickle', 'wb') as fw: 
    pickle.dump(train_image_features_list, fw)
with open('train_labels_list.pickle', 'wb') as fw:
    pickle.dump(train_labels_list, fw)
with open('test_image_features_list.pickle', 'wb') as fw:
    pickle.dump(test_image_features_list, fw)
with open('test_labels_list.pickle', 'wb') as fw:
    pickle.dump(test_labels_list, fw)

In [None]:
def save_images(data_loader, split=1, type='train'):
    images_list = []
    labels_list = []
    save_idx = 0
    size = int(len(data_loader)/split)

    for i, data in enumerate(data_loader):
        images = data['images']
        images = images.to(device)

        labels = torch.squeeze(data['labels'])
        labels = labels.to(device)
        
        images_list.append(images)
        labels_list.append(labels)
        
        if (i+1) % size == 0:
            with open(f'{type}_images_list{save_idx}.pickle', 'wb') as fw: 
                pickle.dump(images_list, fw)
            images_list = []
            save_idx += 1

    with open(f'{type}_images_list{save_idx}.pickle', 'wb') as fw: 
        pickle.dump(images_list, fw)
    
    return labels_list

def load_images_list(type='train', i=0):
    with open(f'{type}_images_list{i}.pickle', 'rb') as fr: 
        images_list = pickle.load(fr)
    return images_list

In [None]:
train_labels_list = save_images(train_loader, type="train", split=3)

with open('train_labels_list0.pickle', 'wb') as fw:
    pickle.dump(train_labels_list, fw)

test_labels_list = save_images(test_loader, type="test", split=3)

with open('test_labels_list0.pickle', 'wb') as fw:
    pickle.dump(test_labels_list, fw)

In [None]:
pacs_class = [
    'a dog', 'an elephant', 'a giraffe', 'a guitar', 'a horse', 'a house', 'a person'
]

def prompt(idx):
    return f"An image of {pacs_class[idx]}"

prompts = [prompt(x) for x in range(7)]
print(prompts)

['An image of a dog', 'An image of an elephant', 'An image of a giraffe', 'An image of a guitar', 'An image of a horse', 'An image of a house', 'An image of a person']


In [None]:
class_tokens = clip.tokenize([prompt(x) for x in range(7)]).to(device)
with open('class_tokens.pickle', 'wb') as fw:
    pickle.dump(class_tokens, fw)

In [None]:
class_features = get_text_features(clip_model, class_tokens)
with open('class_features.pickle', 'wb') as fw:
    pickle.dump(class_features, fw)

In [None]:
def get_images_feature(clip_model, images):
    images = images.to(device)

    x = clip_model.visual.conv1(images.type(clip_model.dtype))
    x = x.reshape(x.shape[0], x.shape[1], -1)   # shape = [*, width, grid ** 2]
    x = x.permute(0, 2, 1)                      # shape = [*, grid ** 2, width]
    x = torch.cat([clip_model.visual.class_embedding.to(x.dtype) + 
                   torch.zeros(x.shape[0], 1, x.shape[-1], dtype=x.dtype, device=x.device), x], dim=1)  # shape = [*, grid ** 2 + 1, width]
    x = x + clip_model.visual.positional_embedding.to(x.dtype)
    x = clip_model.visual.ln_pre(x)
    x = x.permute(1, 0, 2)  # NLD -> LND
    x = clip_model.visual.transformer(x)
    x = x.permute(1, 0, 2)  # LND -> NLD
    x = x[:, 0, :].detach()

    return x

In [None]:
def get_texts_feature(clip_model, texts):
    texts = texts.to(device)

    x = clip_model.token_embedding(texts).type(clip_model.dtype)  # [batch_size, n_ctx, d_clip_model]
    x = x + clip_model.positional_embedding.type(clip_model.dtype)
    x = x.permute(1, 0, 2)            # NLD -> LND
    x = clip_model.transformer(x)
    x = x.permute(1, 0, 2).detach()   # LND -> NLD

    return x

In [None]:
def get_images_feature_and_labels(data_loader):
    images_feature_list = []
    labels_list = []

    for i, data in enumerate(data_loader):
        images = data['images']
        labels = torch.squeeze(data['labels'])

        images = images.to(device)
        labels = labels.to(device)

        images_features = get_images_feature(clip_model, images)
        
        labels_list.append(labels)
        images_feature_list.append(images_features)
    
    return images_feature_list, labels_list

In [None]:
train_images_feature_list, train_labels_list0 = get_images_feature_and_labels(train_loader)
test_images_feature_list, test_labels_list0 = get_images_feature_and_labels(test_loader)

In [None]:
## save pickle
with open('train_images_feature_list.pickle', 'wb') as fw: 
    pickle.dump(train_images_feature_list, fw)
with open('train_labels_list0.pickle', 'wb') as fw:
    pickle.dump(train_labels_list0, fw)
with open('test_images_feature_list.pickle', 'wb') as fw:
    pickle.dump(test_images_feature_list, fw)
with open('test_labels_list0.pickle', 'wb') as fw:
    pickle.dump(test_labels_list0, fw)

In [None]:
class_features0 = get_texts_feature(clip_model, class_tokens)
with open('class_features0.pickle', 'wb') as fw:
    pickle.dump(class_features0, fw)

# data load

In [None]:
## load pickle
with open('class_tokens.pickle', 'rb') as fr:
    class_tokens = pickle.load(fr)
with open('class_features.pickle', 'rb') as fr:
    class_features = pickle.load(fr)
with open('train_image_features_list.pickle', 'rb') as fr:
    train_image_features_list = pickle.load(fr)
with open('train_labels_list.pickle', 'rb') as fr:
    train_labels_list = pickle.load(fr)
with open('test_image_features_list.pickle', 'rb') as fr:
    test_image_features_list = pickle.load(fr)
with open('test_labels_list.pickle', 'rb') as fr:
    test_labels_list = pickle.load(fr)

# Model constructing

In [None]:
import pandas as pd
import numpy as np

import torch
from torch import nn 
from torch.utils.data import DataLoader

from torchvision import datasets, transforms, models
from torchvision.transforms import ToTensor

import clip

In [None]:
class OurCLIP(nn.Module):        
    def __init__(self, clip_model, # use pre-trained clip model
                 use_one_ln1=True, use_one_ln2=True, 
                 use_one_projection1=True, use_one_projection2=True,
                 projection_random_init1=False, 
                 projection_random_init2=False, 
                 trainable_ln1=False, trainable_ln2=False, 
                 trainable_projection1=False, trainable_projection2=False,
                 threshold = 100, score_type=0): 
        super().__init__()

        self.dtype = clip_model.dtype
        self.threshold = threshold
        self.score_type = score_type

        self.threshold_weight = nn.Parameter(torch.rand(1)).requires_grad_(True)

        ####################### 미구현
        if use_one_ln1:
            self.ln_post = clip_model.visual.ln_post.requires_grad_(trainable_ln1)
        else: pass
        if use_one_ln2:
            self.ln_final = clip_model.ln_final.requires_grad_(trainable_ln2)
        else: pass
        ########################

        if use_one_projection1:
            self.visual_projection = nn.Parameter(clip_model.visual.proj.clone().detach()).requires_grad_(trainable_projection1)
        else:
            if projection_random_init1:
                self.visual_projection = nn.Parameter(
                    torch.stack([
                        torch.zeros_like(clip_model.visual.proj, dtype=torch.float16).normal_(mean=0,std=0.03),
                        torch.zeros_like(clip_model.visual.proj, dtype=torch.float16).normal_(mean=0,std=0.03),
                        torch.zeros_like(clip_model.visual.proj, dtype=torch.float16).normal_(mean=0,std=0.03),
                        torch.zeros_like(clip_model.visual.proj, dtype=torch.float16).normal_(mean=0,std=0.03),
                        torch.zeros_like(clip_model.visual.proj, dtype=torch.float16).normal_(mean=0,std=0.03),
                        torch.zeros_like(clip_model.visual.proj, dtype=torch.float16).normal_(mean=0,std=0.03),
                        torch.zeros_like(clip_model.visual.proj, dtype=torch.float16).normal_(mean=0,std=0.03),
                        torch.zeros_like(clip_model.visual.proj, dtype=torch.float16).normal_(mean=0,std=0.03),
                        torch.zeros_like(clip_model.visual.proj, dtype=torch.float16).normal_(mean=0,std=0.03),
                        torch.zeros_like(clip_model.visual.proj, dtype=torch.float16).normal_(mean=0,std=0.03),
                        torch.zeros_like(clip_model.visual.proj, dtype=torch.float16).normal_(mean=0,std=0.03),
                        clip_model.visual.proj.clone().detach()
                    ])).requires_grad_(trainable_projection1)
            else:
                self.visual_projection = nn.Parameter(
                    torch.stack([
                        clip_model.visual.proj.clone().detach(),
                        clip_model.visual.proj.clone().detach(),
                        clip_model.visual.proj.clone().detach(),
                        clip_model.visual.proj.clone().detach(),
                        clip_model.visual.proj.clone().detach(),
                        clip_model.visual.proj.clone().detach(),
                        clip_model.visual.proj.clone().detach(),
                        clip_model.visual.proj.clone().detach(),
                        clip_model.visual.proj.clone().detach(),
                        clip_model.visual.proj.clone().detach(),
                        clip_model.visual.proj.clone().detach(),
                        clip_model.visual.proj.clone().detach()
                    ])).requires_grad_(trainable_projection1)
        
        if use_one_projection2:
            self.textual_projection = nn.Parameter(clip_model.text_projection.clone().detach()).requires_grad_(trainable_projection2)
        else:
            if projection_random_init2:
                self.textual_projection = nn.Parameter(
                    torch.stack([
                        torch.zeros_like(clip_model.text_projection, dtype=torch.float16).normal_(mean=0,std=0.03),
                        torch.zeros_like(clip_model.text_projection, dtype=torch.float16).normal_(mean=0,std=0.03),
                        torch.zeros_like(clip_model.text_projection, dtype=torch.float16).normal_(mean=0,std=0.03),
                        torch.zeros_like(clip_model.text_projection, dtype=torch.float16).normal_(mean=0,std=0.03),
                        torch.zeros_like(clip_model.text_projection, dtype=torch.float16).normal_(mean=0,std=0.03),
                        torch.zeros_like(clip_model.text_projection, dtype=torch.float16).normal_(mean=0,std=0.03),
                        torch.zeros_like(clip_model.text_projection, dtype=torch.float16).normal_(mean=0,std=0.03),
                        torch.zeros_like(clip_model.text_projection, dtype=torch.float16).normal_(mean=0,std=0.03),
                        torch.zeros_like(clip_model.text_projection, dtype=torch.float16).normal_(mean=0,std=0.03),
                        torch.zeros_like(clip_model.text_projection, dtype=torch.float16).normal_(mean=0,std=0.03),
                        torch.zeros_like(clip_model.text_projection, dtype=torch.float16).normal_(mean=0,std=0.03),
                        clip_model.text_projection.clone().detach()
                    ])).requires_grad_(trainable_projection2)
            else:
                self.textual_projection = nn.Parameter(
                    torch.stack([
                        clip_model.text_projection.clone().detach(),
                        clip_model.text_projection.clone().detach(),
                        clip_model.text_projection.clone().detach(),
                        clip_model.text_projection.clone().detach(),
                        clip_model.text_projection.clone().detach(),
                        clip_model.text_projection.clone().detach(),
                        clip_model.text_projection.clone().detach(),
                        clip_model.text_projection.clone().detach(),
                        clip_model.text_projection.clone().detach(),
                        clip_model.text_projection.clone().detach(),
                        clip_model.text_projection.clone().detach(),
                        clip_model.text_projection.clone().detach()
                    ])).requires_grad_(trainable_projection2)


    def forward(self, image_features, class_features, class_tokens):    
        image_features2 = self.ln_post(image_features)                      # (12, batch_size, 768)
        image_embeddings = image_features2 @ self.visual_projection         # (12, batch_size, 512)  <- we'll use it

        class_features2 = self.ln_final(class_features).type(self.dtype)    # (12, seq_len, 77, 512)
        class_embeddings = class_features2[:, torch.arange(class_features2.shape[1]), 
                                           class_tokens.argmax(dim=-1)] @ self.textual_projection
                                                                            # (12, seq_len, 512)  <- we'll use it

        score_tensor = torch.einsum("ijk,mnk->jnim",image_embeddings, class_embeddings)
        if self.score_type != 3:
            score_tensor = score_tensor.reshape(*score_tensor.shape[:2],-1)

        if self.score_type == 1:
            return torch.mean(score_tensor, dim=-1)
        if self.score_type == 2:
            return torch.sigmoid(100 * (score_tensor-(self.threshold*self.threshold_weight))).sum(dim=-1)
        if self.score_type == 3:
            return score_tensor.max(dim=-1)[0].mean(dim=-1)
        return torch.max(score_tensor, dim=-1)[0]

# Train

In [None]:
import time
import torch.nn.functional as f

def train_one_epoch(model, optimizer, criterion, image_features_list, labels_list, max_iter_num=3000, print_log=True):

    model.train()

    # Zero the performance stats for each epoch
    running_loss = 0.0
    start_time = time.time()
    total = 0
    correct = 0
    
    for i in range(len(image_features_list)):
        image_features = image_features_list[i]
        labels = labels_list[i]
        
        optimizer.zero_grad()
        
        output = model(image_features, class_features, class_tokens)

        loss = criterion(output, labels)
        loss.backward()
        optimizer.step()
        
        predicted = torch.argmax(output, dim=-1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
        accuracy = 100 * correct / total
    
        # Print performance statistics
        running_loss += loss.item()

        if print_log:
            if i != 0 and i % 10 == 0:    # print every 10 batches
                batch_time = time.time()
                speed = (i+1)/(batch_time-start_time)
                print('[%5d] loss: %.3f, speed: %.2f, accuracy: %.2f %%' % (i, running_loss, speed, accuracy))

                running_loss = 0.0
                total = 0
                correct = 0

        if i != 0 and i % max_iter_num == 0:
            break;
    
    return accuracy

    
def test_model(model, image_features_list, labels_list, max_iter_num=1000, print_log=True):

    model.eval()

    start_time = time.time()
    total = 0
    correct = 0
    with torch.no_grad():
        for i in range(len(image_features_list)):
            image_features = image_features_list[i]
            labels = labels_list[i]

            # forward + backward + optimize
            output = model(image_features, class_features, class_tokens)

            predicted = torch.argmax(output, dim=-1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
        
            if i != 0 and i % max_iter_num == 0:
                break
            
        accuracy = 100 * correct / total

        if print_log:
            print('Finished Testing')
            print('Testing accuracy: %.1f %%' %(accuracy))
            
    return accuracy

In [None]:
class EarlyStopping:
    def __init__(self, patience=5):
        self.accuracy = 0
        self.patience = 0
        self.patience_limit = patience
        
    def step(self, accuracy):
        if self.accuracy < accuracy:
            self.accuracy = accuracy
            self.patience = 0
        else:
            self.patience += 1
    
    def is_stop(self):
        return self.patience >= self.patience_limit

In [None]:
def start_train(model, criterion, optimizer, num_epochs=200, patience=5, 
                save_path='output0316.txt', save_content='', print_log=True):
    device = "cuda" if torch.cuda.is_available() else "cpu"

    early_stop = EarlyStopping(patience)

    for epoch in range(num_epochs):  # loop over the dataset multiple times
        print("------------------ Training Epoch {} ------------------".format(epoch+1))
        train_accuracy = train_one_epoch(model, optimizer, criterion, 
                                        train_image_features_list, train_labels_list, max_iter_num=1000, print_log=print_log)
        test_accuracy = test_model(model, test_image_features_list, test_labels_list, print_log=print_log)
        print(f"train accuracy: {train_accuracy:.2f}%\ttest accuracy: {test_accuracy:.2f}%")

        early_stop.step(test_accuracy)
        if early_stop.is_stop():
            print(f"-\ntrain epoch: {epoch}")
            break

    with open(save_path, 'a') as f:
        f.write(f"{epoch+1}\t"+"="*10+f"{save_content}\n")
        f.write(f"accuracy: {test_accuracy:.4f}\n")


    print('Finished Training')

# inference

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
clip_model, preprocess = clip.load("ViT-B/32", device=device)

num_epochs = 400
patience = 5

In [None]:
# train all by one tensor / max_score
model = OurCLIP(clip_model, 
                trainable_projection1=True, trainable_projection2=True,
                score_type=0).to(device)

criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.001, momentum=0.1)

start_train(model, criterion, optimizer, num_epochs=num_epochs, patience=patience,
            save_path='output0316.txt', save_content='train all by one tensor / max_score', print_log=False)

torch.save(model.state_dict(), 'saved/oneMaxModel.pt')

------------------ Training Epoch 1 ------------------
train accuracy: 96.52%	test accuracy: 95.71%
------------------ Training Epoch 2 ------------------
train accuracy: 97.67%	test accuracy: 95.71%
------------------ Training Epoch 3 ------------------
train accuracy: 98.06%	test accuracy: 95.43%
------------------ Training Epoch 4 ------------------
train accuracy: 98.24%	test accuracy: 95.18%
------------------ Training Epoch 5 ------------------
train accuracy: 98.34%	test accuracy: 94.82%
------------------ Training Epoch 6 ------------------
train accuracy: 98.50%	test accuracy: 94.66%
-
train epoch: 5
Finished Training


In [None]:
# train all by one tensor / new_score
# reference: https://discuss.pytorch.org/t/is-there-any-soft-way-of-counting-positive-values-with-grad-reserved/158975/2
clip_model, preprocess = clip.load("ViT-B/32", device=device)
model = OurCLIP(clip_model, 
                trainable_projection1=True, trainable_projection2=True,
                score_type=3).to(device)

# for name, param in model.named_parameters():
#     if param.requires_grad:
#         print(name, param.shape)

criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.001, momentum=0.1)

start_train(model, criterion, optimizer, num_epochs=num_epochs, patience=patience,
            save_path='output0316.txt', save_content='train all by one tensor / new_score', print_log=False)

torch.save(model.state_dict(), 'saved/oneNewModel.pt')

------------------ Training Epoch 1 ------------------
train accuracy: 82.79%	test accuracy: 47.95%
------------------ Training Epoch 2 ------------------
train accuracy: 92.71%	test accuracy: 54.07%
------------------ Training Epoch 3 ------------------
train accuracy: 94.58%	test accuracy: 78.14%
------------------ Training Epoch 4 ------------------
train accuracy: 95.51%	test accuracy: 85.20%
------------------ Training Epoch 5 ------------------
train accuracy: 95.98%	test accuracy: 89.18%
------------------ Training Epoch 6 ------------------
train accuracy: 96.27%	test accuracy: 90.60%
------------------ Training Epoch 7 ------------------
train accuracy: 96.49%	test accuracy: 91.45%
------------------ Training Epoch 8 ------------------
train accuracy: 96.64%	test accuracy: 91.64%
------------------ Training Epoch 9 ------------------
train accuracy: 96.79%	test accuracy: 91.70%
------------------ Training Epoch 10 ------------------
train accuracy: 96.93%	test accuracy: 91.73%

In [None]:
# train all by each tensor(init with last weight) / max_score
clip_model, preprocess = clip.load("ViT-B/32", device=device)
model = OurCLIP(clip_model, 
                use_one_projection1=False, use_one_projection2=False,
                trainable_projection1=True, trainable_projection2=True,
                score_type=0).to(device)

# for name, param in model.named_parameters():
#     if param.requires_grad:
#         print(name, param.shape)

criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.001, momentum=0.1)

start_train(model, criterion, optimizer, num_epochs=num_epochs, patience=patience,
            save_path='output0316.txt', save_content='train all by each tensor(init with last weight) / max_score',
            print_log=False)

torch.save(model.state_dict(), 'saved/allMaxModel.pt')

------------------ Training Epoch 1 ------------------
train accuracy: 96.45%	test accuracy: 95.67%
------------------ Training Epoch 2 ------------------
train accuracy: 97.67%	test accuracy: 95.75%
------------------ Training Epoch 3 ------------------
train accuracy: 98.05%	test accuracy: 95.49%
------------------ Training Epoch 4 ------------------
train accuracy: 98.26%	test accuracy: 95.23%
------------------ Training Epoch 5 ------------------
train accuracy: 98.36%	test accuracy: 94.87%
------------------ Training Epoch 6 ------------------
train accuracy: 98.51%	test accuracy: 94.72%
------------------ Training Epoch 7 ------------------
train accuracy: 98.65%	test accuracy: 94.49%
-
train epoch: 6
Finished Training


In [None]:
# train all by each tensor(init with last weight) / new_score
clip_model, preprocess = clip.load("ViT-B/32", device=device)
model = OurCLIP(clip_model, 
                use_one_projection1=False, use_one_projection2=False,
                trainable_projection1=True, trainable_projection2=True,
                score_type=3).to(device)

# for name, param in model.named_parameters():
#     if param.requires_grad:
#         print(name, param.shape)

criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.001, momentum=0.1)

start_train(model, criterion, optimizer, num_epochs=num_epochs, patience=patience,
            save_path='output0316.txt', save_content='train all by each tensor(init with last weight) / new_score',
            print_log=False)

torch.save(model.state_dict(), 'saved/allNewModel.pt')

------------------ Training Epoch 1 ------------------
train accuracy: 58.84%	test accuracy: 43.89%
------------------ Training Epoch 2 ------------------
train accuracy: 73.93%	test accuracy: 46.34%
------------------ Training Epoch 3 ------------------
train accuracy: 79.88%	test accuracy: 45.86%
------------------ Training Epoch 4 ------------------
train accuracy: 83.76%	test accuracy: 48.31%
------------------ Training Epoch 5 ------------------
train accuracy: 86.60%	test accuracy: 50.30%
------------------ Training Epoch 6 ------------------
train accuracy: 88.39%	test accuracy: 51.17%
------------------ Training Epoch 7 ------------------
train accuracy: 89.66%	test accuracy: 53.47%
------------------ Training Epoch 8 ------------------
train accuracy: 90.46%	test accuracy: 58.91%
------------------ Training Epoch 9 ------------------
train accuracy: 91.33%	test accuracy: 64.95%
------------------ Training Epoch 10 ------------------
train accuracy: 91.89%	test accuracy: 69.98%

In [None]:
# train all by each tensor(init with randn weight) / max_score
clip_model, preprocess = clip.load("ViT-B/32", device=device)
model = OurCLIP(clip_model, 
                use_one_projection1=False, use_one_projection2=False,
                trainable_projection1=True, trainable_projection2=True,
                projection_random_init1=True,
                projection_random_init2=True,
                score_type=0).to(device)

# for name, param in model.named_parameters():
#     if param.requires_grad:
#         print(name, param.shape)

criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.001, momentum=0.1)

start_train(model, criterion, optimizer, num_epochs=num_epochs, patience=patience,
            save_path='output0316.txt', save_content='train all by each tensor(init with randn weight) / max_score',
            print_log=False)

torch.save(model.state_dict(), 'saved/allMaxRandInitModel.pt')

------------------ Training Epoch 1 ------------------
train accuracy: 96.52%	test accuracy: 95.75%
------------------ Training Epoch 2 ------------------
train accuracy: 97.68%	test accuracy: 95.80%
------------------ Training Epoch 3 ------------------
train accuracy: 98.03%	test accuracy: 95.53%
------------------ Training Epoch 4 ------------------
train accuracy: 98.28%	test accuracy: 95.25%
------------------ Training Epoch 5 ------------------
train accuracy: 98.37%	test accuracy: 94.91%
------------------ Training Epoch 6 ------------------
train accuracy: 98.52%	test accuracy: 94.77%
------------------ Training Epoch 7 ------------------
train accuracy: 98.65%	test accuracy: 94.52%
-
train epoch: 6
Finished Training


In [None]:
# train all by each tensor(init with randn weight) / new_score
clip_model, preprocess = clip.load("ViT-B/32", device=device)
model = OurCLIP(clip_model, 
                use_one_projection1=False, use_one_projection2=False,
                trainable_projection1=True, trainable_projection2=True,
                projection_random_init1=True,
                projection_random_init2=True,
                score_type=3).to(device)

# for name, param in model.named_parameters():
#     if param.requires_grad:
#         print(name, param.shape)

criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.001, momentum=0.1)

start_train(model, criterion, optimizer, num_epochs=num_epochs, patience=patience,
            save_path='output0316.txt', save_content='train all by each tensor(init with randn weight) / new_score',
            print_log=False)

torch.save(model.state_dict(), 'saved/allNewRandInitModel.pt')

------------------ Training Epoch 1 ------------------
train accuracy: 84.14%	test accuracy: 64.72%
------------------ Training Epoch 2 ------------------
train accuracy: 90.91%	test accuracy: 85.47%
------------------ Training Epoch 3 ------------------
train accuracy: 93.71%	test accuracy: 90.05%
------------------ Training Epoch 4 ------------------
train accuracy: 94.90%	test accuracy: 90.95%
------------------ Training Epoch 5 ------------------
train accuracy: 95.64%	test accuracy: 91.22%
------------------ Training Epoch 6 ------------------
train accuracy: 96.13%	test accuracy: 91.07%
------------------ Training Epoch 7 ------------------
train accuracy: 96.22%	test accuracy: 90.34%
------------------ Training Epoch 8 ------------------
train accuracy: 96.26%	test accuracy: 90.22%
------------------ Training Epoch 9 ------------------
train accuracy: 96.46%	test accuracy: 91.08%
------------------ Training Epoch 10 ------------------
train accuracy: 96.58%	test accuracy: 90.99%

# LR -> 0.00001

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
clip_model, preprocess = clip.load("ViT-B/32", device=device)

oneMaxModel = OurCLIP(clip_model, trainable_projection1=True, trainable_projection2=True, score_type=0).to(device)
oneMaxModel.load_state_dict(torch.load('saved/oneMaxModel.pt'))
oneNewModel = OurCLIP(clip_model, trainable_projection1=True, trainable_projection2=True, score_type=3).to(device)
oneNewModel.load_state_dict(torch.load('saved/oneNewModel.pt'))
allMaxModel = OurCLIP(clip_model, 
                use_one_projection1=False, use_one_projection2=False,
                trainable_projection1=True, trainable_projection2=True,
                score_type=0).to(device)
allMaxModel.load_state_dict(torch.load('saved/allMaxModel.pt'))
allNewModel = OurCLIP(clip_model, 
                use_one_projection1=False, use_one_projection2=False,
                trainable_projection1=True, trainable_projection2=True,
                score_type=3).to(device)
allNewModel.load_state_dict(torch.load('saved/allNewModel.pt'))
allMaxRandInitModel = OurCLIP(clip_model, 
                use_one_projection1=False, use_one_projection2=False,
                trainable_projection1=True, trainable_projection2=True,
                projection_random_init1=True,
                projection_random_init2=True,
                score_type=0).to(device)
allMaxRandInitModel.load_state_dict(torch.load('saved/allMaxRandInitModel.pt'))
allNewRandInitModel = OurCLIP(clip_model, 
                use_one_projection1=False, use_one_projection2=False,
                trainable_projection1=True, trainable_projection2=True,
                projection_random_init1=True,
                projection_random_init2=True,
                score_type=3).to(device)
allNewRandInitModel.load_state_dict(torch.load('saved/allNewRandInitModel.pt'))

<All keys matched successfully>

In [None]:
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(oneMaxModel.parameters(), lr=0.00001, momentum=0.1)

start_train(oneMaxModel, criterion, optimizer, num_epochs=num_epochs, patience=patience,
            save_path='output0316.txt', save_content='oneMaxModel / lr=0.00001',
            print_log=False)

torch.save(model.state_dict(), 'saved/oneMaxModelFT.pt')

------------------ Training Epoch 1 ------------------
train accuracy: 95.57%	test accuracy: 96.13%
------------------ Training Epoch 2 ------------------
train accuracy: 96.45%	test accuracy: 96.54%
------------------ Training Epoch 3 ------------------
train accuracy: 96.68%	test accuracy: 96.68%
------------------ Training Epoch 4 ------------------
train accuracy: 96.89%	test accuracy: 96.76%
------------------ Training Epoch 5 ------------------
train accuracy: 97.00%	test accuracy: 96.85%
------------------ Training Epoch 6 ------------------
train accuracy: 96.99%	test accuracy: 96.85%
------------------ Training Epoch 7 ------------------
train accuracy: 97.03%	test accuracy: 96.85%
------------------ Training Epoch 8 ------------------
train accuracy: 97.04%	test accuracy: 96.86%
------------------ Training Epoch 9 ------------------
train accuracy: 97.06%	test accuracy: 96.84%
------------------ Training Epoch 10 ------------------
train accuracy: 97.07%	test accuracy: 96.83%

In [None]:
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(oneNewModel.parameters(), lr=0.00001, momentum=0.1)

start_train(oneNewModel, criterion, optimizer, num_epochs=num_epochs, patience=patience,
            save_path='output0316.txt', save_content='oneNewModel / lr=0.00001',
            print_log=False)

torch.save(model.state_dict(), 'saved/oneNewModelFT.pt')

------------------ Training Epoch 1 ------------------
train accuracy: 92.56%	test accuracy: 93.39%
------------------ Training Epoch 2 ------------------
train accuracy: 93.86%	test accuracy: 94.34%
------------------ Training Epoch 3 ------------------
train accuracy: 94.43%	test accuracy: 94.56%
------------------ Training Epoch 4 ------------------
train accuracy: 94.59%	test accuracy: 94.67%
------------------ Training Epoch 5 ------------------
train accuracy: 94.60%	test accuracy: 94.72%
------------------ Training Epoch 6 ------------------
train accuracy: 94.68%	test accuracy: 94.73%
------------------ Training Epoch 7 ------------------
train accuracy: 94.76%	test accuracy: 94.78%
------------------ Training Epoch 8 ------------------
train accuracy: 94.79%	test accuracy: 94.85%
------------------ Training Epoch 9 ------------------
train accuracy: 94.81%	test accuracy: 94.84%
------------------ Training Epoch 10 ------------------
train accuracy: 94.86%	test accuracy: 94.85%

In [None]:
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(allMaxModel.parameters(), lr=0.00001, momentum=0.1)

start_train(allMaxModel, criterion, optimizer, num_epochs=num_epochs, patience=patience,
            save_path='output0316.txt', save_content='allMaxModel / lr=0.00001',
            print_log=False)

torch.save(model.state_dict(), 'saved/allMaxModelFT.pt')

------------------ Training Epoch 1 ------------------
train accuracy: 95.52%	test accuracy: 96.16%
------------------ Training Epoch 2 ------------------
train accuracy: 96.49%	test accuracy: 96.62%
------------------ Training Epoch 3 ------------------
train accuracy: 96.70%	test accuracy: 96.75%
------------------ Training Epoch 4 ------------------
train accuracy: 96.94%	test accuracy: 96.86%
------------------ Training Epoch 5 ------------------
train accuracy: 97.01%	test accuracy: 96.92%
------------------ Training Epoch 6 ------------------
train accuracy: 97.00%	test accuracy: 96.91%
------------------ Training Epoch 7 ------------------
train accuracy: 97.06%	test accuracy: 96.92%
------------------ Training Epoch 8 ------------------
train accuracy: 97.06%	test accuracy: 96.91%
------------------ Training Epoch 9 ------------------
train accuracy: 97.08%	test accuracy: 96.94%
------------------ Training Epoch 10 ------------------
train accuracy: 97.11%	test accuracy: 96.95%

In [None]:
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(allNewModel.parameters(), lr=0.00001, momentum=0.1)

start_train(allNewModel, criterion, optimizer, num_epochs=num_epochs, patience=patience,
            save_path='output0316.txt', save_content='allNewModel / lr=0.00001',
            print_log=False)

torch.save(model.state_dict(), 'saved/allNewModelFT.pt')

------------------ Training Epoch 1 ------------------
train accuracy: 91.10%	test accuracy: 91.92%
------------------ Training Epoch 2 ------------------
train accuracy: 92.69%	test accuracy: 93.19%
------------------ Training Epoch 3 ------------------
train accuracy: 93.58%	test accuracy: 93.72%
------------------ Training Epoch 4 ------------------
train accuracy: 94.00%	test accuracy: 94.10%
------------------ Training Epoch 5 ------------------
train accuracy: 94.27%	test accuracy: 94.41%
------------------ Training Epoch 6 ------------------
train accuracy: 94.54%	test accuracy: 94.59%
------------------ Training Epoch 7 ------------------
train accuracy: 94.71%	test accuracy: 94.80%
------------------ Training Epoch 8 ------------------
train accuracy: 94.75%	test accuracy: 94.85%
------------------ Training Epoch 9 ------------------
train accuracy: 94.85%	test accuracy: 94.87%
------------------ Training Epoch 10 ------------------
train accuracy: 94.85%	test accuracy: 94.94%

In [None]:
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(allMaxRandInitModel.parameters(), lr=0.00001, momentum=0.1)

start_train(allMaxRandInitModel, criterion, optimizer, num_epochs=num_epochs, patience=patience,
            save_path='output0316.txt', save_content='allMaxRandInitModel / lr=0.00001',
            print_log=False)

torch.save(model.state_dict(), 'saved/allMaxRandInitModelFT.pt')

------------------ Training Epoch 1 ------------------
train accuracy: 95.50%	test accuracy: 96.17%
------------------ Training Epoch 2 ------------------
train accuracy: 96.50%	test accuracy: 96.61%
------------------ Training Epoch 3 ------------------
train accuracy: 96.71%	test accuracy: 96.78%
------------------ Training Epoch 4 ------------------
train accuracy: 96.94%	test accuracy: 96.86%
------------------ Training Epoch 5 ------------------
train accuracy: 97.03%	test accuracy: 96.93%
------------------ Training Epoch 6 ------------------
train accuracy: 97.04%	test accuracy: 96.94%
------------------ Training Epoch 7 ------------------
train accuracy: 97.04%	test accuracy: 96.98%
------------------ Training Epoch 8 ------------------
train accuracy: 97.09%	test accuracy: 96.97%
------------------ Training Epoch 9 ------------------
train accuracy: 97.09%	test accuracy: 96.96%
------------------ Training Epoch 10 ------------------
train accuracy: 97.11%	test accuracy: 96.98%

In [None]:
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(allNewRandInitModel.parameters(), lr=0.00001, momentum=0.1)

start_train(allNewRandInitModel, criterion, optimizer, num_epochs=num_epochs, patience=patience,
            save_path='output0316.txt', save_content='allNewRandInitModel / lr=0.00001',
            print_log=False)

torch.save(model.state_dict(), 'saved/allNewRandInitModelFT.pt')


------------------ Training Epoch 1 ------------------
train accuracy: 91.08%	test accuracy: 91.29%
------------------ Training Epoch 2 ------------------
train accuracy: 91.30%	test accuracy: 91.44%
------------------ Training Epoch 3 ------------------
train accuracy: 91.48%	test accuracy: 91.66%
------------------ Training Epoch 4 ------------------
train accuracy: 91.63%	test accuracy: 91.82%
------------------ Training Epoch 5 ------------------
train accuracy: 91.80%	test accuracy: 91.94%
------------------ Training Epoch 6 ------------------
train accuracy: 91.95%	test accuracy: 92.11%
------------------ Training Epoch 7 ------------------
train accuracy: 92.05%	test accuracy: 92.21%
------------------ Training Epoch 8 ------------------
train accuracy: 92.17%	test accuracy: 92.31%
------------------ Training Epoch 9 ------------------
train accuracy: 92.21%	test accuracy: 92.42%
------------------ Training Epoch 10 ------------------
train accuracy: 92.36%	test accuracy: 92.51%

# zeroshot clip

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
clip_model, preprocess = clip.load("ViT-B/32", device=device)

In [None]:
with open('train_labels_list0.pickle', 'rb') as fr:
    train_labels_list0 = pickle.load(fr)
with open('train_images_feature_list.pickle', 'rb') as fr:
    train_images_feature_list = pickle.load(fr)
with open('test_labels_list0.pickle', 'rb') as fr:
    test_labels_list0 = pickle.load(fr)
with open('test_images_feature_list.pickle', 'rb') as fr:
    test_images_feature_list = pickle.load(fr)
with open('class_features0.pickle', 'rb') as fr:
    class_features0 = pickle.load(fr)

In [None]:
class _CLIP(nn.Module):        
    def __init__(self, clip_model): 
        super().__init__()

        self.dtype = clip_model.dtype
        self.visual_projection = nn.Parameter(clip_model.visual.proj.clone().detach()).requires_grad_(True)
        self.textual_projection = nn.Parameter(clip_model.text_projection.clone().detach()).requires_grad_(True)
        ####################### 학습할지 여부 결정하지 않았음
        self.ln_post = clip_model.visual.ln_post.requires_grad_(False)
        self.ln_final = clip_model.ln_final.requires_grad_(False)
        ########################
        self.logit_scale = clip_model.logit_scale

    def forward(self, image_features, class_features, class_tokens):
        image_features = self.ln_post(image_features)                       # (batch_size, 768)
        image_features = image_features @ self.visual_projection            # (batch_size, 512)  <- we'll use it

        class_features = self.ln_final(class_features).type(self.dtype)     # (seq_len, 77, 512)
        class_features = class_features[torch.arange(class_features.shape[0]), 
                                           class_tokens.argmax(dim=-1)] @ self.textual_projection
                                                                            # (seq_len, 512)  <- we'll use it

        # normalized features
        image_features = image_features / image_features.norm(dim=1, keepdim=True)
        class_features = class_features / class_features.norm(dim=1, keepdim=True)

        # cosine similarity as logits
        logit_scale = self.logit_scale.exp()
        logits_per_image = logit_scale * image_features @ class_features.t()
        logits_per_text = logits_per_image.t()

        # shape = [global_batch_size, global_batch_size]
        return logits_per_image, logits_per_text

In [None]:
# zeroshot
def test_model_for_clip_model(model, images_feature_list, labels_list, max_iter_num=1000):

    model.eval()

    total = 0
    correct = 0

    with torch.no_grad():
        for i in range(len(labels_list)):
            image_features = images_feature_list[i]
            labels = labels_list[i]

            # forward + backward + optimize
            outputs = model(image_features, class_features0, class_tokens)[0].softmax(dim=-1)
    
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

            if i != 0 and i % max_iter_num == 0:
                break

        accuracy = 100 * correct / total
            
        print('Finished Testing')
        print('Testing accuracy: %.2f %%' %(accuracy))

    return accuracy

def train_model_for_clip_model(model, optimizer, criterion, images_feature_list, labels_list, max_iter_num=3000, print_log=True):

    model.train()

    running_loss = 0.0
    start_time = time.time()
    total = 0
    correct = 0
    
    for i in range(len(labels_list)):
        image_features = images_feature_list[i]
        labels = labels_list[i]

        # forward + backward + optimize
        outputs = model(image_features, class_features0, class_tokens)[0].softmax(dim=-1)
        
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        
        predicted = torch.argmax(outputs, dim=-1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
        accuracy = 100 * correct / total
    
        # Print performance statistics
        running_loss += loss.item()

        if print_log:
            if i != 0 and i % 10 == 0:    # print every 10 batches
                batch_time = time.time()
                speed = (i+1)/(batch_time-start_time)
                print('[%5d] loss: %.3f, speed: %.2f, accuracy: %.2f %%' %
                    (i, running_loss, speed, accuracy))

                running_loss = 0.0
                total = 0
                correct = 0

        if i != 0 and i % max_iter_num == 0:
            break;
    
    return accuracy

In [None]:
model = _CLIP(clip_model)

accuracy = test_model_for_clip_model(model, test_images_feature_list, test_labels_list0)

with open("output0316.txt", 'a') as f:
    f.write("="*10+"clip / zeroshot\n")
    f.write(f"accuracy: {accuracy:.4f}\n")

Finished Testing
Testing accuracy: 91.63 %


In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
clip_model, preprocess = clip.load("ViT-B/32", device=device)

model = _CLIP(clip_model)
model.logit_scale.requires_grad_(False)

for name, param in model.named_parameters():
    if param.requires_grad:
        print(name, param.shape, param.requires_grad)

visual_projection torch.Size([768, 512]) True
textual_projection torch.Size([512, 512]) True


In [None]:
patience = 5
num_epochs = 200

In [None]:
import time

model = _CLIP(clip_model)
model.logit_scale.requires_grad_(False)

criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.00001, momentum=0.1)   # lr이 .00001보다 작을 경우 학습이 망가짐

early_stop = EarlyStopping(patience)

for epoch in range(num_epochs):  # loop over the dataset multiple times
    print("------------------ Training Epoch {} ------------------".format(epoch+1))
    train_accuracy = train_model_for_clip_model(model, optimizer, criterion,
                                                train_images_feature_list, train_labels_list0, max_iter_num=1000, print_log=False)
    test_accuracy = test_model_for_clip_model(model, test_images_feature_list, test_labels_list0)
    print(f"train accuracy: {train_accuracy:.2f}%\ttest accuracy: {test_accuracy:.2f}%")

    early_stop.step(test_accuracy)

    if early_stop.is_stop():
        print(f"-\ntrain epoch: {epoch}")
        break

with open("output0316.txt", 'a') as f:
    f.write(f"{epoch+1}\t"+"="*10+f"clip linear probing\n")
    f.write(f"accuracy: {test_accuracy:.4f}\n")


print('Finished Training')

------------------ Training Epoch 1 ------------------
Finished Testing
Testing accuracy: 58.69 %
train accuracy: 85.26%	test accuracy: 58.69%
------------------ Training Epoch 2 ------------------
Finished Testing
Testing accuracy: 66.09 %
train accuracy: 64.42%	test accuracy: 66.09%
------------------ Training Epoch 3 ------------------
Finished Testing
Testing accuracy: 84.30 %
train accuracy: 73.89%	test accuracy: 84.30%
------------------ Training Epoch 4 ------------------
Finished Testing
Testing accuracy: 70.91 %
train accuracy: 67.76%	test accuracy: 70.91%
------------------ Training Epoch 5 ------------------
Finished Testing
Testing accuracy: 79.30 %
train accuracy: 74.27%	test accuracy: 79.30%
------------------ Training Epoch 6 ------------------
Finished Testing
Testing accuracy: 73.39 %
train accuracy: 72.28%	test accuracy: 73.39%
------------------ Training Epoch 7 ------------------
Finished Testing
Testing accuracy: 86.71 %
train accuracy: 80.58%	test accuracy: 86.71%