# Set up enviroment

__Define paths & import libraries__

In [4]:
import os, sys
print(os.getcwd())
from pathlib import Path
ROOT_PATH = Path('../../paper_clones/TBPS-CLIP').resolve()
sys.path.append(str(ROOT_PATH))
IMAGE_PATH = Path('../../DATASET').resolve()
sys.path.append(str(IMAGE_PATH))

/dscilab_dungvo/workspace/BA-PRE_THESIS/report/training


In [5]:
import os, json
from torchinfo import summary
import random
import wandb
import time, datetime
from pathlib import Path
import torch
from misc.build import load_checkpoint, cosine_scheduler, build_optimizer
from misc.data import build_pedes_data
from misc.eval import test
from misc.utils import parse_config, init_distributed_mode, set_seed, is_master, is_using_distributed, AverageMeter
from model.tbps_model import clip_vitb, CLIP
from options import get_args
import nltk; nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# Prepare vietnamese annotations

__Google translate API 's results__

In [6]:
google_translates = open(IMAGE_PATH/'CUHK-PEDES/google_translate.txt', 'r')
google_translates = google_translates.readlines()
google_translates = [x.replace('\n', '').replace('vi: ', '') for x in google_translates]
len(google_translates)

FileNotFoundError: [Errno 2] No such file or directory: '/dscilab_dungvo/workspace/BA-PRE_THESIS/DATASET/CUHK-PEDES/google_translate.txt'

__Annotations from a model on Hugging Face__

In [None]:
anno_path = IMAGE_PATH/'CUHK-PEDES/reid_translate.json'
objs = json.load(open(anno_path))

__Split into train, test__

In [45]:
for i, x in enumerate(objs):
    x['captions_bt'] = [google_translates[i]]
# random.shuffle(objs)

ratio = 0.8
train_objs = objs[:int(len(objs)*ratio)]
test_objs = objs[int(len(objs)*ratio):]

for obj in train_objs: obj['split'] = 'train'
for obj in test_objs: obj['split'] = 'test'

In [46]:

json.dump(train_objs, open(anno_path.parent/'vietnamese'/'train_reid.json', 'w'))
json.dump(test_objs, open(anno_path.parent/'vietnamese'/'test_reid.json', 'w'))

In [47]:
train_objs = json.load(open(anno_path.parent/'vietnamese'/'train_reid.json'))
test_objs = json.load(open(anno_path.parent/'vietnamese'/'test_reid.json'))

# Set training config

In [49]:
config_path = ROOT_PATH/'config/config.yaml'
config = parse_config(config_path)

In [50]:
set_seed(config)
config['log']['print_period'] = 1
config['model']['checkpoint'] = ROOT_PATH/'checkpoint/ViT-B-16.pt'
config['anno_dir'] = anno_path.parent/'vietnamese'
# config['anno_dir'] = ROOT_PATH/'annotation/CUHK-PEDES'

config['image_dir'] = IMAGE_PATH/'CUHK-PEDES/imgs'
config['device'] = 'cuda'
config['model']['use_gather'] = False
config['data']['batch_size'] = 120
config['model']['saved_path'] = ROOT_PATH/"checkpoint"
config['experiment']['text_length'] = 132
config['model']['embed_dim'] = 512
config['schedule']['epoch_warmup'] =2
config['schedule']['epoch'] = 50

In [51]:
meters = {
    "loss": AverageMeter(),
    "nitc_loss": AverageMeter(),
    "ss_loss": AverageMeter(),
    "citc_loss": AverageMeter(),
    "ritc_loss": AverageMeter(),
    "mlm_loss": AverageMeter(),
    "id_loss": AverageMeter(),
}
best_rank_1 = 0.0
best_epoch = 0

__Build dataloader__

In [11]:
dataloader = build_pedes_data(config)
train_loader = dataloader['train_loader']
test_loader = dataloader['test_loader']
num_classes = len(train_loader.dataset.person2text)

In [10]:
from multilingual_clip import pt_multilingual_clip
import transformers

texts = [
    'Three blind horses listening to Mozart.',
    'Älgen är skogens konung!',
    'Wie leben Eisbären in der Antarktis?',
    'Вы знали, что все белые медведи левши?'
]
model_name = 'M-CLIP/XLM-Roberta-Large-Vit-B-32'

# Load Model & Tokenizer
model = pt_multilingual_clip.MultilingualCLIP.from_pretrained(model_name)
tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)

embeddings = model.forward(texts, tokenizer)
print(embeddings.shape)

config.json:   0%|          | 0.00/221 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

config.json:   0%|          | 0.00/616 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/399 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

torch.Size([4, 512])


__Build model__

In [12]:
from sentence_transformers import SentenceTransformer, util
import transformers
from transformers import CLIPProcessor, CLIPModel
clip_processor = transformers.CLIPProcessor.from_pretrained('openai/clip-vit-base-patch32')
tokenizer = transformers.AutoTokenizer.from_pretrained('sentence-transformers/clip-ViT-B-32-multilingual-v1')
clip_b32_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
multilingual_text_model = SentenceTransformer('sentence-transformers/clip-ViT-B-32-multilingual-v1')
multilingual_image_model = SentenceTransformer('sentence-transformers/clip-ViT-B-32')

In [14]:
class MultilingualCLIP(CLIP):
    def __init__(self, *args):
        super().__init__(*args)
        self.device = 'cuda'
        self.initilize_multilingual_encoder()
        self.train(True)
        self.to(self.device)
    
    
    def initilize_multilingual_encoder(self):
        self.vision_encoder  = clip_b32_model.vision_model.to(self.device)
        self.vision_proj = clip_b32_model.visual_projection.to(self.device)
        self.text_encoder = multilingual_text_model.to(self.device)
        for param in self.vision_encoder.embeddings.parameters():
            param.requires_grad = False
        for param in self.text_encoder[0].auto_model.embeddings.parameters():
            param.requires_grad = False
        # for i in range(0, 2):
        #     for param in self.text_encoder[0].auto_model.transformer.layer[i].parameters():
        #         param.requires_grad = False
        class TrickTokenize:
            def __call__(self, text, context_length=None):
                if type(text) == str:
                    text = [text]
                self.x = text
                return self
            def to(self, device=None):
                return self.x
        def encode_text(texts, return_dense=None):
            if return_dense:
                return self.text_encoder.encode(texts, convert_to_tensor=True, device=self.device), None
            else:
                return self.text_encoder.encode(texts, convert_to_tensor=True, device=self.device)
        self.encode_text = encode_text
        self.tokenize = TrickTokenize() # do nothing because tokenize is done in encode_text
        
    def encode_image(self, image, return_dense=False):
        if return_dense:
            return self.vision_proj(self.vision_encoder(image.to(self.device)).pooler_output), None
        else:
            return self.vision_proj(self.vision_encoder(image.to(self.device)).pooler_output)

In [15]:
model = MultilingualCLIP(config, None, None, num_classes, config.experiment.ritc_eps)

In [15]:
# k = random.randint(0, len(train_loader.dataset))
# img = train_loader.dataset[k]['image']
# import torchvision
# normalize = torchvision.transforms.Normalize(
#     mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
# un_normalize = torchvision.transforms.Normalize(mean=[-0.485/0.229, -0.456/0.224, -0.406/0.225], std=[1/0.229, 1/0.224, 1/0.225])
# def convert_pytorch_img(img):
#     import torchvision.transforms.functional as TF
#     import matplotlib.pyplot as plt
#     return TF.to_pil_image(un_normalize(img))
#     # plt.imshow(pil_img)
#     # plt.show()
# # 
# image_trasnform = torchvision.transforms.Compose([
#     torchvision.transforms.ToTensor(),
#     normalize
# ])

__Build opimizer, learning scchduler__

In [16]:
config.schedule.niter_per_ep = len(train_loader)
lr_schedule = cosine_scheduler(config)
optimizer = build_optimizer(config, model)
scaler = torch.cuda.amp.GradScaler()
type(lr_schedule), type(optimizer), type(scaler)

(numpy.ndarray, torch.optim.adamw.AdamW, torch.cuda.amp.grad_scaler.GradScaler)

# Set up logger wandb

In [17]:
os.environ['WANDB_NOTEBOOK_NAME'] = 'TBPS-CLIP_training.ipynb'
wandb.login()

[34m[1mwandb[0m: Currently logged in as: [33mdung-vo20csehcmut[0m ([33mtuandung[0m). Use [1m`wandb login --relogin`[0m to force relogin


True

In [18]:
wandb.finish()
run = wandb.init(
    project="TBPS-CLIP_experiment_9_11",
    config=config,
    name="training_" + '14',
)

# Evaluate function

In [22]:
import torch
import torch.nn.functional as F
# import clip
# from text_utils.tokenizer import tokenize

TEMP = None
@torch.no_grad()
def test(model, data_loader, max_length, device):
    tokenize = model.tokenize
    # switch to evaluate mode
    model.eval()

    dataset = data_loader.dataset
    texts = dataset.text
    num_text = len(texts)
    text_bs = 256

    text_feats = []
    for i in range(0, num_text, text_bs):
        text = texts[i: min(num_text, i + text_bs)]
        text = tokenize(text, context_length=max_length).to(device)
        text_feat = F.normalize(model.encode_text(text), dim=-1)
        
        
        text_feats.append(text_feat)
    text_feats = torch.cat(text_feats, dim=0)

    image_feats = []
    for image in data_loader:
        image = image.to(device)
        image_feat = F.normalize(model.encode_image(image), dim=-1)
        image_feats.append(image_feat)
    image_feats = torch.cat(image_feats, dim=0)

    sims_matrix = text_feats @ image_feats.t()
    eval_result = metric_eval(sims_matrix, dataset.img2person, dataset.txt2person)

    return eval_result


@torch.no_grad()
def metric_eval(scores_t2i, img2person, txt2person):
    device = scores_t2i.device
    img2person = img2person.to(device)
    txt2person = txt2person.to(device)

    index = torch.argsort(scores_t2i, dim=-1, descending=True)
    pred_person = img2person[index]
    matches = (txt2person.view(-1, 1).eq(pred_person)).long()

    def acc_k(matches, k=1):
        matches_k = matches[:, :k].sum(dim=-1)
        matches_k = torch.sum((matches_k > 0))
        return 100.0 * matches_k / matches.size(0)

    # Compute metrics
    ir1 = acc_k(matches, k=1).item()
    ir5 = acc_k(matches, k=5).item()
    ir10 = acc_k(matches, k=10).item()
    ir_mean = (ir1 + ir5 + ir10) / 3

    real_num = matches.sum(dim=-1)
    tmp_cmc = matches.cumsum(dim=-1).float()
    order = torch.arange(start=1, end=matches.size(1) + 1, dtype=torch.long).to(device)
    tmp_cmc /= order
    tmp_cmc *= matches
    AP = tmp_cmc.sum(dim=-1) / real_num
    mAP = AP.mean() * 100.0

    eval_result = {'r1': ir1,
                   'r5': ir5,
                   'r10': ir10,
                   'r_mean': ir_mean,
                   'mAP': mAP.item()
                   }

    return eval_result


# Training loop

In [19]:
it = 0
logger = run
for epoch in range(config.schedule.epoch):
    start_time = time.time()
    for meter in meters.values():
        meter.reset()
    model.train()
    for i, batch in enumerate(train_loader):
        for param_group in optimizer.param_groups:
            param_group['lr'] = lr_schedule[it] * param_group['ratio']
        if epoch == 0:
            alpha = config.model.softlabel_ratio * \
                min(1.0, i / len(train_loader))
        else:
            alpha = config.model.softlabel_ratio

        with torch.autocast(device_type='cuda'):
            ret = model(batch, alpha)
            loss = sum([v for k, v in ret.items() if "loss" in k])
        batch_size = batch['image'].shape[0]
        meters['loss'].update(loss.item(), batch_size)
        meters['nitc_loss'].update(ret.get('nitc_loss', 0), batch_size)
        meters['ss_loss'].update(ret.get('ss_loss', 0), batch_size)
        meters['citc_loss'].update(ret.get('citc_loss', 0), batch_size)
        meters['ritc_loss'].update(ret.get('ritc_loss', 0), batch_size)
        meters['mlm_loss'].update(ret.get('mlm_loss', 0), batch_size)
        meters['id_loss'].update(ret.get('id_loss', 0), batch_size)
        batch_size = batch['image'].shape[0]
        logger.log({
            'epoch': epoch, 
            'step': i,
            'lr': lr_schedule[it],
            **{k: v.avg for k, v in meters.items()}
        })       

        # scaler.scale(loss).backward()
        # scaler.step(optimizer)
        # scaler.update()
        # model.zero_grad()

        # if (i % 5 == 0) or (i == len(train_loader) - 1):
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        it += 1
        if (i + 1) % config.log.print_period == 0:
            info_str = f"Epoch[{epoch + 1}] Iteration[{i + 1}/{len(train_loader)}]"
            # log loss
            for k, v in meters.items():
                if v.val != 0:
                    info_str += f", {k}: {v.val:.4f}"
            info_str += f", Base Lr: {param_group['lr']:.2e}"
            print(info_str)

    end_time = time.time()
    time_per_batch = (end_time - start_time) / (i + 1)
    time_epoch = end_time - start_time
    print("Epoch {} done. Time per batch: {:.3f}[s] Speed: {:.1f}[samples/s]"
          .format(epoch + 1, time_per_batch, train_loader.batch_size / time_per_batch))

    eval_result = test(
        model, dataloader['test_loader'], config['experiment']['text_length'], config.device)
    rank_1, rank_5, rank_10, map = eval_result['r1'], eval_result['r5'], eval_result['r10'], eval_result['mAP']
    logger.log({
        'epoch': epoch, 
        'rank_1': rank_1,
        'rank_5': rank_5,
        'rank_10': rank_10,
        'mAP': map,
        'epoch_time': time_epoch,
    })

    print('Acc@1 {top1:.5f} Acc@5 {top5:.5f} Acc@10 {top10:.5f} mAP {mAP:.5f}'.format(top1=rank_1, top5=rank_5,
                                                                                      top10=rank_10, mAP=map))
    torch.cuda.empty_cache()
    if best_rank_1 < rank_1:
        best_rank_1 = rank_1
        best_epoch = epoch

        save_obj = {
            'model': model.module.state_dict(),
            'optimizer': optimizer.state_dict(),
            'config': config,
        }
        torch.save(save_obj, os.path.join(
            config.model.saved_path, 'checkpoint_best_18_11.pth'))

print(f"best Acc@1: {best_rank_1} at epoch {best_epoch + 1}")

Epoch[1] Iteration[1/268], loss: 6.4066, nitc_loss: 4.5779, ss_loss: 2.0207, citc_loss: 0.0004, ritc_loss: -0.1924, Base Lr: 1.00e-06
Epoch[1] Iteration[2/268], loss: 6.4046, nitc_loss: 4.5606, ss_loss: 2.0388, citc_loss: 0.0005, ritc_loss: -0.1953, Base Lr: 1.19e-06
Epoch[1] Iteration[3/268], loss: 6.3839, nitc_loss: 4.5439, ss_loss: 2.0328, citc_loss: 0.0006, ritc_loss: -0.1934, Base Lr: 1.37e-06
Epoch[1] Iteration[4/268], loss: 6.4099, nitc_loss: 4.5462, ss_loss: 2.0597, citc_loss: 0.0005, ritc_loss: -0.1965, Base Lr: 1.56e-06
Epoch[1] Iteration[5/268], loss: 6.4025, nitc_loss: 4.5867, ss_loss: 2.0041, citc_loss: 0.0006, ritc_loss: -0.1889, Base Lr: 1.74e-06
Epoch[1] Iteration[6/268], loss: 6.3180, nitc_loss: 4.5351, ss_loss: 1.9704, citc_loss: 0.0005, ritc_loss: -0.1880, Base Lr: 1.93e-06
Epoch[1] Iteration[7/268], loss: 6.2750, nitc_loss: 4.5237, ss_loss: 1.9398, citc_loss: 0.0007, ritc_loss: -0.1892, Base Lr: 2.11e-06
Epoch[1] Iteration[8/268], loss: 6.2599, nitc_loss: 4.4974, ss

KeyboardInterrupt: 

In [24]:
res = test(model, dataloader['test_loader'], config['experiment']['text_length'], config.device)
rank_1, rank_5, rank_10, map = eval_result['r1'], eval_result['r5'], eval_result['r10'], eval_result['mAP']

NameError: name 'eval_result' is not defined

In [26]:
res

{'r1': 0.5098234415054321,
 'r5': 1.8776423931121826,
 'r10': 3.2081573009490967,
 'r_mean': 1.8652077118555705,
 'mAP': 1.310916543006897}