In [1]:
!nvidia-smi

Wed Nov  9 18:10:49 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 465.31       Driver Version: 465.31       CUDA Version: 11.3     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ...  Off  | 00000000:1C:00.0 Off |                  N/A |
| 34%   28C    P8    24W / 350W |      0MiB / 24268MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+---------------------------------------------------------------------------

In [2]:
import torch
import numpy as np
import os
import random
from modules.tokenization_clip import SimpleTokenizer as ClipTokenizer
from modules.modeling import CLIP2Video
from evaluation.eval import eval_epoch

from utils.config import get_args
from utils.utils import get_logger
from utils.dataloader import dataloader_msrvtt_train
from utils.dataloader import dataloader_msrvtt_test
from utils.dataloader import dataloader_msrvttfull_test
from utils.dataloader import dataloader_msvd_train
from utils.dataloader import dataloader_msvd_test
from utils.dataloader import dataloader_vatexEnglish_train
from utils.dataloader import dataloader_vatexEnglish_test


In [3]:
DATALOADER_DICT = {}
DATALOADER_DICT["msrvtt"] = {"train":dataloader_msrvtt_train, "test":dataloader_msrvtt_test}
DATALOADER_DICT["msrvttfull"] = {"train":dataloader_msrvtt_train, "val":dataloader_msrvttfull_test, "test":dataloader_msrvttfull_test}
DATALOADER_DICT["msvd"] = {"train":dataloader_msvd_train, "val":dataloader_msvd_test, "test":dataloader_msvd_test}
DATALOADER_DICT["vatexEnglish"] = {"train":dataloader_vatexEnglish_train, "test":dataloader_vatexEnglish_test}


In [4]:

def set_seed_logger(args):
    """Initialize the seed and environment variable

    Args:
        args: the hyper-parameters.

    Returns:
        args: the hyper-parameters modified by the random seed.

    """

    global logger

    # predefining random initial seeds
    random.seed(args.seed)
    os.environ['PYTHONHASHSEED'] = str(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)  # if you are using multi-GPU.
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

    # get logger
    logger = get_logger(os.path.join(args.output_dir))

    return args

In [5]:

def init_device(args, local_rank):
    """Initialize device to determine CPU or GPU

     Args:
         args: the hyper-parameters
         local_rank: GPU id

     Returns:
         devices: cuda
         n_gpu: number of gpu

     """
    global logger
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu", local_rank)
    n_gpu = torch.cuda.device_count()
    logger.info("device: {} n_gpu: {}".format(device, n_gpu))
    args.n_gpu = n_gpu

    if args.batch_size_val % args.n_gpu != 0:
        raise ValueError("Invalid batch_size/batch_size_val and n_gpu parameter: {}%{} and {}%{}, should be == 0".format(
            args.batch_size, args.n_gpu, args.batch_size_val, args.n_gpu))

    return device, n_gpu

In [6]:

def init_model(args, device):
    """Initialize model.

    if location of args.init_model exists, model will be initialized from the pretrained model.
    if no model exists, the training will be initialized from CLIP's parameters.

    Args:
        args: the hyper-parameters
        devices: cuda

    Returns:
        model: the initialized model

    """

    # resume model if pre-trained model exist.
    model_file = os.path.join(args.checkpoint, "pytorch_model.bin.{}".format(args.model_num))
    if os.path.exists(model_file):
        model_state_dict = torch.load(model_file, map_location='cpu')
        if args.local_rank == 0:
            logger.info("Model loaded from %s", model_file)
    else:
        model_state_dict = None
        if args.local_rank == 0:
            logger.info("Model loaded fail %s", model_file)

    # Prepare model
    model = CLIP2Video.from_pretrained(args.cross_model, cache_dir=None, state_dict=model_state_dict,
                                       task_config=args)
    model.to(device)

    return model


In [7]:
global logger

In [8]:
import argparse
import configparser

In [14]:
jupyter = True
parser = argparse.ArgumentParser(description='CLIP2Video on Dideo-Text Retrieval Task')

# arugment based on CLIP4clip:
# https://github.com/ArrowLuo/CLIP4Clip/blob/668334707c493a4eaee7b4a03b2dae04915ce170/main_task_retrieval.py#L457
parser.add_argument("--do_eval", action='store_true', default=True, help="Whether to run eval on the dev set.")
parser.add_argument('--val_csv', type=str, default='data/.val.csv', help='')
parser.add_argument('--data_path', type=str, default='/share/home/lyq/Pretrain_attack/CLIP2Video/data/msvd_data/', help='data pickle file path')
parser.add_argument('--features_path', type=str, default='/share/test/lyq/video/test_MSVD', help='feature path')
parser.add_argument('--num_thread_reader', type=int, default=4, help='')
parser.add_argument('--batch_size_val', type=int, default=64, help='batch size eval')
parser.add_argument('--seed', type=int, default=42, help='random seed')
parser.add_argument('--max_words', type=int, default=32, help='')
parser.add_argument('--max_frames', type=int, default=12, help='')
parser.add_argument('--feature_framerate', type=int, default=1, help='frame rate for uniformly sampling the video')
parser.add_argument("--output_dir", default='/share/home/lyq/Pretrain_attack/CLIP2Video/CLIP2Video_MSVD/try.txt', type=str,
                    help="The output directory where the model predictions and checkpoints will be written.")
parser.add_argument("--cross_model", default="cross-base", type=str, required=False, help="Cross module")
parser.add_argument("--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.")
parser.add_argument('--n_gpu', type=int, default=1, help="Changed in the execute process.")
parser.add_argument("--cache_dir", default="", type=str,
                    help="Where do you want to store the pre-trained models downloaded from s3")
parser.add_argument('--fp16', action='store_true',
                    help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit")
parser.add_argument('--fp16_opt_level', type=str, default='O1',
                    help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
                         "See details at https://nvidia.github.io/apex/amp.html")
parser.add_argument('--cross_num_hidden_layers', type=int, default=4, help="Layer NO. of cross.")


# important extra argument for training and testing CLIP2Video
parser.add_argument('--sim_type', type=str, default="seqTransf", choices=["meanP", "seqTransf"],
                    help="choice a similarity header.")

# argument for testing
parser.add_argument('--checkpoint', type=str, default='/share/home/lyq/CLIP2Video/CLIP2Video_MSVD', help="checkpoint dir")
parser.add_argument('--model_num', type=str, default='2', help="model id")
parser.add_argument('--local_rank', default=0, type=int, help='shard_id: node rank for distributed training')
parser.add_argument("--datatype", default="msvd", type=str, help="msvd | msrvtt | vatexEnglish | msrvttfull")

# for different vocab size
parser.add_argument('--vocab_size', type=int, default=49408, help="the number of vocab size")

# for TDB block
parser.add_argument('--temporal_type', type=str, default='TDB', help="TDB type")
parser.add_argument('--temporal_proj', type=str, default='sigmoid_selfA', help="sigmoid_mlp | sigmoid_selfA")

# for TAB block
parser.add_argument('--center_type', type=str, default='TAB', help="TAB")
parser.add_argument('--centerK', type=int, default=5, help='center number for clustering.')
parser.add_argument('--center_weight', type=float, default=0.5, help='the weight to adopt the main simiarility')
parser.add_argument('--center_proj', type=str, default='TAB_TDB', help='TAB | TAB_TDB')
parser.add_argument('--myreplace', type=str, default=None)
# model path of clip
parser.add_argument('--clip_path', type=str,
                    default='/share/home/lyq/.cache/clip/ViT-B-32.pt',
                    help="model path of CLIP")
arg_list = None
args = parser.parse_known_args()[0]
print('args:', args)

args: Namespace(batch_size_val=64, cache_dir='', centerK=5, center_proj='TAB_TDB', center_type='TAB', center_weight=0.5, checkpoint='/share/home/lyq/CLIP2Video/CLIP2Video_MSVD', clip_path='/share/home/lyq/.cache/clip/ViT-B-32.pt', cross_model='cross-base', cross_num_hidden_layers=4, data_path='/share/home/lyq/Pretrain_attack/CLIP2Video/data/msvd_data/', datatype='msvd', do_eval=True, do_lower_case=False, feature_framerate=1, features_path='/share/test/lyq/video/test_MSVD', fp16=False, fp16_opt_level='O1', local_rank=0, max_frames=12, max_words=32, model_num='2', myreplace=None, n_gpu=1, num_thread_reader=4, output_dir='/share/home/lyq/Pretrain_attack/CLIP2Video/CLIP2Video_MSVD/try.txt', seed=42, sim_type='seqTransf', temporal_proj='sigmoid_selfA', temporal_type='TDB', val_csv='data/.val.csv', vocab_size=49408)


In [15]:
args = set_seed_logger(args)

In [16]:
device, n_gpu = init_device(args, args.local_rank)

11/09/2022 18:38:50 - INFO -   device: cuda:0 n_gpu: 1


In [17]:
tokenizer = ClipTokenizer()

In [18]:
model = init_model(args, device)

11/09/2022 18:39:04 - INFO -   Model loaded from /share/home/lyq/CLIP2Video/CLIP2Video_MSVD/pytorch_model.bin.2
11/09/2022 18:39:05 - INFO -   loading archive file /share/home/lyq/Pretrain_attack/CLIP2Video/modules/cross-base
11/09/2022 18:39:05 - INFO -   Model config {
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 512,
  "initializer_range": 0.02,
  "intermediate_size": 2048,
  "max_position_embeddings": 77,
  "num_attention_heads": 8,
  "num_hidden_layers": 4,
  "type_vocab_size": 2,
  "vocab_size": 512
}

11/09/2022 18:39:05 - INFO -   Weight doesn't exsits. /share/home/lyq/Pretrain_attack/CLIP2Video/modules/cross-base/cross_pytorch_model.bin
11/09/2022 18:39:14 - INFO -   --------------------


In [19]:
assert args.datatype in DATALOADER_DICT
test_dataloader, test_length = DATALOADER_DICT[args.datatype]["test"](args, tokenizer)


For test, sentence number: 27763
For test, video number: 670
Video number: 670
Total Pair: 27763


  "Argument interpolation should be of type InterpolationMode instead of int. "


In [20]:
if args.local_rank == 0:
        logger.info("***** Running test *****")
        logger.info("  Num examples = %d", test_length)
        logger.info("  Batch size = %d", args.batch_size_val)
        logger.info("  Num steps = %d", len(test_dataloader))

11/09/2022 18:40:08 - INFO -   ***** Running test *****
11/09/2022 18:40:08 - INFO -     Num examples = 27763
11/09/2022 18:40:08 - INFO -     Batch size = 64
11/09/2022 18:40:08 - INFO -     Num steps = 434


In [26]:
import os
import sys
import numpy as np
from evaluation.metrics import tensor_text_to_video_metrics
from evaluation.metrics import tensor_video_to_text_sim
from utils.utils import parallel_apply
import torch



def _run_on_single_gpu(model, batch_list_t, batch_list_v, batch_sequence_output_list, batch_visual_output_list):
    """run similarity in one single gpu
    Args:
        model: CLIP2Video
        batch_list_t: id of text embedding
        batch_list_v: id of visual embedding
        batch_sequence_output_list: batch text embedding
        batch_visual_output_list: batch visual embedding
    Returns:
        sim_matrix: similarity

    """
    sim_matrix = []
    for idx1, b1 in enumerate(batch_list_t):
        input_mask, segment_ids, *_tmp = b1
        sequence_output = batch_sequence_output_list[idx1]
        each_row = []
        for idx2, b2 in enumerate(batch_list_v):
            video_mask, *_tmp = b2
            visual_output = batch_visual_output_list[idx2]
            # calculate the similarity
            b1b2_logits, *_tmp = model.get_inference_logits(sequence_output, visual_output, input_mask, video_mask)
            b1b2_logits = b1b2_logits.cpu().detach().numpy()
            each_row.append(b1b2_logits)
        each_row = np.concatenate(tuple(each_row), axis=-1)
        sim_matrix.append(each_row)
    return sim_matrix



def eval_epoch_h(model, test_dataloader, device, n_gpu, logger):
    """run similarity in one single gpu
    Args:
        model: CLIP2Video
        test_dataloader: data loader for test
        device: device to run model
        n_gpu: GPU number
        batch_sequence_output_list: batch text embedding
        batch_visual_output_list: batch visual embedding
    Returns:
        R1: rank 1 of text-to-video retrieval

    """

    if hasattr(model, 'module'):
        model = model.module.to(device)
    else:
        model = model.to(device)

    # if multi_sentence_ == True: compute the similarity with multi-sentences retrieval
    multi_sentence_ = False

    cut_off_points_, sentence_num_, video_num_ = [], -1, -1
    if hasattr(test_dataloader.dataset, 'multi_sentence_per_video') \
            and test_dataloader.dataset.multi_sentence_per_video:
        multi_sentence_ = True
        cut_off_points_ = test_dataloader.dataset.cut_off_points # used to tag the label when calculate the metric
        sentence_num_ = test_dataloader.dataset.sentence_num # used to cut the sentence representation
        video_num_ = test_dataloader.dataset.video_num # used to cut the video representation
        cut_off_points_ = [itm - 1 for itm in cut_off_points_]

    if multi_sentence_:
        logger.warning("Eval under the multi-sentence per video clip setting.")
        logger.warning("sentence num: {}, video num: {}".format(sentence_num_, video_num_))

    model.eval()

    with torch.no_grad():
        batch_list_t = []
        batch_list_v = []
        batch_sequence_output_list, batch_visual_output_list = [], []
        total_video_num = 0


        for bid, batch in enumerate(test_dataloader):
            batch = tuple(t.to(device) for t in batch)
            input_ids, input_mask, segment_ids, video, video_mask = batch
            # print(input_ids.shape)   #batch(64) * 32
            # print(input_mask.shape)  #batch * 32
            # print(segment_ids.shape)  #batch * 32
            # print(video.shape)       #batch * 1 * frame * 1 * 3 * 224 * 224
            # print(video_mask.shape)   #batch * 1 * frame
            if multi_sentence_:
                # multi-sentences retrieval means: one frame clip has two or more descriptions.
                b, *_t = video.shape
                sequence_output = model.get_sequence_output(input_ids, segment_ids, input_mask)
                batch_sequence_output_list.append(sequence_output)
                batch_list_t.append((input_mask, segment_ids,))

                s_, e_ = total_video_num, total_video_num + b
                # print(cut_off_points_)
                filter_inds = [itm - s_ for itm in cut_off_points_ if itm >= s_ and itm < e_]
                # print(len(batch_sequence_output_list))

                if len(filter_inds) > 0:
                    video, video_mask = video[filter_inds, ...], video_mask[filter_inds, ...]
                    visual_output = model.get_visual_output(video, video_mask)
                    batch_visual_output_list.append(visual_output)
                    batch_list_v.append((video_mask,))
                # print(len(batch_visual_output_list))
                total_video_num += b
            else:
                sequence_output, visual_output = model.get_sequence_visual_output(input_ids, segment_ids, input_mask, video, video_mask)

                batch_sequence_output_list.append(sequence_output)
                batch_list_t.append((input_mask, segment_ids,))

                batch_visual_output_list.append(visual_output)
                batch_list_v.append((video_mask,))

            #print("{}/{}\r".format(bid, len(test_dataloader)), end="")
            if bid % 50 == 0:
                print("now: ", bid)
        return model, batch_list_t, batch_list_v, batch_sequence_output_list, batch_visual_output_list
       
        # calculate the similarity  in one GPU
#         sim_matrix = _run_on_single_gpu(model, batch_list_t, batch_list_v, batch_sequence_output_list, batch_visual_output_list)
#         sim_matrix = np.concatenate(tuple(sim_matrix), axis=0)

#         R1 = logging_rank(sim_matrix, multi_sentence_, cut_off_points_, logger)
#         return R1

def logging_rank(sim_matrix, multi_sentence_, cut_off_points_, logger):
    """run similarity in one single gpu
    Args:
        sim_matrix: similarity matrix
        multi_sentence_: indicate whether the multi sentence retrieval
        cut_off_points_:  tag the label when calculate the metric
        logger: logger for metric
    Returns:
        R1: rank 1 of text-to-video retrieval

    """

    if multi_sentence_:
        # if adopting multi-sequence retrieval, the similarity matrix should be reshaped
        logger.info("before reshape, sim matrix size: {} x {}".format(sim_matrix.shape[0], sim_matrix.shape[1]))
        cut_off_points2len_ = [itm + 1 for itm in cut_off_points_]
        max_length = max([e_-s_ for s_, e_ in zip([0]+cut_off_points2len_[:-1], cut_off_points2len_)])
        sim_matrix_new = []
        for s_, e_ in zip([0] + cut_off_points2len_[:-1], cut_off_points2len_):
            sim_matrix_new.append(np.concatenate((sim_matrix[s_:e_],
                                                  np.full((max_length-e_+s_, sim_matrix.shape[1]), -np.inf)), axis=0))
        sim_matrix = np.stack(tuple(sim_matrix_new), axis=0)   # 670 x 81(max_length) x 670
        logger.info("after reshape, sim matrix size: {} x {} x {}".
                    format(sim_matrix.shape[0], sim_matrix.shape[1], sim_matrix.shape[2]))

        # compute text-to-video retrieval
        tv_metrics = tensor_text_to_video_metrics(sim_matrix)


        # compute video-to-text retrieval
        tmp = tensor_video_to_text_sim(sim_matrix)

        print(tmp.shape)  ## debug
        vt_metrics = compute_metrics(tmp)
    else:
        logger.info("sim matrix size: {}, {}".format(sim_matrix.shape[0], sim_matrix.shape[1]))

        # compute text-to-video retrieval
        tv_metrics = compute_metrics(sim_matrix)

        # compute video-to-text retrieval
        vt_metrics = compute_metrics(sim_matrix.T)
        logger.info('\t Length-T: {}, Length-V:{}'.format(len(sim_matrix), len(sim_matrix[0])))


    # logging the result of text-to-video retrieval
    logger.info("Text-to-Video:")
    logger.info('\t>>>  R@1: {:.1f} - R@5: {:.1f} - R@10: {:.1f} - Median R: {:.1f} - Mean R: {:.1f}'.
                format(tv_metrics['R1'], tv_metrics['R5'], tv_metrics['R10'], tv_metrics['MR'], tv_metrics['MeanR']))

    # logging the result of video-to-text retrieval
    logger.info("Video-to-Text:")
    logger.info(
        '\t>>>  V2T$R@1: {:.1f} - V2T$R@5: {:.1f} - V2T$R@10: {:.1f} - V2T$Median R: {:.1f} - V2T$Mean R: {:.1f}'.format(
            vt_metrics['R1'], vt_metrics['R5'], vt_metrics['R10'], vt_metrics['MR'], vt_metrics['MeanR']))

    R1 = tv_metrics['R1']
    return R1


In [27]:
model, batch_list_t, batch_list_v, batch_sequence_output_list, batch_visual_output_list = eval_epoch_h(model, test_dataloader, device, n_gpu, logger)



now:  0
now:  50
now:  100
now:  150
now:  200
now:  250
now:  300
now:  350
now:  400


In [29]:
print(len(batch_list_v))
print(len(batch_list_t))

433
434


In [30]:
sim_matrix = _run_on_single_gpu(model, batch_list_t, batch_list_v, batch_sequence_output_list, batch_visual_output_list)

In [31]:
print("ok")

ok


In [33]:
print(len(sim_matrix))

434


In [34]:
sim_matrix = np.concatenate(tuple(sim_matrix), axis=0)

In [36]:
print(sim_matrix.shape)

(27763, 670)


In [39]:
cut_off_points_, sentence_num_, video_num_ = [], -1, -1
if hasattr(test_dataloader.dataset, 'multi_sentence_per_video') \
        and test_dataloader.dataset.multi_sentence_per_video:
    multi_sentence_ = True
    cut_off_points_ = test_dataloader.dataset.cut_off_points # used to tag the label when calculate the metric
    sentence_num_ = test_dataloader.dataset.sentence_num # used to cut the sentence representation
    video_num_ = test_dataloader.dataset.video_num # used to cut the video representation
    cut_off_points_ = [itm - 1 for itm in cut_off_points_]

In [40]:
print(len(cut_off_points_))

670


In [41]:
logger.info("before reshape, sim matrix size: {} x {}".format(sim_matrix.shape[0], sim_matrix.shape[1]))
cut_off_points2len_ = [itm + 1 for itm in cut_off_points_]
max_length = max([e_-s_ for s_, e_ in zip([0]+cut_off_points2len_[:-1], cut_off_points2len_)])
sim_matrix_new = []
for s_, e_ in zip([0] + cut_off_points2len_[:-1], cut_off_points2len_):
    sim_matrix_new.append(np.concatenate((sim_matrix[s_:e_],
                                          np.full((max_length-e_+s_, sim_matrix.shape[1]), -np.inf)), axis=0))
sim_matrix = np.stack(tuple(sim_matrix_new), axis=0)   # 670 x 81(max_length) x 670
logger.info("after reshape, sim matrix size: {} x {} x {}".
            format(sim_matrix.shape[0], sim_matrix.shape[1], sim_matrix.shape[2]))



11/09/2022 19:55:02 - INFO -   before reshape, sim matrix size: 27763 x 670
11/09/2022 19:55:03 - INFO -   after reshape, sim matrix size: 670 x 81 x 670


In [42]:
tv_metrics = tensor_text_to_video_metrics(sim_matrix)

In [43]:
print(sim_matrix.shape)

(670, 81, 670)


In [44]:
tmp = tensor_video_to_text_sim(sim_matrix)

print(tmp.shape)  ## debug


torch.Size([670, 670])


In [53]:
def compute_metrics(x):

    print("x", x.shape) #debug
    sx = np.sort(-x, axis=1)
    print("sx", sx.shape) #debug
    d = np.diag(-x)
    d = d[:, np.newaxis]
    print("d", d.shape) #debug
    ind = sx - d
    print("ind", ind.shape) #debug
    print("sum", np.sum(ind==0))
    ind = np.where(ind == 0)
    print(ind)
    ind = ind[1]
    
#     with open("correct.txt", "a+") as fp:
#         np.savetxt(fp, ind, fmt='%d', delimiter=',')
    # b = np.loadtxt(filename, dtype=np.int32, delimiter=',')
    metrics = {}
    metrics['R1'] = float(np.sum(ind == 0)) * 100 / len(ind)
    metrics['R5'] = float(np.sum(ind < 5)) * 100 / len(ind)
    metrics['R10'] = float(np.sum(ind < 10)) * 100 / len(ind)
    metrics['MR'] = np.median(ind) + 1
    metrics["MedianR"] = metrics['MR']
    metrics["MeanR"] = np.mean(ind) + 1
    metrics["cols"] = [int(i) for i in list(ind)]
    return metrics

In [52]:
vt_metrics = compute_metrics(tmp)

x torch.Size([670, 670])
sx (670, 670)
d (670, 1)
ind (670, 670)
sum 714
714
