## Install library

In [None]:
# set up our code
!pip install git+https://github.com/xinyu1205/recognize-anything.git
%rm -rf recognize-anything
!git clone https://github.com/xinyu1205/recognize-anything.git
%cd /kaggle/working/recognize-anything
!pip install -r requirements.txt
!pip install -e .


%cd /kaggle/working/
# set up Grounded-SAM
# you will need to clone Grounded-SAM repo
%rm -rf Grounded-Segment-Anything
!git clone https://github.com/IDEA-Research/Grounded-Segment-Anything.git

## Fix error _C

In [None]:
with open("/kaggle/working/Grounded-Segment-Anything/GroundingDINO/groundingdino/models/GroundingDINO/ms_deform_attn.py","w") as f:
    f.write('''

import math
import warnings
from typing import Optional

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Function
from torch.autograd.function import once_differentiable
from torch.nn.init import constant_, xavier_uniform_


# helpers
def _is_power_of_2(n):
    if (not isinstance(n, int)) or (n < 0):
        raise ValueError("invalid input for _is_power_of_2: {} (type: {})".format(n, type(n)))
    return (n & (n - 1) == 0) and n != 0


def multi_scale_deformable_attn_pytorch(
    value: torch.Tensor,
    value_spatial_shapes: torch.Tensor,
    sampling_locations: torch.Tensor,
    attention_weights: torch.Tensor,
) -> torch.Tensor:

    bs, _, num_heads, embed_dims = value.shape
    _, num_queries, num_heads, num_levels, num_points, _ = sampling_locations.shape
    value_list = value.split([H_ * W_ for H_, W_ in value_spatial_shapes], dim=1)
    sampling_grids = 2 * sampling_locations - 1
    sampling_value_list = []
    for level, (H_, W_) in enumerate(value_spatial_shapes):
        # bs, H_*W_, num_heads, embed_dims ->
        # bs, H_*W_, num_heads*embed_dims ->
        # bs, num_heads*embed_dims, H_*W_ ->
        # bs*num_heads, embed_dims, H_, W_
        value_l_ = (
            value_list[level].flatten(2).transpose(1, 2).reshape(bs * num_heads, embed_dims, H_, W_)
        )
        # bs, num_queries, num_heads, num_points, 2 ->
        # bs, num_heads, num_queries, num_points, 2 ->
        # bs*num_heads, num_queries, num_points, 2
        sampling_grid_l_ = sampling_grids[:, :, :, level].transpose(1, 2).flatten(0, 1)
        # bs*num_heads, embed_dims, num_queries, num_points
        sampling_value_l_ = F.grid_sample(
            value_l_, sampling_grid_l_, mode="bilinear", padding_mode="zeros", align_corners=False
        )
        sampling_value_list.append(sampling_value_l_)
    # (bs, num_queries, num_heads, num_levels, num_points) ->
    # (bs, num_heads, num_queries, num_levels, num_points) ->
    # (bs, num_heads, 1, num_queries, num_levels*num_points)
    attention_weights = attention_weights.transpose(1, 2).reshape(
        bs * num_heads, 1, num_queries, num_levels * num_points
    )
    output = (
        (torch.stack(sampling_value_list, dim=-2).flatten(-2) * attention_weights)
        .sum(-1)
        .view(bs, num_heads * embed_dims, num_queries)
    )
    return output.transpose(1, 2).contiguous()


class MultiScaleDeformableAttention(nn.Module):
    """Multi-Scale Deformable Attention Module used in Deformable-DETR

    `Deformable DETR: Deformable Transformers for End-to-End Object Detection.
    <https://arxiv.org/pdf/2010.04159.pdf>`_.

    Args:
        embed_dim (int): The embedding dimension of Attention. Default: 256.
        num_heads (int): The number of attention heads. Default: 8.
        num_levels (int): The number of feature map used in Attention. Default: 4.
        num_points (int): The number of sampling points for each query
            in each head. Default: 4.
        img2col_steps (int): The step used in image_to_column. Defualt: 64.
            dropout (float): Dropout layer used in output. Default: 0.1.
        batch_first (bool): if ``True``, then the input and output tensor will be
            provided as `(bs, n, embed_dim)`. Default: False. `(n, bs, embed_dim)`
    """

    def __init__(
        self,
        embed_dim: int = 256,
        num_heads: int = 8,
        num_levels: int = 4,
        num_points: int = 4,
        img2col_step: int = 64,
        batch_first: bool = False,
    ):
        super().__init__()
        if embed_dim % num_heads != 0:
            raise ValueError(
                "embed_dim must be divisible by num_heads, but got {} and {}".format(
                    embed_dim, num_heads
                )
            )
        head_dim = embed_dim // num_heads

        self.batch_first = batch_first

        if not _is_power_of_2(head_dim):
            warnings.warn(
                """
                You'd better set d_model in MSDeformAttn to make sure that
                each dim of the attention head a power of 2, which is more efficient.
                """
            )

        self.im2col_step = img2col_step
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        self.num_levels = num_levels
        self.num_points = num_points
        self.sampling_offsets = nn.Linear(embed_dim, num_heads * num_levels * num_points * 2)
        self.attention_weights = nn.Linear(embed_dim, num_heads * num_levels * num_points)
        self.value_proj = nn.Linear(embed_dim, embed_dim)
        self.output_proj = nn.Linear(embed_dim, embed_dim)

        self.init_weights()

    def _reset_parameters(self):
        return self.init_weights()

    def init_weights(self):
        """
        Default initialization for Parameters of Module.
        """
        constant_(self.sampling_offsets.weight.data, 0.0)
        thetas = torch.arange(self.num_heads, dtype=torch.float32) * (
            2.0 * math.pi / self.num_heads
        )
        grid_init = torch.stack([thetas.cos(), thetas.sin()], -1)
        grid_init = (
            (grid_init / grid_init.abs().max(-1, keepdim=True)[0])
            .view(self.num_heads, 1, 1, 2)
            .repeat(1, self.num_levels, self.num_points, 1)
        )
        for i in range(self.num_points):
            grid_init[:, :, i, :] *= i + 1
        with torch.no_grad():
            self.sampling_offsets.bias = nn.Parameter(grid_init.view(-1))
        constant_(self.attention_weights.weight.data, 0.0)
        constant_(self.attention_weights.bias.data, 0.0)
        xavier_uniform_(self.value_proj.weight.data)
        constant_(self.value_proj.bias.data, 0.0)
        xavier_uniform_(self.output_proj.weight.data)
        constant_(self.output_proj.bias.data, 0.0)

    def freeze_sampling_offsets(self):
        print("Freeze sampling offsets")
        self.sampling_offsets.weight.requires_grad = False
        self.sampling_offsets.bias.requires_grad = False

    def freeze_attention_weights(self):
        print("Freeze attention weights")
        self.attention_weights.weight.requires_grad = False
        self.attention_weights.bias.requires_grad = False

    def forward(
        self,
        query: torch.Tensor,
        key: Optional[torch.Tensor] = None,
        value: Optional[torch.Tensor] = None,
        query_pos: Optional[torch.Tensor] = None,
        key_padding_mask: Optional[torch.Tensor] = None,
        reference_points: Optional[torch.Tensor] = None,
        spatial_shapes: Optional[torch.Tensor] = None,
        level_start_index: Optional[torch.Tensor] = None,
        **kwargs
    ) -> torch.Tensor:

        """Forward Function of MultiScaleDeformableAttention

        Args:
            query (torch.Tensor): Query embeddings with shape
                `(num_query, bs, embed_dim)`
            key (torch.Tensor): Key embeddings with shape
                `(num_key, bs, embed_dim)`
            value (torch.Tensor): Value embeddings with shape
                `(num_key, bs, embed_dim)`
            query_pos (torch.Tensor): The position embedding for `query`. Default: None.
            key_padding_mask (torch.Tensor): ByteTensor for `query`, with shape `(bs, num_key)`,
                indicating which elements within `key` to be ignored in attention.
            reference_points (torch.Tensor): The normalized reference points
                with shape `(bs, num_query, num_levels, 2)`,
                all elements is range in [0, 1], top-left (0, 0),
                bottom-right (1, 1), including padding are.
                or `(N, Length_{query}, num_levels, 4)`, add additional
                two dimensions `(h, w)` to form reference boxes.
            spatial_shapes (torch.Tensor): Spatial shape of features in different levels.
                With shape `(num_levels, 2)`, last dimension represents `(h, w)`.
            level_start_index (torch.Tensor): The start index of each level. A tensor with
                shape `(num_levels, )` which can be represented as
                `[0, h_0 * w_0, h_0 * w_0 + h_1 * w_1, ...]`.

        Returns:
            torch.Tensor: forward results with shape `(num_query, bs, embed_dim)`
        """

        if value is None:
            value = query

        if query_pos is not None:
            query = query + query_pos

        if not self.batch_first:
            # change to (bs, num_query ,embed_dims)
            query = query.permute(1, 0, 2)
            value = value.permute(1, 0, 2)

        bs, num_query, _ = query.shape
        bs, num_value, _ = value.shape

        assert (spatial_shapes[:, 0] * spatial_shapes[:, 1]).sum() == num_value

        value = self.value_proj(value)
        if key_padding_mask is not None:
            value = value.masked_fill(key_padding_mask[..., None], float(0))
        value = value.view(bs, num_value, self.num_heads, -1)
        sampling_offsets = self.sampling_offsets(query).view(
            bs, num_query, self.num_heads, self.num_levels, self.num_points, 2
        )
        attention_weights = self.attention_weights(query).view(
            bs, num_query, self.num_heads, self.num_levels * self.num_points
        )
        attention_weights = attention_weights.softmax(-1)
        attention_weights = attention_weights.view(
            bs,
            num_query,
            self.num_heads,
            self.num_levels,
            self.num_points,
        )

        # bs, num_query, num_heads, num_levels, num_points, 2
        if reference_points.shape[-1] == 2:
            offset_normalizer = torch.stack([spatial_shapes[..., 1], spatial_shapes[..., 0]], -1)
            sampling_locations = (
                reference_points[:, :, None, :, None, :]
                + sampling_offsets / offset_normalizer[None, None, None, :, None, :]
            )
        elif reference_points.shape[-1] == 4:
            sampling_locations = (
                reference_points[:, :, None, :, None, :2]
                + sampling_offsets
                / self.num_points
                * reference_points[:, :, None, :, None, 2:]
                * 0.5
            )
        else:
            raise ValueError(
                "Last dim of reference_points must be 2 or 4, but get {} instead.".format(
                    reference_points.shape[-1]
                )
            )
    
        
        output = multi_scale_deformable_attn_pytorch(
            value, spatial_shapes, sampling_locations, attention_weights
        )

        output = self.output_proj(output)

        if not self.batch_first:
            output = output.permute(1, 0, 2)

        return output


def create_dummy_class(klass, dependency, message=""):
    """
    When a dependency of a class is not available, create a dummy class which throws ImportError
    when used.

    Args:
        klass (str): name of the class.
        dependency (str): name of the dependency.
        message: extra message to print
    Returns:
        class: a class object
    """
    err = "Cannot import '{}', therefore '{}' is not available.".format(dependency, klass)
    if message:
        err = err + " " + message

    class _DummyMetaClass(type):
        # throw error on class attribute access
        def __getattr__(_, __):  # noqa: B902
            raise ImportError(err)

    class _Dummy(object, metaclass=_DummyMetaClass):
        # throw error on constructor
        def __init__(self, *args, **kwargs):
            raise ImportError(err)

    return _Dummy


def create_dummy_func(func, dependency, message=""):
    """
    When a dependency of a function is not available, create a dummy function which throws
    ImportError when used.

    Args:
        func (str): name of the function.
        dependency (str or list[str]): name(s) of the dependency.
        message: extra message to print
    Returns:
        function: a function object
    """
    err = "Cannot import '{}', therefore '{}' is not available.".format(dependency, func)
    if message:
        err = err + " " + message

    if isinstance(dependency, (list, tuple)):
        dependency = ",".join(dependency)

    def _dummy(*args, **kwargs):
        raise ImportError(err)

    return _dummy
    ''')

In [None]:
%cd /kaggle/working/Grounded-Segment-Anything
!pip install -r requirements.txt
%pip install ./segment_anything
%pip install ./GroundingDINO
%cd ..
%pip install opencv-python pycocotools matplotlib onnxruntime onnx ipykernel

## Download model

In [None]:
!wget https://dl.fbaipublicfiles.com/segment_anything/sam_vit_h_4b8939.pth
!wget https://github.com/IDEA-Research/GroundingDINO/releases/download/v0.1.0-alpha/groundingdino_swint_ogc.pth

In [None]:
with open("/kaggle/working/Grounded-Segment-Anything/GroundingDINO/groundingdino/config/GroundingDINO_SwinT_OGC.py","w")as f:
    f.write('''
batch_size = 1
modelname = "groundingdino"
backbone = "swin_T_224_1k"
position_embedding = "sine"
pe_temperatureH = 20
pe_temperatureW = 20
return_interm_indices = [1, 2, 3]
backbone_freeze_keywords = None
enc_layers = 6
dec_layers = 6
pre_norm = False
dim_feedforward = 2048
hidden_dim = 256
dropout = 0.0
nheads = 8
num_queries = 900
query_dim = 4
num_patterns = 0
num_feature_levels = 4
enc_n_points = 4
dec_n_points = 4
two_stage_type = "standard"
two_stage_bbox_embed_share = False
two_stage_class_embed_share = False
transformer_activation = "relu"
dec_pred_bbox_embed_share = True
dn_box_noise_scale = 1.0
dn_label_noise_ratio = 0.5
dn_label_coef = 1.0
dn_bbox_coef = 1.0
embed_init_tgt = True
dn_labelbook_size = 2000
max_text_len = 256
text_encoder_type = "bert-base-uncased"
use_text_enhancer = True
use_fusion_layer = True
use_checkpoint = True
use_transformer_ckpt = True
use_text_cross_attention = True
text_dropout = 0.0
fusion_dropout = 0.0
fusion_droppath = 0.1
sub_sentence_present = True
bert_base_uncased_path = "bert-base-uncased"
''')

## Config

In [None]:
# arguments.
#
# before you go, please download following 4 checkpoints:
# download RAM and Tag2Text checkpoints to ./pretrained/ from https://github.com/majinyu666/recognize-anything/tree/main#toolbox-checkpoints
# download GroundingDINO and SAM checkpoints to ./4Grounded-Segment-Anything/ from step 1 of https://github.com/IDEA-Research/Grounded-Segment-Anything#running_man-grounded-sam-detect-and-segment-everything-with-text-prompt

config_file = "/kaggle/working/Grounded-Segment-Anything/GroundingDINO/groundingdino/config/GroundingDINO_SwinT_OGC.py"
ram_checkpoint = "/kaggle/input/pretrain/ram_swin_large_14m.pth"
grounded_checkpoint = "/kaggle/working/groundingdino_swint_ogc.pth"
sam_checkpoint = "/kaggle/working/sam_vit_h_4b8939.pth"
box_threshold = 0.25
text_threshold = 0.2
iou_threshold = 0.5
import torch
device = "cuda" if torch.cuda.is_available() else "cpu"

In [None]:
# model loading, inference and visualization functions
%cd /kaggle/working/recognize-anything
import os
import random

import cv2
import groundingdino.datasets.transforms as T
import numpy as np
import torch
import torchvision
import torchvision.transforms as TS
from groundingdino.models import build_model
from groundingdino.util.slconfig import SLConfig
from groundingdino.util.utils import clean_state_dict, get_phrases_from_posmap
from PIL import Image, ImageDraw, ImageFont
from ram import inference_ram 
from ram.models import ram
from ram.models import tag2text
from segment_anything import SamPredictor, build_sam
import time


def load_model(model_config_path, model_checkpoint_path, device):
    args = SLConfig.fromfile(model_config_path)
    args.device = device
    model = build_model(args)
    checkpoint = torch.load(model_checkpoint_path, map_location="cpu")
    load_res = model.load_state_dict(
        clean_state_dict(checkpoint["model"]), strict=False)
    print(load_res)
    _ = model.eval()
    return model


def get_grounding_output(model, image, caption, box_threshold, text_threshold, device="cpu"):
    caption = caption.lower()
    caption = caption.strip()
    if not caption.endswith("."):
        caption = caption + "."
    model = model.to(device)
    image = image.to(device)
    with torch.no_grad():
        outputs = model(image[None], captions=[caption])
    logits = outputs["pred_logits"].cpu().sigmoid()[0]  # (nq, 256)
    boxes = outputs["pred_boxes"].cpu()[0]  # (nq, 4)
    logits.shape[0]

    # filter output
    logits_filt = logits.clone()
    boxes_filt = boxes.clone()
    filt_mask = logits_filt.max(dim=1)[0] > box_threshold
    logits_filt = logits_filt[filt_mask]  # num_filt, 256
    boxes_filt = boxes_filt[filt_mask]  # num_filt, 4
    logits_filt.shape[0]

    # get phrase
    tokenlizer = model.tokenizer
    tokenized = tokenlizer(caption)
    # build pred
    pred_phrases = []
    scores = []
    for logit, box in zip(logits_filt, boxes_filt):
        pred_phrase = get_phrases_from_posmap(
            logit > text_threshold, tokenized, tokenlizer)
        pred_phrases.append(pred_phrase + f"({str(logit.max().item())[:4]})")
        scores.append(logit.max().item())

    return boxes_filt, torch.Tensor(scores), pred_phrases



@torch.no_grad()
def inference(
    raw_image, tagging_model, grounding_dino_model, sam_model
):
    a = time.time()
    raw_image = raw_image.convert("RGB")

    # run tagging model
    normalize = TS.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    transform = TS.Compose([
        TS.Resize((384, 384)),
        TS.ToTensor(),
        normalize
    ])

    image = raw_image.resize((384, 384))
    image = transform(image).unsqueeze(0).to(device)

    res = inference_ram(image, tagging_model)
    tags = res[0].strip(' ').replace('  ', ' ').replace(' |', ',')
    a1 = time.time()
    print("ram: ",a1-a)

    # run groundingDINO
    transform = T.Compose([
        T.RandomResize([800], max_size=1333),
        T.ToTensor(),
        T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
    ])

    # Apply transform and get image dimensions
    image, _ = transform(raw_image, None)
    W, H = raw_image.size
    scaling_tensor = torch.Tensor([W, H, W, H])

    # Get grounding output
    boxes_filt, scores, pred_phrases = get_grounding_output(
        grounding_dino_model, image, tags, box_threshold, text_threshold, device=device
    )

    # Scale boxes to original image dimensions
    boxes_filt *= scaling_tensor

    # Convert boxes from (cx, cy, width, height) to (x1, y1, x2, y2)
    boxes_filt[:, :2] -= boxes_filt[:, 2:] / 2
    boxes_filt[:, 2:] += boxes_filt[:, :2]

    # Non-Maximum Suppression
    nms_idx = torchvision.ops.nms(boxes_filt, scores, iou_threshold)
    boxes_filt = boxes_filt[nms_idx].cpu()
    pred_phrases = [pred_phrases[idx] for idx in nms_idx]

    a2 = time.time()
    print("sam: ",a2-a1)
    return tags, boxes_filt.cpu().detach().numpy(), pred_phrases

## Load model

In [None]:
# load RAM
ram_model = ram(pretrained=ram_checkpoint, image_size=384, vit='swin_l')
ram_model.eval()
ram_model = ram_model.to(device)

# load groundingDINO
grounding_dino_model = load_model(config_file, grounded_checkpoint, device=device)

# load SAM
sam_model = SamPredictor(build_sam(checkpoint=sam_checkpoint).to(device))

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter

class BBoxGrid:
    def __init__(self, bboxes, labels, grid_size=7, image_width=1280, image_height=720):
        self.bboxes = bboxes
        self.labels = ["_".join(l.split('(')[0].strip().split()) for l in labels]
        self.grid_size = grid_size
        self.grid_columns = list(map(str, range(grid_size)))  # ['0', '1', '2', ..., '6']
        self.grid_rows = list('abcdefg')  # ['a', 'b', 'c', ..., 'g']
        self.grid_width = image_width / grid_size
        self.grid_height = image_height / grid_size
        self.image_width = image_width
        self.image_height = image_height

    def get_grid(self, bbox):
        '''Get grid cells that the bounding box occupies'''
        x_min, y_min, x_max, y_max = bbox
        col_start = int(x_min // self.grid_width)
        col_end = min(int(x_max // self.grid_width), self.grid_size - 1)
        row_start = int(y_min // self.grid_height)
        row_end = min(int(y_max // self.grid_height), self.grid_size - 1)
        
        grid_cells = []
        for col in range(col_start, col_end + 1):
            for row in range(row_start, row_end + 1):
                grid_cells.append(f"{self.grid_rows[row]}{col}")
        
        return grid_cells

    def tag_encoded(self):
        formatted_output = "" 
        for bbox, label in zip(self.bboxes, self.labels):
            grid_cells = self.get_grid(bbox)
            cell_strings = [f" {cell}{label}" for cell in grid_cells]
            formatted_output += "".join(cell_strings)
        return str(formatted_output).strip()
    
    def tag_numbered(self):
        return " ".join([f"{item}{count}" for item, count in Counter(self.labels).items()])

    def show(self):
        _, ax = plt.subplots()
        ax.set_xlim(0, self.image_width)
        ax.set_ylim(0, self.image_height)
        ax.invert_yaxis()

        ###### Draw grid ######
        for i in range(1, self.grid_size):
            ax.axhline(y=i * self.grid_height, color='gray', linestyle='--')
            ax.axvline(x=i * self.grid_width, color='gray', linestyle='--')

        ###### Draw bounding boxes ######
        for bbox, label in zip(self.bboxes, self.labels):
            x_min, y_min, x_max, y_max = bbox
            rect = plt.Rectangle((x_min, y_min), x_max - x_min, y_max - y_min, linewidth=1, edgecolor='r', facecolor='none')
            ax.add_patch(rect)
            ax.text(x_min, y_min - 10, label, color='red')

        plt.show()

 

## Test many image

In [None]:
import os
import glob
from tqdm import tqdm

keyframes_dir = '/kaggle/input/keyframe-extra-aic2024/Keyframes-extra'
all_keyframe_paths = dict()
for part in sorted(os.listdir(keyframes_dir)):
    parts = part.split('_')
    data_part = parts[-2] + "_" + parts[-1] if len(parts) == 3 else parts[-1]
    all_keyframe_paths[data_part] =  dict()

for data_part in sorted(all_keyframe_paths.keys()):
    data_part_path = f'{keyframes_dir}/Keyframes_{data_part}/keyframes'
    video_dirs = sorted(os.listdir(data_part_path))
    video_ids = [video_dir.split('_')[-1] for video_dir in video_dirs]
    for video_id, video_dir in zip(video_ids, video_dirs):
        keyframe_paths = sorted(glob.glob(f'{data_part_path}/{video_dir}/*.jpg'))
        all_keyframe_paths[data_part][video_id] = keyframe_paths

In [None]:
for key, video_keyframe_paths in tqdm(list(all_keyframe_paths.items())):
    video_ids = sorted(video_keyframe_paths.keys())
    for video_id in tqdm(video_ids):
        print(video_id)
        video_keyframe_path = video_keyframe_paths[video_id]
        tag_bboxes, number_tag, tag = [],[],[]
        for i in tqdm(range(0, len(video_keyframe_path))): 
            image_path = video_keyframe_path[i]
            in_img = Image.open(image_path)
            ram_tags, ram_bboxes, ram_label = inference(in_img, ram_model, grounding_dino_model, sam_model)
            encoded = BBoxGrid(ram_bboxes, ram_label)
            tag_bboxes.append(encoded.tag_encoded())
            number_tag.append(encoded.tag_numbered()) 
            tag.append(" ".join(ram_tags.split(", ")))
        tag_bboxes_path = f"/kaggle/working/tag_bboxes_encoded_extra/{key}/"
        number_tag_path = f"/kaggle/working/number_tag_encoded_extra/{key}/"
        tag_path = f"/kaggle/working/tag_encoded_extra/{key}/"
        if not os.path.exists(tag_bboxes_path):
            os.makedirs(tag_bboxes_path)
        if not os.path.exists(number_tag_path):
            os.makedirs(number_tag_path)
        if not os.path.exists(tag_path):
            os.makedirs(tag_path)
        # Saving the video context txt 
        with open(f"/kaggle/working/tag_bboxes_encoded_extra/{key}/{video_id}.txt", "w") as f:
            for item in tag_bboxes:
                f.write("%s\n" % item)  
        with open(f"/kaggle/working/number_tag_encoded_extra/{key}/{video_id}.txt", "w") as f:
            for item in number_tag:
                f.write("%s\n" % item)  
        with open(f"/kaggle/working/tag_encoded_extra/{key}/{video_id}.txt", "w") as f:
            for item in tag:
                f.write("%s\n" % item)  

print("Hoàn thành")

In [None]:
!rm -rf /kaggle/working/groundingdino_swint_ogc.pth
!rm -rf /kaggle/working/sam_vit_h_4b8939.pth
!rm -rf /kaggle/working/recognize-anything
!rm -rf /kaggle/working/Grounded-Segment-Anything