In [None]:
!pip install git+https://github.com/xinyu1205/recognize-anything.git


In [1]:
import os

def download_checkpoints(model):
    print('You selected', model)
    if not os.path.exists('pretrained'):
        os.makedirs('pretrained')

    if model == "RAM":
        ram_weights_path = 'pretrained/ram_swin_large_14m.pth'
        if not os.path.exists(ram_weights_path):
            !wget https://huggingface.co/spaces/xinyu1205/Recognize_Anything-Tag2Text/resolve/main/ram_swin_large_14m.pth -O pretrained/ram_swin_large_14m.pth
        else:
            print("RAM weights already downloaded!")
    if model == "RAM+":
        ram_plus_weights_path = 'pretrained/ram_plus_swin_large_14m.pth'
        if not os.path.exists(ram_plus_weights_path):
            !wget https://huggingface.co/xinyu1205/recognize-anything-plus-model/resolve/main/ram_plus_swin_large_14m.pth -O pretrained/ram_plus_swin_large_14m.pth
        else:
            print("RAM+ weights already downloaded!")
    if model == "Tag2Text":
        tag2text_weights_path = 'pretrained/tag2text_swin_14m.pth'
        if not os.path.exists(tag2text_weights_path):
            !wget https://huggingface.co/spaces/xinyu1205/Recognize_Anything-Tag2Text/resolve/main/tag2text_swin_14m.pth -O pretrained/tag2text_swin_14m.pth
        else:
            print("Tag2Text weights already downloaded!")


model = "RAM"
download_checkpoints(model)
print(model, 'weights are downloaded!')

model = "RAM+"
download_checkpoints(model)
print(model, 'weights are downloaded!')

model = "Tag2Text"
download_checkpoints(model)
print(model, 'weights are downloaded!')

You selected RAM
RAM weights already downloaded!
RAM weights are downloaded!
You selected RAM+
RAM+ weights already downloaded!
RAM+ weights are downloaded!
You selected Tag2Text
Tag2Text weights already downloaded!
Tag2Text weights are downloaded!


In [2]:
import os

def find_all_keyframe_files(root_dir, extensions=[".jpg"]):
    files = []
    for dirpath, _, filenames in os.walk(root_dir):
        for filename in filenames:
            if any(filename.lower().endswith(ext) for ext in extensions):
                files.append(os.path.join(dirpath, filename))
    return files

In [3]:
import re

def get_video_id_and_frame_id(file_path):
    pattern = r"/([A-Z0-9]+)_extra/([A-Z0-9]+)/(\d+)\.*"
    match = re.search(pattern, file_path)
    if match:
        video_id = f"{match.group(1)}_{match.group(2)}"
        frame_id = match.group(3)
        return video_id, frame_id
    else:
        print("No match found: " + file_path)
        return None, None

In [4]:
from ram.models import ram_plus, ram
import torch
from ram import get_transform

device = 'cuda' if torch.cuda.is_available() else 'cpu'

def load_ram_model():
    model = ram(pretrained='pretrained/ram_swin_large_14m.pth',
                     image_size=384,
                     vit='swin_l')
    return model.to(device).eval()

def load_ram_plus_model():
    model = ram_plus(pretrained='pretrained/ram_plus_swin_large_14m.pth',
                 image_size=384,
                 vit='swin_l')
    return model.to(device).eval()

transform = get_transform(image_size=384)


In [5]:
from PIL import Image
from ram import inference_ram as inference

ram_model = load_ram_model()
ram_plus_model = load_ram_plus_model()
space_pattern = re.compile('^[\S]+')

def inference_ram(image_path):
   image = transform(Image.open(image_path)).unsqueeze(0).to(device)
   ram_res = inference(image, ram_model)[0].split(" | ")
   ram_plus_res = inference(image, ram_plus_model)[0].split(" | ")
   result = list(set(ram_res + ram_plus_res))
   result = list(filter(space_pattern.match,result))
   return result

BertLMHeadModel has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`. From 👉v4.50👈 onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.


/encoder/layer/0/crossattention/self/query is tied
/encoder/layer/0/crossattention/self/key is tied
/encoder/layer/0/crossattention/self/value is tied
/encoder/layer/0/crossattention/output/dense is tied
/encoder/layer/0/crossattention/output/LayerNorm is tied
/encoder/layer/0/intermediate/dense is tied
/encoder/layer/0/output/dense is tied
/encoder/layer/0/output/LayerNorm is tied
/encoder/layer/1/crossattention/self/query is tied
/encoder/layer/1/crossattention/self/key is tied
/encoder/layer/1/crossattention/self/value is tied
/encoder/layer/1/crossattention/output/dense is tied
/encoder/layer/1/crossattention/output/LayerNorm is tied
/encoder/layer/1/intermediate/dense is tied
/encoder/layer/1/output/dense is tied
/encoder/layer/1/output/LayerNorm is tied
--------------
pretrained/ram_swin_large_14m.pth
--------------
load checkpoint from pretrained/ram_swin_large_14m.pth
vit: swin_l
--------------
pretrained/ram_plus_swin_large_14m.pth
--------------
load checkpoint from pretraine

In [6]:
from tqdm import tqdm
from ram import inference_ram as inference

keyframe_dir = "/media/daoan/T7 Shield2/AI_Challenge_2024_DATA/Keyframes"
tag_dir = keyframe_dir.replace("Keyframes", "Tags")
keyframe_paths = find_all_keyframe_files(keyframe_dir)
keywords = ["L25_extra", "L26_extra", "L27_extra", "L28_extra", "L29_extra", "L30_extra"]
filtered_keyframe_paths = [path for path in keyframe_paths if any(keyword in path for keyword in keywords)]

for keyframe_path in tqdm(filtered_keyframe_paths, "Inference"):
    video_id, frame_id = get_video_id_and_frame_id(keyframe_path)
    folder_id = video_id.split("_")[0]
    video_id = video_id.split("_")[1]
    tag_path = os.path.join(tag_dir, folder_id + "_extra", video_id, f"{frame_id}.txt")
    # create folder if not exists
    if not os.path.exists(os.path.dirname(tag_path)):
        os.makedirs(os.path.dirname(tag_path))
    if not os.path.exists(tag_path):
        tags = inference_ram(keyframe_path)
        with open(tag_path, "w", encoding='utf-8') as f:
            f.write(" | ".join(tags))


Inference: 100%|██████████| 305552/305552 [3:34:15<00:00, 23.77it/s]  
