### Installation & Setup

In [5]:
!pip install ipywidgets timm transformers fairscale pycocoevalcap sentence_transformers scikit-learn

Collecting ipywidgets
  Using cached ipywidgets-8.1.5-py3-none-any.whl.metadata (2.3 kB)
Collecting timm
  Using cached timm-1.0.9-py3-none-any.whl.metadata (42 kB)
Collecting transformers
  Using cached transformers-4.45.1-py3-none-any.whl.metadata (44 kB)
Collecting fairscale
  Using cached fairscale-0.4.13-py3-none-any.whl
Collecting pycocoevalcap
  Using cached pycocoevalcap-1.2-py3-none-any.whl.metadata (3.2 kB)
Collecting sentence_transformers
  Using cached sentence_transformers-3.1.1-py3-none-any.whl.metadata (10 kB)
Collecting scikit-learn
  Using cached scikit_learn-1.5.2-cp312-cp312-macosx_12_0_arm64.whl.metadata (13 kB)
Collecting widgetsnbextension~=4.0.12 (from ipywidgets)
  Using cached widgetsnbextension-4.0.13-py3-none-any.whl.metadata (1.6 kB)
Collecting jupyterlab-widgets~=3.0.12 (from ipywidgets)
  Using cached jupyterlab_widgets-3.0.13-py3-none-any.whl.metadata (4.1 kB)
Collecting torch (from timm)
  Using cached torch-2.4.1-cp312-none-macosx_11_0_arm64.whl.metadat

In [7]:
!brew install wget

[34m==>[0m [1mAuto-updating Homebrew...[0m
Adjust how often this is run with HOMEBREW_AUTO_UPDATE_SECS or disable with
HOMEBREW_NO_AUTO_UPDATE. Hide these hints with HOMEBREW_NO_ENV_HINTS (see `man brew`).
[34m==>[0m [1mDownloading https://ghcr.io/v2/homebrew/portable-ruby/portable-ruby/blobs/sha256:e7340e4a1d7cc0f113686e461b93114270848cb14676e9037a1a2ff3b1a0ff32[0m
######################################################################### 100.0%                          39.3%
[34m==>[0m [1mPouring portable-ruby-3.3.5.arm64_big_sur.bottle.tar.gz[0m
[34m==>[0m [1mAuto-updated Homebrew![0m
Updated 2 taps (homebrew/core and homebrew/cask).
[34m==>[0m [1mNew Formulae[0m
binsider        flang           lld             probe-rs-tools  rsgain
facad           inchi           polkit          repopack
[34m==>[0m [1mNew Casks[0m
synology-image-assistant   vienna-assistant           windows-app

To reinstall 1.24.5, run:
  brew reinstall wget


In [3]:
import warnings
warnings.filterwarnings("ignore")

In [7]:
import os
import re
import subprocess
import numpy as np

import ipywidgets as widgets
from IPython.display import clear_output, display, Image

import torch
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

import nltk
from nltk.corpus import stopwords

In [3]:
cd recognize-anything

/Users/iassn0rma1/Library/CloudStorage/GoogleDrive-amanbarthwal0110@gmail.com/My Drive/Programming/Projects/WG/Ai-tools/recognize-anything


In [4]:
model = "Tag2Text"

In [20]:
def download_checkpoints(model):
    print('You selected', model)
    if not os.path.exists('pretrained'):
        os.makedirs('pretrained')

    if model == "RAM":
        ram_weights_path = 'pretrained/ram_swin_large_14m.pth'
        if not os.path.exists(ram_weights_path):
            !wget https://huggingface.co/spaces/xinyu1205/Recognize_Anything-Tag2Text/resolve/main/ram_swin_large_14m.pth -O pretrained/ram_swin_large_14m.pth
        else:
            print("RAM weights already downloaded!")
    else:
        tag2text_weights_path = 'pretrained/tag2text_swin_14m.pth'
        if not os.path.exists(tag2text_weights_path):
            !wget https://huggingface.co/spaces/xinyu1205/Recognize_Anything-Tag2Text/resolve/main/tag2text_swin_14m.pth -O pretrained/tag2text_swin_14m.pth
        else:
            print("Tag2Text weights already downloaded!")

download_checkpoints(model)
print(model, 'weights are downloaded!')

You selected Tag2Text
Tag2Text weights already downloaded!
Tag2Text weights are downloaded!


### Import images

In [11]:
images_dir = "images/demo"

In [12]:
image_files = [f"{images_dir}/{file}" for file in sorted(os.listdir(images_dir)) if file.lower().endswith(('.jpg', '.jpeg', '.png'))]
image_path = image_files[0]

# Create dropdown widget
image_dropdown = widgets.Dropdown(
    options=image_files,
    description='Select Image:',
)

# Create image preview widget
image_preview = widgets.Output()

# Define function to update image preview
def update_preview(change):
    global image_path
    image_path = change.new
    with image_preview:
        image_preview.clear_output()
        display(Image(filename=image_path, width=400))

# Set the initial image preview
with image_preview:
    display(Image(filename=image_files[0], width=400))

# Attach the update function to the dropdown
image_dropdown.observe(update_preview, names='value')

# Display the widgets
display(image_dropdown, image_preview)

Dropdown(description='Select Image:', options=('images/demo/039-harry-philosopher-potter-wallpaper-abc6cc7d631…

Output()

### Tag Generation

In [13]:
results = {}  # to save the results

def run_inference(model, task):
    if model == "Tag2Text" and task == "one image":
        result = subprocess.run(
            ["python", "inference_tag2text.py", "--image", image_path, 
             "--pretrained", "pretrained/tag2text_swin_14m.pth"], 
            capture_output=True, text=True
        )
        results['Tag2Text_one_image'] = result.stdout

    elif model == "Tag2Text" and task == "multiple images":
        result = subprocess.run(
            ["python", "batch_inference.py", "--image-dir", images_dir,
             "--pretrained", "pretrained/tag2text_swin_14m.pth", "--model-type", "tag2text"], 
            capture_output=True, text=True
        )
        results['Tag2Text_multiple_images'] = result.stdout

    elif model == "RAM" and task == "one image":
        result = subprocess.run(
            ["python", "inference_ram.py", "--image", image_path, 
             "--pretrained", "pretrained/ram_swin_large_14m.pth"], 
            capture_output=True, text=True
        )
        results['RAM_one_image'] = result.stdout

    elif model == "RAM" and task == "multiple images":
        result = subprocess.run(
            ["python", "batch_inference.py", "--image-dir", images_dir, 
             "--pretrained", "pretrained/ram_swin_large_14m.pth", "--model-type", "ram"], 
            capture_output=True, text=True
        )
        results['RAM_multiple_images'] = result.stdout
    else:
        print('Invalid model or task')

run_inference(model, "multiple images") # or "one image"

results

{'Tag2Text_multiple_images': '/encoder/layer/0/crossattention/self/query is tied\n/encoder/layer/0/crossattention/self/key is tied\n/encoder/layer/0/crossattention/self/value is tied\n/encoder/layer/0/crossattention/output/dense is tied\n/encoder/layer/0/crossattention/output/LayerNorm is tied\n/encoder/layer/0/intermediate/dense is tied\n/encoder/layer/0/output/dense is tied\n/encoder/layer/0/output/LayerNorm is tied\n/encoder/layer/1/crossattention/self/query is tied\n/encoder/layer/1/crossattention/self/key is tied\n/encoder/layer/1/crossattention/self/value is tied\n/encoder/layer/1/crossattention/output/dense is tied\n/encoder/layer/1/crossattention/output/LayerNorm is tied\n/encoder/layer/1/intermediate/dense is tied\n/encoder/layer/1/output/dense is tied\n/encoder/layer/1/output/LayerNorm is tied\n--------------\npretrained/tag2text_swin_14m.pth\n--------------\nload checkpoint from pretrained/tag2text_swin_14m.pth\nvit: swin_b\n{\'filepath\': \'images/demo/ai-art-impressionism-

### Cleaning Generated Data for Embedding

In [115]:
def process_data(results):
    cleaned_string = re.sub(r"^[^{]*\{", '{', results['Tag2Text_multiple_images']).replace("'", '"')
    cleaned_string = cleaned_string.split('\n')
    
    generated_tags = []
    filtered_data = []

    for entry in cleaned_string:
        if not entry.startswith('{'):
            continue

        try:
            parsed_entry = {}
            key_value_pairs = re.findall(r'\"(\w+)\":\s*(\"[^\"]*\"|null|\d+)', entry)
            
            for key, value in key_value_pairs:
                if value.isdigit():
                    parsed_entry[key] = int(value)
                else:
                    parsed_entry[key] = value.strip('"')

            filtered_data.append(parsed_entry)

            # save 'model_identified_tags', if exists
            if 'model_identified_tags' in parsed_entry:
                generated_tags.append(parsed_entry['model_identified_tags'])

        except Exception as e:
            print(f"Skipping invalid entry due to error: {e}")
    
    return filtered_data, generated_tags


dict_list, generated_tags = process_data(results)

# clean & flatten the result
split_tags = [tag.strip() for tags in generated_tags for tag in tags.split('|') if tag.strip()]
unique_tags = list(set(split_tags))  # Remove duplicates

print("Generated Tags:", generated_tags)
print("Unique Tags:", unique_tags)

Generated Tags: ['rain | night | city | city street | woman | painting | umbrella | street | person | hold | walk | rainy | blue', 'woman | bear | game | art', 'sunset | woman | field | girl | sun | guitar | grass | leg | sit | play | sit in | sit on | tall | young', 'poster | person | art | stand', 'light', 'angel | wing | art | black', 'soccer game | fan | player | crowd | stadium | people | large', 'glass window | woman | purse | person | leather jacket | wear | stained', 'shield | picture | logo | helmet | emblem | black', 'book | table | desk | woman | actor | film | people | person | scene | man | stand', 'warrior | art | female | large', 'anime | person | art', 'goal | football team | match | fan | player | footballer | stadium | soccer player | celebrate', 'wallpaper | video game | game | screenshot | person', 'drawing | anime | wallpaper | person', 'woman | girl | arrow | bow | hold', 'football team | football player | match | team | player | soccer player | man | look', 'nigh

In [16]:
def recommend_tags(tag, tags, tag_embeddings, model, top_n=3):
    if tag in tags:
        idx = tags.index(tag)
        query_embedding = tag_embeddings[idx].reshape(1, -1)
    else:
        print(f"Tag '{tag}' not found, generating embedding for it.")
        query_embedding = model.encode([tag]).reshape(1, -1)
    
    similarities = cosine_similarity(query_embedding, tag_embeddings)[0]
    
    if tag in tags:
        similar_indices = similarities.argsort()[::-1][1:top_n+1]
    else:
        similar_indices = similarities.argsort()[::-1][:top_n]
    
    top_similar = [tags[i] for i in similar_indices]
    return top_similar

def compute_embeddings(model_name, tags):
    model = SentenceTransformer(model_name)

    tag_embeddings = model.encode(tags)
    return model, tag_embeddings

### Generate Recommendations

In [117]:
input_tag = 'naruto'
top_n = 10  # number of recommendations

In [116]:
def generator(model_name):
    print(f"\nTesting model: {model_name}")
    model, tag_embeddings = compute_embeddings(model_name, unique_tags)
    
    recommended_tags = recommend_tags(input_tag, unique_tags, tag_embeddings, model, top_n=top_n)
    print(f"Recommended tags for '{input_tag}' using {model_name}:\n{recommended_tags}")

In [None]:
modeList = [
    'all-MiniLM-L6-v2',            # lightweight
    'all-MiniLM-L12-v2',           # slightly larger
    'sentence-t5-base',            # for sentence embeddings
    'paraphrase-MiniLM-L6-v2',     # for paraphrase detection
    'paraphrase-mpnet-base-v2',    # high performance on semantic similarity tasks
    'all-mpnet-base-v2',           # high performance on similarity tasks
]

In [None]:
large_modeList = [
    'stsb-roberta-large',          # fine-tuned for semantic similarity
    'sentence-t5-large',           # for context understanding
    'gtr-t5-large',                # for deep contextual embeddings
    'multi-qa-mpnet-base-dot-v1',  # for multi-turn question-answering
]

In [118]:
generator('all-MiniLM-L6-v2')


Testing model: all-MiniLM-L6-v2
Tag 'naruto' not found, generating embedding for it.
Recommended tags for 'naruto' using all-MiniLM-L6-v2:
['anime', 'cartoon', 'cartoon character', 'robot', 'sword', 'comic book character', 'zombie', 'warrior', 'helicopter', 'dragon']


In [119]:
generator('all-MiniLM-L12-v2')


Testing model: all-MiniLM-L12-v2
Tag 'naruto' not found, generating embedding for it.
Recommended tags for 'naruto' using all-MiniLM-L12-v2:
['anime', 'sword', 'cartoon', 'cartoon character', 'comic book character', 'warrior', 'zombie', 'lightning', 'axe', 'dragon']


In [120]:
generator('paraphrase-MiniLM-L12-v2')


Testing model: paraphrase-MiniLM-L12-v2
Tag 'naruto' not found, generating embedding for it.
Recommended tags for 'naruto' using paraphrase-MiniLM-L12-v2:
['hero', 'scene', 'warrior', 'sword', 'actor', 'movie', 'cartoon', 'ride on', 'animation', 'game']


In [121]:
generator('sentence-t5-base')


Testing model: sentence-t5-base
Tag 'naruto' not found, generating embedding for it.
Recommended tags for 'naruto' using sentence-t5-base:
['anime', 'cartoon', 'animation', 'art', 'dragon', 'fox', 'cartoon character', 'soccer', 'source', 'sword']


In [122]:
generator('paraphrase-mpnet-base-v2')


Testing model: paraphrase-mpnet-base-v2
Tag 'naruto' not found, generating embedding for it.
Recommended tags for 'naruto' using paraphrase-mpnet-base-v2:
['anime', 'jungle', 'sail', 'cartoon character', 'video game', 'mermaid', 'warrior', 'fan', 'hero', 'cartoon']


In [123]:
generator('all-mpnet-base-v2')


Testing model: all-mpnet-base-v2
Tag 'naruto' not found, generating embedding for it.
Recommended tags for 'naruto' using all-mpnet-base-v2:
['anime', 'cartoon', 'cartoon character', 'arrow', 'lion', 'warrior', 'mermaid', 'costume', 'jungle', 'animation']


In [4]:
from transformers import BertTokenizer, BertForMaskedLM

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForMaskedLM.from_pretrained('bert-base-uncased')

model.eval()

BertForMaskedLM has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`. From 👉v4.50👈 onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another archite

BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwi

In [5]:
def score_bigrams(unigrams):
    bigrams_scores = []
    
    embeddings = []
    for word in unigrams:
        inputs = tokenizer(word, return_tensors="pt")
        with torch.no_grad():
            outputs = model.bert(**inputs)
        embeddings.append(outputs.last_hidden_state.mean(dim=1).numpy())
    
    embeddings = np.vstack(embeddings)

    for i, word1 in enumerate(unigrams):
        for j, word2 in enumerate(unigrams):
            if i != j:
                # create a sentence with a mask (to predict word2)
                sentence = f"{word1} [MASK]"
                inputs = tokenizer(sentence, return_tensors="pt")
                mask_token_index = torch.where(inputs["input_ids"] == tokenizer.mask_token_id)[1]
                
                # predictions for the masked token
                with torch.no_grad():
                    token_logits = model(**inputs).logits
                mask_token_logits = token_logits[0, mask_token_index, :]
                
                # predicted token id for the second word
                predicted_token_id = tokenizer.convert_tokens_to_ids(word2)
                predicted_score = mask_token_logits[0, predicted_token_id].item()
                
                semantic_similarity = cosine_similarity(embeddings[i].reshape(1, -1), embeddings[j].reshape(1, -1))[0][0]
                
                combined_score = predicted_score + semantic_similarity
                bigrams_scores.append(((word1, word2), combined_score))
    
    sorted_bigrams = sorted(bigrams_scores, key=lambda x: x[1], reverse=True)
    return sorted_bigrams

In [8]:
unigrams = ['brown', 'horse', 'gun', 'football', 'orange', 'anime', 'sword', 'stadium', 'player', 'cat']
top_bigrams = score_bigrams(unigrams)

print("Top contextual bigrams:")
for bigram, score in top_bigrams[:6]:
    print(f"{bigram}: Score {score:.4f}")


Top contextual bigrams:
('football', 'player'): Score 3.6308
('football', 'brown'): Score 2.3301
('anime', 'player'): Score 2.1899
('orange', 'brown'): Score 2.1360
('stadium', 'player'): Score 2.0297
('football', 'stadium'): Score 1.8900


In [9]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/iassn0rma1/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [10]:
caption = "a painting of a person holding a blue umbrella walking down a street in a rainy city at night"

unigrams = [word for word in caption.split() if word.lower() not in stop_words]
top_bigrams = score_bigrams(unigrams)

print("Top contextual bigrams:")
for bigram, score in top_bigrams[:10]:
    print(f"{bigram}: Score {score:.4f}")


Top contextual bigrams:
('rainy', 'night'): Score 3.9830
('rainy', 'city'): Score 2.9927
('street', 'city'): Score 2.6098
('umbrella', 'blue'): Score 2.5386
('city', 'street'): Score 1.8728
('painting', 'blue'): Score 1.8144
('umbrella', 'painting'): Score 1.7871
('umbrella', 'city'): Score 1.6530
('rainy', 'blue'): Score 1.5750
('umbrella', 'street'): Score 1.5567
