In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import torch
from torch.utils.data import Dataset
import time
import json
import os
from tqdm import tqdm

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [None]:
# Image Prompt Dataset
class CaptionDataset(Dataset):
    def __init__(self, captions_file):
        with open(captions_file, 'r') as f:
            self.data = json.load(f)
        
        self.image_filenames = list(self.data.keys())
        self.captions = list(self.data.values())
    
    def __len__(self):
        return len(self.image_filenames)
    
    def __getitem__(self, idx):
        image_filename = self.image_filenames[idx]
        caption = self.captions[idx]
        return image_filename, caption

In [None]:
# Dataset
part = 4
captions_file = f"img_prompts_part_{part}.json"
result_file = f"merged_captions_part_{part}.json"
if not os.path.exists(result_file):    
    with open(result_file, "w", encoding="utf-8") as f:
        ...
dataset = CaptionDataset(captions_file)
print(len(dataset))

20000


In [5]:
# Load model and tokenizer
# tokenizer_name = "nvidia/Nemotron-Mini-4B-Instruct"
tokenizer_name = "nemotron_tokenizer"
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name, use_default_system_prompt=True)
tokenizer.pad_token_id = tokenizer.eos_token_id

q_conf = BitsAndBytesConfig(load_in_8bit=True)

# model_name = "nvidia/Nemotron-Mini-4B-Instruct"
model_name = "nemotron_model"
model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=q_conf, device_map="auto").eval()

# Save model and tokenizer
# tokenizer.save_pretrained("nemotron_tokenizer")
# model.save_pretrained("nemotron_model")
# torch.save(model, "nemotron_model.pth")

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [None]:
def generate_caption(id):
    pth, prompt = dataset[id]

    # prompt = (
    #     "The following captions describe the same image in different ways. "
    #     "Merge them into a single, clear, and accurate description of the same scene, avoiding repetition:\n\n"
    #     + "\n".join(lst)
    #     + "\n\nMerged Paragraph:"
    # )

    # Tokenize input
    inputs = tokenizer(prompt, return_tensors="pt").to(device)

    # Generate output
    with torch.inference_mode():
        output_tokens = model.generate(
            **inputs,
            max_new_tokens=100,
            do_sample=False,
            pad_token_id=tokenizer.eos_token_id
        )

    # Decode and clean up output
    output_text = tokenizer.decode(output_tokens[0], skip_special_tokens=True)
    generated_paragraph = output_text.split("Merged Paragraph:")[-1].strip()
    
    return pth, generated_paragraph

In [None]:
with open(result_file, "r", encoding="utf-8") as f:
    checkpoint = len(f.readlines()) - 1

print(checkpoint)

10680


In [None]:
with open(result_file, "a", encoding="utf-8") as f:
    if checkpoint == -1:
      f.write("{\n")  # Start JSON object
      first_entry = True
    else:
      first_entry = False

    for i in tqdm(range(len(dataset)), total=len(dataset), desc="Generating Captions"):
        if i < checkpoint:
            continue

        img, generated_caption = generate_caption(i)

        # Write each entry separately
        if not first_entry:
          f.write(",\n")  # Add a comma before new entries (except the first)
        json.dump(img, f)
        f.write(": ")
        json.dump(generated_caption, f)

        f.flush()  # Ensure data is written immediately
        first_entry = False

    f.write("\n}")  # End JSON object

  with torch.cuda.amp.autocast(enabled=False):
  attn_output = torch.nn.functional.scaled_dot_product_attention(
Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)
Generating Captions: 100%|██████████| 20000/20000 [8:46:02<00:00,  1.58s/it]   

Time taken: 31562.68



