# Inference

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
entity_unit_map = {
    'width': {'centimetre', 'foot', 'inch', 'metre', 'millimetre', 'yard'},
    'depth': {'centimetre', 'foot', 'inch', 'metre', 'millimetre', 'yard'},
    'height': {'centimetre', 'foot', 'inch', 'metre', 'millimetre', 'yard'},
    'item_weight': {'gram',
        'kilogram',
        'microgram',
        'milligram',
        'ounce',
        'pound',
        'ton'},
    'maximum_weight_recommendation': {'gram',
        'kilogram',
        'microgram',
        'milligram',
        'ounce',
        'pound',
        'ton'},
    'voltage': {'kilovolt', 'millivolt', 'volt'},
    'wattage': {'kilowatt', 'watt'},
    'item_volume': {'centilitre',
        'cubic foot',
        'cubic inch',
        'cup',
        'decilitre',
        'fluid ounce',
        'gallon',
        'imperial gallon',
        'litre',
        'microlitre',
        'millilitre',
        'pint',
        'quart'}
}

In [None]:
%pip install torch transformers timm einops datasets bitsandbytes accelerate

Collecting timm
  Downloading timm-1.0.9-py3-none-any.whl.metadata (42 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/42.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.4/42.4 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
Collecting datasets
  Downloading datasets-3.0.0-py3-none-any.whl.metadata (19 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.43.3-py3-none-manylinux_2_24_x86_64.whl.metadata (3.5 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading tim

In [None]:
# Initialize moondream. Change DEVICE to 'mps' if you're on an M1 Mac, or 'cpu' if you don't have a
# GPU. Note that fine-tuning on CPU will be very slow.

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

DEVICE = "cuda"
DTYPE = torch.float32 if DEVICE == "cpu" else torch.float16 # CPU doesn't support float16
MD_REVISION = "2024-07-23"

tokenizer = AutoTokenizer.from_pretrained("vikhyatk/moondream2", revision=MD_REVISION)
moondream = AutoModelForCausalLM.from_pretrained(
    "/content/drive/MyDrive/moondream-ft_large_prompt_after_10000_iterations", revision=MD_REVISION, trust_remote_code=True,
    torch_dtype=DTYPE, device_map={"": DEVICE}
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/7.34k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/1.08k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

In [None]:
import pandas as pd
from tqdm import tqdm
from torch.utils.data import Dataset
import requests
from PIL import Image
from io import BytesIO

class CustomTestImageDataset(Dataset):
    def __init__(self, csv_file):
        # Load the CSV file into a pandas DataFrame
        self.data = pd.read_csv(csv_file)

    def __len__(self):
        # Return the total number of samples
        return len(self.data)

    def __getitem__(self, idx):
        # Get the row corresponding to the given index
        row = self.data.iloc[idx]

        # Extract the image URL and entity name
        image_url = row['image_link']
        entity_name = row['entity_name']

        # Replace underscores in entity_name with spaces
        entity_name_cleaned = entity_name.replace('_', ' ')

        # Download and open the image in grayscale
        response = requests.get(image_url)
        img = Image.open(BytesIO(response.content)).convert('L')

        # Create the prompt using the cleaned entity name
        prompt = f"""Extract {entity_name_cleaned} from the image in the format 'x unit', where: 'x' is a float number in standard formatting.'unit' is one of the allowed units for {entity_name_cleaned} from the following list: {', '.join(entity_unit_map[entity_name])}.Ensure that the output strictly matches the format "x unit" with a space separating the number and the unit. Do not use any abbreviations, special characters, or additional text. If no valid value is found in the image, return a string "blank" """

        # Return the sample with its original index
        return {
            "index": idx,  # Keep track of the original index
            "image": img,  # Grayscale PIL image
            "qa": [
                {
                    "question": prompt,
                    "answer": "",  # No ground truth answer for test data
                }
            ]
        }

# Load the CSV file into a pandas DataFrame
test_df = pd.read_csv('part22.csv')

# Initialize the custom test dataset
test_dataset = CustomTestImageDataset('part22.csv')

# Prepare an empty list to store the predictions and their indices
predictions = []

# Perform model inference
for i, sample in tqdm(enumerate(test_dataset), total=len(test_dataset), desc="Predicting"):
    # Generate prediction using the model
    md_answer = moondream.answer_question(
        moondream.encode_image(sample['image']),
        sample['qa'][0]['question'],
        tokenizer=tokenizer,
        num_beams=4,
        no_repeat_ngram_size=5,
        early_stopping=True
    )

    # Append the prediction along with its original index
    predictions.append((sample["index"], md_answer))

    # Print for debugging or confirmation
    print('Question:', sample['qa'][0]['question'])
    print('Predicted Answer:', md_answer)

# Sort the predictions list by the original index to maintain order
predictions.sort(key=lambda x: x[0])

# Extract the sorted predictions
sorted_predictions = [pred[1] for pred in predictions]

# Add the sorted predictions as a new column in the DataFrame
test_df['entity_value'] = sorted_predictions

# Save the updated DataFrame back to the CSV file
test_df.to_csv('updated_merged_part22.csv', index=False)

print("Predictions have been saved to 'updated_test.csv'.")

Output hidden; open in https://colab.research.google.com to view.