# get image description from moondream
save iamge_name, prompt and output text to a csv file

In [1]:
import json
import torch
import argparse
from PIL import Image
from moondream import Moondream, detect_device
from moondream.vision_encoder import VisionEncoder
from queue import Queue
from threading import Thread
from transformers import (
    TextIteratorStreamer,
    CodeGenTokenizerFast as Tokenizer,
)
import re
import os
import pandas as pd
pd.set_option('display.max_colwidth', 1000)

  from .autonotebook import tqdm as notebook_tqdm
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


## load gpt4 inference and prompt

In [2]:
from config import gpt4_res_json_path, gpt4_res_csv_path, moondream_res_csv_path, both_res_csv_path, default_prompt

with open(gpt4_res_json_path, 'r') as json_file:
    res_gpt4 = json.load(json_file)

df_gpt4 = {'image': [], 'prompt': [], 'gpt4_inference': []}
for image_name, gpt4_inf in res_gpt4.items():
    df_gpt4['image'].append(image_name)
    df_gpt4['prompt'] = default_prompt
    df_gpt4['gpt4_inference'].append(gpt4_inf)

df_gpt4 = pd.DataFrame(df_gpt4)
df_gpt4.to_csv(gpt4_res_csv_path, index=False)
df_gpt4.head(1)

Unnamed: 0,image,prompt,gpt4_inference
0,9701 ARIANA TRIBAL 2.9 X 11.9.jpeg,"\nAssuming you are a store assistant please analyze the item in the picture.\n Only return the data in a json with the keys shape, dominant_colors, style, description, material, suggested_title, theme. \n Please restrict the product description to 100 words. \n For theme please return any special characteristics like nature, geometric, abstract etc. \n","{'shape': 'rectangle', 'dominant_colors': ['red', 'blue', 'beige', 'black'], 'style': 'traditional', 'description': 'This is a traditional hand-woven runner rug, likely of Persian origin. It features intricate patterns and symmetrical geometric shapes, bordered by detailed motifs. The craftsmanship suggests it may be antique, adding to its decorative appeal.', 'material': 'wool', 'suggested_title': 'Traditional Persian Runner Rug', 'theme': 'geometric'}"


## load moondream model

In [3]:
device, dtype = detect_device()
# print(device, dtype)

model_id = "vikhyatk/moondream1"
tokenizer = Tokenizer.from_pretrained(model_id)
moondream = Moondream.from_pretrained(model_id).to(device=device, dtype=dtype)
moondream.eval()

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Moondream(
  (vision_encoder): VisionEncoder(
    (encoder): ModelHolder(
      (model): VisualHolder(
        (visual): VisionTransformer(
          (patch_embed): LinearPatchEmbedding(
            (linear): Linear(in_features=588, out_features=1152, bias=True)
          )
          (pos_drop): Dropout(p=0.0, inplace=False)
          (patch_drop): Identity()
          (norm_pre): Identity()
          (blocks): Sequential(
            (0): Block(
              (norm1): LayerNorm((1152,), eps=1e-06, elementwise_affine=True)
              (attn): Attention(
                (qkv): Linear(in_features=1152, out_features=3456, bias=True)
                (q_norm): Identity()
                (k_norm): Identity()
                (attn_drop): Dropout(p=0.0, inplace=False)
                (proj): Linear(in_features=1152, out_features=1152, bias=True)
                (proj_drop): Dropout(p=0.0, inplace=False)
              )
              (ls1): Identity()
              (drop_path1): Identity()
  

## get moondream inference

In [4]:
repo = 'rug_jpeg'

prompt = default_prompt

answers, names = [], []
images_with_gpt4_inf = set(res_gpt4.keys())
for name in os.listdir(repo):

    if name not in images_with_gpt4_inf:
        continue
    
    image = Image.open(repo + '/' + name)
    image_embeds = moondream.encode_image(image) # VisionEncoder()(image).to(device, dtype)
    answer = moondream.answer_question( image_embeds,prompt,  tokenizer)

    answers.append(answer)
    names.append(name)

df_moondream = pd.DataFrame({'image': names, 'prompt': [prompt]*len(names), 'moondream_inference': answers})

In [5]:
df_moondream.to_csv(moondream_res_csv_path, index=False)
df_moondream.head(1)

Unnamed: 0,image,prompt,moondream_inference
0,9701 ARIANA TRIBAL 2.9 X 11.9.jpeg,"\nAssuming you are a store assistant please analyze the item in the picture.\n Only return the data in a json with the keys shape, dominant_colors, style, description, material, suggested_title, theme. \n Please restrict the product description to 100 words. \n For theme please return any special characteristics like nature, geometric, abstract etc. \n",A long red rug with a geometric pattern on it.


## combine two models' inferences

In [6]:
df_all = df_moondream.merge(df_gpt4, on = 'image', suffixes = ['_moondream', '_gpt'], copy=False)
df_all.to_csv(both_res_csv_path, index=False)
df_all.head(1)

Unnamed: 0,image,prompt_moondream,moondream_inference,prompt_gpt,gpt4_inference
0,9701 ARIANA TRIBAL 2.9 X 11.9.jpeg,"\nAssuming you are a store assistant please analyze the item in the picture.\n Only return the data in a json with the keys shape, dominant_colors, style, description, material, suggested_title, theme. \n Please restrict the product description to 100 words. \n For theme please return any special characteristics like nature, geometric, abstract etc. \n",A long red rug with a geometric pattern on it.,"\nAssuming you are a store assistant please analyze the item in the picture.\n Only return the data in a json with the keys shape, dominant_colors, style, description, material, suggested_title, theme. \n Please restrict the product description to 100 words. \n For theme please return any special characteristics like nature, geometric, abstract etc. \n","{'shape': 'rectangle', 'dominant_colors': ['red', 'blue', 'beige', 'black'], 'style': 'traditional', 'description': 'This is a traditional hand-woven runner rug, likely of Persian origin. It features intricate patterns and symmetrical geometric shapes, bordered by detailed motifs. The craftsmanship suggests it may be antique, adding to its decorative appeal.', 'material': 'wool', 'suggested_title': 'Traditional Persian Runner Rug', 'theme': 'geometric'}"


In [7]:
df_all.astype(str).describe().T

Unnamed: 0,count,unique,top,freq
image,125,125,9701 ARIANA TRIBAL 2.9 X 11.9.jpeg,1
prompt_moondream,125,1,"\nAssuming you are a store assistant please analyze the item in the picture.\n Only return the data in a json with the keys shape, dominant_colors, style, description, material, suggested_title, theme. \n Please restrict the product description to 100 words. \n For theme please return any special characteristics like nature, geometric, abstract etc. \n",125
moondream_inference,125,89,The rug has a geometric pattern and is made of wool.,16
prompt_gpt,125,1,"\nAssuming you are a store assistant please analyze the item in the picture.\n Only return the data in a json with the keys shape, dominant_colors, style, description, material, suggested_title, theme. \n Please restrict the product description to 100 words. \n For theme please return any special characteristics like nature, geometric, abstract etc. \n",125
gpt4_inference,125,125,"{'shape': 'rectangle', 'dominant_colors': ['red', 'blue', 'beige', 'black'], 'style': 'traditional', 'description': 'This is a traditional hand-woven runner rug, likely of Persian origin. It features intricate patterns and symmetrical geometric shapes, bordered by detailed motifs. The craftsmanship suggests it may be antique, adding to its decorative appeal.', 'material': 'wool', 'suggested_title': 'Traditional Persian Runner Rug', 'theme': 'geometric'}",1
