In [1]:
IDEApath = '/mnt/chenjh/Idea23D/input/case1'
num_img = 1 
num_draft = 3
max_iters = 5
outpath = '/mnt/chenjh/Idea23D/output/case1-0410-llava-34b'

In [2]:
import cv2
import numpy as np
from transformers import SegformerFeatureExtractor, SegformerForSemanticSegmentation
from PIL import Image
import argparse
import os
import json
from http import HTTPStatus
from transformers import AutoProcessor, LlavaNextForConditionalGeneration
from transformers import LlavaNextProcessor, LlavaNextForConditionalGeneration
import torch
import requests
import pygame
from pygame.locals import *
from OpenGL.GL import *
from OpenGL.GLU import *
import pywavefront
import re
import replicate
    
def log(text):
    print(f'\n[IDEA-2-3D]: {text}')



import requests
import base64
import json
from io import BytesIO
from PIL import Image



class lmm_gpt4v:
    def __init__(self, api_key=''):
        self.api_key = api_key

    def encode_image(self, image):
        """Encode PIL image to base64, converting RGBA images to RGB."""
        if image.mode == 'RGBA':
            image = image.convert('RGB')
        buffered = BytesIO()
        image.save(buffered, format="JPEG")
        return base64.b64encode(buffered.getvalue()).decode('utf-8')

    def inference(self, question: str, image):
        """Make an inference request to the GPT-4 Vision API with an image and a question."""
        base64_image = self.encode_image(image)
        
        headers = {
            "Content-Type": "application/json",
            "Authorization": f"Bearer {self.api_key}"
        }
        
        payload = {
            "model": "gpt-4-vision-preview",
            "messages": [
                {
                    "role": "user",
                    "content": [
                        {
                            "type": "text",
                            "text": question
                        },
                        {
                            "type": "image_url",
                            "image_url": {
                                "url": f"data:image/jpeg;base64,{base64_image}"
                            }
                        }
                    ]
                }
            ],
            "max_tokens": 300
        }
        
        response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)
        return response.json()['choices'][0]['message']['content']


class lmm_llava_34b():
    
    def __init__(self, model_path = "llava-hf/llava-v1.6-34b-hf", gpuid = 0): 
        self.gpuid = gpuid
        from transformers import LlavaNextProcessor, LlavaNextForConditionalGeneration
        self.processor = LlavaNextProcessor.from_pretrained(model_path)
        self.model = LlavaNextForConditionalGeneration.from_pretrained(model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True) 
        self.model.to(f"cuda:{gpuid}")
        
    def inference(self, question: str, images_list):
        if type(images_list) == list:
            # 拼接图像
            image = concatenate_images_with_number_label(images_list)
        else:
            image = images_list
        
        # 对图像进行比较
        prompt = f"<|im_start|>system\nAnswer the questions.<|im_end|><|im_start|>user\n<image>\n{question}<|im_end|><|im_start|>assistant\n"
        inputs = self.processor(prompt, image, return_tensors="pt").to(f"cuda:{self.gpuid}")
        output = self.model.generate(**inputs, max_new_tokens=1000)
        res = self.processor.decode(output[0], skip_special_tokens=True)
        
        start_index = res.find("<|im_start|> assistant\n")
        if start_index != -1:
            content = res[start_index + len("<|im_start|> assistant\n"):]
            # log(content)
        return content
    
    def image_caption(self, image):
        image_caption_prompt = 'Describe the details of this image in detail, including the color, pose, lighting, and environment of the target object.'
        return self.inference(image_caption_prompt, image)
    pass


class lmm_llava_7b():
    
    def __init__(self, model_path = "llava-hf/llava-v1.6-mistral-7b-hf", gpuid = 0): 
        self.gpuid = gpuid
        from transformers import LlavaNextProcessor, LlavaNextForConditionalGeneration
        self.processor = LlavaNextProcessor.from_pretrained(model_path)
        self.model = LlavaNextForConditionalGeneration.from_pretrained(model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True) 
        self.model.to(f"cuda:{gpuid}")
        
    def inference(self, question: str, images_list):
        if type(images_list) == list:
            # 拼接图像
            image = concatenate_images_with_number_label(images_list)
        else:
            image = images_list
        
        # 对图像进行比较
        
        prompt = f"[INST] <image>\n{question} [/INST]"
        inputs = self.processor(prompt, image, return_tensors="pt").to(f"cuda:{self.gpuid}")
        output = self.model.generate(**inputs, max_new_tokens=1000)

        res = self.processor.decode(output[0], skip_special_tokens=True)
        
        result = re.search(r'\[/INST\](.*)', res)
        if result:
            res = result.group(1)
            # log(f'lmm res={res}')
    
        return res
    
    def image_caption(self, image):
        image_caption_prompt = 'Describe the details of this image in detail, including the color, pose, lighting, and environment of the target object.'
        return self.inference(image_caption_prompt, image)
    pass

from diffusers import DiffusionPipeline
import torch

class text2img_sdxl():
    def __init__(self, sdxl_base_path='stabilityai/stable-diffusion-xl-base-1.0', sdxl_refiner_path='stabilityai/stable-diffusion-xl-refiner-1.0', gpuid=1,variant="fp16"):
        self.sdxl_base_path=sdxl_base_path
        self.sdxl_refiner_path=sdxl_refiner_path
        self.gpuid=gpuid
        # load both base & refiner
        self.base = DiffusionPipeline.from_pretrained(
            sdxl_base_path, 
            torch_dtype=torch.float32,
            # variant="fp16", 
            use_safetensors=True
        )
        self.base.to(f"cuda:{gpuid}")
        self.refiner = DiffusionPipeline.from_pretrained(
            sdxl_refiner_path,
            text_encoder_2=self.base.text_encoder_2,
            vae=self.base.vae,
            torch_dtype=torch.float32,
            use_safetensors=True,
            # variant="fp16",
        )
        self.refiner.to(f"cuda:{gpuid}")

    def inference(self, prompt):
        # Define how many steps and what % of steps to be run on each experts (80/20) here
        n_steps = 40
        high_noise_frac = 0.8

        # run both experts
        image = self.base(
            prompt=prompt,
            num_inference_steps=n_steps,
            denoising_end=high_noise_frac,
            output_type="latent",
        ).images
        image = self.refiner(
            prompt=prompt,
            num_inference_steps=n_steps,
            denoising_start=high_noise_frac,
            image=image,
        ).images[0]
        
        return image

    
    pass


class text2img_sdxl_replicate():
    def __init__(self, replicate_key='see https://replicate.com/stability-ai/sdxl/api'):
        self.replicate_key=replicate_key
        

    def inference(self, prompt):
        import replicate
        from PIL import Image
        import os

        replicate = replicate.Client(api_token=self.replicate_key)

        input = {
            "width": 1024,
            "height": 1024,
            "prompt": prompt,
            "refine": "expert_ensemble_refiner",
            "apply_watermark": False,
            "num_inference_steps": 25
        }

        output = replicate.run(
            "stability-ai/sdxl:39ed52f2a78e934b3ba6e2a89f5b1c712de7dfea535525255b1aa35c5565e08b",
            input=input
        )

        response = requests.get(output[0])
        image_data = BytesIO(response.content)
        image = Image.open(image_data)
        
        return image

    
    pass

import sys
# 添加 tool 目录到 sys.path，使得 Python 能找到你的模块
module_path = './tool'
if module_path not in sys.path:
    sys.path.append(module_path)

from i23d.TripoSR.run import TripoSRmain

class img23d_TripoSR():
    
    def __init__(self, model_path = 'stabilityai/TripoSR', gpuid=1):
        self.gpuid = gpuid
        self.model_path = model_path
        
        
    def inference(self, png_path, output_path):
        # CUDA_VISIBLE_DEVICES=6 python /mnt/chenjh/Idea23D/tool/i23d/TripoSR/run.py /mnt/chenjh/Idea23D/input/a111.png --output-dir /mnt/chenjh/Idea23D/input --render
        print('png_path,=',png_path)
        res = TripoSRmain(self.gpuid, self.model_path, png_path, output_path)
        # os.system(f'CUDA_VISIBLE_DEVICES={self.gpuid} python {self.model_path}/run.py {png_path} --output-dir {output_path} --render')
        return f'{output_path}/mesh.obj'

    pass

def readimage(path):
    with open(path, 'rb') as file:
        image = Image.open(path).convert("RGB")
        resized_image = image.resize((256, 256))
    return resized_image

def writeimage(image, path):
    # Check if the directory exists, and create it if it doesn't
    directory = os.path.dirname(path)
    if not os.path.exists(directory):
        os.makedirs(directory)
    
    # Save the image to the path
    with open(path, 'wb') as file:
        image.save(file, 'PNG')  # Use 'PNG' to ensure proper saving of PNG files

    
class Memory():
    # 记忆模块
    # 初始idea
    idea_input_imglist = []
    idea_input_img = None # 合并后的img
    idea_input_prompt = ''
    
    best_img = None
    best_prompt = None
    best_3d_path = None
    
    feedback = ''
    pass


class Iter():
    # 每一轮的结果
    def __init__(self, index):
        self.index = index
    idea_input_imglist = []
    prompt = []
    draft_img = []
    draft_3d_path = []
    best_img = None
    best_3d_path = ''
    best_prompt = ''
    
    def clear(self):
        self.idea_input_imglist = []
        self.prompt = []
        self.draft_img = []
        self.draft_3d_path = []
        self.best_img = None
        self.best_3d_path = ''
        self.best_prompt = ''
    


[2024-04-10 16:28:25,203] [INFO] [real_accelerator.py:191:get_accelerator] Setting ds_accelerator to cuda (auto detect)
pygame 2.5.2 (SDL 2.28.2, Python 3.10.13)
Hello from the pygame community. https://www.pygame.org/contribute.html


In [3]:
# 初始化LMM,T2I,I23D
log('loading lmm...')

# lmm = lmm_gpt4v('sk-your open ai key')
lmm = lmm_llava_34b(model_path = "/mnt/chenjh/LargeModels/llava-v1.6-34b-hf", gpuid = 4)
# lmm = lmm_llava_7b(model_path = "/mnt/chenjh/LargeModels/llava-v1.6-mistral-7b-hf", gpuid = 2)

log('loading t2i...')
# t2i = text2img_sdxl_replicate(replicate_key='r8_ZCtKyMJqjyqVF76N6mycGNHEgF6cTTF1EZtmG')
t2i = text2img_sdxl(sdxl_base_path='/mnt/chenjh/LargeModels/stable-diffusion-xl-base-1.0', 
                    sdxl_refiner_path='/mnt/chenjh/LargeModels/stable-diffusion-xl-refiner-1.0', 
                    gpuid=6)

log('loading i23d...')
i23d = img23d_TripoSR(model_path = '/mnt/chenjh/LargeModels/TripoSR' ,gpuid=7)
log('loading finish.')

You set `add_prefix_space`. The tokenizer needs to be converted from the slow tokenizers



[IDEA-2-3D]: loading lmm...


Loading checkpoint shards:   0%|          | 0/15 [00:00<?, ?it/s]


[IDEA-2-3D]: loading t2i...


Loading pipeline components...:   0%|          | 0/7 [00:00<?, ?it/s]

Loading pipeline components...:   0%|          | 0/5 [00:00<?, ?it/s]


[IDEA-2-3D]: loading i23d...

[IDEA-2-3D]: loading finish.


In [4]:
i23d.inference('/mnt/chenjh/Idea23D/output/case1-0409-006-34b/draft/iter-1-0-0/draft.png', '/mnt/chenjh/Idea23D/output/case1-0409-006-34b/draft/iter-1-0-0')

png_path,= /mnt/chenjh/Idea23D/output/case1-0409-006-34b/draft/iter-1-0-0/draft.png


2024-04-10 16:28:48,456 - INFO - Initializing model ...
2024-04-10 16:28:53,968 - INFO - Initializing model finished in 5511.89ms.
2024-04-10 16:28:53,969 - INFO - Processing image ...
2024-04-10 16:28:53,969 - ERROR - Provided path is not a file: /mnt/chenjh/Idea23D/output/case1-0409-006-34b/draft/iter-1-0-0/draft.png


'/mnt/chenjh/Idea23D/output/case1-0409-006-34b/draft/iter-1-0-0/mesh.obj'

In [5]:

from PIL import Image
import numpy as np
import datetime
import os
import cv2

def concatenate_images_with_number_label(images_list, direction="h", output_folder=f'{outpath}/tmp'):
    # Check if images_list contains PIL images
    if not all(isinstance(image, Image.Image) for image in images_list):
        raise ValueError("All images in images_list must be PIL images.")
    
    # Check direction parameter
    if direction not in ["h", "v"]:
        raise ValueError("Invalid direction parameter. It must be 'h' for horizontal or 'v' for vertical concatenation.")
    
    # Check output folder
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)  # Create the target folder if it doesn't exist
    
    # Convert PIL images to numpy arrays for processing
    np_images = [np.array(image) for image in images_list]
    
    # Check if we're concatenating horizontally or vertically and create a canvas
    if direction == "h":
        total_width = sum(image.size[0] for image in images_list)
        max_height = max(image.size[1] for image in images_list)
        concatenated_image = Image.new('RGB', (total_width, max_height))
    elif direction == "v":
        total_height = sum(image.size[1] for image in images_list)
        max_width = max(image.size[0] for image in images_list)
        concatenated_image = Image.new('RGB', (max_width, total_height))
    
    # Paste images onto the canvas
    x_offset, y_offset = 0, 0
    for image in images_list:
        concatenated_image.paste(image, (x_offset, y_offset))
        if direction == "h":
            x_offset += image.size[0]
        elif direction == "v":
            y_offset += image.size[1]
    
    # Save the image
    timestamp = datetime.datetime.now().strftime("%Y%m%d%H%M%S%f")
    output_path = os.path.join(output_folder, f"concatenated_image-{timestamp}.png")
    
    log(f'concatenated_image output_path={output_path}')
    concatenated_image.save(output_path)
    
    return concatenated_image

In [6]:
# parser = argparse.ArgumentParser()
# parser.add_argument("--IDEApath", type=str, default='/mnt/chenjh/Idea23D/input/case1')
# parser.add_argument("--num_img", type=int, default=1, help="number of images to generate per prompt")
# parser.add_argument("--num_draft", type=int, default=3, help="number of prompts to search each round")
# parser.add_argument("--max_iters", type=int, default=3, help="max number of iter rounds")
# parser.add_argument("--outpath", default='/mnt/chenjh/Idea23D/output/case1')
# args = parser.parse_args()
import datetime



with open(f'{IDEApath}/idea.txt', 'r') as file:
    IdeaContent = file.read()

if len(IdeaContent.strip()) == 0:
    log('Error: empty Idea.txt')
    exit()

 # 初始化记忆模块
memory = Memory()

# 将idea中的图，使用LMM生成描述，替换成文字
# This brown rabbit <IMG>a1.png</IMG> uses two front paws to engage in the action of eating this doughnut <IMG>a111.png</IMG>.
prompt_imagecaption = 'Describe the image in detail.'

img_tags = re.findall(r'<IMG>(.*?)<\/IMG>', IdeaContent)
obj_tags = re.findall(r'<OBJ>(.*?)<\/OBJ>', IdeaContent)

img_list = []
for img_tag in img_tags:
    img_path = f'{IDEApath}/{img_tag}'
    image = readimage(img_path)
    width, height = image.size
    # log('image.size', width, height)
    img_list.append(image)
    log('img_list.append(image)')
    caption = lmm.inference(prompt_imagecaption, image)
    IdeaContent = IdeaContent.replace(f'<IMG>{img_tag}</IMG>', f'[{caption}]')
    # log(img_tag)

    # 将idea中的3d模型，使用blender渲染，生成多视角的图，再使用LMM生成描述，替换成文字
for obj_tag in obj_tags:
    # TODO...
    # img_list.append(image) 将obj渲染的图也融合在一张图里
    log(obj_tag)

# 更新记忆模块中的idea部分，这部分内容是固定值 不会变的
log('img_list size = {len(img_list)}')
memory.idea_input_imglist = img_list
log(f'memory.idea_input_imglist size = {len(memory.idea_input_imglist)}')
log(f'memory.idea_input_imglist = {memory.idea_input_imglist}')
memory.idea_input_img = concatenate_images_with_number_label(img_list) # 合并后的img
memory.idea_input_prompt = IdeaContent # 最原始的user idea input，obj和png未转换

log(f'memory.idea_input_img = {memory.idea_input_img}')

log(f'memory.idea_input_prompt = {memory.idea_input_prompt}')

log(f'init input prompt = {IdeaContent}')


[IDEA-2-3D]: img_list.append(image)

[IDEA-2-3D]: img_list.append(image)

[IDEA-2-3D]: img_list size = {len(img_list)}

[IDEA-2-3D]: memory.idea_input_imglist size = 2

[IDEA-2-3D]: memory.idea_input_imglist = [<PIL.Image.Image image mode=RGB size=256x256 at 0x7F3C681AF160>, <PIL.Image.Image image mode=RGB size=256x256 at 0x7F3C80853730>]

[IDEA-2-3D]: concatenated_image output_path=/mnt/chenjh/Idea23D/output/case1-0410-llava-34b/tmp/concatenated_image-20240410163118558998.png

[IDEA-2-3D]: memory.idea_input_img = <PIL.Image.Image image mode=RGB size=512x256 at 0x7F3C80198AC0>

[IDEA-2-3D]: memory.idea_input_prompt = This brown rabbit [The image shows a close-up of a rabbit with a fluffy, light brown coat. The rabbit has large, pointed ears and expressive eyes that are looking directly at the viewer. Its whiskers are prominent, and it appears to be sitting upright with its front paws slightly apart. The background is a solid black, which contrasts with the rabbit's fur and highlights 

In [7]:
iters = Iter(0)

for i in range(max_iters):
    log(f'iter = {i}')
    iters.clear()

    for k in range( num_draft):
        if i == 0: # initial round
            prompt_gen = f'Optimize text descriptions based on image content and details to better match user input [User Input]{memory.idea_input_prompt}[/User Input] and images. Answers are 75 words or less.'
            log(f'prompt_gen = {prompt_gen}')
            IdeaContent = lmm.inference(prompt_gen, memory.idea_input_img)
        else:
            # The second round starts with memory+idea input, and the image and best prompt of the best model from the previous round.
            prompt_rev = f'Optimize prompt [Prompt]{memory.best_prompt}[/Prompt] based on image content and details to better match user input [User Input]{memory.idea_input_prompt}[/User Input] and images. The first line of the image is the user input. Here\'s the revision [feedback]{memory.feedback}[/feedback]. Answers are 75 words or less.'
            log(f'prompt_rev = {prompt_rev}')
            imagetmp = memory.idea_input_img
            log(f'imagetmp={imagetmp}')
            IdeaContent = lmm.inference(prompt_rev, imagetmp)
        # Generate end of prompt, convert to image
        log(f'new input prompt = {IdeaContent}')
        for j in range( num_img): # Each prompt generates n charts
            iters.prompt.append(IdeaContent)
            imgtmp = t2i.inference(IdeaContent)
            imgpath = f'{ outpath}/draft/iter-{i+1}-{k}-{j}/draft.png'
            out3dpath = f'{ outpath}/draft/iter-{i+1}-{k}-{j}'
            writeimage(imgtmp, imgpath)
            i23d_res = i23d.inference(imgpath, out3dpath)
            iters.draft_3d_path.append(i23d_res)
            log(f'i23d_res = {i23d_res}')
            #  Save 6 rendered images, and then filter, filter out the best prompt into memory.
            img_render_list = [readimage(f'{ outpath}/draft/iter-{i+1}-{k}-{j}/render_00{idx}.png') for idx in range(6)]
            img_render = concatenate_images_with_number_label(img_render_list) # 6 rendered images merged
            iters.draft_img.append(img_render)

    # Stitch all the images into one big picture, each row is a draft model, and the best model is filtered together.
    append_i = -1
    if memory.best_3d_path != None:
      iters.prompt.append(memory.best_prompt)
      iters.draft_img.append(memory.best_img)
      iters.draft_3d_path.append(memory.best_3d_path)
      append_i = 0

    draft_img_comp = concatenate_images_with_number_label(iters.draft_img, 'v')

    # Selection of the best draft model for the current round
    prompt_select = f'Each row of these images shows 6 views of a 3D model. Which row of images best meets the user input? [User Input]{memory.idea_input_prompt}[/User Input]. Only return a number in the list {[kj for kj in range( num_draft * num_img)]}, the number of rows. Such as, \"1\" or \"0\".'
    log(f'prompt_select = {prompt_select}')
    best_row = lmm.inference(prompt_select, draft_img_comp)
    log(f'best_row answer = {best_row}')
    try:
        best_row = int(best_row)
    except ValueError:
        # Handle failure to parse to integers
        # Handle errors based on specific needs, such as giving default values or prompting the user to re-enter
        if i==0:
            best_row = 0
        else:
            best_row = len(iters.draft_3d_path) - 1
        log('Failed to parse best_row as an integer. Using default value.')

    log(f'best_row = {best_row}')

    k = best_row //  num_img
    j = best_row %  num_img
    log(f'k={k}, j={j}')

    memory.best_prompt = iters.prompt[k *  num_draft + j *  num_img + append_i ]
    memory.best_img = iters.draft_img[k *  num_draft + j *  num_img + append_i ]
    memory.best_3d_path = iters.draft_3d_path[k *  num_draft + j *  num_img + append_i ]
    log(f'memory.best_prompt = {memory.best_prompt}')
    log(f'memory.best_img = {memory.best_img}')
    log(f'memory.best_3d_path = {memory.best_3d_path}')

    # Determine if the output condition is met
    # Give feedback
    prompt_feedback = f'Does the diagram satisfy the user input? [User Input]{memory.idea_input_prompt}[/User Input]. Returns "no revision" if it matches the User Input. Give the correct prompt if it does not.'
    log(f'prompt_feedback = {prompt_feedback}')
    feedback = lmm.inference(prompt_feedback, memory.best_img)
    log(f'feedback answer = {feedback}')
    if 'no revision' in feedback:
        log('output no revison , finish.')
        break
    memory.feedback = feedback
    pass


# End of iteration, save memory best model to outputs
log(f'cp {memory.best_3d_path} {outpath}/mesh.obj')
os.system(f'cp {memory.best_3d_path} {outpath}/mesh.obj')
log(f'finished! check the path {outpath}/mesh.obj')


[IDEA-2-3D]: iter = 0

[IDEA-2-3D]: prompt_gen = Optimize text descriptions based on image content and details to better match user input [User Input]This brown rabbit [The image shows a close-up of a rabbit with a fluffy, light brown coat. The rabbit has large, pointed ears and expressive eyes that are looking directly at the viewer. Its whiskers are prominent, and it appears to be sitting upright with its front paws slightly apart. The background is a solid black, which contrasts with the rabbit's fur and highlights the animal. The image is a realistic photograph with a focus on the rabbit, and there are no visible texts or additional elements in the frame.] uses two front paws to engage in the action of eating this doughnut [The image displays a single doughnut with a glossy pink glaze. The glaze appears to be freshly applied, as indicated by the visible drips on the side of the doughnut. It is sprinkled with a variety of colorful sprinkles that are scattered across the glaze, addi

Token indices sequence length is longer than the specified maximum sequence length for this model (116 > 77). Running this sequence through the model will result in indexing errors
The following part of your input was truncated because CLIP can only handle sequences up to 77 tokens: ['black, which contrasts with the doughnut and highlights its details. the style of the image is a realistic 3 d rendering, with a focus on the doughnut to emphasize its textures and colors.']
Token indices sequence length is longer than the specified maximum sequence length for this model (116 > 77). Running this sequence through the model will result in indexing errors
The following part of your input was truncated because CLIP can only handle sequences up to 77 tokens: ['black, which contrasts with the doughnut and highlights its details. the style of the image is a realistic 3 d rendering, with a focus on the doughnut to emphasize its textures and colors.']



[IDEA-2-3D]: new input prompt = The image features a light brown rabbit with large ears and expressive eyes, sitting upright and looking directly at the viewer. The rabbit's fur is fluffy, and its whiskers are prominent. In the background, there is a single doughnut with a glossy pink glaze and colorful sprinkles. The doughnut has a golden-brown color and a hole in the center. The background is a solid black, which contrasts with the doughnut and highlights its details. The style of the image is a realistic 3D rendering, with a focus on the doughnut to emphasize its textures and colors.


  0%|          | 0/32 [00:00<?, ?it/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (116 > 77). Running this sequence through the model will result in indexing errors
The following part of your input was truncated because CLIP can only handle sequences up to 77 tokens: ['black, which contrasts with the doughnut and highlights its details. the style of the image is a realistic 3 d rendering, with a focus on the doughnut to emphasize its textures and colors.']


  0%|          | 0/8 [00:00<?, ?it/s]

2024-04-10 16:33:40,149 - INFO - Initializing model ...


png_path,= /mnt/chenjh/Idea23D/output/case1-0410-llava-34b/draft/iter-1-0-0/draft.png


2024-04-10 16:33:45,530 - INFO - Initializing model finished in 5381.15ms.
2024-04-10 16:33:45,531 - INFO - Processing image ...
2024-04-10 16:33:46,426 - INFO - Processing image finished in 895.11ms.
2024-04-10 16:33:46,428 - INFO - Running model ...
2024-04-10 16:33:46,428 - INFO - Running model ...
2024-04-10 16:33:46,726 - INFO - Running model finished in 297.98ms.
2024-04-10 16:33:46,727 - INFO - Rendering ...
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
2024-04-10 16:33:51,090 - INFO - Rendering finished in 4362.77ms.
2024-04-10 16:33:51,091 - INFO - Exporting mesh ...
2024-04-10 16:33:52,820 - INFO - Exporting mesh finished in 1729.44ms.



[IDEA-2-3D]: i23d_res = /mnt/chenjh/Idea23D/output/case1-0410-llava-34b/draft/iter-1-0-0/mesh.obj

[IDEA-2-3D]: concatenated_image output_path=/mnt/chenjh/Idea23D/output/case1-0410-llava-34b/tmp/concatenated_image-20240410163352875597.png

[IDEA-2-3D]: prompt_gen = Optimize text descriptions based on image content and details to better match user input [User Input]This brown rabbit [The image shows a close-up of a rabbit with a fluffy, light brown coat. The rabbit has large, pointed ears and expressive eyes that are looking directly at the viewer. Its whiskers are prominent, and it appears to be sitting upright with its front paws slightly apart. The background is a solid black, which contrasts with the rabbit's fur and highlights the animal. The image is a realistic photograph with a focus on the rabbit, and there are no visible texts or additional elements in the frame.] uses two front paws to engage in the action of eating this doughnut [The image displays a single doughnut with a 

The following part of your input was truncated because CLIP can only handle sequences up to 77 tokens: ['black, which contrasts with the doughnut and highlights its details. the style of the image is a realistic 3 d rendering, with a focus on the doughnut to emphasize its textures and colors.']
The following part of your input was truncated because CLIP can only handle sequences up to 77 tokens: ['black, which contrasts with the doughnut and highlights its details. the style of the image is a realistic 3 d rendering, with a focus on the doughnut to emphasize its textures and colors.']



[IDEA-2-3D]: new input prompt = The image features a light brown rabbit with large ears and expressive eyes, sitting upright and looking directly at the viewer. The rabbit's fur is fluffy, and its whiskers are prominent. In the background, there is a single doughnut with a glossy pink glaze and colorful sprinkles. The doughnut has a golden-brown color and a hole in the center. The background is a solid black, which contrasts with the doughnut and highlights its details. The style of the image is a realistic 3D rendering, with a focus on the doughnut to emphasize its textures and colors.


  0%|          | 0/32 [00:00<?, ?it/s]

The following part of your input was truncated because CLIP can only handle sequences up to 77 tokens: ['black, which contrasts with the doughnut and highlights its details. the style of the image is a realistic 3 d rendering, with a focus on the doughnut to emphasize its textures and colors.']


  0%|          | 0/8 [00:00<?, ?it/s]

2024-04-10 16:36:13,764 - INFO - Initializing model ...


png_path,= /mnt/chenjh/Idea23D/output/case1-0410-llava-34b/draft/iter-1-1-0/draft.png


2024-04-10 16:36:18,817 - INFO - Initializing model finished in 5053.27ms.
2024-04-10 16:36:18,818 - INFO - Processing image ...
2024-04-10 16:36:19,708 - INFO - Processing image finished in 889.96ms.
2024-04-10 16:36:19,709 - INFO - Running model ...
2024-04-10 16:36:19,709 - INFO - Running model ...
2024-04-10 16:36:19,943 - INFO - Running model finished in 234.04ms.
2024-04-10 16:36:19,944 - INFO - Rendering ...
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
2024-04-10 16:36:24,282 - INFO - Rendering finished in 4338.27ms.
2024-04-10 16:36:24,284 - INFO - Exporting mesh ...
2024-04-10 16:36:25,929 - INFO - Exporting mesh finished in 1645.55ms.



[IDEA-2-3D]: i23d_res = /mnt/chenjh/Idea23D/output/case1-0410-llava-34b/draft/iter-1-1-0/mesh.obj

[IDEA-2-3D]: concatenated_image output_path=/mnt/chenjh/Idea23D/output/case1-0410-llava-34b/tmp/concatenated_image-20240410163625985253.png

[IDEA-2-3D]: prompt_gen = Optimize text descriptions based on image content and details to better match user input [User Input]This brown rabbit [The image shows a close-up of a rabbit with a fluffy, light brown coat. The rabbit has large, pointed ears and expressive eyes that are looking directly at the viewer. Its whiskers are prominent, and it appears to be sitting upright with its front paws slightly apart. The background is a solid black, which contrasts with the rabbit's fur and highlights the animal. The image is a realistic photograph with a focus on the rabbit, and there are no visible texts or additional elements in the frame.] uses two front paws to engage in the action of eating this doughnut [The image displays a single doughnut with a 

The following part of your input was truncated because CLIP can only handle sequences up to 77 tokens: ['black, which contrasts with the doughnut and highlights its details. the style of the image is a realistic 3 d rendering, with a focus on the doughnut to emphasize its textures and colors.']
The following part of your input was truncated because CLIP can only handle sequences up to 77 tokens: ['black, which contrasts with the doughnut and highlights its details. the style of the image is a realistic 3 d rendering, with a focus on the doughnut to emphasize its textures and colors.']



[IDEA-2-3D]: new input prompt = The image features a light brown rabbit with large ears and expressive eyes, sitting upright and looking directly at the viewer. The rabbit's fur is fluffy, and its whiskers are prominent. In the background, there is a single doughnut with a glossy pink glaze and colorful sprinkles. The doughnut has a golden-brown color and a hole in the center. The background is a solid black, which contrasts with the doughnut and highlights its details. The style of the image is a realistic 3D rendering, with a focus on the doughnut to emphasize its textures and colors.


  0%|          | 0/32 [00:00<?, ?it/s]

The following part of your input was truncated because CLIP can only handle sequences up to 77 tokens: ['black, which contrasts with the doughnut and highlights its details. the style of the image is a realistic 3 d rendering, with a focus on the doughnut to emphasize its textures and colors.']


  0%|          | 0/8 [00:00<?, ?it/s]

2024-04-10 16:38:46,916 - INFO - Initializing model ...


png_path,= /mnt/chenjh/Idea23D/output/case1-0410-llava-34b/draft/iter-1-2-0/draft.png


2024-04-10 16:38:54,832 - INFO - Initializing model finished in 7916.07ms.
2024-04-10 16:38:54,833 - INFO - Processing image ...
2024-04-10 16:38:55,935 - INFO - Processing image finished in 1101.69ms.
2024-04-10 16:38:55,936 - INFO - Running model ...
2024-04-10 16:38:55,936 - INFO - Running model ...
2024-04-10 16:38:56,174 - INFO - Running model finished in 237.07ms.
2024-04-10 16:38:56,175 - INFO - Rendering ...
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
2024-04-10 16:39:00,541 - INFO - Rendering finished in 4366.04ms.
2024-04-10 16:39:00,543 - INFO - Exporting mesh ...
2024-04-10 16:39:02,285 - INFO - Exporting mesh finished in 1742.27ms.



[IDEA-2-3D]: i23d_res = /mnt/chenjh/Idea23D/output/case1-0410-llava-34b/draft/iter-1-2-0/mesh.obj

[IDEA-2-3D]: concatenated_image output_path=/mnt/chenjh/Idea23D/output/case1-0410-llava-34b/tmp/concatenated_image-20240410163902344048.png

[IDEA-2-3D]: concatenated_image output_path=/mnt/chenjh/Idea23D/output/case1-0410-llava-34b/tmp/concatenated_image-20240410163902409905.png

[IDEA-2-3D]: prompt_select = Each row of these images shows 6 views of a 3D model. Which row of images best meets the user input? [User Input]This brown rabbit [The image shows a close-up of a rabbit with a fluffy, light brown coat. The rabbit has large, pointed ears and expressive eyes that are looking directly at the viewer. Its whiskers are prominent, and it appears to be sitting upright with its front paws slightly apart. The background is a solid black, which contrasts with the rabbit's fur and highlights the animal. The image is a realistic photograph with a focus on the rabbit, and there are no visible t

  0%|          | 0/32 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

2024-04-10 16:41:41,258 - INFO - Initializing model ...


png_path,= /mnt/chenjh/Idea23D/output/case1-0410-llava-34b/draft/iter-2-0-0/draft.png


2024-04-10 16:41:47,575 - INFO - Initializing model finished in 6317.11ms.
2024-04-10 16:41:47,576 - INFO - Processing image ...
2024-04-10 16:41:48,533 - INFO - Processing image finished in 957.38ms.
2024-04-10 16:41:48,534 - INFO - Running model ...
2024-04-10 16:41:48,534 - INFO - Running model ...
2024-04-10 16:41:48,772 - INFO - Running model finished in 237.28ms.
2024-04-10 16:41:48,773 - INFO - Rendering ...
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
2024-04-10 16:41:53,077 - INFO - Rendering finished in 4304.55ms.
2024-04-10 16:41:53,078 - INFO - Exporting mesh ...
2024-04-10 16:41:54,682 - INFO - Exporting mesh finished in 1603.46ms.



[IDEA-2-3D]: i23d_res = /mnt/chenjh/Idea23D/output/case1-0410-llava-34b/draft/iter-2-0-0/mesh.obj

[IDEA-2-3D]: concatenated_image output_path=/mnt/chenjh/Idea23D/output/case1-0410-llava-34b/tmp/concatenated_image-20240410164154735805.png

[IDEA-2-3D]: prompt_rev = Optimize prompt [Prompt]The image features a light brown rabbit with large ears and expressive eyes, sitting upright and looking directly at the viewer. The rabbit's fur is fluffy, and its whiskers are prominent. In the background, there is a single doughnut with a glossy pink glaze and colorful sprinkles. The doughnut has a golden-brown color and a hole in the center. The background is a solid black, which contrasts with the doughnut and highlights its details. The style of the image is a realistic 3D rendering, with a focus on the doughnut to emphasize its textures and colors.[/Prompt] based on image content and details to better match user input [User Input]This brown rabbit [The image shows a close-up of a rabbit with a

  0%|          | 0/32 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

2024-04-10 16:43:35,130 - INFO - Initializing model ...


png_path,= /mnt/chenjh/Idea23D/output/case1-0410-llava-34b/draft/iter-2-1-0/draft.png


2024-04-10 16:43:39,738 - INFO - Initializing model finished in 4608.08ms.
2024-04-10 16:43:39,739 - INFO - Processing image ...
2024-04-10 16:43:40,633 - INFO - Processing image finished in 894.67ms.
2024-04-10 16:43:40,634 - INFO - Running model ...
2024-04-10 16:43:40,634 - INFO - Running model ...
2024-04-10 16:43:40,870 - INFO - Running model finished in 235.77ms.
2024-04-10 16:43:40,871 - INFO - Rendering ...
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
2024-04-10 16:43:45,059 - INFO - Rendering finished in 4187.86ms.
2024-04-10 16:43:45,060 - INFO - Exporting mesh ...
2024-04-10 16:43:46,532 - INFO - Exporting mesh finished in 1471.74ms.



[IDEA-2-3D]: i23d_res = /mnt/chenjh/Idea23D/output/case1-0410-llava-34b/draft/iter-2-1-0/mesh.obj

[IDEA-2-3D]: concatenated_image output_path=/mnt/chenjh/Idea23D/output/case1-0410-llava-34b/tmp/concatenated_image-20240410164346587163.png

[IDEA-2-3D]: prompt_rev = Optimize prompt [Prompt]The image features a light brown rabbit with large ears and expressive eyes, sitting upright and looking directly at the viewer. The rabbit's fur is fluffy, and its whiskers are prominent. In the background, there is a single doughnut with a glossy pink glaze and colorful sprinkles. The doughnut has a golden-brown color and a hole in the center. The background is a solid black, which contrasts with the doughnut and highlights its details. The style of the image is a realistic 3D rendering, with a focus on the doughnut to emphasize its textures and colors.[/Prompt] based on image content and details to better match user input [User Input]This brown rabbit [The image shows a close-up of a rabbit with a

  0%|          | 0/32 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

2024-04-10 16:45:26,910 - INFO - Initializing model ...


png_path,= /mnt/chenjh/Idea23D/output/case1-0410-llava-34b/draft/iter-2-2-0/draft.png


2024-04-10 16:45:31,651 - INFO - Initializing model finished in 4741.28ms.
2024-04-10 16:45:31,652 - INFO - Processing image ...
2024-04-10 16:45:32,598 - INFO - Processing image finished in 946.42ms.
2024-04-10 16:45:32,599 - INFO - Running model ...
2024-04-10 16:45:32,600 - INFO - Running model ...
2024-04-10 16:45:32,836 - INFO - Running model finished in 236.11ms.
2024-04-10 16:45:32,837 - INFO - Rendering ...
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
2024-04-10 16:45:37,098 - INFO - Rendering finished in 4260.96ms.
2024-04-10 16:45:37,099 - INFO - Exporting mesh ...
2024-04-10 16:45:38,844 - INFO - Exporting mesh finished in 1745.20ms.



[IDEA-2-3D]: i23d_res = /mnt/chenjh/Idea23D/output/case1-0410-llava-34b/draft/iter-2-2-0/mesh.obj

[IDEA-2-3D]: concatenated_image output_path=/mnt/chenjh/Idea23D/output/case1-0410-llava-34b/tmp/concatenated_image-20240410164538898872.png

[IDEA-2-3D]: concatenated_image output_path=/mnt/chenjh/Idea23D/output/case1-0410-llava-34b/tmp/concatenated_image-20240410164538960624.png

[IDEA-2-3D]: prompt_select = Each row of these images shows 6 views of a 3D model. Which row of images best meets the user input? [User Input]This brown rabbit [The image shows a close-up of a rabbit with a fluffy, light brown coat. The rabbit has large, pointed ears and expressive eyes that are looking directly at the viewer. Its whiskers are prominent, and it appears to be sitting upright with its front paws slightly apart. The background is a solid black, which contrasts with the rabbit's fur and highlights the animal. The image is a realistic photograph with a focus on the rabbit, and there are no visible t

  0%|          | 0/32 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

2024-04-10 16:48:18,551 - INFO - Initializing model ...


png_path,= /mnt/chenjh/Idea23D/output/case1-0410-llava-34b/draft/iter-3-0-0/draft.png


2024-04-10 16:48:24,399 - INFO - Initializing model finished in 5847.99ms.
2024-04-10 16:48:24,400 - INFO - Processing image ...
2024-04-10 16:48:25,343 - INFO - Processing image finished in 942.80ms.
2024-04-10 16:48:25,344 - INFO - Running model ...
2024-04-10 16:48:25,344 - INFO - Running model ...
2024-04-10 16:48:25,577 - INFO - Running model finished in 232.51ms.
2024-04-10 16:48:25,577 - INFO - Rendering ...
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
2024-04-10 16:48:29,993 - INFO - Rendering finished in 4415.51ms.
2024-04-10 16:48:29,994 - INFO - Exporting mesh ...
2024-04-10 16:48:31,561 - INFO - Exporting mesh finished in 1566.91ms.



[IDEA-2-3D]: i23d_res = /mnt/chenjh/Idea23D/output/case1-0410-llava-34b/draft/iter-3-0-0/mesh.obj

[IDEA-2-3D]: concatenated_image output_path=/mnt/chenjh/Idea23D/output/case1-0410-llava-34b/tmp/concatenated_image-20240410164831610753.png

[IDEA-2-3D]: prompt_rev = Optimize prompt [Prompt]The image features a light brown rabbit with large ears and expressive eyes, sitting upright and looking directly at the viewer. The rabbit's fur is fluffy, and its whiskers are prominent. In the background, there is a single doughnut with a glossy pink glaze and colorful sprinkles. The doughnut has a golden-brown color and a hole in the center. The background is a solid black, which contrasts with the doughnut and highlights its details. The style of the image is a realistic 3D rendering, with a focus on the doughnut to emphasize its textures and colors.[/Prompt] based on image content and details to better match user input [User Input]This brown rabbit [The image shows a close-up of a rabbit with a

  0%|          | 0/32 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

2024-04-10 16:50:11,952 - INFO - Initializing model ...


png_path,= /mnt/chenjh/Idea23D/output/case1-0410-llava-34b/draft/iter-3-1-0/draft.png


2024-04-10 16:50:19,306 - INFO - Initializing model finished in 7354.67ms.
2024-04-10 16:50:19,307 - INFO - Processing image ...
2024-04-10 16:50:20,266 - INFO - Processing image finished in 958.63ms.
2024-04-10 16:50:20,267 - INFO - Running model ...
2024-04-10 16:50:20,267 - INFO - Running model ...
2024-04-10 16:50:20,527 - INFO - Running model finished in 259.06ms.
2024-04-10 16:50:20,527 - INFO - Rendering ...
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
2024-04-10 16:50:24,746 - INFO - Rendering finished in 4218.54ms.
2024-04-10 16:50:24,747 - INFO - Exporting mesh ...
2024-04-10 16:50:26,716 - INFO - Exporting mesh finished in 1968.79ms.



[IDEA-2-3D]: i23d_res = /mnt/chenjh/Idea23D/output/case1-0410-llava-34b/draft/iter-3-1-0/mesh.obj

[IDEA-2-3D]: concatenated_image output_path=/mnt/chenjh/Idea23D/output/case1-0410-llava-34b/tmp/concatenated_image-20240410165026783502.png

[IDEA-2-3D]: prompt_rev = Optimize prompt [Prompt]The image features a light brown rabbit with large ears and expressive eyes, sitting upright and looking directly at the viewer. The rabbit's fur is fluffy, and its whiskers are prominent. In the background, there is a single doughnut with a glossy pink glaze and colorful sprinkles. The doughnut has a golden-brown color and a hole in the center. The background is a solid black, which contrasts with the doughnut and highlights its details. The style of the image is a realistic 3D rendering, with a focus on the doughnut to emphasize its textures and colors.[/Prompt] based on image content and details to better match user input [User Input]This brown rabbit [The image shows a close-up of a rabbit with a

  0%|          | 0/32 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

2024-04-10 16:52:07,117 - INFO - Initializing model ...


png_path,= /mnt/chenjh/Idea23D/output/case1-0410-llava-34b/draft/iter-3-2-0/draft.png


2024-04-10 16:52:13,869 - INFO - Initializing model finished in 6751.58ms.
2024-04-10 16:52:13,870 - INFO - Processing image ...
2024-04-10 16:52:14,816 - INFO - Processing image finished in 945.90ms.
2024-04-10 16:52:14,817 - INFO - Running model ...
2024-04-10 16:52:14,817 - INFO - Running model ...
2024-04-10 16:52:15,054 - INFO - Running model finished in 237.06ms.
2024-04-10 16:52:15,055 - INFO - Rendering ...
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
2024-04-10 16:52:19,446 - INFO - Rendering finished in 4390.52ms.
2024-04-10 16:52:19,447 - INFO - Exporting mesh ...
2024-04-10 16:52:21,077 - INFO - Exporting mesh finished in 1629.93ms.



[IDEA-2-3D]: i23d_res = /mnt/chenjh/Idea23D/output/case1-0410-llava-34b/draft/iter-3-2-0/mesh.obj

[IDEA-2-3D]: concatenated_image output_path=/mnt/chenjh/Idea23D/output/case1-0410-llava-34b/tmp/concatenated_image-20240410165221139862.png

[IDEA-2-3D]: concatenated_image output_path=/mnt/chenjh/Idea23D/output/case1-0410-llava-34b/tmp/concatenated_image-20240410165221201320.png

[IDEA-2-3D]: prompt_select = Each row of these images shows 6 views of a 3D model. Which row of images best meets the user input? [User Input]This brown rabbit [The image shows a close-up of a rabbit with a fluffy, light brown coat. The rabbit has large, pointed ears and expressive eyes that are looking directly at the viewer. Its whiskers are prominent, and it appears to be sitting upright with its front paws slightly apart. The background is a solid black, which contrasts with the rabbit's fur and highlights the animal. The image is a realistic photograph with a focus on the rabbit, and there are no visible t

  0%|          | 0/32 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

2024-04-10 16:55:00,531 - INFO - Initializing model ...


png_path,= /mnt/chenjh/Idea23D/output/case1-0410-llava-34b/draft/iter-4-0-0/draft.png


2024-04-10 16:55:07,010 - INFO - Initializing model finished in 6478.95ms.
2024-04-10 16:55:07,011 - INFO - Processing image ...
2024-04-10 16:55:07,863 - INFO - Processing image finished in 852.73ms.
2024-04-10 16:55:07,864 - INFO - Running model ...
2024-04-10 16:55:07,865 - INFO - Running model ...
2024-04-10 16:55:08,099 - INFO - Running model finished in 234.78ms.
2024-04-10 16:55:08,100 - INFO - Rendering ...
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
2024-04-10 16:55:12,350 - INFO - Rendering finished in 4249.63ms.
2024-04-10 16:55:12,351 - INFO - Exporting mesh ...
2024-04-10 16:55:13,994 - INFO - Exporting mesh finished in 1643.41ms.



[IDEA-2-3D]: i23d_res = /mnt/chenjh/Idea23D/output/case1-0410-llava-34b/draft/iter-4-0-0/mesh.obj

[IDEA-2-3D]: concatenated_image output_path=/mnt/chenjh/Idea23D/output/case1-0410-llava-34b/tmp/concatenated_image-20240410165514051699.png

[IDEA-2-3D]: prompt_rev = Optimize prompt [Prompt]The image features a light brown rabbit with large ears and expressive eyes, sitting upright and looking directly at the viewer. The rabbit's fur is fluffy, and its whiskers are prominent. In the background, there is a single doughnut with a glossy pink glaze and colorful sprinkles. The doughnut has a golden-brown color and a hole in the center. The background is a solid black, which contrasts with the doughnut and highlights its details. The style of the image is a realistic 3D rendering, with a focus on the doughnut to emphasize its textures and colors.[/Prompt] based on image content and details to better match user input [User Input]This brown rabbit [The image shows a close-up of a rabbit with a

  0%|          | 0/32 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

2024-04-10 16:56:54,554 - INFO - Initializing model ...


png_path,= /mnt/chenjh/Idea23D/output/case1-0410-llava-34b/draft/iter-4-1-0/draft.png


2024-04-10 16:57:01,083 - INFO - Initializing model finished in 6529.02ms.
2024-04-10 16:57:01,085 - INFO - Processing image ...
2024-04-10 16:57:02,107 - INFO - Processing image finished in 1022.42ms.
2024-04-10 16:57:02,108 - INFO - Running model ...
2024-04-10 16:57:02,108 - INFO - Running model ...
2024-04-10 16:57:02,350 - INFO - Running model finished in 241.46ms.
2024-04-10 16:57:02,350 - INFO - Rendering ...
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
2024-04-10 16:57:06,627 - INFO - Rendering finished in 4276.49ms.
2024-04-10 16:57:06,628 - INFO - Exporting mesh ...
2024-04-10 16:57:08,424 - INFO - Exporting mesh finished in 1795.46ms.



[IDEA-2-3D]: i23d_res = /mnt/chenjh/Idea23D/output/case1-0410-llava-34b/draft/iter-4-1-0/mesh.obj

[IDEA-2-3D]: concatenated_image output_path=/mnt/chenjh/Idea23D/output/case1-0410-llava-34b/tmp/concatenated_image-20240410165708475508.png

[IDEA-2-3D]: prompt_rev = Optimize prompt [Prompt]The image features a light brown rabbit with large ears and expressive eyes, sitting upright and looking directly at the viewer. The rabbit's fur is fluffy, and its whiskers are prominent. In the background, there is a single doughnut with a glossy pink glaze and colorful sprinkles. The doughnut has a golden-brown color and a hole in the center. The background is a solid black, which contrasts with the doughnut and highlights its details. The style of the image is a realistic 3D rendering, with a focus on the doughnut to emphasize its textures and colors.[/Prompt] based on image content and details to better match user input [User Input]This brown rabbit [The image shows a close-up of a rabbit with a

  0%|          | 0/32 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

2024-04-10 16:58:48,969 - INFO - Initializing model ...


png_path,= /mnt/chenjh/Idea23D/output/case1-0410-llava-34b/draft/iter-4-2-0/draft.png


2024-04-10 16:58:54,189 - INFO - Initializing model finished in 5219.43ms.
2024-04-10 16:58:54,189 - INFO - Processing image ...
2024-04-10 16:58:55,194 - INFO - Processing image finished in 1004.51ms.
2024-04-10 16:58:55,195 - INFO - Running model ...
2024-04-10 16:58:55,195 - INFO - Running model ...
2024-04-10 16:58:55,412 - INFO - Running model finished in 217.02ms.
2024-04-10 16:58:55,413 - INFO - Rendering ...
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
2024-04-10 16:58:59,663 - INFO - Rendering finished in 4250.50ms.
2024-04-10 16:58:59,664 - INFO - Exporting mesh ...
2024-04-10 16:59:01,533 - INFO - Exporting mesh finished in 1868.61ms.



[IDEA-2-3D]: i23d_res = /mnt/chenjh/Idea23D/output/case1-0410-llava-34b/draft/iter-4-2-0/mesh.obj

[IDEA-2-3D]: concatenated_image output_path=/mnt/chenjh/Idea23D/output/case1-0410-llava-34b/tmp/concatenated_image-20240410165901597019.png

[IDEA-2-3D]: concatenated_image output_path=/mnt/chenjh/Idea23D/output/case1-0410-llava-34b/tmp/concatenated_image-20240410165901671076.png

[IDEA-2-3D]: prompt_select = Each row of these images shows 6 views of a 3D model. Which row of images best meets the user input? [User Input]This brown rabbit [The image shows a close-up of a rabbit with a fluffy, light brown coat. The rabbit has large, pointed ears and expressive eyes that are looking directly at the viewer. Its whiskers are prominent, and it appears to be sitting upright with its front paws slightly apart. The background is a solid black, which contrasts with the rabbit's fur and highlights the animal. The image is a realistic photograph with a focus on the rabbit, and there are no visible t

  0%|          | 0/32 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

2024-04-10 17:01:41,177 - INFO - Initializing model ...


png_path,= /mnt/chenjh/Idea23D/output/case1-0410-llava-34b/draft/iter-5-0-0/draft.png


2024-04-10 17:01:45,956 - INFO - Initializing model finished in 4778.49ms.
2024-04-10 17:01:45,956 - INFO - Processing image ...
2024-04-10 17:01:46,894 - INFO - Processing image finished in 937.84ms.
2024-04-10 17:01:46,895 - INFO - Running model ...
2024-04-10 17:01:46,895 - INFO - Running model ...
2024-04-10 17:01:47,132 - INFO - Running model finished in 236.85ms.
2024-04-10 17:01:47,133 - INFO - Rendering ...
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
2024-04-10 17:01:51,376 - INFO - Rendering finished in 4243.26ms.
2024-04-10 17:01:51,377 - INFO - Exporting mesh ...
2024-04-10 17:01:52,885 - INFO - Exporting mesh finished in 1507.87ms.



[IDEA-2-3D]: i23d_res = /mnt/chenjh/Idea23D/output/case1-0410-llava-34b/draft/iter-5-0-0/mesh.obj

[IDEA-2-3D]: concatenated_image output_path=/mnt/chenjh/Idea23D/output/case1-0410-llava-34b/tmp/concatenated_image-20240410170152936249.png

[IDEA-2-3D]: prompt_rev = Optimize prompt [Prompt]The image features a light brown rabbit with large ears and expressive eyes, sitting upright and looking directly at the viewer. The rabbit's fur is fluffy, and its whiskers are prominent. In the background, there is a single doughnut with a glossy pink glaze and colorful sprinkles. The doughnut has a golden-brown color and a hole in the center. The background is a solid black, which contrasts with the doughnut and highlights its details. The style of the image is a realistic 3D rendering, with a focus on the doughnut to emphasize its textures and colors.[/Prompt] based on image content and details to better match user input [User Input]This brown rabbit [The image shows a close-up of a rabbit with a

  0%|          | 0/32 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

2024-04-10 17:03:33,430 - INFO - Initializing model ...


png_path,= /mnt/chenjh/Idea23D/output/case1-0410-llava-34b/draft/iter-5-1-0/draft.png


2024-04-10 17:03:38,371 - INFO - Initializing model finished in 4941.70ms.
2024-04-10 17:03:38,372 - INFO - Processing image ...
2024-04-10 17:03:39,314 - INFO - Processing image finished in 942.31ms.
2024-04-10 17:03:39,315 - INFO - Running model ...
2024-04-10 17:03:39,316 - INFO - Running model ...
2024-04-10 17:03:39,572 - INFO - Running model finished in 256.10ms.
2024-04-10 17:03:39,573 - INFO - Rendering ...
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
2024-04-10 17:03:43,786 - INFO - Rendering finished in 4213.08ms.
2024-04-10 17:03:43,787 - INFO - Exporting mesh ...
2024-04-10 17:03:45,666 - INFO - Exporting mesh finished in 1879.38ms.



[IDEA-2-3D]: i23d_res = /mnt/chenjh/Idea23D/output/case1-0410-llava-34b/draft/iter-5-1-0/mesh.obj

[IDEA-2-3D]: concatenated_image output_path=/mnt/chenjh/Idea23D/output/case1-0410-llava-34b/tmp/concatenated_image-20240410170345720188.png

[IDEA-2-3D]: prompt_rev = Optimize prompt [Prompt]The image features a light brown rabbit with large ears and expressive eyes, sitting upright and looking directly at the viewer. The rabbit's fur is fluffy, and its whiskers are prominent. In the background, there is a single doughnut with a glossy pink glaze and colorful sprinkles. The doughnut has a golden-brown color and a hole in the center. The background is a solid black, which contrasts with the doughnut and highlights its details. The style of the image is a realistic 3D rendering, with a focus on the doughnut to emphasize its textures and colors.[/Prompt] based on image content and details to better match user input [User Input]This brown rabbit [The image shows a close-up of a rabbit with a

  0%|          | 0/32 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

2024-04-10 17:05:26,227 - INFO - Initializing model ...


png_path,= /mnt/chenjh/Idea23D/output/case1-0410-llava-34b/draft/iter-5-2-0/draft.png


2024-04-10 17:05:31,537 - INFO - Initializing model finished in 5309.68ms.
2024-04-10 17:05:31,538 - INFO - Processing image ...
2024-04-10 17:05:32,427 - INFO - Processing image finished in 889.47ms.
2024-04-10 17:05:32,428 - INFO - Running model ...
2024-04-10 17:05:32,428 - INFO - Running model ...
2024-04-10 17:05:32,639 - INFO - Running model finished in 210.33ms.
2024-04-10 17:05:32,639 - INFO - Rendering ...
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
2024-04-10 17:05:36,934 - INFO - Rendering finished in 4294.32ms.
2024-04-10 17:05:36,934 - INFO - Exporting mesh ...
2024-04-10 17:05:38,700 - INFO - Exporting mesh finished in 1765.23ms.



[IDEA-2-3D]: i23d_res = /mnt/chenjh/Idea23D/output/case1-0410-llava-34b/draft/iter-5-2-0/mesh.obj

[IDEA-2-3D]: concatenated_image output_path=/mnt/chenjh/Idea23D/output/case1-0410-llava-34b/tmp/concatenated_image-20240410170538750799.png

[IDEA-2-3D]: concatenated_image output_path=/mnt/chenjh/Idea23D/output/case1-0410-llava-34b/tmp/concatenated_image-20240410170538807882.png

[IDEA-2-3D]: prompt_select = Each row of these images shows 6 views of a 3D model. Which row of images best meets the user input? [User Input]This brown rabbit [The image shows a close-up of a rabbit with a fluffy, light brown coat. The rabbit has large, pointed ears and expressive eyes that are looking directly at the viewer. Its whiskers are prominent, and it appears to be sitting upright with its front paws slightly apart. The background is a solid black, which contrasts with the rabbit's fur and highlights the animal. The image is a realistic photograph with a focus on the rabbit, and there are no visible t