In [7]:
!python -m ensurepip --upgrade
# There're two versions of vicuna deltas v1.1 and v0. For minigpt-4 we should use v0 (only compatible up to 0.1.10) v1.1 would work but would produce useless outputs
!python -m pip install --quiet fschat==0.1.10 sentencepiece gdown gitpython
!python -m pip install --quiet omegaconf==2.3.0 iopath==0.1.10 timm==0.6.13 opencv-python-headless==4.7.0.72 decord==0.6.0 webdataset==0.2.48 accelerate==0.16.0 bitsandbytes==0.37.0

Looking in links: /tmp/tmpei8w5vz3
You should consider upgrading via the '/bin/python -m pip install --upgrade pip' command.[0m[33m
You should consider upgrading via the '/bin/python -m pip install --upgrade pip' command.[0m[33m
[0m

In [2]:
import shutil
import pathlib
import os
import gdown
import transformers
import gc

## Setting up Mini-GPT4

In [1]:
!rm -r /.cache/huggingface/hub/models*

In [None]:
#from transformers import LlamaTokenizer

default_cache_ir = pathlib.Path("/.cache/huggingface/hub")
llama_space = "decapoda-research"
llama_id = "llama-7b-hf"
vicuna_space = "lmsys"
vicuna_id = "vicuna-7b-delta-v0"

def download_models():
    llama_repo_id = f"{llama_space}/{llama_id}"
    vicuna_repo_id = f"{vicuna_space}/{vicuna_id}"
    tokenizer = transformers.LlamaTokenizer.from_pretrained(llama_repo_id)
    transformers.LlamaTokenizer.from_pretrained(vicuna_repo_id)

    # transformers.AutoModelForCausalLM.from_pretrained
    model = transformers.AutoModelForCausalLM.from_pretrained(llama_repo_id)
    del model
    gc.collect()
    model = transformers.AutoModelForCausalLM.from_pretrained(vicuna_repo_id)
    del model
    gc.collect()
    

        
        
download_models()

In [9]:
import json

def patch_tokenizer_config(default_cache_dir):
    # Magic fix introduced in https://github.com/huggingface/transformers/issues/22222#issuecomment-1477171703
    for space, repo in [(vicuna_space, vicuna_id), (llama_space, llama_id)]:
        for path in  (default_cache_dir / f"models--{space}--{repo}").rglob("snapshots/*/tokenizer_config.json"):
            print(f"Loading {path}")
            config = json.loads(open(path, "r").read())
            if config["tokenizer_class"] == "LlamaTokenizer":
                print("No fix needed")
            else:
                config["tokenizer_class"] = "LlamaTokenizer"
            with open(path, "w") as f:
                json.dump(config, f)

patch_tokenizer_config(default_cache_dir)

Loading /.cache/huggingface/hub/models--lmsys--vicuna-7b-delta-v0/snapshots/f902a2f7e2ca5dfeedf40a0220320e50d2d4fa2a/tokenizer_config.json
No fix needed
Loading /.cache/huggingface/hub/models--decapoda-research--llama-7b-hf/snapshots/5f98eefcc80e437ef68d457ad7bf167c2c6a1348/tokenizer_config.json
No fix needed


In [10]:
# Vicuna weights are deltas which needs to be applied on top of llama
!python -m fastchat.model.apply_delta --base /.cache/huggingface/hub/models--decapoda-research--llama-7b-hf/snapshots/*/  --target ./vicuna-7b-v0  --delta /.cache/huggingface/hub/models--lmsys--vicuna-7b-delta-v0/snapshots/*/

Loading the base model from /.cache/huggingface/hub/models--decapoda-research--llama-7b-hf/snapshots/5f98eefcc80e437ef68d457ad7bf167c2c6a1348/
Loading checkpoint shards: 100%|████████████████| 33/33 [00:06<00:00,  4.99it/s]
Loading the delta from /.cache/huggingface/hub/models--lmsys--vicuna-7b-delta-v0/snapshots/f902a2f7e2ca5dfeedf40a0220320e50d2d4fa2a/
Loading checkpoint shards: 100%|██████████████████| 2/2 [00:04<00:00,  2.25s/it]
Applying the delta
Applying delta: 100%|█████████████████████████| 323/323 [00:03<00:00, 85.23it/s]
Saving the target model to ./vicuna-7b-v0


In [4]:
# Notes
# wangrongsheng/MiniGPT-4-LLaMA-7B seems much better than vicuna-7b-v1.1 (decapoda-research/llama-7b-hf + lmsys/vicuna-7b-delta-v1.1)


Downloading tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/411 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/727 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/543 [00:00<?, ?B/s]

Downloading (…)model.bin.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading (…)l-00001-of-00002.bin:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

Downloading (…)l-00002-of-00002.bin:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

In [38]:
output_path = 'pretrained_minigpt4.pth'
gdown.download(
    "https://drive.google.com/file/d/1RY9jV0dyqLX-o38LrumkKRh6Jtaop58R/view?usp=sharing", output_path, fuzzy=True
)

Downloading...
From (uriginal): https://drive.google.com/uc?id=1RY9jV0dyqLX-o38LrumkKRh6Jtaop58R
From (redirected): https://drive.google.com/uc?id=1RY9jV0dyqLX-o38LrumkKRh6Jtaop58R&confirm=t&uuid=fab2f9e0-f31a-4e34-81c2-6697438c2530
To: /root/Autmn2023/pretrained_minigpt4.pth
100%|██████████| 37.9M/37.9M [00:00<00:00, 192MB/s]


'pretrained_minigpt4.pth'

In [34]:
!curl -LO https://github.com/Vision-CAIR/MiniGPT-4/archive/refs/heads/main.zip 

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
100 34.4M    0 34.4M    0     0  20.0M      0 --:--:--  0:00:01 --:--:-- 22.0M


In [36]:
import zipfile
with zipfile.ZipFile("main.zip", 'r') as zip_ref:
    zip_ref.extractall("./")

## Running Mini-GPT4

In [1]:
import sys
minigpt4_path = './MiniGPT-4/MiniGPT-4-main'
if sys.path[-1] != minigpt4_path:
    sys.path.append(minigpt4_path)

In [2]:
import argparse 
from minigpt4.common.config import Config
from minigpt4.common.registry import registry

from minigpt4.datasets.builders import *
from minigpt4.models import *
from minigpt4.processors import *
from minigpt4.runners import *
from minigpt4.tasks import *

parser = argparse.ArgumentParser(description="")
parser.add_argument('--cfg-path', help='')
parser.add_argument('--options', nargs="+",help='')
parser.add_argument('--gpu-id', default=0, help='')
args = parser.parse_args(" --cfg-path ./MiniGPT-4/MiniGPT-4-main/eval_configs/minigpt4_eval.yaml".split())

cfg = Config(args)

model_config = cfg.model_cfg
model_config.device_8bit = args.gpu_id
model_cls = registry.get_model_class(model_config.arch)
model = model_cls.from_config(model_config).to('cuda:{}'.format(args.gpu_id))

vis_processor_cfg = cfg.datasets_cfg.cc_sbu_align.vis_processor.train
vis_processor = registry.get_processor_class(vis_processor_cfg.name).from_config(vis_processor_cfg)

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Loading VIT


  0%|          | 0.00/1.89G [00:00<?, ?B/s]

Loading VIT Done
Loading Q-Former


  0%|          | 0.00/413M [00:00<?, ?B/s]

Loading Q-Former Done
Loading LLAMA

Welcome to bitsandbytes. For bug reports, please submit your error trace to: https://github.com/TimDettmers/bitsandbytes/issues


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading LLAMA Done
Load 4 training prompts
Prompt Example 
###Human: <Img><ImageHere></Img> Could you describe the contents of this image for me? ###Assistant: 
Load BLIP2-LLM Checkpoint: ./pretrained_minigpt4.pth


In [3]:
import argparse
import time
from PIL import Image

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, LlamaTokenizer
from transformers import StoppingCriteria, StoppingCriteriaList
from minigpt4.conversation.conversation import *


class MiniGPT4Chat:
    
    def __init__(self, model, vis_processor, device='cuda:0'):
        self.device = device
        self.model = model
        self.vis_processor = vis_processor
        stop_words_ids = [torch.tensor([835]).to(self.device),
                          torch.tensor([2277, 29937]).to(self.device)]  # '###' can be encoded in two different ways.
        self.stopping_criteria = StoppingCriteriaList([StoppingCriteriaSub(stops=stop_words_ids)])
        self.conv, self.img_list = None, None
        self.reset_history()
        
    def ask(self, text):
        if len(self.conv.messages) > 0 and self.conv.messages[-1][0] == self.conv.roles[0] \
                and self.conv.messages[-1][1][-6:] == '</Img>':  # last message is image.
            self.conv.messages[-1][1] = ' '.join([self.conv.messages[-1][1], text])
        else:
            self.conv.append_message(self.conv.roles[0], text)

    def answer(self, max_new_tokens=300, num_beams=1, min_length=1, top_p=0.9,
               repetition_penalty=1.0, length_penalty=1, temperature=1.0, max_length=2000):
        self.conv.append_message(self.conv.roles[1], None)
        embs = self.get_context_emb(self.img_list)

        current_max_len = embs.shape[1] + max_new_tokens
        if current_max_len - max_length > 0:
            print('Warning: The number of tokens in current conversation exceeds the max length. '
                  'The model will not see the contexts outside the range.')
        begin_idx = max(0, current_max_len - max_length)

        embs = embs[:, begin_idx:]

        outputs = self.model.llama_model.generate(
            inputs_embeds=embs,
            max_new_tokens=max_new_tokens,
            stopping_criteria=self.stopping_criteria,
            num_beams=num_beams,
            do_sample=True if num_beams==1 else False,
            min_length=min_length,
            top_p=top_p,
            repetition_penalty=repetition_penalty,
            length_penalty=length_penalty,
            temperature=temperature,
        )
        output_token = outputs[0]
        if output_token[0] == 0:  # the model might output a unknow token <unk> at the beginning. remove it
            output_token = output_token[1:]
        if output_token[0] == 1:  # some users find that there is a start token <s> at the beginning. remove it
            output_token = output_token[1:]
        output_text = self.model.llama_tokenizer.decode(output_token, add_special_tokens=False)
        output_text = output_text.split('###')[0]  # remove the stop sign '###'
        output_text = output_text.split('Assistant:')[-1].strip()
        self.conv.messages[-1][1] = output_text
        return output_text, output_token.cpu().numpy()

    def upload_img(self, image):
        if isinstance(image, str):  # is a image path
            raw_image = Image.open(image).convert('RGB')
            image = self.vis_processor(raw_image).unsqueeze(0).to(self.device)
        elif isinstance(image, Image.Image):
            raw_image = image
            image = self.vis_processor(raw_image).unsqueeze(0).to(self.device)
        elif isinstance(image, torch.Tensor):
            if len(image.shape) == 3:
                image = image.unsqueeze(0)
            image = image.to(self.device)

        image_emb, _ = self.model.encode_img(image)
        self.img_list.append(image_emb)
        self.conv.append_message(self.conv.roles[0], "<Img><ImageHere></Img>")
        msg = "Received."
        return msg

    def get_context_emb(self, img_list):
        prompt = self.conv.get_prompt()
        prompt_segs = prompt.split('<ImageHere>')
        assert len(prompt_segs) == len(img_list) + 1, "Unmatched numbers of image placeholders and images."
        seg_tokens = [
            self.model.llama_tokenizer(
                seg, return_tensors="pt", add_special_tokens=i == 0).to(self.device).input_ids
            # only add bos to the first seg
            for i, seg in enumerate(prompt_segs)
        ]
        seg_embs = [self.model.llama_model.model.embed_tokens(seg_t) for seg_t in seg_tokens]
        mixed_embs = [emb for pair in zip(seg_embs[:-1], img_list) for emb in pair] + [seg_embs[-1]]
        mixed_embs = torch.cat(mixed_embs, dim=1)
        return mixed_embs
    
    def reset_history(self):
        self.conv = Conversation(
            system="Give the following image: <Img>ImageContent</Img>. "
                   "You will be able to see the image once I provide it to you. Please answer my questions.",
            roles=("Human", "Assistant"),
            messages=[],
            offset=2,
            sep_style=SeparatorStyle.SINGLE,
            sep="###",
        )
        self.img_list = []

In [8]:
thumbnail_paths = [
    "./design_gpt4_data/thumbnails/EAE2rRgKUhE.jpg", 
    "./design_gpt4_data/thumbnails/EAE3o0b5yas.jpg", 
    "./design_gpt4_data/thumbnails/EAE5qZX9tk4.jpg", 
    "./design_gpt4_data/thumbnails/EAE8FTO3j5E.jpg", 
    "./design_gpt4_data/thumbnails/EAE8tVORyhU.jpg"
]


In [11]:
from ipywidgets import Layout, Box, Image, VBox, GridspecLayout, HTML

images = []
template_ids = []
for path in thumbnail_paths:
    images.append(Image(value=open(path, "rb").read(), format="jpg", width=256, height=256))
    template_ids.append(pathlib.Path(path).stem)


# can use height_ratios to control height of rows?
grid = GridspecLayout(4, len(template_ids), height='300px')

for i, (img, tid) in enumerate(zip(images, template_ids)):
    grid[0, i] = HTML(value=f"Template ID: {tid}")
    grid[1:4, i] = img
        
display(grid)

GridspecLayout(children=(HTML(value='Template ID: EAE2rRgKUhE', layout=Layout(grid_area='widget001')), Image(v…

In [18]:
# Be careful if you import ipython Image object can conflict with this
from PIL import Image

# when I had all questions as a single text - and low temperatures it didn't say "As an AI model I can't bla bla blla"
# start and follow up doesn't work much
prompt_start = "This is an instagram post design. Explain what are the main objects, what is the theme, based on colors what kind of a vibe does this design emit?"
prompts_followup = [       
    "What is the color of the text and the color of the background it's on? Do those colors create a strong contrast?",
    "What is the style of the text, professional, comical or something else? Does the style of text match the them of the design?",
    "Based on your outputs so far, is this a visually striking design. Explain why. Use topics such as text background contrast, overall contrast, color palette.",
    "Pick and audience this design be suitable for (a) an average person, (b) enterprise user or (c) educational?",
    "Analyse if this design pedogogical, suited to a classroom or children? ",
    #"Write 10 diverse instagram hashtags would you used to describe the style of this design. Do not mention specific objects in the design?"
]

thumbnail_paths = [
    "./design_gpt4_data/thumbnails/EAE2rRgKUhE.jpg", 
    "./design_gpt4_data/thumbnails/EAE3o0b5yas.jpg", 
    "./design_gpt4_data/thumbnails/EAE5qZX9tk4.jpg", 
    "./design_gpt4_data/thumbnails/EAE8FTO3j5E.jpg", 
    "./design_gpt4_data/thumbnails/EAE8tVORyhU.jpg"
]

minigpt4 = MiniGPT4Chat(model, vis_processor)
temperature = 0.9
max_new_tokens = 200
top_p = 0.9

gpt4_outputs = []
for path in thumbnail_paths:
    full_output = []
    minigpt4.upload_img(path)
    minigpt4.ask(prompt_start)
    out_start, _ = minigpt4.answer(
        num_beams=1,
        temperature=temperature,
        max_new_tokens=max_new_tokens,
        top_p=top_p
    )    
    print(path,":")
    print(out_start)
    full_output.append(out_start)
    for prompt in prompts_followup:
        print("-"*50)
        minigpt4.ask(prompt)
        out_follow, _ = minigpt4.answer(
            num_beams=1,
            temperature=temperature,
            max_new_tokens=max_new_tokens,
            top_p=top_p
        )
        print(out_follow)
        full_output.append(out_follow)
    print("="*50)
    minigpt4.reset_history()
    gpt4_outputs.append(full_output)
    

./design_gpt4_data/thumbnails/EAE2rRgKUhE.jpg :
The image is a view of a lake with mountains in the background, with the words "find your balance" overlaid on top. The theme of the design is nature and tranquility. The colors are primarily green and blue, which give off a calming and peaceful vibe. The design emits a sense of serenity and encourages the viewer to take a moment to reflect on their own balance in life.
--------------------------------------------------
The text is white, and the background is blue. The contrast between the two colors is high, as the text is white and the background is a darker shade of blue. This high contrast creates a clear and defined visual hierarchy, making the text stand out and draw the viewer's attention to it. The contrast also creates a sense of contrast between the natural, calming image of the lake and the words "find your balance," which emphasizes the importance of finding balance in life.
--------------------------------------------------


In [25]:
from ipywidgets import Layout, Box, Image, VBox, GridspecLayout, HTML

images = []
template_ids = []
for path in thumbnail_paths:
    images.append(Image(value=open(path, "rb").read(), format="jpg", width=256, height=256))
    template_ids.append(pathlib.Path(path).stem)


# can use height_ratios to control height of rows?
grid = GridspecLayout(len(template_ids), 4)

for i, (img, out) in enumerate(zip(images, gpt4_outputs)):
    grid[i, 0] = img    
    full_res = ""
    for q, r in zip([prompt_start] + prompts_followup, out):
        full_res += f"<b>{q}</b><br/>{r}<br/>"
    grid[i, 1:] = HTML(value=full_res)
    
        
display(grid)

GridspecLayout(children=(Image(value=b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00\x00\x01\x00\x01\x00\x00\xf…

In [12]:
# when I had all questions as a single text - and low temperatures it didn't say "As an AI model I can't bla bla blla"
# start and follow up doesn't work much
prompts = [    
    #"Explain what are the main objects, what is the theme, based on colors what kind of a vibe does this design emit?",
    #"What is the color of the text and the color of the background it's on? Is the text well highlighted and easy to read?",
    #"Is the text light or dark? Is the background light or dark? Is the text well highlighted against the background and easy to read?",
    #"Is the text readable in this background?",
    #"Usually it's better to have highly contrastive text with background such as black and white or blue and yellow. Blue and green on the other hand is not contrastive. How does it apply here?"
    # <-- Below one works reasonably well --> 
    # "Text is more readable when it constrasts with background such as black and white or blue and yellow. Blue and green on the other hand is not contrastive or readable. Is the text readable here?",
    #"What is the style of the text, professional, comical or something else? Does the style of text match the them of the design or not?",
    """Having complementary text color and a background color, such as black and white or blue and orange makes text higly contrastive, readable and pleasing. 
    Having both the text and the background in low contrast, such as yellow text with a white background makes text difficult to read. 
    Using shades of the same color for text and background, such as a green background and a green shade text, can make a design appear dull and unreadable. 
    Question 1: What the text color?
    Question 2: What is the background color?
    Question 3: Is the text background combination high-contrast or low-contrast in this image? 
    Answer:""",
    """Text is more readable and considered a good design when there's strong contrast: e.g. white text on a black background, red text on a yellow background, light blue text on a dark purple background, dark green text on a light pink background, Black text on a bright green background
    Text is unreadable and considered dull and poor design when it is low contrast: e.g. Light gray text on a white background, Pale yellow text on a light green background, Dark blue text on a black background, Dark purple text on a dark green background, Dark gray text on a light blue background
    Question 1: What is the text color and background color?
    Question 2: Is the text background combination high-contrast or low-contrast in this image? 
    Answer:
    """,
    #"What is the audience this design be suitable for (a) an average person or (b) enterprise user or (c) educational?",
    #"Analyse if this design pedogogical, suited to a classroom or children or not? ",
    #"""In a classroom setting, an effective educational visual design should be large enough to be visible to all students 
    #and be easily understood without requiring extensive explanation. 
    #It should also use a color scheme that is visually appealing and consistent with the subject matter being taught. 
    #Question 1: Does this design have a school related theme? 
    #Question 2: Is this design pedogogical, suited to a classroom or children or not?
    #Answer: """,
    #"What are 10 diverse instagram hashtags would you used to describe the style of this design. Do not mention specific objects in the design?"
]

thumbnail_paths = [
    "sample_thumbnails/EAE2rRgKUhE.jpg", 
    "sample_thumbnails/EAE3o0b5yas.jpg", 
    "sample_thumbnails/EAE5qZX9tk4.jpg", 
    "sample_thumbnails/EAE8FTO3j5E.jpg", 
    "sample_thumbnails/EAE8tVORyhU.jpg",
    "sample_thumbnails/EAEi-Hy31Ws.jpg",
    "sample_thumbnails/EAEs3-Md18A.jpg",
    "sample_thumbnails/EAEyG9tZVc0.jpg",
    "sample_thumbnails/EAFBw-ZDwJU.jpg",
]

minigpt4 = MiniGPT4Chat(model, vis_processor)
temperature = 0.95
max_new_tokens = 100
top_p = 0.9
length_penalty=0.5

gpt4_outputs = []
for path in thumbnail_paths:
    full_output = ""
    
    print(path,":")
    for prompt in prompts:
        print("-"*50)
        minigpt4.upload_img(path)
        minigpt4.ask(prompt)
        out, _ = minigpt4.answer(
            num_beams=1,
            temperature=temperature,
            max_new_tokens=max_new_tokens,
            top_p=top_p,
            length_penalty=length_penalty
        )            
        print(out)            
        full_output += " " + out
        minigpt4.reset_history()
        
    print("="*50)
    
    gpt4_outputs.append(full_output)
    

sample_thumbnails/EAE2rRgKUhE.jpg :
--------------------------------------------------
The text color is white and the background color is a light blue.
--------------------------------------------------
The text color is black and the background color is dark green. The text is low contrast in this image.
```
sample_thumbnails/EAE3o0b5yas.jpg :
--------------------------------------------------
The text color is yellow and the background color is orange. The text and background color combination is high contrast.
    Therefore, the text is highly visible and readable in this image.</s><s>
--------------------------------------------------
The text color is white and the background color is pink. The text is high-contrast and well-designed in this image.</s><s>
sample_thumbnails/EAE5qZX9tk4.jpg :
--------------------------------------------------
The text color is white and the background color is beige. The text and background combination is high-contrast, making the text easy to read

In [62]:
import ipywidgets as widgets
import IPython.display as display
## Read images from file (because this is binary, maybe you can find how to use ByteIO) but this is more easy
img1 = open('image1.jpeg', 'rb').read()
img2 = open('image2.jpeg', 'rb').read()
## Create image widgets. You can use layout of ipywidgets only with widgets.
## Set image variable, image format and dimension.
wi1 = widgets.Image(value=img1, format='png', width=300, height=400)
wi2 = widgets.Image(value=img2, format='png', width=300, height=400)
## Side by side thanks to HBox widgets
sidebyside = widgets.HBox([wi1, wi2])
## Finally, show.
display.display(sidebyside)

Conversation(system='Give the following image: <Img>ImageContent</Img>. You will be able to see the image once I provide it to you. Please answer my questions.', roles=('Human', 'Assistant'), messages=[['Human', '<Img><ImageHere></Img>']], offset=2, sep_style=<SeparatorStyle.SINGLE: 1>, sep='###', sep2=None, skip_next=False, conv_id=None)


In [1]:
!python MiniGPT-4/MiniGPT-4-main/demo.py --cfg-path MiniGPT-4/MiniGPT-4-main/eval_configs/minigpt4_eval.yaml  --gpu-id 0

Initializing Chat
Loading VIT
Loading VIT Done
Loading Q-Former
Loading Q-Former Done
Loading LLAMA

Welcome to bitsandbytes. For bug reports, please submit your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
Loading checkpoint shards: 100%|██████████████████| 2/2 [00:18<00:00,  9.21s/it]
Loading LLAMA Done
Load 4 training prompts
Prompt Example 
###Human: <Img><ImageHere></Img> Could you describe the contents of this image for me? ###Assistant: 
Load BLIP2-LLM Checkpoint: ./pretrained_minigpt4.pth
Initialization Finished
Running on local URL:  http://127.0.0.1:7860
Running on public URL: https://04f234d5480077b379.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades (NEW!), check out Spaces: https://huggingface.co/spaces
^C
Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7860 <> https://04f234d5480077b379.gradio.live


In [None]:
# output from straight downloaded vicuna

./design_gpt4_data/thumbnails/EAE2rRgKUhE.jpg :
The image shows a mountain lake with a rocky terrain in the foreground and snow-covered mountains in the background. The theme is nature, with a focus on the serenity and tranquility of the scene. The colors are mostly shades of green, blue, and grey, giving a calm and peaceful vibe to the design.
--------------------------------------------------
The text is white and the background is a dark blue. The contrast between the text and the background is strong, making the text stand out and creating a clear visual hierarchy.
--------------------------------------------------
The style of the text is professional and informative, which matches the theme of the image which is a serene and peaceful mountain lake. The text is not comical.
--------------------------------------------------
Based on the outputs so far, the design is visually striking due to the contrast between the white text and the dark blue background. This creates a clear visual hierarchy and draws the eye to the text. Additionally, the mountain lake in the background adds depth and interest to the design. The color palette consists mainly of shades of green, blue, and grey, which creates a calm and peaceful atmosphere. Overall, the design is visually striking due to the contrast, visual hierarchy, and serene atmosphere created by the colors and image.
--------------------------------------------------
This design would be suitable for an enterprise user, as it has a professional and informative style and the content would be relevant to a business or organization. The visual hierarchy created by the contrast between the text and background, as well as the clean and simple design, would make the information easy to understand and digest for an enterprise audience. It could also be suitable for an educational audience, as the serene atmosphere and informative content would make it suitable for teaching or training materials. However, the design would not be as effective for an average person, as it may be too formal and professional for everyday use.
--------------------------------------------------
The design is not particularly pedagogical in nature and would not be suitable for a classroom or for children. It does not have any specific educational content or features, such as diagrams or interactive elements, that would make it suited for a classroom setting. Additionally, the serene atmosphere and professional style may not be engaging for children. Overall, the design is not pedagogical and would not be suitable for a classroom or children.
==================================================
./design_gpt4_data/thumbnails/EAE3o0b5yas.jpg :
This is a recipe for pancakes with strawberries and whipped cream. The image shows a plate with pancakes stacked on top of each other, with strawberries and whipped cream on top. The colors used are pink, red, and white. The overall vibe of this design is sweet and decadent, perfect for a breakfast or brunch dish.
--------------------------------------------------
The text is white and the background is pink. The contrast between the two colors is strong. The white text stands out against the pink background, making it easy to read and understand.
--------------------------------------------------
The style of the text is professional. The theme of the design is a recipe for pancakes with strawberries and whipped cream. The style of the text, being professional, matches the theme of the design.
--------------------------------------------------
Based on my outputs so far, the design is visually striking due to the contrast between the text and the background. The white text on a pink background creates a high level of contrast, making it easy to read and understand. Additionally, the theme of the design, which is a recipe for pancakes, pairs well with the visual representation of the pancakes. Overall, the design is aesthetically pleasing and effectively conveys the information it is meant to.
--------------------------------------------------
This design would be suitable for an average person, as it is a simple and easy to understand recipe for pancakes. The language used is clear and the visual representation of the pancakes makes it easy to visualize the end result. It could also be suitable for an enterprise user, as the design is professional in appearance and the information it conveys is relevant to the theme of the pancakes. It would not be suitable for an educational audience as it is not specifically designed for learning or teaching purposes.
--------------------------------------------------
Based on the design, it is not specifically designed for a classroom or children. The language used is simple and easy to understand, but the visual representation of the pancakes may not be relatable to children. The theme of the recipe, which is pancakes, may not be particularly engaging for children. It would not be suitable for a classroom or for teaching children.
==================================================
./design_gpt4_data/thumbnails/EAE5qZX9tk4.jpg :
The image is a woman wearing a white towel and smiling. The theme of the image is beauty and skincare. The colors used in the design are white, beige and brown. The design emits a relaxing and spa-like vibe.
--------------------------------------------------
The text in the image is white and the background is beige. The contrast between the colors is not very strong.
--------------------------------------------------
The style of the text is professional. The text "Beauty tips" is written in sans-serif font and appears to be in a clean and organized format, which aligns with the overall professional theme of the design. The text does match the theme of the design, which is beauty and skincare.
--------------------------------------------------
Based on my outputs so far, this design is visually striking. The contrast between the white text and beige background is not very strong, but it is still noticeable. The color palette consists of white, beige, and brown, which creates a warm and inviting feel. The overall contrast is good, which makes the text stand out from the background. The use of a professional font and clean format also adds to the overall professional look of the design. All these elements contribute to the visual appeal of the design.
--------------------------------------------------
The design would be suitable for an average person. The information being presented is easy to understand and the use of white text on a beige background makes it easy to read. The overall style of the design is professional and clean, which makes it accessible to most people. It could also be suitable for enterprise users, but the style may not be as appealing to them as it would be for an average person. The design may not be suitable for educational purposes, as it does not convey any specific information related to education or learning.
--------------------------------------------------
This design is not pedagogical, suited to a classroom, or child-friendly. The style of the design is more geared towards professionalism, and the content being presented is geared towards beauty and skincare for adults. The use of beige background may not be suitable for a classroom setting, as it may not be visually appealing to children. Additionally, the font used in the design is not child-friendly, making it difficult for children to read. Therefore, this design is not suitable for a classroom or children.
==================================================
./design_gpt4_data/thumbnails/EAE8FTO3j5E.jpg :
The image is a promotional image for a restaurant. The main objects are the two people, one is pointing at something on the menu and the other is holding a plate with food on it. The theme is food and dining. The colors are green and white, which gives a fresh and clean look to the image. The vibe the design emits is one of excitement and enjoyment.
--------------------------------------------------
The text is white and the background is green. Yes, the contrast is strong. The use of white text on a green background creates a clear visual separation between the two, making the text easy to read and stand out. This contrast also helps to draw the attention of the viewer to the text.
--------------------------------------------------
The style of the text is professional. The text is written in a clean and clear font, which gives an impression of professionalism. The text matches the theme of the design, which is a restaurant, as it is promoting the restaurant and its menu items. The use of a professional tone in the text helps to convey trust and reliability, which is important in the food industry.
--------------------------------------------------
Based on the outputs so far, this is a visually striking design. The contrast between the green background and the white text creates a clear visual separation, making the text easy to read and stand out. The use of the green background gives the design a fresh and natural look, while the white text gives it a professional and clean look. The color palette of green and white creates a harmonious and balanced design. The overall contrast creates a vivid and attention-grabbing design, which makes the restaurant's menu items more appealing and enticing. The design effectively communicates the theme of the restaurant, which is dining, and creates a sense of excitement and enjoyment.
--------------------------------------------------
This design would be suitable for (c) an educational audience. The design is simple and easy to read, making it suitable for an educational setting where the audience may not have much prior knowledge of the restaurant or its menu items. The use of a professional tone in the text helps to convey trust and reliability, which is important in an educational setting where the audience may be looking for reliable information. The overall design is visually striking and attention-grabbing, which helps to engage the audience and make learning more enjoyable. The color palette is also appropriate for an educational setting, as green is often associated with learning and growth.
--------------------------------------------------
This design is suitable for a classroom or children. The design is simple and easy to read, making it suitable for a classroom setting where children may not have much prior knowledge of the restaurant or its menu items. The use of a professional tone in the text helps to convey trust and reliability, which is important in a classroom setting where children may be looking for reliable information. The overall design is visually striking and attention-grabbing, which helps to engage children and make learning more enjoyable. The color palette is also appropriate for a classroom or children's setting, as green is often associated with learning and growth. The design is not pedagogical in nature, but it can be used to teach children about different types of food and menu items.
==================================================
./design_gpt4_data/thumbnails/EAE8tVORyhU.jpg :
This is an Instagram post design that highlights the benefits of using skincare products. The main objects are the words "Beauty Tips without Surgery" in a bold and bold font. The theme of the design is skincare, and the color scheme is pink and white. The design emits a relaxed and natural vibe.
--------------------------------------------------
The text color is pink and the background color is white. The colors create a strong contrast, which is effective in making the text stand out and draw attention. The use of contrast can help make the message more impactful and readable.
--------------------------------------------------
The style of the text is professional. The text is written in a clear and easy-to-read font, which is appropriate for a professional design. The theme of the design is skincare, which is consistent with the professional tone of the text. The design is aimed at conveying information about skincare products in a straightforward manner, so the professional style of the text is appropriate.
--------------------------------------------------