## load datasets

In [1]:
%cd ..

/home/mas-xie.haojie/codes_gzp/MiniGPT-4


In [2]:
from minigpt4.common.config import Config

from minigpt4.datasets.builders import *
from minigpt4.models import *
from minigpt4.processors import *
from minigpt4.runners import *
from minigpt4.tasks import *
from minigpt4.common.registry import registry

  from .autonotebook import tqdm as notebook_tqdm



Welcome to bitsandbytes. For bug reports, please submit your error trace to: https://github.com/TimDettmers/bitsandbytes/issues


In [3]:
# a dummy dataset config 
dataset_config={
    "personality_captions":{
        "vis_processor":{
            "train":{
                "name": "blip2_image_train",
                "image_size": 224
            }
        },
        "text_processor":{
            "train":{
                "name": "blip_caption",
                "image_size": 224
            }
        }
    }
}

ds_config=Config.build_dataset_config({"datasets":dataset_config})

In [4]:
def build_dataset(datasets_config):
    datasets=dict()
    
    for name in datasets_config:
        dataset_config = datasets_config[name]

        builder = registry.get_builder_class(name)(dataset_config)
        dataset = builder.build_datasets()

        dataset['train'].name = name
        if 'sample_ratio' in dataset_config:
            dataset['train'].sample_ratio = dataset_config.sample_ratio

        datasets[name] = dataset

    return datasets

In [5]:
ds_config

{'datasets': {'personality_captions': {'data_type': 'images', 'build_info': {'storage': '/148Dataset/data-gao.zhenpeng/PCap/'}, 'vis_processor': {'train': {'name': 'blip2_image_train', 'image_size': 224}}, 'text_processor': {'train': {'name': 'blip_caption', 'image_size': 224}}}}}

In [6]:
ds=build_dataset(ds_config["datasets"])

In [7]:
pcap=ds["personality_captions"]["train"]

pcap[0]

{'image': tensor([[[0.8355, 0.8501, 0.8501,  ..., 0.8355, 0.8355, 0.8355],
          [0.8355, 0.8501, 0.8501,  ..., 0.8501, 0.8355, 0.8501],
          [0.8355, 0.8501, 0.8501,  ..., 0.8501, 0.8355, 0.8355],
          ...,
          [1.0690, 1.0690, 1.0690,  ..., 0.9522, 0.9668, 0.9960],
          [1.0544, 1.0544, 1.0544,  ..., 1.0544, 1.0398, 1.0398],
          [1.0836, 1.0690, 1.0690,  ..., 1.0544, 1.0544, 1.0398]],
 
         [[1.0243, 1.0393, 1.0393,  ..., 1.0243, 1.0243, 1.0093],
          [1.0243, 1.0393, 1.0393,  ..., 1.0393, 1.0243, 1.0243],
          [1.0243, 1.0393, 1.0393,  ..., 1.0393, 1.0243, 1.0243],
          ...,
          [1.2645, 1.2645, 1.2645,  ..., 1.1744, 1.1894, 1.1894],
          [1.2495, 1.2495, 1.2495,  ..., 1.2495, 1.2344, 1.2344],
          [1.2795, 1.2645, 1.2645,  ..., 1.2495, 1.2495, 1.2344]],
 
         [[1.2074, 1.2216, 1.2216,  ..., 1.2074, 1.1932, 1.2074],
          [1.2074, 1.2216, 1.2216,  ..., 1.2074, 1.1932, 1.2074],
          [1.2074, 1.2216, 1.22

In [8]:
from torch.utils.data import DataLoader

dl=DataLoader(pcap, batch_size=3)

In [9]:
for item in dl:
    print(item)
    break

{'image': tensor([[[[ 0.8647,  0.8647,  0.8647,  ...,  0.7333,  0.7479,  0.7333],
          [ 0.8647,  0.8792,  0.8792,  ...,  0.7479,  0.7479,  0.7479],
          [ 0.8647,  0.8647,  0.8792,  ...,  0.7625,  0.7479,  0.7625],
          ...,
          [ 1.0398,  1.0398,  1.0398,  ...,  1.0544,  1.0544,  1.0544],
          [ 1.0252,  1.0106,  1.0106,  ...,  1.0398,  1.0398,  1.0398],
          [ 1.0398,  1.0252,  1.0252,  ...,  1.0544,  1.0544,  1.0544]],

         [[ 1.0544,  1.0544,  1.0544,  ...,  0.9643,  0.9793,  0.9643],
          [ 1.0544,  1.0694,  1.0694,  ...,  0.9793,  0.9793,  0.9793],
          [ 1.0544,  1.0544,  1.0694,  ...,  0.9943,  0.9793,  0.9943],
          ...,
          [ 1.2344,  1.2344,  1.2344,  ...,  1.2495,  1.2495,  1.2495],
          [ 1.2194,  1.2194,  1.2194,  ...,  1.2344,  1.2344,  1.2344],
          [ 1.2344,  1.2194,  1.2194,  ...,  1.2495,  1.2495,  1.2495]],

         [[ 1.2358,  1.2358,  1.2358,  ...,  1.1363,  1.1505,  1.1363],
          [ 1.2358, 

## test eval dataset

In [14]:
from datasets import load_dataset


eval_ds = "../dataset/PCap/personality_captions/test.json"
ds = load_dataset("json", data_files=eval_ds, split="train")

In [15]:
import json
with open("inference_src.json", "r") as f:
    examples = json.load(f)

# convert record to set
keys = list(examples[0].keys())
examples={k: [x[k] for x in examples] for k in keys}

In [16]:
examples

{'personality': ['Stupid',
  'Opinionated',
  'Deep',
  'Mystical',
  'Grim',
  'Calm',
  'Lazy',
  'Creative',
  'Absentminded',
  'Nihilistic',
  'Destructive',
  'Practical',
  'Assertive',
  'Practical',
  'Aggressive',
  'Cold',
  'Exciting',
  'Frivolous (Trivial, Silly)',
  'Open'],
 'image_hash': ['15cdd44cf6d73b8f0b64352372a91c1',
  '2ed2094a52eb3579f4ef7c5e501db5',
  'f07589c35e378e3043fd5f642513bb64',
  'ac75942b5d6261354442cf502a2bb7dd',
  '145c579124825a6fb82c5eee26d7a3ab',
  'd9a74f6bc2d3cb0a517f1c6bd6dfdeb',
  '837325523e1bd313536c18e63f3c252',
  'f67bbbd0ffe146e76caad64022548aba',
  'd039ee869ab8618bba1b78eb2e44bf3',
  '13d6c33b85da7749aa62d966cc91d7',
  '17bb5c2fddbd6ffcd4d35d43755cadd',
  '17bb5c2fddbd6ffcd4d35d43755cadd',
  'f08b5d8261d8363bba5ca5b0843037bf',
  '6fc987ebf4fdea174621c361bd827fc6',
  '594e9b352cf52b193dd824c9caa16a7',
  'f2ed11c64f4cda6fd3fa35487f99d412',
  'dfcc3c611e334a37e8b2f3475b4a946',
  '2641f7424531f8bdbf241bc9c9a631f',
  '1ceb484e59e9fa5697178

## test tokenizer

In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:

tokenizer = AutoTokenizer.from_pretrained("../../minigpt4_misc/vicuna-7b/")

In [4]:
tokenizer.encode("a", add_special_tokens=True)

[1, 263]