In [1]:
import json, os, sys, random, re, math
sys.path.append("../scripts/formalism")
from entropy import *
from formalism_utils import *
from collections import Counter, defaultdict
import numpy as np
import pandas as pd
from tqdm import tqdm, trange
from pprint import pprint
from textblob import Word
import matplotlib.pyplot as plt
from PIL import Image
from plottable import Table, ColumnDefinition


## Rewrite Dataset to Convenient Format

In [2]:
version = "b"
annotations = []
if 'a' in version:
    annotations.extend(json.load(open("../whatsup_vlms/data/controlled_images_dataset.json", "r")))
if 'b' in version:
    annotations.extend(json.load(open("../whatsup_vlms/data/controlled_clevr_dataset.json", "r")))

print(len(annotations))

408


In [3]:
annotations[0]

{'image_path': 'data/controlled_clevr/mug_right_of_knife.jpeg',
 'caption_options': ['A mug to the right of a knife',
  'A mug in front of a knife',
  'A mug behind a knife',
  'A mug to the left of a knife']}

In [7]:
s = set()
for a in annotations:
    tmp = a["image_path"].split("/")[-1].split(".")[0].split("_")
    o1, o2 = tmp[0], tmp[-1]
    s.add(o1)
    s.add(o2)
print("#unique objects from full What'sUp: ", len(s))

#unique objects from full What'sUp:  18


In [None]:
### write to json with the same format as "vgr_nocaps_tb_both_complete.json"
J = []
SUBJ, OBJ = [], []
for a in tqdm(annotations):
    filename = a['image_path'][5:]
    tmp = a['image_path'].split("/")[-1][:-5].split("_")
    subj, obj = tmp[0], tmp[-1]
    SUBJ.append(subj)
    OBJ.append(obj)
    r = " ".join(tmp[1:-1])
    J.append([
        a['caption_options'][0],
        "whatsup_vlms/" + filename,
        (subj, obj, r)
    ])
print(len(J))
print(f"#unique subj = ", len(set(SUBJ)))
print(f"#unique obj = ", len(set(OBJ)))
print(f"#unique concepts = ", len(set(SUBJ).union(set(OBJ))))
print(f"#unique train_triplets = ", len(set([a[-1] for a in J])))
json.dump(J, open(f"../data/aggregated/whatsup_vlm_{version}.json", "w"), indent=4)


In [13]:
SYMMETRIC_REL = {
    "left of": "right of",
    "right of": "left of", 
    "in-front of": "behind", 
    "behind": "in-front of",
}

In [14]:
### write to json with the same format as "vgr_nocaps_tb_both_complete.json"
### toggle autofill --- only works for version b
J = []
SUBJ, OBJ = [], []
assert version == "b"
autofill_symmetric_rel = True
skip_nouns = ["sunglasses", "remote", "phone"] # None #
rel_version = "lr"

suffix = "_autofill" if autofill_symmetric_rel else ""
if skip_nouns is not None: suffix += "_remove_" + "_".join([x[:3] for x in skip_nouns])
for a in tqdm(annotations):
    filename = a['image_path'][5:]
    tmp = a['image_path'].split("/")[-1][:-5].split("_")
    subj, obj = tmp[0], tmp[-1]
    if skip_nouns is not None and (subj in skip_nouns or obj in skip_nouns): continue
    SUBJ.append(subj)
    OBJ.append(obj)
    r = " ".join(tmp[1:-1])
    if rel_version == "lr" and r in ["in-front of", "behind"]: continue
    if rel_version == "fb" and r in ['left of', "right of"]: continue
    J.append([
        a['caption_options'][0],
        "whatsup_vlms/" + filename,
        (subj, obj, r)
    ])

if autofill_symmetric_rel:
    autofill = []
    for a in J:
        subj, obj, r = a[-1]
        change_r = a[0].replace(r, SYMMETRIC_REL[r])
        tmp = change_r.split()
        tmp[1] = obj
        tmp[-1] = subj
        autofill.append([
            " ".join(tmp),
            a[1],
            (obj, subj, SYMMETRIC_REL[r])
        ])
        
    print(f"\nautofill {len(autofill)} tuples\n")
    J.extend(autofill)

print(len(J))
print(f"#unique subj = ", len(set(SUBJ)))
print(f"#unique obj = ", len(set(OBJ)))
print(f"#unique concepts = ", len(set(SUBJ).union(set(OBJ))))
print(f"#unique train_triplets = ", len(set([a[-1] for a in J])))
json.dump(J, open(f"../data/aggregated/whatsup_vlm_{version}_{rel_version}{suffix}.json", "w"), indent=4)


100%|██████████| 408/408 [00:00<00:00, 708896.45it/s]


autofill 154 tuples

308
#unique subj =  7
#unique obj =  14
#unique concepts =  15
#unique train_triplets =  308





## Compute Metrics

In [4]:
entropy_funcs = [
    #"concept_centric_entropy3(num_nouns, num_relations, df)",
    #"concept_centric_entropy4(num_nouns, num_relations, df)",
    #"concept_centric_entropy5(num_nouns, num_relations, df)",
    #"relation_centric_entropy3(num_nouns, num_relations, df)",
    #"relation_centric_entropy4(num_nouns, num_relations, df)",
    #"relation_centric_entropy5(num_nouns, num_relations, df)",
    #"divergence(num_nouns, num_relations, df)",
    #"divergence2(num_nouns, num_relations, df)",
    #"divergence3(num_nouns, num_relations, df)",
    #"concept_role_entropy(num_nouns, df)",
    #"concept_role_entropy2(num_nouns, df)",
    #"role_association(num_nouns, df)",
    #"role_association2(num_nouns, df)",
    #"concept_entropy(num_nouns, df)",
    #"concept_entropy2(num_nouns, df)",
    #"concept_entropy0(num_nouns, df)",
    "concept_role_index_entropy(num_nouns, df)"
]

### Original Dataset

In [5]:
version = "b_lr_autofill_remove_sun_rem_pho"
rel = "lr"
autofill_symmetric_rel = False
map = {
    "lr": ["left of", "right of"],
    "fb": ["in-front of", "behind"]
}

annotations = json.load(open(f"../data/aggregated/whatsup_vlm_{version}.json", "r"))
SUBJ, OBJ, tuples, relations = [], [], [], map[rel]
for a in annotations:
    subj, obj, r = a[-1]
    if not r in relations: continue
    SUBJ.append(subj)
    OBJ.append(obj)
    tuples.append((subj, obj, r))
#print(f"#unique subj = ", len(set(SUBJ)))
#print(f"#unique obj = ", len(set(OBJ)))
nouns = sorted(list(set(SUBJ).union(set(OBJ))))
num_nouns, num_relations = len(nouns), len(relations)
print("#unique concepts = ", num_nouns)
print(f"relations = {set(relations)}\n")

n2i = {n:i for i, n in enumerate(nouns)}
r2i = {r:i for i, r in enumerate(relations)}
print(len(r2i), len(n2i))

if autofill_symmetric_rel:
    autofill = []
    for t in tuples:
        subj, obj, r = t
        if not (obj, subj, relations[1-r2i[r]]) in tuples:
            autofill.append((obj, subj, relations[1-r2i[r]]))
    print(f"\nautofill {len(autofill)} tuples\n")
    tuples.extend(autofill)

for transpose in [False, True]:
    train_triplets = [] # convert tuple elements to indices
    for subj, obj, r in tuples:
        train_triplets.append((n2i[subj], n2i[obj], r2i[r]))
    if transpose: train_triplets = Transpose(train_triplets)

    print("#unique O1 = ", len(set([t[0] for t in train_triplets])))
    print("#unique O2 = ", len(set([t[1] for t in train_triplets])))
    print(f"#unique train_triplets = ", len(set(train_triplets)))

    df = pd.DataFrame(train_triplets, columns =['O1', 'O2', 'R'])
    print("role intrinsic meanings: {} position {}\n".format("image" if transpose else "linguistic", "(with autofill)" if autofill_symmetric_rel else ""))
    for f in entropy_funcs:
        score = eval(f)
        print(f.split("(")[0], ": ", score)
    print()
        

#unique concepts =  15
relations = {'right of', 'left of'}

2 15
#unique O1 =  15
#unique O2 =  15
#unique train_triplets =  308
role intrinsic meanings: linguistic position 

concept_role_index_entropy :  0.6931471805599453

#unique O1 =  15
#unique O2 =  15
#unique train_triplets =  308
role intrinsic meanings: image position 

concept_role_index_entropy :  0.6931471805599453



In [None]:
### visualize table
# Put the most full column on the left
column_names = sorted(nouns, key=lambda n: len([t for t in train_triplets if nouns[t[0]]==n]), reverse=True)
df = pd.DataFrame(columns=column_names, index=column_names)
for t in train_triplets:
    if not isinstance(df[nouns[t[0]]][nouns[t[1]]], str):
        df[nouns[t[0]]][nouns[t[1]]] = "R1\n  " if t[-1]==0 else "  \nR2"
    else:
        if t[-1] == 0: df[nouns[t[0]]][nouns[t[1]]] = "R1" + df[nouns[t[0]]][nouns[t[1]]][2:]
        else: df[nouns[t[0]]][nouns[t[1]]] = df[nouns[t[0]]][nouns[t[1]]][:3] + "R2"
for n in column_names:
    for m in column_names:
        if not isinstance(df[n][m], str): df[n][m] = ""
col_defs = [ColumnDefinition(
            name=n,
            title=n.replace("glasses", "\nglasses").replace("phones", "\nphones"),
            border="left",
            textprops={"ha": "center"},
        ) for n in column_names]
fig, ax = plt.subplots(figsize=(15, 12))
tab = Table(df,
            ax=ax,
            column_definitions = col_defs,
            column_border_kw={"linewidth": 1, "color": "black", "linestyle": "-"},
            footer_divider=True,
            )
plt.title("{} positions {} {}".format("image" if transpose else "linguistic", 
                                   "(with autofill)" if autofill_symmetric_rel else "",
                                   relations), 
                                   fontsize=20)
plt.show()

In [None]:
### Save example images
from diffusers.utils import make_image_grid
from datetime import datetime
import pytz
timezone = pytz.timezone('America/New_York') 
date = datetime.now(timezone).strftime("%m%d_%H%M%S")
example_dir = "../whatsup_vlms/data/img_examples/"
imsize = 64
images, texts = [], []
for e in random.sample(annotations, 32):
    images.append(Image.open(os.path.join(
        "/data/yingshac/clevr_control/data/",
        e[1]
    )).convert("RGB").resize((imsize, imsize)))
    texts.append(e[0])

image_grid = make_image_grid(images, rows=16, cols=math.ceil(len(images)/16))
image_grid.save("{}/{}.png".format(example_dir, date))
with open("{}/{}.txt".format(example_dir, date), "w") as f:
    f.write("\n".join(texts))



### Subsampled

In [2]:
sys.path.append("../scripts/")
from diffuser_real.dataset import real_dataset

In [3]:
annotations = json.load(open("../data/aggregated/whatsup_vlm_b_lr_autofill_remove_sun_rem_pho.json", "r"))
imdir="/data/yingshac/clevr_control/data/" 

In [17]:
split = "splitI"
transpose = ord(split[-1]) - ord('A') > 13
D = real_dataset(imdir, annotations, imsize=(32, 64), subsample_method=f"subsample_whatsup_{split}")
print(len(D))
n2i = {n:i for i, n in enumerate(D.classes)}
r2i = {"left of": 0, "right of": 1} # hardcoded
print(f"Coverage = {round(len(D)/(len(n2i)*(len(n2i)-1)*len(r2i)), 2)}")


154
Coverage = 0.37


In [18]:
train_triplets = [(n2i[d[-1][0]], n2i[d[-1][1]], r2i[d[-1][-1]]) for d in D.data]

tmp = train_triplets if transpose else Transpose(train_triplets, apply_to_relations=[1])
print("#unique images = {}/154".format(len(set([x[:2] for x in tmp]))))

df = pd.DataFrame(train_triplets, columns =['O1', 'O2', 'R'])
num_nouns = len(D.classes)
score = concept_role_index_entropy(num_nouns, df)
print("Linguistic Balance")
print(f"concept_role_index_entropy : {round(score, 4)}, normalized: {round(score/np.log(2), 2)}")
#if transpose: 

train_triplets = Transpose(train_triplets, apply_to_relations=[1])
df = pd.DataFrame(train_triplets, columns =['O1', 'O2', 'R'])
num_nouns = len(D.classes)
score = concept_role_index_entropy(num_nouns, df)
print("Image Balance")
#print("{} Balance".format("Image" if transpose else "Linguistic"))
print(f"concept_role_index_entropy : {round(score, 4)}, normalized: {round(score/np.log(2), 2)}")

#unique images = 154/154
Linguistic Balance
concept_role_index_entropy : 0.5524, normalized: 0.8
Image Balance
concept_role_index_entropy : 0.6931, normalized: 1.0


In [11]:
Counter([x[-1] for x in train_triplets])

Counter({1: 77, 0: 77})

In [12]:
tuples = [d[-1] for d in D.data]
["tape", "book", "right of"] in tuples

True

## Draft

In [None]:
from torch.utils.data import Dataset
from typing import Any, Dict, Optional, Tuple, Union, List
from torchvision import transforms

class whatsup_singleobj_dataset(Dataset):
    def __init__(self,
                 imdir: str,
                 annotations: List,
                 imsize = (32, 32),
                 ):
        super().__init__()
        self.imdir = imdir
        self.annotations = annotations
        self.preprocess = transforms.Compose(
            [   
                transforms.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
                transforms.Resize(imsize),
                transforms.ToTensor(),
                transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
            ]
        )

        print("Data Preprocessing")
        self.concept2pilimages = defaultdict(list)
        for a in tqdm(self.annotations):
            gth_caption, image_path, T = a
            image = Image.open(os.path.join(self.imdir, image_path))

            width, height = image.size
            new_dimension = min(image.size)
            left = (width - new_dimension)/2
            top = (height - new_dimension)/2
            right = (width + new_dimension)/2
            bottom = (height + new_dimension)/2
            pilimage = image.crop((left, top, right, bottom))

            W, H = pilimage.size
            crop1 = pilimage.crop((W//4, 0, 3*W//4, H//2)) # behind
            crop2 = pilimage.crop((W//4, H//2, 3*W//4, H)) # front
            crop3 = pilimage.crop((0, H//4, W//2, 3*H//4)) # left
            crop4 = pilimage.crop((W//2, H//4, W, 3*H//4)) # right
            f1, f2, r = T

            if "left of" in r: 
                self.concept2pilimages[f1].append(crop3)
                self.concept2pilimages[f2].append(crop4)
            elif "right of" in r: 
                self.concept2pilimages[f2].append(crop3)
                self.concept2pilimages[f1].append(crop4)
            elif "in-front of" in r: 
                self.concept2pilimages[f2].append(crop1)
                self.concept2pilimages[f1].append(crop2)
            elif "behind" in r: 
                self.concept2pilimages[f1].append(crop1)
                self.concept2pilimages[f2].append(crop2)
            else: raise ValueError(f"Invalid relation: {r}")

        self.classes = sorted(self.concept2pilimages.keys(), key=lambda x: len(self.concept2pilimages[x]), reverse=True)

        print("Finish Preprocessing")
        for k in self.classes:
            print(f"concept {k} has {len(self.concept2pilimages[k])} crops")
        
    def __len__(self): return len(self.classes)
    
    def __getitem__(self, i): 
        f = self.classes[i]
        image = random.choice(self.concept2pilimages[f])
        print("in getitem, image.size = ", image.size)
        text = f"an image of a {f}"
        return {
            'image': image, #self.preprocess(image),
            'sentence': text
        }

In [None]:
annotations = json.load(open("../data/aggregated/whatsup_vlm_b_lr.json", "r"))
annotations[0]

In [None]:
D = whatsup_singleobj_dataset(
    imdir="/data/yingshac/clevr_control/data/",
    annotations=annotations,
)

In [None]:
i = random.choice(list(range(len(D))))
i=17
print(D[i]['sentence'])
D[i]['image']

In [None]:
import os, torch, json, random
from torchvision import transforms
from torch.utils.data import Dataset
from PIL import Image
from typing import Any, Dict, Optional, Tuple, Union, List
from tqdm import tqdm, trange
from collections import defaultdict, Counter
class dataset(Dataset):
    def __init__(self,
                 imdir: str,
                 data: List,
                 imsize = (64, 64),
                 ):
        super().__init__()
        self.imdir = imdir
        self.data = data
        self.preprocess = transforms.Compose(
            [   
                transforms.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
                transforms.Resize(imsize),
                transforms.ToTensor(),
                transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
            ]
        )
        
    def __len__(self): return len(self.data)
    
    def __getitem__(self, i): 
        text, image_path, tuples = self.data[i]
        image = Image.open(os.path.join(self.imdir, image_path))
        width, height = image.size
        new_dimension = min(image.size)
        left = (width - new_dimension)/2
        top = (height - new_dimension)/2
        right = (width + new_dimension)/2
        bottom = (height + new_dimension)/2
        image = image.crop((left, top, right, bottom))
        #print(image.size)
        W, H = image.size
        assert W==H

        r = tuples[-1]
        if r in ["left of", "right of"]: image = image.crop((0, H//4, W, 3*H//4))
        elif r in ["in-front of", "behind"]: image = image.crop((W//4, 0, 3*W//4, H))
        
        return {
            'image': image,
            'sentence': text
        }


In [None]:
annotations = json.load(open("../data/aggregated/whatsup_vlm_b_lr_autofill.json", "r"))
imdir="/data/yingshac/clevr_control/data/" 
D = dataset(imdir, annotations, imsize=(32, 64))

In [None]:
i = random.choice(list(range(len(D))))
print(D[i]['sentence'])
D[i]['image']

### Delete Useless Ckpts

In [None]:
import os
dir = "/data/yingshac/clevr_control/scripts/diffuser_real/output/0228_115732/ckpts"
for f in os.listdir(dir):
    if int(f.split("_")[0]) % 10 != 9:
        x = os.path.join(dir, f)
        print(x)
        #os.remove(x)

In [None]:
import os
dir = "/data/yingshac/clevr_control/scripts/diffuser_real/output"
for handle in sorted(os.listdir(dir)):
    #if int("".join(handle.split("_"))) < 228115732:
    print(handle)
#for f in os.listdir(dir):
#    if int(f.split("_")[0]) % 10 != 9:
#        x = os.path.join(dir, f)
#        print(x)