In [1]:

import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
from PIL import Image, ImageDraw, ImageFont
import json
import random

import clip
import os
from tqdm import tqdm
from fine_tune import draw_text_with_new_lines, MyDataset, TestDataset, calculate_corr, load_model, evaluate, all_attributes, preprocess, my_preprocess, _convert_image_to_rgb, set_image_tensors, inclusive_attributes, exclusive_attributes, evaluate_use_dumped_image
from clip.model import CLIP_Dense
from cj_fonts import inclusive_fonts, fifty_fonts
from torchvision.transforms import Compose, Resize, CenterCrop, ToTensor, Normalize, RandomCrop, RandomRotation, RandomResizedCrop
from torchvision.transforms import Lambda
from torchvision.transforms import functional as F
from IPython.display import display
import matplotlib.pyplot as plt
import matplotlib as mpl
import matplotlib.font_manager as font_manager
from matplotlib.font_manager import FontProperties
import PIL

def convert_models_to_fp32(model):
    for p in model.parameters():
        p.data = p.data.float()
        if p.grad is not None:
            p.grad.data = p.grad.data.float()

# If using GPU then use mixed precision training.
device = "cuda:0" if torch.cuda.is_available() else "cpu"
# Must set jit=False for training
model, preprocess = clip.load("ViT-B/32", device=device, jit=False)
from torchvision.transforms.functional import pil_to_tensor, to_pil_image

In [2]:
font_dir = '../gwfonts'
cj_font_dir = '../all-fonts'
font_paths = [os.path.join(font_dir, f) for f in os.listdir(font_dir)]
fifty_font_paths = [os.path.join(cj_font_dir, f) for f in fifty_fonts.split('\n') if f != '']

# add font
for font in font_manager.findSystemFonts(font_dir):
    font_manager.fontManager.addfont(font)

for font in font_manager.findSystemFonts(cj_font_dir):
    font_manager.fontManager.addfont(font)

ttf_list = font_manager.fontManager.ttflist

all_json_path = '../attributeData/all_font_to_attribute_values.json'
train_json_path = '../attributeData/train_font_to_attribute_values.json'
test_json_path = '../attributeData/test_font_to_attribute_values.json'
validation_json_path = '../attributeData/validation_font_to_attribute_values.json'

In [3]:
train_font_num = len(list(json.load(open(train_json_path, 'r')).keys()))
print(train_font_num)

validation_font_num = len(list(json.load(open(validation_json_path, 'r')).keys()))
print(validation_font_num)

test_font_num = len(list(json.load(open(test_json_path, 'r')).keys()))
print(test_font_num)

140
30
30


In [4]:

fox_text = 'The quick\nbrown fox\njumps over\nthe lazy dog'
texts_for_font_image = [fox_text]
target_attributes = ['happy']
image_file_dir = None
image_file_dir = '../attributeData/grayscale_images'
dump_image = True
single_character = False
dataset = MyDataset(font_dir, train_json_path, texts_for_font_image, char_size=150, attribute_threshold=50, use_negative=False, use_weight=False, use_score=True, preprocess=preprocess, use_multiple_attributes=False, use_random_attributes=False, max_sample_num=3, random_prompts_num=100, exclusive_attributes=exclusive_attributes, image_file_dir=image_file_dir, dump_image=dump_image, single_character=single_character)
test_data = TestDataset(font_dir, validation_json_path, texts_for_font_image, char_size=150, attribute_threshold=0, target_attributes=target_attributes, preprocess=my_preprocess, single_character=single_character, image_file_dir=image_file_dir)
#set_image_tensors(dataset, sample_num=10)
data_loader = DataLoader(dataset, batch_size=2, shuffle=True)
test_data_loader = DataLoader(test_data, batch_size=16, shuffle=False)
it = iter(data_loader)
test_it = iter(test_data_loader)

In [5]:
dense_clip = CLIP_Dense(model)

In [6]:
for name, param in dense_clip.named_parameters():
    print(name, param.requires_grad)

clip.positional_embedding True
clip.text_projection True
clip.logit_scale True
clip.visual.class_embedding True
clip.visual.positional_embedding True
clip.visual.proj True
clip.visual.conv1.weight True
clip.visual.ln_pre.weight True
clip.visual.ln_pre.bias True
clip.visual.transformer.resblocks.0.attn.in_proj_weight True
clip.visual.transformer.resblocks.0.attn.in_proj_bias True
clip.visual.transformer.resblocks.0.attn.out_proj.weight True
clip.visual.transformer.resblocks.0.attn.out_proj.bias True
clip.visual.transformer.resblocks.0.ln_1.weight True
clip.visual.transformer.resblocks.0.ln_1.bias True
clip.visual.transformer.resblocks.0.mlp.c_fc.weight True
clip.visual.transformer.resblocks.0.mlp.c_fc.bias True
clip.visual.transformer.resblocks.0.mlp.c_proj.weight True
clip.visual.transformer.resblocks.0.mlp.c_proj.bias True
clip.visual.transformer.resblocks.0.ln_2.weight True
clip.visual.transformer.resblocks.0.ln_2.bias True
clip.visual.transformer.resblocks.1.attn.in_proj_weight True

In [7]:
images, prompts, scores = next(it)

In [8]:
convert_models_to_fp32(dense_clip)
dense_clip = dense_clip.to(device)
images = images.to(device)
prompts = prompts.to(device)
scores = torch.tensor(scores, dtype=torch.float16).to(device)
logits = dense_clip(images, prompts)

  """


In [23]:
def calculate_corr(model, dataset, predict_mode=False, device=device, use_dense_clip=False):
    data_loader = DataLoader(dataset, batch_size=8, shuffle=False, num_workers=0)
    it = iter(data_loader)
    cos_sim = None
    logits = None
    with torch.no_grad():
        for batch in it:
            images, texts = batch
            images = images.to(device)
            texts = texts.to(device)
            if use_dense_clip:
                tmp_logits = model(images, texts)
                # to numpy
                tmp_logits = tmp_logits.cpu().numpy()
                if logits is None:
                    logits = tmp_logits
                else:
                    logits = np.concatenate((logits, tmp_logits), axis=0)
            else:
                embedded_images = model.encode_image(images).cpu()
                embedded_texts = model.encode_text(texts).cpu()

                # assume each row of embedded_texts is the same
                tmp_cos_sim = calculate_cos_sim(embedded_images, embedded_texts)
                if cos_sim is None:
                    cos_sim = tmp_cos_sim
                else:
                    cos_sim = np.concatenate((cos_sim, tmp_cos_sim), axis=0)
    if predict_mode:
        if use_dense_clip:
            return logits
        return cos_sim
    ground_truth_attribute_values = dataset.flatten_ground_truth_attribute_values()
    if use_dense_clip:
        print(logits.shape)
        print(ground_truth_attribute_values.shape)
        corr = np.corrcoef(logits[:, 0], ground_truth_attribute_values)[0, 1]
        return corr, logits, ground_truth_attribute_values
    corr = np.corrcoef(cos_sim, ground_truth_attribute_values)[0, 1]
    return corr, cos_sim, ground_truth_attribute_values

In [26]:
a = calculate_corr(dense_clip, test_data, use_dense_clip=True)
print(a[0])

(30, 1)
torch.Size([30])
0.09425632185997027


In [14]:
a

In [15]:
mse_loss = nn.MSELoss()
loss = mse_loss(logits, scores)
print(loss)


tensor(0.0134, device='cuda:0', grad_fn=<MseLossBackward0>)


In [9]:
image_features = dense_clip.clip.encode_image(images)
text_features = dense_clip.clip.encode_text(prompts)

In [10]:
features = torch.cat((image_features, text_features), dim=1)
print(features.shape)

torch.Size([2, 1024])


In [11]:
dense_clip.linear(features)

tensor([[-0.2553],
        [-0.1399]], device='cuda:0', grad_fn=<AddmmBackward0>)