<a href="https://colab.research.google.com/github/walkerstipe/Surreal_GAN/blob/main/Copy_of_BoneAmputee's_CLIP_featurevis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# [BoneAmputee](https://instagram.com/BoneAmputee)'s [CLIP-featurevis](https://github.com/openai/CLIP-featurevis)

### setup
*   uses Tensorflow 1.x and Lucid 0.3.8
*   grabs `bpe_simple_vocab_16e6.txt`, `image32.pb` and `text32.pb`
*   includes modified `tokenizer.py` and `model.py`

### change to `tokenizer.py`
*   Azure credential issue avoided by loading bpe from local storage

### change to `model.py`
*   `graph_def` loading from protobufs in local storage

### change to `example_usage.py`
*   cat+dog image url changed to one that doesn't 404

### changes to `example_facets.py`
*   removed `()` from three instances of `@wrap_objective()`
*   Azure credential issue avoided using `requests` to get .npy files
*   changed `model.name` to `model.model_name`
*   changed `/root/` to `/content/` for Colab, but disabled saving

# setup

In [None]:
%tensorflow_version 1.x
!pip install --quiet ftfy regex tqdm blobfile lucid==0.3.8
!wget -nc https://github.com/openai/CLIP/raw/main/clip/bpe_simple_vocab_16e6.txt.gz
![ -f "bpe_simple_vocab_16e6.txt" ] && echo "skipping bpe_simple_vocab_16e6.txt" || gunzip "bpe_simple_vocab_16e6.txt.gz"
!wget -nc https://openaipublic.blob.core.windows.net/clip/tf/RN50_4x/084ee9c176da32014b0ebe42cd7ca66e/image32.pb
!wget -nc https://openaipublic.blob.core.windows.net/clip/tf/RN50_4x/da21bc82c7bba068aa8163333438354c/text32.pb

TensorFlow 1.x selected.
[K     |████████████████████████████████| 71kB 4.9MB/s 
[K     |████████████████████████████████| 71kB 7.2MB/s 
[K     |████████████████████████████████| 71kB 6.9MB/s 
[K     |████████████████████████████████| 143kB 11.0MB/s 
[K     |████████████████████████████████| 1.9MB 11.7MB/s 
[K     |████████████████████████████████| 81kB 10.6MB/s 
[K     |████████████████████████████████| 1.2MB 35.0MB/s 
[?25h  Building wheel for ftfy (setup.py) ... [?25l[?25hdone
  Building wheel for lucid (setup.py) ... [?25l[?25hdone
  Building wheel for umap-learn (setup.py) ... [?25l[?25hdone
  Building wheel for pynndescent (setup.py) ... [?25l[?25hdone
[31mERROR: requests 2.23.0 has requirement urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1, but you'll have urllib3 1.26.6 which is incompatible.[0m
[31mERROR: kapre 0.3.5 has requirement tensorflow>=2.0.0, but you'll have tensorflow 1.15.2 which is incompatible.[0m
[31mERROR: datascience 0.10.6 has requirement folium=

In [None]:
# By Alec Radford

import html
import ftfy
import json
import regex as re
from functools import lru_cache
import tensorflow as tf
import blobfile

def pad(x, pad_length = 76):
    z = np.zeros((pad_length))
    z[0:len(x)] = x
    return z

@lru_cache()
def bytes_to_unicode():
    """
    Returns list of utf-8 byte and a corresponding list of unicode strings.
    The reversible bpe codes work on unicode strings.
    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
    This is a signficant percentage of your normal, say, 32K bpe vocab.
    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
    And avoids mapping to whitespace/control characters the bpe code barfs on.
    """
    bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
    cs = bs[:]
    n = 0
    for b in range(2**8):
        if b not in bs:
            bs.append(b)
            cs.append(2**8+n)
            n += 1
    cs = [chr(n) for n in cs]
    return dict(zip(bs, cs))

def get_pairs(word):
    """Return set of symbol pairs in a word.
    Word is represented as tuple of symbols (symbols being variable-length strings).
    """
    pairs = set()
    prev_char = word[0]
    for char in word[1:]:
        pairs.add((prev_char, char))
        prev_char = char
    return pairs

def basic_clean(text):
    text = ftfy.fix_text(text)
    text = html.unescape(html.unescape(text))
    return text.strip()

def whitespace_clean(text):
    text = re.sub(r'\s+', ' ', text)
    text = text.strip()
    return text

class SimpleTokenizer(object):

    def __init__(self, bpe_path = None):
        if bpe_path == None:
            bpe_path = blobfile.BlobFile('bpe_simple_vocab_16e6.txt', 'r')
        self.byte_encoder = bytes_to_unicode()
        self.byte_decoder = {v:k for k, v in self.byte_encoder.items()}
        merges = bpe_path.read().split('\n')
        merges = merges[1:49152-256-2+1]
        merges = [tuple(merge.split()) for merge in merges]
        vocab = list(bytes_to_unicode().values())
        vocab = vocab + [v+'</w>' for v in vocab]
        for merge in merges:
            vocab.append(''.join(merge))
        vocab.extend(['<|startoftext|>', '<|endoftext|>'])
        self.encoder = dict(zip(vocab, range(len(vocab))))
        self.decoder = {v:k for k,v in self.encoder.items()}
        self.bpe_ranks = dict(zip(merges, range(len(merges))))
        self.cache = {'<|startoftext|>':'<|startoftext|>', '<|endoftext|>':'<|endoftext|>'}
        self.pat = re.compile(r"""<\|startoftext\|>|<\|endoftext\|>|'s|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+""", re.IGNORECASE)

    def bpe(self, token):
        if token in self.cache:
            return self.cache[token]
        word = tuple(token[:-1]) + ( token[-1] + '</w>',)
        pairs = get_pairs(word)

        if not pairs:
            return token+'</w>'

        while True:
            bigram = min(pairs, key = lambda pair: self.bpe_ranks.get(pair, float('inf')))
            if bigram not in self.bpe_ranks:
                break
            first, second = bigram
            new_word = []
            i = 0
            while i < len(word):
                try:
                    j = word.index(first, i)
                    new_word.extend(word[i:j])
                    i = j
                except:
                    new_word.extend(word[i:])
                    break

                if word[i] == first and i < len(word)-1 and word[i+1] == second:
                    new_word.append(first+second)
                    i += 2
                else:
                    new_word.append(word[i])
                    i += 1
            new_word = tuple(new_word)
            word = new_word
            if len(word) == 1:
                break
            else:
                pairs = get_pairs(word)
        word = ' '.join(word)
        self.cache[token] = word
        return word

    def encode(self, text):
        bpe_tokens = []
        text = whitespace_clean(basic_clean(text)).lower()
        for token in re.findall(self.pat, text):
            token = ''.join(self.byte_encoder[b] for b in token.encode('utf-8'))
            bpe_tokens.extend(self.encoder[bpe_token] for bpe_token in self.bpe(token).split(' '))
        return bpe_tokens

    def decode(self, tokens):
        text = ''.join([self.decoder[token] for token in tokens])
        text = bytearray([self.byte_decoder[c] for c in text]).decode('utf-8', errors="replace").replace('</w>', ' ')
        return text

    def tokenize(self, text, n_text = 76, pad = True):
        sot = self.encoder['<|startoftext|>']
        eot = self.encoder['<|endoftext|>']
        tokens = self.encode(text)
        tokens = [sot]+tokens[:n_text-1]+[eot]
        if pad:
            return [tokens + [0]*(n_text+1-len(tokens))]
        else:
            return tokens

    def sot(self):
        return self.encoder['<|startoftext|>']

    def eot(self):
        return self.encoder['<|endoftext|>']


In [None]:
from lucid.modelzoo.vision_base import Model
from lucid.optvis import render
import tensorflow as tf
from lucid.misc.io import load, save


class CLIPImage(Model):
    image_value_range = (0, 255)
    input_name = 'input_image'
    def __init__(self):
        with tf.gfile.GFile('image32.pb', "rb") as f:
          self.graph_def = tf.GraphDef()
          self.graph_def.ParseFromString(f.read())
        self.model_name = "RN50_4x"
        self.image_shape = [288, 288, 3]
        self.model_path = "https://openaipublic.blob.core.windows.net/clip/tf/RN50_4x/084ee9c176da32014b0ebe42cd7ca66e/image32.pb"

    def load(self, inp = None):
        import tensorflow as tf
        if inp == None:
            self.inp = tf.placeholder(shape = (None,self.image_shape[0], self.image_shape[1], 3), dtype = tf.float32)   
        else:
            self.inp = inp
        self.T   = render.import_model(self, self.inp, self.inp)
        return self.inp, self.T


class CLIPText(Model):
    input_name = 'tokens'

    def __init__(self):
        with tf.gfile.GFile('text32.pb', "rb") as f:
          self.graph_def = tf.GraphDef()
          self.graph_def.ParseFromString(f.read())
        self.model_name = f"RN50_4x_text"
        self.model_path = "https://openaipublic.blob.core.windows.net/clip/tf/RN50_4x/da21bc82c7bba068aa8163333438354c/text32.pb"

    def load(self, O = None):
        import tensorflow as tf
        if O == None:
            self.O = tf.placeholder(tf.int32, [None, None])  
        else:
            self.O = O
        tf.import_graph_def(self.graph_def, {self.input_name: self.O}, name = "text")
        gph = tf.get_default_graph()
        self.T = lambda x: gph.get_tensor_by_name("text/" + x + ":0")
        return self.O, self.T

# example_usage.py

In [None]:
# from tokenizer import SimpleTokenizer
# from model import CLIPImage, CLIPText
import tensorflow as tf
from lucid.misc.io import load
import numpy as np

def imresize(img, size, scale=255):
    from PIL import Image
    im = Image.fromarray((img*scale).astype(np.uint8) )
    return np.array(im.resize(size, Image.BICUBIC)).astype(np.float32)/scale

tokenizer = SimpleTokenizer()

tf.reset_default_graph()
inp_text, T_text = CLIPText().load()
inp_img,  T_img  = CLIPImage().load()

sess = tf.Session()

captions = ["This is a dog", "This is a cat", "This is a dog and a cat"]
tokens   = []
for caption in captions:
	tokens.append(tokenizer.tokenize(caption)[0])

img    = imresize(load("https://i.imgur.com/EwD46UF.jpg"), [288,288])

text_embd = sess.run(T_text("text_post/l2_normalize"), {inp_text: tokens})
img_embd  = sess.run(T_img("l2_normalize"), {inp_img: [img]})

scores = (text_embd @ img_embd.T)[:,0]

for score, caption in zip(scores, captions):
	print(caption, score)

# example_facets.py

In [None]:
from tqdm import tqdm
# from model import CLIPImage, CLIPText
import tensorflow as tf
import os
import numpy as np

from lucid.optvis import objectives, param
import lucid.optvis.render as render
from lucid.optvis.objectives import wrap_objective, diversity
import lucid.optvis.transform as transform
from lucid.misc.io import load, save


@wrap_objective
def l2(batch=None):
  def inner(T):
    return -tf.reduce_mean((T("input") - 0.5)**2)
  return inner

@wrap_objective
def vector(layer, d, batch=None):
  def inner(T):
    channel_obj = tf.reduce_mean( tf.einsum( "ijkl,j->ikl", tf.nn.relu(T(layer)), tf.constant(d) ), [1,2])
    channel_obj_weighted = tf.reduce_mean(channel_obj)**(1/1)
    return channel_obj_weighted
  return inner

@wrap_objective
def attr(obj, style_attrs, layers, strength):
    def inner(T):
        style = tf.constant(style_attrs)
        obj_t = obj(T)  
        layer_t = T(layers[0])
        w = tf.linspace(strength[0], strength[1], tf.shape(layer_t)[0])
        batch_n, _, _, _ = layer_t.get_shape().as_list()
        style = tf.transpose(style, (0,2,3,1))
        style = tf.image.resize(style, (tf.shape(layer_t)[2],tf.shape(layer_t)[3]))
        style = tf.transpose(style, (0,3,1,2))
        flat_attrs = []
        grads = tf.gradients(obj_t, [T(layer) for layer in layers])
        for layer, grad_t in zip(layers, grads):
            layer_t = T(layer)
            attr_t = layer_t * tf.nn.relu(tf.stop_gradient(grad_t))
            if len(style_attrs.shape) == 2:
                flat_attr_t = tf.reduce_sum(attr_t, axis=(2,3))
            elif len(style_attrs.shape) == 4:
                flat_attr_t = attr_t
            flat_attrs.append(flat_attr_t)
        flat_attr_t = tf.concat(flat_attrs, -1)
        return tf.reduce_sum(w[:,None,None,None]*flat_attr_t*style)
    return inner

def render_facet(model, neuron_obj, layers, style_attrs, strength = (0.1, 0.3), l2_weight = 10.0, resolution = 128, alpha = False):

    def mean_alpha():
        def inner(T):
            input_t = T("input")
            return tf.sqrt(tf.reduce_mean(input_t[..., 3:] ** 2))
        return objectives.Objective(inner)

    standard_transforms = [
        transform.pad(2, mode='constant', constant_value=.5),
        transform.jitter(4),
        transform.jitter(4),
        transform.jitter(4),
        transform.jitter(4),
        transform.jitter(4),
        transform.jitter(4),
        transform.jitter(4),
        transform.jitter(4),
        transform.jitter(4),
        transform.jitter(4),
        transform.random_scale([0.995**n for n in range(-5,80)] + [0.998**n for n in 2*list(range(20,40))]),
        transform.random_rotate(list(range(-20,20))+list(range(-10,10))+list(range(-5,5))+5*[0]),
        transform.jitter(2),
        transform.crop_or_pad_to(resolution, resolution)
    ]

    if alpha:
        standard_transforms.append(transform.collapse_alpha_random())
        param_f = lambda: param.image(resolution, batch=9, alpha=True)
    else:
        param_f = lambda: param.image(resolution, batch=9)

    optimizer      = tf.train.AdamOptimizer(0.02)
    ultimate_layer = [n.name for n in model.graph_def.node if "image_block_4" in n.name][-1]
    obj            = vector(ultimate_layer, neuron_obj) 
    facetsp        = [(5/len(layers))*attr(obj, style, [layer], strength) for style, layer in list(zip(style_attrs, layers))]
    for facetp in facetsp:
        obj = obj + facetp
    obj = obj + l2_weight*l2()
    if alpha:
        obj -= mean_alpha()
        obj -=  1e2 * objectives.blur_alpha_each_step()
    data = render.render_vis(model, obj, param_f, transforms=standard_transforms, optimizer=optimizer, thresholds=(1024*4,))
    return data

def one_hot(ind):
    z = np.zeros(2560)
    z[ind] = 1
    return z.astype(np.float32)

facets = ["face", "text", "logo", "pose", "arch", "nature", "indoor"]
model  = CLIPImage()
d      = one_hot(100)

for facet in facets:
    layernames  = [n.name for n in model.graph_def.node if ("image_block_3" in n.name) and ("Relu_2" in n.name)][::2]
    def loadnpy(url):
        import blobfile
        from io import BytesIO
        # fp = blobfile.BlobFile(url, "rb")
        import requests
        x  = np.load(BytesIO(requests.get(url).content))
        # fp.close()
        return x

    style_attrs = [loadnpy(f"https://openaipublic.blob.core.windows.net/clip/facets/{model.model_name}/{layername}/{facet}_spatial.npy") for layername in layernames]
    for l2_weight in [10]:
        img = render_facet(model, 
                           d, 
                           layernames, 
                           style_attrs, 
                           l2_weight = l2_weight, 
                           strength = (0.1, 5.0), 
                           alpha = False, 
                           resolution = 256)
    break
        # save(img[0][-1], f"/content/{facet}.png")

In [None]:
#should be the actual model (which one specifically i still dont know) 
model

#this is the main contribution to the Surreal GAN project
# from keras.layers.merge import concatenate
# stacked_model = define_stacked_model(members)
plot_model(model, to_file='model_plot.png', show_shapes=True, show_layer_names=True)
plot_model(model, show_shapes=True, show_layer_names=True)