# KoCLIP

## 1. Setup

In [28]:
import json
import easydict
import numpy as np
import pandas as pd
from tqdm.auto import tqdm

import torch
import torch.nn as nn
import torch.nn.functional as F

from transformers import AutoModel, AutoTokenizer, AutoModelForSequenceClassification, CLIPModel, CLIPTextModel

In [2]:
cfg = easydict.EasyDict(
    device = 'cuda:0',
    clip_model = "openai/clip-vit-base-patch32",
    korean_model = 'klue/roberta-small',
)

## 2. Data

In [4]:
tokenizer = AutoTokenizer.from_pretrained(cfg.clip_model)

In [5]:
model = CLIPModel.from_pretrained(cfg.clip_model)

In [10]:
inputs = tokenizer(['hi', 'how are you'], return_tensors='pt', padding=True)

In [13]:
out = model.text_model(**inputs)
emb = model.text_projection(out.pooler_output)

In [14]:
emb.shape

torch.Size([2, 512])

In [None]:
class Dataset()

In [23]:
trans = json.load(open('../data/MSCOCO_train_val_Korean.json'))

In [None]:
trans[0]

In [None]:
trans_train = []
trans_val = []

for t in tqdm(trans):
    if 'train' in t['file_path']:
        trans_train.append(t)
    else:
        trans_val.append(t)

In [None]:
len(trans_train), len(trans_val)

## 3. Model

In [29]:
d = pd.read_csv('../data/mscoco_test.csv')

In [30]:
batch_idxs = np.random.randint(0, len(d), 16)

In [32]:
batch = d.iloc[batch_idxs]

In [36]:
en_inputs= tokenizer(batch['en'].tolist(), return_tensors='pt', padding=True)

{'input_ids': tensor([[49406,   320,  4456,   593, 31124,   267,  2972,   267, 19667,   537,
           320,  3979,   556,  1047,   531,  2131,   269, 49407, 49407],
        [49406,   320,  3638,  1901,  2012,  2292,  4919,  1131,   531,   320,
          4163,  9973,  2292,   269, 49407, 49407, 49407, 49407, 49407],
        [49406,   320,  9629,   530,   518,  1570,  1265,  3209,   333,   638,
           320,   786,   269, 49407, 49407, 49407, 49407, 49407, 49407],
        [49406,  1237,  1449,  2368,  4919,   530,  2184,   539,   320,  4879,
          7619,   518, 12608,   269, 49407, 49407, 49407, 49407, 49407],
        [49406,   320,  3032,   530,  2349,  8687,  4919,   525,  1253,   539,
           320,  1579,  5135,   269, 49407, 49407, 49407, 49407, 49407],
        [49406,   320,  2913,  7286,   320, 36841, 16736,   620,   539,   911,
          4932,  1519, 35072,   911,  8225,   269, 49407, 49407, 49407],
        [49406,  1237,  2034,  1629,   320,  2054,  1455,  1063,  1952,   

In [None]:
model = CLIPModel.from_pretrained(cfg.clip_model).to(cfg.device)
processor = CLIPProcessor.from_pretrained(cfg.clip_model)

In [20]:
model.text_model.embeddings.token_embedding.weight

Parameter containing:
tensor([[-3.9053e-03, -6.3254e-03,  7.3507e-03,  ..., -1.0660e-02,
         -2.2764e-02, -1.0908e-02],
        [-2.6081e-02,  8.7953e-03, -1.1737e-02,  ..., -1.2019e-02,
         -2.4059e-02, -2.1929e-02],
        [-1.9648e-02, -6.6711e-03, -9.0593e-03,  ...,  4.5782e-03,
         -2.0692e-02, -8.7150e-03],
        ...,
        [ 8.5028e-03,  1.0219e-03,  2.0366e-02,  ...,  1.4868e-02,
          1.7627e-02, -1.4752e-03],
        [-1.6741e-03,  7.3048e-05, -4.1996e-03,  ..., -3.4096e-03,
         -3.9295e-03, -5.5289e-05],
        [-6.0260e-03,  2.0210e-03,  4.9674e-04,  ..., -3.3459e-03,
         -9.8587e-03, -2.3390e-04]], requires_grad=True)

In [None]:
korean_model = AutoModel.from_pretrained(cfg.korean_model)

In [None]:
korean_model.pooler.dense.weight.data = korean_model.pooler.dense.weight[:512].data
korean_model.pooler.dense.bias.data = korean_model.pooler.dense.bias[:512].data
_ = korean_model.to(cfg.device)

In [None]:
model.text_model = korean_model
processor.tokenizer = korean_tokenizer

In [None]:
kor_projection = nn.Linear(korean_model.config.hidden_size, model.projection_dim, bias=False).to(cfg.device)
korean_tokenizer = AutoTokenizer.from_pretrained(cfg.korean_model)

In [None]:
text = t['captions']
text = t['caption_ko']
inputs = processor(text=text, return_tensors='pt', padding=True).to(cfg.device)
korean_inputs = korean_tokenizer(text, return_tensors='pt', padding=True).to(cfg.device)

In [None]:
out = model.text_model(**inputs)
embeds = model.text_projection(out.pooler_output)

In [None]:
embeds.shape

In [None]:
model.text_model(**inputs)[0].shape

In [None]:
out = model.text_model(**inputs)
embeds = model.text_projection(out.pooler_output)

kor_out = kor_model(**korean_inputs)
kor_embeds = kor_out.logits
# kor_embeds = kor_projection(kor_out.pooler_output)

In [None]:
loss = F.mse_loss(kor_embeds, embeds)

In [None]:
loss

In [None]:
a = AutoModelForSequenceClassification.from_pretrained(cfg.korean_model, num_labels=512).to(cfg.device)

In [None]:
out = a(**inputs)

In [None]:
out[1]

In [None]:
model.save_pretrained('clip')

In [None]:
processor.save_pretrained('clip')

In [None]:
korean_model.config

In [None]:
model.config.text_config = korean_model.config

In [None]:
model.config.save_pretrained('clip')

In [None]:
a = CLIPModel.from_pretrained('clip')

In [None]:
a.config.text_config

In [None]:
a.text_model