In [17]:
import os
import json
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import japanese_clip as ja_clip
from PIL import Image
from tqdm import tqdm
from transformers import MLukeTokenizer, LukeModel

In [5]:
!wget https://www.shonan-it.ac.jp/media/20241126_g03.jpg -O test_image.jpg
test_image_path = "test_image.jpg"
test_sentence = "2024年11月14日、「湘南工科大学 産学交流フォーラム2024」が開催され、大学院生による研究発表に対する「優秀プレゼンテーション賞」の表彰が初めて実施されました。"

--2024-12-05 12:44:08--  https://www.shonan-it.ac.jp/media/20241126_g03.jpg
Resolving www.shonan-it.ac.jp (www.shonan-it.ac.jp)... 150.60.144.101, 10.2.10.10, 10.2.10.20
Connecting to www.shonan-it.ac.jp (www.shonan-it.ac.jp)|150.60.144.101|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 56759 (55K) [image/jpeg]
Saving to: ‘test_image.jpg’


2024-12-05 12:44:08 (1.49 MB/s) - ‘test_image.jpg’ saved [56759/56759]



In [29]:
device = "cuda" if torch.cuda.is_available() else "cpu"
clip_model, clip_preprocesser = ja_clip.load("rinna/japanese-clip-vit-b-16", 
                                             cache_dir="/tmp/japanese_clip", 
                                             torch_dtype = torch.float16,
                                             device = device)
clip_tokenizer = ja_clip.load_tokenizer()

encoded_sentences = ja_clip.tokenize(
        texts = [test_sentence],
        max_seq_len = 77,
        device = device,
        tokenizer = clip_tokenizer,
    )
image = Image.open(test_image_path)
preprcessed_image = clip_preprocesser(image).unsqueeze(0).to(device)
with torch.no_grad():
    clip_test_image_features = clip_model.get_image_features(preprcessed_image)
    clip_test_image_feature = clip_test_image_features.cpu().numpy()[0]
    clip_test_sentence_features = clip_model.get_text_features(**encoded_sentences)
    clip_test_sentence_feature = clip_test_sentence_features.cpu().numpy()[0]


clip_test_image_feature.shape, clip_test_sentence_feature.shape

((512,), (512,))

In [30]:
class SentenceLukeJapanese:
    def __init__(self, device = None):
        self.tokenizer = MLukeTokenizer.from_pretrained("sonoisa/sentence-luke-japanese-base-lite")
        self.model = LukeModel.from_pretrained("sonoisa/sentence-luke-japanese-base-lite",
                                               torch_dtype = torch.float16)
        self.model.eval()

        if device is None:
            device = "cuda" if torch.cuda.is_available() else "cpu"
        self.device = torch.device(device)
        self.model.to(device)

    def _mean_pooling(self, model_output, attention_mask):
        token_embeddings = model_output[0] #First element of model_output contains all token embeddings
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
        return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

    @torch.no_grad()
    def encode(self, sentences, batch_size = 256):
        all_embeddings = []
        iterator = range(0, len(sentences), batch_size)
        for batch_idx in iterator:
            batch = sentences[batch_idx:batch_idx + batch_size]

            encoded_input = self.tokenizer.batch_encode_plus(batch, padding="longest",
                                           truncation=True, return_tensors="pt").to(self.device)
            model_output = self.model(**encoded_input)
            sentence_embeddings = self._mean_pooling(model_output, encoded_input["attention_mask"]).to('cpu')

            all_embeddings.extend(sentence_embeddings)

        return torch.stack(all_embeddings)

luke_model = SentenceLukeJapanese()
luke_test_sentence_feature = luke_model.encode([test_sentence]).cpu().numpy()[0]

In [18]:
class CustomModel(nn.Module):
    def __init__(self, cif_dim, csf_dim, lsf_dim):
        super(CustomModel, self).__init__()
        # 入力次元を設定
        self.cif_dim = cif_dim
        self.csf_dim = csf_dim
        self.lsf_dim = lsf_dim
        
        # 全結合層
        self.fc1 = nn.Linear(cif_dim + csf_dim + lsf_dim, 1024)
        self.fc2 = nn.Linear(1024, 1024)
        self.fc3 = nn.Linear(1024, 1024)
        self.output_layer = nn.Linear(1024, 1)
        
    def forward(self, cif, csf, lsf):
        # 入力を結合
        x = torch.cat([cif, csf, lsf], dim=1)
        # 全結合層 + LeakyReLU
        x = F.leaky_relu(self.fc1(x))
        x = F.leaky_relu(self.fc2(x))
        x = F.leaky_relu(self.fc3(x))
        # 出力層 + シグモイド
        output = torch.sigmoid(self.output_layer(x))
        return output

In [28]:
clip_test_image_feature.shape, clip_test_sentence_feature.shape, luke_test_sentence_feature.shape

(torch.Size([1, 1, 1, 512]),
 torch.Size([1, 1, 1, 512]),
 torch.Size([1, 1, 1, 768]))

In [None]:
EXPERIENCE_NUMBER = "002"
RESULT_DIR = f"../../results/Boke_Judge/{EXPERIENCE_NUMBER}/"
model_path = f"{RESULT_DIR}model.weights.pth"

model = CustomModel(clip_test_image_feature.shape[0],
                    clip_test_sentence_feature.shape[0],
                    luke_test_sentence_feature.shape[0])

model.load_state_dict(torch.load(f"{RESULT_DIR}model_weights.pth",
                                 map_location = torch.device('cpu')))
model.eval()

encoded_sentences = ja_clip.tokenize(
        texts = [test_sentence],
        max_seq_len = 77,
        device = device,
        tokenizer = clip_tokenizer,
    )
image = Image.open(test_image_path)
preprcessed_image = clip_preprocesser(image).unsqueeze(0).to(device)
with torch.no_grad():
    clip_test_image_features = clip_model.get_image_features(preprcessed_image)
    clip_test_image_feature = clip_test_image_features.cpu().numpy()[0]
    clip_test_sentence_features = clip_model.get_text_features(**encoded_sentences)
    clip_test_sentence_feature = clip_test_sentence_features.cpu().numpy()[0]

luke_test_sentence_feature = luke_model.encode([test_sentence]).cpu().numpy()[0]

with torch.no_grad():
    clip_test_image_feature = torch.tensor(clip_test_image_feature[np.newaxis])
    clip_test_sentence_feature = torch.tensor(clip_test_sentence_feature[np.newaxis])
    luke_test_sentence_feature = torch.tensor(luke_test_sentence_feature[np.newaxis])

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    clip_test_image_feature = clip_test_image_feature.to(device)
    clip_test_sentence_feature = clip_test_sentence_feature.to(device)
    luke_test_sentence_feature = luke_test_sentence_feature.to(device)

    outputs = model(clip_test_image_feature,
                    clip_test_sentence_feature,
                    luke_test_sentence_feature)

outputs

tensor([[0.9964]], device='cuda:0')