In [17]:
import os
import json
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import japanese_clip as ja_clip
from PIL import Image
from tqdm import tqdm
from transformers import MLukeTokenizer, LukeModel

In [5]:
!wget https://www.shonan-it.ac.jp/media/20241126_g03.jpg -O test_image.jpg
test_image_path = "test_image.jpg"
test_sentence = "2024年11月14日、「湘南工科大学 産学交流フォーラム2024」が開催され、大学院生による研究発表に対する「優秀プレゼンテーション賞」の表彰が初めて実施されました。"

--2024-12-05 12:44:08--  https://www.shonan-it.ac.jp/media/20241126_g03.jpg
Resolving www.shonan-it.ac.jp (www.shonan-it.ac.jp)... 150.60.144.101, 10.2.10.10, 10.2.10.20
Connecting to www.shonan-it.ac.jp (www.shonan-it.ac.jp)|150.60.144.101|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 56759 (55K) [image/jpeg]
Saving to: ‘test_image.jpg’


2024-12-05 12:44:08 (1.49 MB/s) - ‘test_image.jpg’ saved [56759/56759]



In [29]:
device = "cuda" if torch.cuda.is_available() else "cpu"
clip_model, clip_preprocesser = ja_clip.load("rinna/japanese-clip-vit-b-16", 
                                             cache_dir="/tmp/japanese_clip", 
                                             torch_dtype = torch.float16,
                                             device = device)
clip_tokenizer = ja_clip.load_tokenizer()

encoded_sentences = ja_clip.tokenize(
        texts = [test_sentence],
        max_seq_len = 77,
        device = device,
        tokenizer = clip_tokenizer,
    )
image = Image.open(test_image_path)
preprcessed_image = clip_preprocesser(image).unsqueeze(0).to(device)
with torch.no_grad():
    clip_test_image_features = clip_model.get_image_features(preprcessed_image)
    clip_test_image_feature = clip_test_image_features.cpu().numpy()[0]
    clip_test_sentence_features = clip_model.get_text_features(**encoded_sentences)
    clip_test_sentence_feature = clip_test_sentence_features.cpu().numpy()[0]


clip_test_image_feature.shape, clip_test_sentence_feature.shape

((512,), (512,))

In [30]:
class SentenceLukeJapanese:
    def __init__(self, device = None):
        self.tokenizer = MLukeTokenizer.from_pretrained("sonoisa/sentence-luke-japanese-base-lite")
        self.model = LukeModel.from_pretrained("sonoisa/sentence-luke-japanese-base-lite",
                                               torch_dtype = torch.float16)
        self.model.eval()

        if device is None:
            device = "cuda" if torch.cuda.is_available() else "cpu"
        self.device = torch.device(device)
        self.model.to(device)

    def _mean_pooling(self, model_output, attention_mask):
        token_embeddings = model_output[0] #First element of model_output contains all token embeddings
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
        return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

    @torch.no_grad()
    def encode(self, sentences, batch_size = 256):
        all_embeddings = []
        iterator = range(0, len(sentences), batch_size)
        for batch_idx in iterator:
            batch = sentences[batch_idx:batch_idx + batch_size]

            encoded_input = self.tokenizer.batch_encode_plus(batch, padding="longest",
                                           truncation=True, return_tensors="pt").to(self.device)
            model_output = self.model(**encoded_input)
            sentence_embeddings = self._mean_pooling(model_output, encoded_input["attention_mask"]).to('cpu')

            all_embeddings.extend(sentence_embeddings)

        return torch.stack(all_embeddings)

luke_model = SentenceLukeJapanese()
luke_test_sentence_feature = luke_model.encode([test_sentence]).cpu().numpy()[0]

In [18]:
class CustomModel(nn.Module):
    def __init__(self, cif_dim, csf_dim, lsf_dim):
        super(CustomModel, self).__init__()
        # 入力次元を設定
        self.cif_dim = cif_dim
        self.csf_dim = csf_dim
        self.lsf_dim = lsf_dim
        
        # 全結合層
        self.fc1 = nn.Linear(cif_dim + csf_dim + lsf_dim, 1024)
        self.fc2 = nn.Linear(1024, 1024)
        self.fc3 = nn.Linear(1024, 1024)
        self.output_layer = nn.Linear(1024, 1)
        
    def forward(self, cif, csf, lsf):
        # 入力を結合
        x = torch.cat([cif, csf, lsf], dim=1)
        # 全結合層 + LeakyReLU
        x = F.leaky_relu(self.fc1(x))
        x = F.leaky_relu(self.fc2(x))
        x = F.leaky_relu(self.fc3(x))
        # 出力層 + シグモイド
        output = torch.sigmoid(self.output_layer(x))
        return output

In [45]:
!wget https://d2dcan0armyq93.cloudfront.net/photo/odai/600/04950fa024255b5c910abbc35650f2d3_600.jpg -O test_image.jpg
test_image_path = "test_image.jpg"
test_sentence = "今日もいい天気" # "藤井聡太普段"

--2024-12-05 13:05:51--  https://d2dcan0armyq93.cloudfront.net/photo/odai/600/04950fa024255b5c910abbc35650f2d3_600.jpg
Resolving d2dcan0armyq93.cloudfront.net (d2dcan0armyq93.cloudfront.net)... 13.249.166.90, 13.249.166.231, 13.249.166.109, ...
Connecting to d2dcan0armyq93.cloudfront.net (d2dcan0armyq93.cloudfront.net)|13.249.166.90|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 41849 (41K) [image/jpeg]
Saving to: ‘test_image.jpg’


2024-12-05 13:05:54 (10.7 MB/s) - ‘test_image.jpg’ saved [41849/41849]



In [63]:
!wget https://d2dcan0armyq93.cloudfront.net/photo/odai/600/a7d1e26dc93ee0a4d3b16f80dd0954ea_600.jpg -O test_image.jpg
test_image_path = "test_image.jpg"
test_sentence = "こいつ誰だ"

--2024-12-05 15:49:52--  https://d2dcan0armyq93.cloudfront.net/photo/odai/600/a7d1e26dc93ee0a4d3b16f80dd0954ea_600.jpg
Resolving d2dcan0armyq93.cloudfront.net (d2dcan0armyq93.cloudfront.net)... 13.249.166.109, 13.249.166.197, 13.249.166.231, ...
Connecting to d2dcan0armyq93.cloudfront.net (d2dcan0armyq93.cloudfront.net)|13.249.166.109|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 34341 (34K) [image/jpeg]
Saving to: ‘test_image.jpg’


2024-12-05 15:49:52 (50.9 MB/s) - ‘test_image.jpg’ saved [34341/34341]



In [60]:
!wget https://d2dcan0armyq93.cloudfront.net/photo/odai/600/fd8eb8d243d6fe080067d9483ede42fe_600.jpg -O test_image.jpg
test_image_path = "test_image.jpg"
test_sentence = "はい、アーンして"

--2024-12-05 15:49:22--  https://d2dcan0armyq93.cloudfront.net/photo/odai/600/fd8eb8d243d6fe080067d9483ede42fe_600.jpg
Resolving d2dcan0armyq93.cloudfront.net (d2dcan0armyq93.cloudfront.net)... 13.249.166.109, 13.249.166.197, 13.249.166.231, ...
Connecting to d2dcan0armyq93.cloudfront.net (d2dcan0armyq93.cloudfront.net)|13.249.166.109|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 42570 (42K) [image/jpeg]
Saving to: ‘test_image.jpg’


2024-12-05 15:49:22 (10.8 MB/s) - ‘test_image.jpg’ saved [42570/42570]



In [51]:
!wget https://d2dcan0armyq93.cloudfront.net/photo/odai/600/89ed8f0f21e303e5c76e77ecf81c88e9_600.jpg -O test_image.jpg
test_image_path = "test_image.jpg"
test_sentence = "ゴジラになれますように"

--2024-12-05 15:04:58--  https://d2dcan0armyq93.cloudfront.net/photo/odai/600/89ed8f0f21e303e5c76e77ecf81c88e9_600.jpg
Resolving d2dcan0armyq93.cloudfront.net (d2dcan0armyq93.cloudfront.net)... 13.249.166.197, 13.249.166.90, 13.249.166.231, ...
Connecting to d2dcan0armyq93.cloudfront.net (d2dcan0armyq93.cloudfront.net)|13.249.166.197|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 35766 (35K) [image/jpeg]
Saving to: ‘test_image.jpg’


2024-12-05 15:04:58 (287 MB/s) - ‘test_image.jpg’ saved [35766/35766]



In [53]:
!wget https://www.shonan-it.ac.jp/media/20241126_g03.jpg -O test_image.jpg
test_image_path = "test_image.jpg"
test_sentence = "AIはボケられるのか⁉ ～AIを用いた画像に対する大喜利生成～"

--2024-12-05 15:32:01--  https://www.shonan-it.ac.jp/media/20241126_g03.jpg
Resolving www.shonan-it.ac.jp (www.shonan-it.ac.jp)... 150.60.144.101, 10.2.10.10, 10.2.10.20
Connecting to www.shonan-it.ac.jp (www.shonan-it.ac.jp)|150.60.144.101|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 56759 (55K) [image/jpeg]
Saving to: ‘test_image.jpg’


2024-12-05 15:32:01 (1.48 MB/s) - ‘test_image.jpg’ saved [56759/56759]



In [55]:
!wget https://www.shonan-it.ac.jp/media/20241128_g01.jpg -O test_image.jpg
test_image_path = "test_image.jpg"
test_sentence = "AIはボケられるのか⁉ ～AIを用いた画像に対する大喜利生成～"

--2024-12-05 15:33:59--  https://www.shonan-it.ac.jp/media/20241128_g01.jpg
Resolving www.shonan-it.ac.jp (www.shonan-it.ac.jp)... 150.60.144.101, 10.2.10.10, 10.2.10.20
Connecting to www.shonan-it.ac.jp (www.shonan-it.ac.jp)|150.60.144.101|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 46813 (46K) [image/jpeg]
Saving to: ‘test_image.jpg’


2024-12-05 15:33:59 (1.25 MB/s) - ‘test_image.jpg’ saved [46813/46813]



In [88]:
!wget https://www.shonan-it.ac.jp/media/20241204_w01.jpg -O test_image.jpg
test_image_path = "test_image.jpg"
test_sentence = "自宅の冷蔵庫を開けると、そこにはりんごとみかんとぶどうがありました"

--2024-12-05 15:56:14--  https://www.shonan-it.ac.jp/media/20241204_w01.jpg
Resolving www.shonan-it.ac.jp (www.shonan-it.ac.jp)... 150.60.144.101, 10.2.10.10, 10.2.10.20
Connecting to www.shonan-it.ac.jp (www.shonan-it.ac.jp)|150.60.144.101|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 77508 (76K) [image/jpeg]
Saving to: ‘test_image.jpg’


2024-12-05 15:56:14 (1.37 MB/s) - ‘test_image.jpg’ saved [77508/77508]



In [None]:
!wget https://d2dcan0armyq93.cloudfront.net/photo/odai/600/c42184648bce2e291c7ccc36495b34e7_600.jpg -O test_image.jpg
test_image_path = "test_image.jpg"
test_sentence = ""

--2024-12-05 15:54:45--  https://d2dcan0armyq93.cloudfront.net/photo/odai/600/c42184648bce2e291c7ccc36495b34e7_600.jpg
Resolving d2dcan0armyq93.cloudfront.net (d2dcan0armyq93.cloudfront.net)... 13.249.166.109, 13.249.166.231, 13.249.166.90, ...
Connecting to d2dcan0armyq93.cloudfront.net (d2dcan0armyq93.cloudfront.net)|13.249.166.109|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 29743 (29K) [image/jpeg]
Saving to: ‘test_image.jpg’


2024-12-05 15:54:45 (39.0 MB/s) - ‘test_image.jpg’ saved [29743/29743]



In [89]:
EXPERIENCE_NUMBER = "003"
RESULT_DIR = f"../../results/Boke_Judge/{EXPERIENCE_NUMBER}/"
model_path = f"{RESULT_DIR}model.weights.pth"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = CustomModel(512, 512, 768)

model.load_state_dict(torch.load(f"{RESULT_DIR}model_weights.pth"))
model.eval()
model.to(device)

encoded_sentences = ja_clip.tokenize(
        texts = [test_sentence],
        max_seq_len = 77,
        device = device,
        tokenizer = clip_tokenizer,
    )
image = Image.open(test_image_path)
preprcessed_image = clip_preprocesser(image).unsqueeze(0).to(device)
with torch.no_grad():
    clip_test_image_features = clip_model.get_image_features(preprcessed_image)
    clip_test_image_feature = clip_test_image_features.cpu().numpy()[0]
    clip_test_sentence_features = clip_model.get_text_features(**encoded_sentences)
    clip_test_sentence_feature = clip_test_sentence_features.cpu().numpy()[0]

luke_test_sentence_feature = luke_model.encode([test_sentence]).cpu().numpy()[0]

with torch.no_grad():
    clip_test_image_feature = torch.tensor(clip_test_image_feature[np.newaxis])
    clip_test_sentence_feature = torch.tensor(clip_test_sentence_feature[np.newaxis])
    luke_test_sentence_feature = torch.tensor(luke_test_sentence_feature[np.newaxis])

    clip_test_image_feature = clip_test_image_feature.to(device)
    clip_test_sentence_feature = clip_test_sentence_feature.to(device)
    luke_test_sentence_feature = luke_test_sentence_feature.to(device)

    outputs = model(clip_test_image_feature,
                    clip_test_sentence_feature,
                    luke_test_sentence_feature)

outputs

  model.load_state_dict(torch.load(f"{RESULT_DIR}model_weights.pth"))


tensor([[0.4938]], device='cuda:0')