# 現実写真のみ、文字なし、固有名詞全部なし、キャプション、ほかの画像の大喜利

In [1]:
import os
import json
import numpy as np
from tqdm import tqdm
from sklearn.model_selection import train_test_split

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

In [2]:
EXPERIENCE_NUMBER = "001"

USE_UNREAL_IMAGE = False
USE_WORD_IMAGE = False
USE_UNIQUE_NOUN_BOKE = False
USE_CAPTION = True
USE_MISS_BOKE = True

EPOCH = 25
BATCH_SIZE = 256

RESULT_DIR = f"../../results/Boke_Judge/{EXPERIENCE_NUMBER}/"
if not os.path.exists("../../results/Boke_Judge/"):
    os.mkdir("../../results/Boke_Judge/")
if not os.path.exists(RESULT_DIR):
    os.mkdir(RESULT_DIR)

DATA_DIR = "../../datas/boke_data_assemble/"
CLIP_IMAGE_FEATURE_DIR = "../../datas/encoded/clip_image_feature/"
CLIP_SENTENCE_FEATURE_DIR = "../../datas/encoded/clip_sentence_feature/"
LUKE_SENTENCE_FEATURE_DIR = "../../datas/encoded/luke_sentence_feature/"

# データセットの作成(初回のみ実行)

In [3]:
boke_datas = list()
caption_datas = list()

max_num_boke = 0
for JP in tqdm(os.listdir(DATA_DIR)):
    N = int(JP.split(".")[0])

    with open(f"{DATA_DIR}{JP}", "r") as f:
        a = json.load(f)

    image_information = a["image_infomation"]
    is_photographic_probability = image_information["is_photographic_probability"]
    ja_caption = image_information["ja_caption"]
    ocr = image_information["ocr"]

    # 現実写真以外を除去
    if not USE_UNREAL_IMAGE:
        if is_photographic_probability < 0.8: continue
        
    # 文字のある画像を除去
    if not USE_WORD_IMAGE:
        if len(ocr) != 0: continue

    bokes = a["bokes"]

    max_num_boke = max(max_num_boke, len(a["bokes"]))
    for i, B in enumerate(bokes):

        # 固有名詞を含む大喜利を除去
        if not USE_UNIQUE_NOUN_BOKE:
            if len(B["unique_nouns"]) != 0: continue

        boke_datas.append({
            "boke_number": int(f"1{N:07}{i:05}"),
            "image_number": int(f"3{N:07}")
        })

    caption_datas.append({
        "caption_number": int(f"2{N:07}"),
        "image_number": int(f"3{N:07}")
    })

len(boke_datas), len(caption_datas)

100%|██████████| 602566/602566 [14:05<00:00, 712.66it/s] 


(2183708, 230990)

In [4]:
train_boke_datas, test_boke_datas = train_test_split(boke_datas, test_size = 0.01)
train_caption_datas, test_caption_datas = train_test_split(caption_datas, test_size = 0.01)

with open(f"{RESULT_DIR}train_boke_datas.json", "w") as f:
    json.dump(train_boke_datas, f)
with open(f"{RESULT_DIR}train_caption_datas.json", "w") as f:
    json.dump(train_caption_datas, f)

with open(f"{RESULT_DIR}test_boke_datas.json", "w") as f:
    json.dump(test_boke_datas, f)
with open(f"{RESULT_DIR}test_caption_datas.json", "w") as f:
    json.dump(test_caption_datas, f)

# モデルの学習

In [3]:
with open(f"{RESULT_DIR}train_boke_datas.json", "r") as f:
    train_boke_datas = json.load(f)
with open(f"{RESULT_DIR}train_caption_datas.json", "r") as f:
    train_caption_datas = json.load(f)

with open(f"{RESULT_DIR}test_boke_datas.json", "r") as f:
    test_boke_datas = json.load(f)
with open(f"{RESULT_DIR}test_caption_datas.json", "r") as f:
    test_caption_datas = json.load(f)

len(train_boke_datas), len(train_caption_datas), len(test_boke_datas), len(test_caption_datas)

(2161870, 228680, 21838, 2310)

In [None]:
# def make_dataset(boke_datas, caption_datas):

#     def make_dataset_with_directory(file_numbers, directory):
#         def make_feature_dataset(file_numbers):
#             dataset = tf.data.Dataset.from_tensor_slices((file_numbers))
#             dataset = dataset.map(load_data_wrapper, 
#                                 num_parallel_calls = tf.data.experimental.AUTOTUNE)
#             dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)
#             return dataset

#         def load_data_wrapper(file_number):
#             return tf.py_function(func = load_data, inp = [file_number], 
#                                   Tout = tf.float32)

#         def load_data(file_number):
#             # file_number = file_number.numpy().decode("utf-8")
#             feature_path = f"{directory}{file_number}.npy"
#             return np.load(feature_path)
        
#         return make_feature_dataset(file_numbers)
    
#     image_file_numbers = list()
#     sentence_file_numbers = list()
#     teacher_signals = list()

#     for D in boke_datas:
#         image_file_numbers.append(D["image_number"])
#         sentence_file_numbers.append(D["boke_number"])
#         teacher_signals.append(1)
    
#     if USE_CAPTION:
#         for D in caption_datas:
#             image_file_numbers.append(D["image_number"])
#             sentence_file_numbers.append(D["caption_number"])
#             teacher_signals.append(0)
    
#     if USE_MISS_BOKE:
#         miss_boke_datas = list()
#         tmp_idx = np.random.randint(0, len(boke_datas), size = (len(boke_datas), ))
#         for i, idx in tqdm(enumerate(tmp_idx)):
#             tmp_boke_number = boke_datas[idx]["boke_number"]
#             while str(tmp_boke_number)[:7] == str(boke_datas[i]["boke_number"])[:7]:
#                 idx = np.random.randint(0, len(boke_datas))
#                 tmp_boke_number = boke_datas[idx]["boke_number"]

#             miss_boke_datas.append({
#                 "boke_number": tmp_boke_number,
#                 "image_number": boke_datas[i]["image_number"]
#             })
        
#         for D in miss_boke_datas:
#             image_file_numbers.append(D["image_number"])
#             sentence_file_numbers.append(D["boke_number"])
#             teacher_signals.append(0)
    
#     print(f"num data: {len(teacher_signals)}")
    
#     tmp = list(zip(image_file_numbers, sentence_file_numbers, teacher_signals))
#     np.random.shuffle(tmp)
#     image_file_numbers, sentence_file_numbers, teacher_signals = zip(*tmp)
#     image_file_numbers = tf.constant(image_file_numbers, dtype = tf.int64)
#     sentence_file_numbers = tf.constant(sentence_file_numbers, dtype = tf.int64)
#     teacher_signals = tf.constant(teacher_signals, dtype = tf.int32)

#     clip_image_feature_dataset = make_dataset_with_directory(image_file_numbers, CLIP_IMAGE_FEATURE_DIR)
#     clip_sentence_feature_dataset = make_dataset_with_directory(sentence_file_numbers, CLIP_SENTENCE_FEATURE_DIR)
#     luke_sentence_feature_dataset = make_dataset_with_directory(sentence_file_numbers, LUKE_SENTENCE_FEATURE_DIR)
#     teacher_signal_dataset = tf.data.Dataset.from_tensor_slices(teacher_signals)

#     dataset = tf.data.Dataset.zip( ((clip_image_feature_dataset, clip_sentence_feature_dataset, luke_sentence_feature_dataset), teacher_signal_dataset) )
#     return dataset

# train_dataset = make_dataset(train_boke_datas, train_caption_datas)
# test_dataset = make_dataset(test_boke_datas, test_caption_datas)

# (CIF, CSF, LSF), TS = next(iter(train_dataset.batch(32)))
# CIF.shape, CSF.shape, LSF.shape, TS.shape

2161870it [00:01, 1096490.76it/s]


num data: 4552420


21838it [00:00, 1642138.67it/s]


num data: 45986


(TensorShape([32, 512]),
 TensorShape([32, 512]),
 TensorShape([32, 768]),
 TensorShape([32]))

In [10]:
def make_dataloader(boke_datas, caption_datas):
    class LoadNpyDataset(Dataset):
        def __init__(self, image_file_numbers, sentence_file_numbers, teacher_signals):
            if len(image_file_numbers) != len(sentence_file_numbers) and len(sentence_file_numbers) != len(teacher_signals):
                raise ValueError("データリストの長さが一致しません")

            self.image_file_numbers = image_file_numbers
            self.sentence_file_numbers = sentence_file_numbers
            self.teacher_signals = teacher_signals

        def __len__(self):
            return len(self.teacher_signals)

        def __getitem__(self, idx):
            clip_image_feature = np.load(f"{CLIP_IMAGE_FEATURE_DIR}{self.image_file_numbers[idx]}.npy")
            clip_sentence_feature = np.load(f"{CLIP_SENTENCE_FEATURE_DIR}{self.sentence_file_numbers[idx]}.npy")
            luke_sentence_feature = np.load(f"{LUKE_SENTENCE_FEATURE_DIR}{self.sentence_file_numbers[idx]}.npy")
            teacher_signal = self.teacher_signals[idx]

            return clip_image_feature, clip_sentence_feature, luke_sentence_feature, teacher_signal

    def collate_fn_tf(batch):
        clip_image_features = np.array([B[0] for B in batch])
        clip_sentence_features = np.array([B[1] for B in batch])
        luke_sentence_features = np.array([B[2] for B in batch])
        teacher_signals = np.array([float(B[3]) for B in batch])
        
        return clip_image_features, clip_sentence_features, luke_sentence_features, teacher_signals

    #
    image_file_numbers = list()
    sentence_file_numbers = list()
    teacher_signals = list()

    for D in boke_datas:
        image_file_numbers.append(D["image_number"])
        sentence_file_numbers.append(D["boke_number"])
        teacher_signals.append(1)

    if USE_CAPTION:
        for D in caption_datas:
            image_file_numbers.append(D["image_number"])
            sentence_file_numbers.append(D["caption_number"])
            teacher_signals.append(0)
    
    if USE_MISS_BOKE:
        miss_boke_datas = list()
        tmp_idx = np.random.randint(0, len(boke_datas), size = (len(boke_datas), ))
        for i, idx in tqdm(enumerate(tmp_idx)):
            tmp_boke_number = boke_datas[idx]["boke_number"]
            while str(tmp_boke_number)[:7] == str(boke_datas[i]["boke_number"])[:7]:
                idx = np.random.randint(0, len(boke_datas))
                tmp_boke_number = boke_datas[idx]["boke_number"]

            miss_boke_datas.append({
                "boke_number": tmp_boke_number,
                "image_number": boke_datas[i]["image_number"]
            })
        
        for D in miss_boke_datas:
            image_file_numbers.append(D["image_number"])
            sentence_file_numbers.append(D["boke_number"])
            teacher_signals.append(0)
    
    print(f"num data: {len(teacher_signals)}")

    tmp = list(zip(image_file_numbers, sentence_file_numbers, teacher_signals))
    np.random.shuffle(tmp)
    image_file_numbers, sentence_file_numbers, teacher_signals = zip(*tmp)

    dataset = LoadNpyDataset(image_file_numbers, sentence_file_numbers, teacher_signals)
    dataloader = DataLoader(
        dataset, 
        batch_size = BATCH_SIZE, 
        num_workers = 10, 
        collate_fn = collate_fn_tf
    )

    return dataloader

train_dataloader = make_dataloader(train_boke_datas, train_caption_datas)
test_dataloader = make_dataloader(test_boke_datas, test_caption_datas)

#
CIF, CSF, LSF, TS = next(iter(train_dataloader))
CIF.shape, CSF.shape, LSF.shape, TS.shape

2161870it [00:03, 668746.07it/s]


num data: 4552420


21838it [00:00, 47237.45it/s]

num data: 45986





((32, 512), (32, 512), (32, 768), (32,))

In [None]:
# def build_model():
#     # clip image feature
#     input1 = layers.Input(shape = CIF.shape[1:])
#     # clip sentence feature
#     input2 = layers.Input(shape = CSF.shape[1:])
#     # luke sentence feature
#     input3 = layers.Input(shape = LSF.shape[1:])

#     x = layers.Concatenate()([input1, input2, input3])
#     x = layers.Dense(units = 1024)(x)
#     x = layers.LeakyReLU()(x)
#     x = layers.Dense(units = 1024)(x)
#     x = layers.LeakyReLU()(x)
#     output = layers.Dense(units = 1, activation = "sigmoid")(x)

#     return models.Model([input1, input2, input3], output)

# model = build_model()
# model.compile(loss = losses.BinaryCrossentropy(),
#               optimizer = optimizers.AdamW(),
#               metrics = [metrics.BinaryAccuracy()])
# model.summary()

In [None]:
class CustomModel(nn.Module):
    def __init__(self, cif_dim, csf_dim, lsf_dim):
        super(CustomModel, self).__init__()
        # 入力次元を設定
        self.cif_dim = cif_dim
        self.csf_dim = csf_dim
        self.lsf_dim = lsf_dim
        
        # 全結合層
        self.fc1 = nn.Linear(cif_dim + csf_dim + lsf_dim, 1024)
        self.fc2 = nn.Linear(1024, 1024)
        self.output_layer = nn.Linear(1024, 1)
        
    def forward(self, cif, csf, lsf):
        # 入力を結合
        x = torch.cat([cif, csf, lsf], dim=1)
        # 全結合層 + LeakyReLU
        x = F.leaky_relu(self.fc1(x))
        x = F.leaky_relu(self.fc2(x))
        # 出力層 + シグモイド
        output = torch.sigmoid(self.output_layer(x))
        return output

model = CustomModel(CIF.shape[1], CSF.shape[1], LSF.shape[1])
print(model)

CustomModel(
  (fc1): Linear(in_features=1792, out_features=1024, bias=True)
  (fc2): Linear(in_features=1024, out_features=1024, bias=True)
  (output_layer): Linear(in_features=1024, out_features=1, bias=True)
)


In [None]:
# model = build_model()
# optimizer = optimizers.AdamW()

# train_loss_history = list()
# train_accuracy_history = list()
# test_loss_history = list()
# test_accuracy_history = list()

# for epoch in range(EPOCH):

#     # train
#     train_dataloader = make_dataloader(train_boke_datas, train_caption_datas)
#     train_loss_obj = metrics.Mean()
#     train_accuracy_obj = metrics.Mean()

#     pb = tqdm(train_dataloader)
#     for CIF, CSF, LSF, TS in pb:
#         CIF = tf.convert_to_tensor(CIF)
#         CSF = tf.convert_to_tensor(CSF)
#         LSF = tf.convert_to_tensor(LSF)
#         TS = tf.convert_to_tensor(TS)

#         loss, accuracy = model.train_on_batch([CIF, CSF, LSF], TS)
#         train_loss_obj(loss)
#         train_accuracy_obj(accuracy)
#         pb.set_postfix({"train_loss": float(train_loss_obj.result()),
#                         "train_accuracy": float(train_accuracy_obj.result())})
        
#         del CIF, CSF, LSF, TS

#     # test
#     test_dataloader = make_dataloader(test_boke_datas, test_caption_datas)
#     test_loss_obj = metrics.Mean()
#     test_accuracy_obj = metrics.Mean()

#     pb = tqdm(test_dataloader)
#     for CIF, CSF, LSF, TS in pb:
#         CIF = tf.constant(CIF)
#         CSF = tf.constant(CSF)
#         LSF = tf.constant(LSF)
#         TS = tf.constant(TS)
#         loss, accuracy = model.evaluate([CIF, CSF, LSF], TS,
#                                         verbose = False, batch_size = BATCH_SIZE)
#         train_loss_obj(loss)
#         train_accuracy_obj(accuracy)
#         pb.set_postfix({"train_loss": float(train_loss_obj.result()),
#                         "train_accuracy": float(train_accuracy_obj.result())})
    
#     train_loss = float(train_loss_obj.result())
#     train_accuracy = float(train_accuracy_obj.result())
#     test_loss = float(test_loss_obj.result())
#     test_accuracy = float(test_accuracy_obj.result())

#     print(f"epoch: {epoch}, train loss: {train_loss}, train accuracy: {train_accuracy}, test loss: {test_loss}, test accuracy: {test_accuracy}")

#     train_loss_history.append(train_loss)
#     train_accuracy_history.append(train_accuracy)
#     test_loss_history.append(test_loss)
#     test_accuracy_history.append(test_accuracy)

# with open(f"{RESULT_DIR}history.json", "w") as f:
#     json.dump({
#         "train_loss": train_loss_history,
#         "train_accuracy": train_accuracy_history,
#         "test_loss": test_loss_history,
#         "test_accuracy": test_accuracy_history
#     }, f)

2161870it [00:03, 629279.54it/s]


num data: 4552420


 16%|█▌        | 2832/17783 [05:04<4:39:06,  1.12s/it, train_loss=0.674, train_accuracy=0.674]

In [None]:
def train_step(model, optimizer, batch_data, batch_labels):
    model.train()  # モデルを訓練モードに設定
    optimizer.zero_grad()  # 勾配を初期化
    outputs = model(*batch_data).float()  # モデルで予測を計算
    loss = nn.BCELoss()(outputs, batch_labels.float())  # 損失関数（バイナリクロスエントロピー）
    accuracy = ((outputs > 0.5).float() == batch_labels).float().mean()  # 精度計算
    loss.backward()  # 勾配を計算
    optimizer.step()  # パラメータ更新
    return loss.item(), accuracy.item()

def evaluate(model, batch_data, batch_labels):
    model.eval()  # モデルを評価モードに設定
    with torch.no_grad():  # 評価時に勾配計算を行わない
        outputs = model(*batch_data)
        loss = nn.BCELoss()(outputs, batch_labels)
        accuracy = ((outputs > 0.5).float() == batch_labels).float().mean()
    return loss.item(), accuracy.item()

train_loss_history = []
train_accuracy_history = []
test_loss_history = []
test_accuracy_history = []

model = CustomModel(CIF.shape[1], CSF.shape[1], LSF.shape[1])
optimizer = optim.AdamW(model.parameters())

# 訓練・評価ループ
for epoch in range(EPOCHS):
    # 訓練
    train_loss_obj = 0.0
    train_accuracy_obj = 0.0
    model.train()  # 訓練モード
    train_dataloader = make_dataloader(train_boke_datas, train_caption_datas)
    pb = tqdm(train_dataloader, desc = f"Epoch {epoch+1}/{EPOCHS}")
    
    for CIF, CSF, LSF, TS in pb:
        CIF, CSF, LSF, TS = torch.tensor(CIF), torch.tensor(CSF), torch.tensor(LSF), torch.tensor(TS)
        batch_data = (CIF, CSF, LSF)  # 入力データ
        batch_labels = TS.unsqueeze(1)  # 出力ラベル（shape: [batch_size, 1]）
        
        loss, accuracy = train_step(model, optimizer, batch_data, batch_labels)
        train_loss_obj += loss
        train_accuracy_obj += accuracy
        pb.set_postfix({"train_loss": train_loss_obj / (pb.n + 1), "train_accuracy": train_accuracy_obj / (pb.n + 1)})

    # 訓練後の損失と精度
    train_loss = train_loss_obj / len(train_dataloader)
    train_accuracy = train_accuracy_obj / len(train_dataloader)

    # テスト
    test_loss_obj = 0.0
    test_accuracy_obj = 0.0
    model.eval()  # 評価モード
    test_dataloader = make_dataloader(test_boke_datas, test_caption_datas)
    pb = tqdm(test_dataloader, desc = "Evaluating")

    for CIF, CSF, LSF, TS in pb:
        CIF, CSF, LSF, TS = torch.tensor(CIF).float(), torch.tensor(CSF).float(), torch.tensor(LSF).float(), torch.tensor(TS).float()
        batch_data = (CIF, CSF, LSF)
        batch_labels = TS.unsqueeze(1).float()
        
        loss, accuracy = evaluate(model, batch_data, batch_labels)
        test_loss_obj += loss
        test_accuracy_obj += accuracy
        pb.set_postfix({"test_loss": test_loss_obj / (pb.n + 1), "test_accuracy": test_accuracy_obj / (pb.n + 1)})

    # テスト後の損失と精度
    test_loss = test_loss_obj / len(test_dataloader)
    test_accuracy = test_accuracy_obj / len(test_dataloader)

    # 結果表示
    print(f"Epoch: {epoch+1}/{EPOCHS}, "
          f"Train Loss: {train_loss:.4f}, Train Accuracy: {train_accuracy:.4f}, "
          f"Test Loss: {test_loss:.4f}, Test Accuracy: {test_accuracy:.4f}")

    # 履歴の保存
    train_loss_history.append(train_loss)
    train_accuracy_history.append(train_accuracy)
    test_loss_history.append(test_loss)
    test_accuracy_history.append(test_accuracy)

# 結果をJSONファイルに保存
with open(f"{RESULT_DIR}history.json", "w") as f:
    json.dump({
        "train_loss": train_loss_history,
        "train_accuracy": train_accuracy_history,
        "test_loss": test_loss_history,
        "test_accuracy": test_accuracy_history
    }, f)


2161870it [00:03, 661153.03it/s]


num data: 4552420


Epoch 1/10:   6%|▌         | 8630/142264 [04:50<4:13:21,  8.79it/s, train_loss=0.666, train_accuracy=0.532]

In [12]:
torch.tensor(CIF, dtype = torch.float32)

  torch.tensor(CIF, dtype = torch.float32)


tensor([[-2.6309,  0.9189,  0.8711,  ..., -1.8018, -0.5161, -3.3066],
        [-6.3281, -2.5703,  4.1680,  ..., -6.1719,  8.7031,  4.3320],
        [-0.5420,  3.6680,  3.1328,  ...,  1.6611, -0.4119, -2.2832],
        ...,
        [ 2.5312,  0.2988,  0.8936,  ...,  0.9741,  1.0430, -7.9766],
        [-4.6367,  3.3340, -2.3906,  ..., -3.8438,  3.9785, -4.8164],
        [-2.1738, -2.4199, -2.4395,  ...,  1.8652, -8.0000,  1.5752]])