In [1]:
import os
import json
import numpy as np
import shutil
import argparse
import matplotlib.pyplot as plt
from PIL import Image
from tqdm import tqdm
from sklearn.model_selection import train_test_split

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import optim
from torch.utils.data import Dataset, DataLoader
import torchvision.models as models
import torchvision.transforms as transforms

In [2]:
# 大喜利生成モデルのクラス
class BokeGeneratorModel(nn.Module):
    def __init__(self, num_word, image_feature_dim, sentence_length, feature_dim = 1024):
        """
            num_word: 学習に用いる単語の総数
            image_feature_dim: 画像の特徴量の次元数
            sentence_length: 入力する文章の単語数
            feature_dim: 特徴量次元数
        """
        super(BokeGeneratorModel, self).__init__()
        self.num_word = num_word
        self.image_feature_dim = image_feature_dim
        self.sentence_length = sentence_length
        self.feature_dim = feature_dim
        
        self.fc1 = nn.Linear(image_feature_dim, feature_dim)
        self.embedding = nn.Embedding(num_word, feature_dim, padding_idx = 0)
        self.lstm = nn.LSTM(input_size = feature_dim, hidden_size = feature_dim, 
                            batch_first = True)
        self.fc2 = nn.Linear(feature_dim + feature_dim, 2 * feature_dim)
        self.fc3 = nn.Linear(2 * feature_dim, 2 * feature_dim)
        self.fc4 = nn.Linear(2 * feature_dim, num_word)
    
    # LSTMの初期値は0で，画像の特徴量と文章の特徴量を全結合層の前で結合する
    def forward(self, image_features, sentences):
        """
            image_features: 画像の特徴量
            sentences: 入力する文章
        """
        x1 = F.leaky_relu(self.fc1(image_features))
        x1 = x1.unsqueeze(1).repeat(1, self.sentence_length, 1)

        x2 = self.embedding(sentences)
        x2, _ = self.lstm(x2)

        x = torch.cat((x1, x2), dim = -1)
        x = F.leaky_relu(self.fc2(x))
        x = F.leaky_relu(self.fc3(x))

        return self.fc4(x)

In [3]:
class ImageEncoder(nn.Module):
    def __init__(self, image_feature_dim):
        """
            image_feature_dim: 
        """
        super(ImageEncoder, self).__init__()

        self.conv1 = nn.Conv2d(3, 32, kernel_size = 3, stride = 2, padding = 1)
        self.conv2 = nn.Conv2d(32, 64, kernel_size = 3, stride = 2, padding = 1)
        self.conv3 = nn.Conv2d(64, 128, kernel_size = 3, stride = 2, padding = 1)
        self.conv4 = nn.Conv2d(128, 256, kernel_size = 3, stride = 2, padding = 1)

        self.fc1 = nn.Linear(16384, 16384)
        self.fc2 = nn.Linear(16384, image_feature_dim)
    
    def forward(self, images):
        x = F.leaky_relu( self.conv1(images) )
        # 32, 64, 64
        x = F.leaky_relu( self.conv2(x) )
        # 64, 32, 32
        x = F.leaky_relu( self.conv3(x) )
        # 128, 16, 16
        x = F.leaky_relu( self.conv4(x) )
        # 256, 8, 8

        x = nn.Flatten()(x)
        x = F.leaky_relu(self.fc1(x))
        return F.leaky_relu(self.fc2(x))

class ImageDecoder(nn.Module):
    def __init__(self, image_feature_dim):
        """
            image_feature_dim: 
        """
        super(ImageDecoder, self).__init__()

        self.fc1 = nn.Linear(image_feature_dim, 16384)

        self.deconv1 = nn.ConvTranspose2d(256, 128, kernel_size = 3, stride = 2, padding = 1, output_padding = 1)
        self.deconv2 = nn.ConvTranspose2d(128, 64, kernel_size = 3, stride = 2, padding = 1, output_padding = 1)
        self.deconv3 = nn.ConvTranspose2d(64, 32, kernel_size = 3, stride = 2, padding = 1, output_padding = 1)
        self.deconv4 = nn.ConvTranspose2d(32, 32, kernel_size = 3, stride = 2, padding = 1, output_padding = 1)
        
        self.conv1 = nn.Conv2d(32, 3, kernel_size = 3, stride = 1, padding = 1)
    
    def forward(self, image_features):
        x = F.leaky_relu(self.fc1(image_features))
        x = nn.Unflatten(1, (256, 8, 8))(x)
        # 256, 8, 8

        x = F.leaky_relu( self.deconv1(x) )
        # 128, 16, 16
        x = F.leaky_relu( self.deconv2(x) )
        # 64, 32, 32
        x = F.leaky_relu( self.deconv3(x) )
        # 32, 64, 64
        x = F.leaky_relu( self.deconv4(x) )
        # 32, 128, 128
        return nn.Sigmoid()( self.conv1(x) )
        # 3, 128, 128

class Autoencoder(nn.Module):
    def __init__(self, image_feature_dim):
        """
            image_feature_dim: 
        """
        super(Autoencoder, self).__init__()

        self.encoder = ImageEncoder(image_feature_dim)
        self.decoder = ImageDecoder(image_feature_dim)
    
    def forward(self, images):
        return self.decoder( self.encoder(images) )

In [4]:
# 大喜利生成AI
class GUMI_AE:
    def __init__(self, weight_path, ae_weight_path, index_to_word, sentence_length, feature_dim = 1024, ae_feature_dim = 16384):
        """
            weight_path: 大喜利適合判定モデルの学習済みの重みのパス
            ae_weight_path: 
            index_to_word: 単語のID: 単語の辞書(0:<PAD>, 1:<START>, 2:<END>)
            sentence_length: 入力する文章の単語数
            feature_dim: 特徴量次元数
            ae_feature_dim: 
        """
        self.index_to_word = index_to_word
        self.sentence_length = sentence_length

        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

        self.boke_generate_model = BokeGeneratorModel(
                                        num_word = len(index_to_word), 
                                        image_feature_dim = ae_feature_dim, 
                                        sentence_length = sentence_length, 
                                        feature_dim = feature_dim)
        self.boke_generate_model.load_state_dict(torch.load(weight_path))
        self.boke_generate_model.to(self.device)
        self.boke_generate_model.eval()

        self.autoencoder = Autoencoder(image_feature_dim = ae_feature_dim)
        self.autoencoder.load_state_dict(torch.load(ae_weight_path))
        self.encoder = self.autoencoder.encoder
        self.encoder = self.encoder.to(self.device)
        self.encoder.eval()

        # 画像の前処理
        self.image_preprocesser = transforms.Compose([
            transforms.Resize((128, 128)),
            transforms.ToTensor(),
        ])
    
    def __call__(self, image_path, argmax = False, top_k = 5):
        """
            image_path: 大喜利を生成したい画像のパス
            argmax: Trueなら最大確率の単語を選ぶ, FalseならTop-Kサンプリングを行う
            top_k: Top-Kサンプリング時に考慮する単語の数
        """
        image = Image.open(image_path)
        preprocessed_image = self.image_preprocesser(image).to(self.device)
        image_feature = self.encoder( preprocessed_image.unsqueeze(0) ) # (1, 2048)
        
        generated_text = [1] # <START>トークン
        for i in range(1, self.sentence_length):
            tmp = generated_text + [0] * (self.sentence_length - i) # Padding
            tmp = torch.Tensor(np.array(tmp)).unsqueeze(0).to(self.device).to(dtype=torch.int32) # (1, sentence_length)
            pred = self.boke_generate_model(image_feature, tmp) # (1, sentence_length, num_word)
            target_pred = pred[0][i - 1]

            if argmax:
                # 最大確率の単語を選ぶ
                chosen_id = torch.argmax(target_pred).item()
            else:
                # Top-Kサンプリング
                top_k_probs, top_k_indices = torch.topk(target_pred, top_k)
                top_k_probs = torch.nn.functional.softmax(top_k_probs, dim = -1)
                chosen_id = np.random.choice(top_k_indices.detach().cpu().numpy(), 
                                             p = top_k_probs.detach().cpu().numpy())
            
            generated_text.append(chosen_id)
            if chosen_id == 2:
                break
        
        generated_sentence = ""
        for I in generated_text[1:-1]:
            generated_sentence += self.index_to_word[I]
        return generated_sentence

In [14]:
with open('/home/user/workspace/Master_Thesis/results/GUMI_AMAE/False_False_False_0_32_4_31_128_128_25_32_0.0001_16384_1000.0_3_3_25_64_0.0001_1024/index_to_word.json', 'r') as f:
    index_to_word = json.load(f)

index_to_word = {
    int(K): V for K, V in index_to_word.items()
}

In [15]:
model = GUMI_AE(weight_path = '/home/user/workspace/Master_Thesis/results/GUMI_AMAE/False_False_False_0_32_4_31_128_128_25_32_0.0001_16384_1000.0_3_3_25_64_0.0001_1024/best_model.pth',
                ae_weight_path = f"/home/user/workspace/Master_Thesis/results/Augmix_Autoencoder/False_False_False_0_32_4_31_128_128_25_32_0.0001_16384_1000.0_3_3/best_model.pth",
                index_to_word = index_to_word,
                sentence_length = 32, 
                feature_dim = 1024, 
                ae_feature_dim = 16384) 

  self.boke_generate_model.load_state_dict(torch.load(weight_path))
  self.autoencoder.load_state_dict(torch.load(ae_weight_path))


In [16]:
with open("generated_ohgiri.json", "r") as f:
    datas = json.load(f)

In [17]:
for D in tqdm(datas):
    D["GUMI_AMAE_1000.0"] = model(f"../../datas/boke_image/{D['image_number']}.jpg")

100%|██████████| 200/200 [00:06<00:00, 28.93it/s]


In [18]:
datas

[{'image_number': 821461,
  'human': 'やる気あんの？',
  'star': 0,
  'caption': '暗い部屋で他人と話している男',
  'GUMI_AMAE_100.0': '「俺のプリン食べたやろー！」',
  'GUMI_AMAE_10.0': 'この後、彼は見た',
  'GUMI_AMAE_1.0': '「お前の母ちゃん、出ベソ！」',
  'GUMI_T_3': '「お前、何の話してるか分かってんだよ？」',
  'Neural_Joking_Machine': '「あの、すみません。道を聞いただけで、何か変なものがさ～」',
  'llama': '暗い部屋で他人と話している男画像に対して一言。',
  'GUMI_AE': 'あ〜あ〜。また派手な人が出てきたな〜…',
  'GUMI_AMAE_1000.0': 'この中に１人妹がいる。'},
 {'image_number': 100060,
  'human': 'そんな事より緑が豊か',
  'star': 0,
  'caption': '多数のタイヤがフィールドに存在している。',
  'GUMI_AMAE_100.0': '「お前は、この戦いにはついてこれそう」「いやいや、そちらはプライドも捨てちゃいまええやろ」',
  'GUMI_AMAE_10.0': 'え、俺の子？',
  'GUMI_AMAE_1.0': '「おい、お前ら、俺の靴磨き返せコラァ！」',
  'GUMI_T_3': '「「「「「あ～あ。あの、あの、もうちょっと…」」」」',
  'Neural_Joking_Machine': 'この惑星の住人は、去年のゴルフボールだけが連鎖',
  'llama': 'フィールドを走行するたくさんのタイヤ',
  'GUMI_AE': '「お前は俺を怒らせた・・・」と言われたので。',
  'GUMI_AMAE_1000.0': '「俺、この戦争が終わったらゲートボールに行ってやる」'},
 {'image_number': 465363,
  'human': '人々は彼をこう呼ぶ！ルンペンと！',
  'star': 0,
  'caption': '女性がを持って通りを歩いている姿。',
  'GUMI_AMAE_100.0'

In [10]:
with open("generated_ohgiri.json", "w") as f:
    json.dump(datas, f)