### Environment Preparation

In [None]:
# !pip install swifter pandas numpy
# !pip install -q -U keras-tuner
# !pip install tensorflow-gpu keras keras-transformer keras_bert
# !pip install pydot pydotplus
# !pip install h5py==2.10.0 --force-reinstall
# !apt-get install graphviz -y

### Download required embedding

In [None]:
# !wget https://storage.googleapis.com/bert_models/2020_02_20/uncased_L-12_H-768_A-12.zip data/
# !wget https://dl.fbaipublicfiles.com/fasttext/vectors-english/crawl-300d-2M.vec.zip data/
# !wget https://nlp.stanford.edu/data/glove.840B.300d.zip data/

In [None]:
# !unzip data/uncased_L-12_H-768_A-12.zip 
# !unzip data/crawl-300d-2M.vec.zip
# !unzip data/glove.840B.300d.zip 

### Run Preprocessing File

In [None]:
# !python preprocess.py

### Imports


In [7]:
import os
import gc
import json
import pandas as pd
import numpy as np
from random import choice, seed
import tensorflow as tf
from keras.models import Sequential, Model
from keras.layers import Input, CuDNNLSTM as LSTM, Dropout, BatchNormalization
from keras.layers import Dense, Concatenate, Embedding, Bidirectional, Lambda, Conv1D
from keras.layers import Add, TimeDistributed, GlobalMaxPooling1D
from tensorflow.compat.v1.keras.optimizers import Adam, Nadam
from keras.preprocessing.sequence import pad_sequences
from keras.callbacks import Callback
import tensorflow.compat.v1.keras.backend as K
import keras_tuner as kt
import keras
from keras_bert.loader import load_trained_model_from_checkpoint
from keras_bert import AdamWarmup, calc_train_steps
from keras.models import Model
from keras.models import load_model
from keras_bert import get_custom_objects
from keras_bert import Tokenizer
from collections import defaultdict
from eval import read_submission, get_ndcg
from tqdm import tqdm, trange
import pickle
import dask.dataframe as dd
import joblib


BERT_PRETRAINED_DIR = "/root/Applied_AI_Lab_WiSe2021_Passau/ai-light/data/uncased_L-12_H-768_A-12"
VAL_ANS_PATH = '/root/Applied_AI_Lab_WiSe2021_Passau/ai-light/data/valid_answer.json'
LABEL_PATH = '/root/Applied_AI_Lab_WiSe2021_Passau/ai-light/data/multimodal_labels.txt'

MAX_EPOCH = 20
MAX_LEN = 10
B_SIZE = 256
FOLD_IDS = [-1]
FOLD_NUM = 20
THRE = 0.5
SHUFFLE = True
MAX_BOX = 5
MAX_CHAR = 5
PREFIX = "[image-bert-concat-query]-wwm_uncased_L12-768_v3_1M_example"
SEED = 2021
ACCUM_STEP = int(128 // B_SIZE)
SAVE_EPOCHS=[10, 20, 35, 50, 80, 100]
IMAGE_LABEM_CONCAT_TOKEN = "###"
CONCAT_TOKE = "[unused0]"

cfg = {}
cfg["verbose"] = PREFIX
cfg["base_dir"] = BERT_PRETRAINED_DIR
cfg['maxlen'] = MAX_LEN
cfg["max_box"] = MAX_BOX
cfg["max_char"] = MAX_CHAR
cfg["lr"] = 1e-4
cfg['min_lr'] = 6e-8
cfg["opt"] = "nadam"
cfg["loss_w"] =  20.
cfg["trainable"] = True
cfg["bert_trainable"] = True
cfg["mix_mode"] = ""   # add concat average
cfg["unit1_1"] = 128
cfg["accum_step"] = ACCUM_STEP
cfg["cls_num"] = 2
cfg["raw_filename"] = "{}_{}oof{}"

In [3]:
def get_vocab():
    
    if "albert"in cfg["verbose"].lower():
        dict_path = os.path.join(BERT_PRETRAINED_DIR, 'vocab_chinese.txt')
    else:
        dict_path = os.path.join(BERT_PRETRAINED_DIR, 'vocab.txt')
    with open(dict_path, mode="r", encoding="utf8") as f:
        lines = f.readlines()
        lines = [l.strip() for l in lines]

    word_index = {v: k  for k, v in enumerate(lines)}
    return word_index


word_index = get_vocab()
cfg["x_pad"] = word_index["[PAD]"]
print(cfg['x_pad'])
tokenizer = Tokenizer(word_index)


def get_label(path):
    with open(path) as f:
        lines = f.readlines()
        label2id = {l.split('\n')[0].split('\t')[1]:int(l.split('\n')[0].split('\t')[0]) for l in lines[1:]}
        id2label = {int(l.split('\n')[0].split('\t')[0]):l.split('\n')[0].split('\t')[1] for l in lines[1:]}
    return label2id, id2label


label2id, id2label = get_label(LABEL_PATH)
label_set = set(label2id.keys())
print(label_set)

0
{'baby products', 'furniture', 'others', 'hair', 'underwear', 'makeup, perfume, beauty tools and essential oils', 'storage supplies', 'sporting goods', 'household fabric', 'home decoration', 'home / personal cleaning tools', 'household electrical appliances', 'bed linens', 'clothes (accessories, baby clothing, etc.)', 'kitchenware', 'stationery', 'hand', 'skirt & dress', 'digital supplies', 'human face', 'arm', 'outdoor product', 'luggage, leather goods', 'top clothes (coat, jacket, shirt, etc.)', 'shoes', 'snacks, nuts, liquor and tea', 'bottom clothes (trousers, pants, etc.)', 'motorcycle, motorcycle accessories, vehicles, bicycle and riding equipment', 'personal care', 'lighting', 'toys', 'accessories (jewelry, clothing accessories, belts, hats, scarves, etc.)', 'bottle drink'}


In [3]:
# with open('../data/train_data.pkl', 'rb') as outp:
#     train_data= joblib.load(outp)
# 100K sample
with open('data/1M_data.pkl', 'rb') as outp:
    train_data = joblib.load(outp)

with open("data/sample_val.pkl", "rb") as f:
    val_data = joblib.load(f)

# with open('data/val_data.pkl', 'rb') as outp:
#     val_data = pickle.load(outp)

In [4]:
def get_coefs(word, *arr):
    return word, np.asarray(arr, dtype=np.float16)


def load_embed(path, dim=300, word_index=None):
    embedding_index = {}
    with open(path, mode="r", encoding="utf8") as f:
        lines = f.readlines()
        for l in lines:
            l = l.strip().split()
            word, arr = l[0], l[1:]
            if len(arr) != dim:
                print("[!] l = {}".format(l))
                continue
            if word_index and word not in word_index:
                continue
            word, arr = get_coefs(word, arr)
            embedding_index[word] = arr
    return embedding_index


def build_matrix(path, word_index=None, max_features=None, dim=300):
    embedding_index = load_embed(path, dim=dim, word_index=word_index)
    max_features = len(word_index) + 1 if max_features is None else max_features 
    embedding_matrix = np.zeros((max_features + 1, dim))
    unknown_words = []
    
    for word, i in word_index.items():
        if i <= max_features:
            try:
                embedding_matrix[i] = embedding_index[word]
            except KeyError:
                unknown_words.append(word)
    return embedding_matrix, unknown_words


def load_word_embed(word_embed_glove="data/glove.840B.300d.txt", 
                    word_embed_crawl="data/crawl-300d-2M.vec",
               save_filename="word_embedding_matrix",
               word_index=None):
    """
    (30524, 300) 7590
    (30524, 300) 7218
    """    
    if os.path.exists(save_filename + ".npy"):
        word_embedding_matrix = np.load(save_filename + ".npy").astype("float32")
    else:
        word_embedding_matrix, _ = build_matrix(word_embed_glove, word_index=word_index, dim=300)
        word_embedding_matrix_v2, _ = build_matrix(word_embed_crawl, word_index=word_index, dim=300)
        word_embedding_matrix = np.concatenate([word_embedding_matrix, word_embedding_matrix_v2], axis=1)
        
        gc.collect()
        np.save(save_filename, word_embedding_matrix)
    return word_embedding_matrix


word_embedding_matrix = load_word_embed(word_index=word_index)

In [5]:
word_embedding_matrix.shape

(30524, 600)

In [5]:
def build_model(cfg, summary=False, word_embedding_matrix=None):
    
    def _get_model(base_dir, cfg_=None):
        config_file = os.path.join(base_dir, 'bert_config.json')
        checkpoint_file = os.path.join(base_dir, 'bert_model.ckpt')
        if not os.path.exists(config_file):
            config_file = os.path.join(base_dir, 'bert_config_large.json')
            checkpoint_file = os.path.join(base_dir, 'roberta_l24_large_model')
        # print(config_file, checkpoint_file)
        model = load_trained_model_from_checkpoint(config_file, 
                                           checkpoint_file, 
                                           training=False, 
                                           trainable=cfg_["bert_trainable"], 
                                           output_layer_num=cfg["cls_num"],
                                           seq_len=None)
        return model
    
    def get_opt(num_example, warmup_proportion=0.1, lr=2e-5, min_lr=None):
        if cfg["opt"].lower() == "nadam":
            opt = Nadam(lr=lr)
        else:
            total_steps, warmup_steps = calc_train_steps(
                num_example=num_example,
                batch_size=B_SIZE,
                epochs=MAX_EPOCH,
                warmup_proportion=warmup_proportion,
            )

            opt = AdamWarmup(total_steps, warmup_steps, lr=lr, min_lr=min_lr)

        return opt

    model1 = _get_model(cfg["base_dir"], cfg)
    model1 = Model(inputs=model1.inputs[: 2], outputs=model1.layers[-7].output)

    if word_embedding_matrix is not None:
        embed_layer = Embedding(input_dim=word_embedding_matrix.shape[0], 
                                output_dim=word_embedding_matrix.shape[1],
                                weights=[word_embedding_matrix],
                                trainable=cfg["trainable"],
                                name="embed_layer"
                         )
        
    inp_token1 = Input(shape=(None, ), dtype=np.int32, name="query_token_input")
    inp_segm1 = Input(shape=(None, ), dtype=np.float32, name="query_segm_input")
      
    
    inp_image = Input(shape=(None, 2048), dtype=np.float32, name="image_input")
    inp_image_mask = Input(shape=(None, ), dtype=np.float32, name="image_mask_input")
    inp_pos = Input(shape=(None, 5), dtype=np.float32, name="image_pos_input")        
    inp_image_char = Input(shape=(None, cfg["max_char"]), dtype=np.int32, name='image_char_input')
    
    
    mask = Lambda(lambda x: K.cast(K.not_equal(x, cfg["x_pad"]), 'float32'), name="token_mask")(inp_token1)
    word_embed = embed_layer(inp_token1)
    word_embed = Lambda(lambda x: x[0] * K.expand_dims(x[1], axis=-1))([word_embed, mask])
    word_embed = Bidirectional(LSTM(cfg["unit1_1"], return_sequences=True), merge_mode="sum")(word_embed)
    word_embed = BatchNormalization()(word_embed)
    word_embed = Lambda(lambda x: x[0] * K.expand_dims(x[1], axis=-1))([word_embed, mask])

    sequence_output = model1([inp_token1, inp_segm1])
    sequence_output = Concatenate(axis=-1)([sequence_output, word_embed])
    text_pool = Lambda(lambda x: x[:, 0, :])(sequence_output)

    # Share weights of character-level embedding for premise and hypothesis
    character_embedding_layer = TimeDistributed(Sequential([
        embed_layer,
        # Embedding(input_dim=100, output_dim=char_embedding_size, input_length=chars_per_word),
        Conv1D(filters=128, kernel_size=3, name="char_embed_conv1d"),
        GlobalMaxPooling1D()
    ]), name='CharEmbedding')
    character_embedding_layer.build(input_shape=(None, None, cfg["max_char"]))
    image_char_embed  = character_embedding_layer(inp_image_char)    
    image_embed = Concatenate(axis=-1)([image_char_embed, inp_image])    
    image_embed = Dense(256, activation='relu', name='image_embed')(image_embed)
    image_embed = Lambda(lambda x: x[0] * K.expand_dims(x[1], axis=-1))([image_embed, inp_image_mask])
    pos_embed = Dense(256, activation='relu', name='pos_embed')(inp_pos)
    pos_embed = Lambda(lambda x: x[0] * K.expand_dims(x[1], axis=-1))([pos_embed, inp_image_mask])
    embed = Add()([image_embed , pos_embed]) # batch, maxlen(10), 1024+128
    
    image_embed = Bidirectional(LSTM(512, return_sequences=True), merge_mode="sum")(embed)
    image_embed = BatchNormalization()(image_embed)
    image_embed = Lambda(lambda x: x[0] * K.expand_dims(x[1], axis=-1))([image_embed, inp_image_mask])
    
    image_pool = Lambda(lambda x: x[:, 0, :])(image_embed)
    
    pool = Concatenate(axis=-1)([image_pool, text_pool])
    pool = Dense(1024, activation="relu")(pool)
    pool = Dropout(0.3)(pool)
    pool = Dense(512, activation="relu")(pool)
    pool = Dense(128, activation="relu")(pool)
    
    output = Dense(2, activation='softmax', name='output')(pool)

    opt = get_opt(num_example=cfg["num_example"], lr=cfg['lr'], min_lr=cfg['min_lr'])
    model = Model(inputs=[inp_token1, inp_segm1, 
                          inp_image, inp_image_mask,
                          inp_pos, inp_image_char], outputs=[output])#
    
    model.compile(optimizer=opt, loss={
                'output': 'sparse_categorical_crossentropy'
            }, metrics=['accuracy'])
    if summary:
        model.summary()
    
    return model


In [6]:
def build_baseline_model(cfg, summary=False, word_embedding_matrix=None):
    
    def _get_model(base_dir, cfg_=None):
        config_file = os.path.join(base_dir, 'bert_config.json')
        checkpoint_file = os.path.join(base_dir, 'bert_model.ckpt')
        if not os.path.exists(config_file):
            config_file = os.path.join(base_dir, 'bert_config_large.json')
            checkpoint_file = os.path.join(base_dir, 'roberta_l24_large_model')
        # print(config_file, checkpoint_file)
        model = load_trained_model_from_checkpoint(config_file, 
                                           checkpoint_file, 
                                           training=False, 
                                           trainable=cfg_["bert_trainable"], 
                                           output_layer_num=cfg["cls_num"],
                                           seq_len=None)
        return model
    
    def get_opt(num_example, warmup_proportion=0.1, lr=2e-5, min_lr=None):
        if cfg["opt"].lower() == "nadam":
            opt = Nadam(lr=lr)
        else:
            total_steps, warmup_steps = calc_train_steps(
                num_example=num_example,
                batch_size=B_SIZE,
                epochs=MAX_EPOCH,
                warmup_proportion=warmup_proportion,
            )

            opt = AdamWarmup(total_steps, warmup_steps, lr=lr, min_lr=min_lr)

        return opt

    # model1 = _get_model(cfg["base_dir"], cfg)
    # model1 = Model(inputs=model1.inputs[: 2], outputs=model1.layers[-7].output)

    global word_index
    word_embedding_matrix = load_word_embed(word_index=word_index)
    embed_layer = Embedding(input_dim=word_embedding_matrix.shape[0], 
                            output_dim=word_embedding_matrix.shape[1],
                            weights=[word_embedding_matrix],
                            trainable=cfg["trainable"],
                            name="embed_layer"
                        )
        
    inp_token1 = Input(shape=(None, ), dtype=np.int32, name="query_token_input")
    inp_segm1 = Input(shape=(None, ), dtype=np.float32, name="query_segm_input")
    
#     inp_token2 = Input(shape=(None, ), dtype=np.int32)
#     inp_segm2 = Input(shape=(None, ), dtype=np.float32)    
    
    inp_image = Input(shape=(None, 2048), dtype=np.float32, name="image_input")
    inp_image_mask = Input(shape=(None, ), dtype=np.float32, name="image_mask_input")
    inp_pos = Input(shape=(None, 5), dtype=np.float32, name="image_pos_input")        
    inp_image_char = Input(shape=(None, cfg["max_char"]), dtype=np.int32, name='image_char_input')
    
    
    mask = Lambda(lambda x: K.cast(K.not_equal(x, cfg["x_pad"]), 'float32'), name="token_mask")(inp_token1)
    word_embed = embed_layer(inp_token1)
    word_embed = Lambda(lambda x: x[0] * K.expand_dims(x[1], axis=-1))([word_embed, mask])
    
    # hp_units_lstm = hp.Int('lstm_units1', min_value=64, max_value=512, step=32)
    word_embed = Bidirectional(LSTM(cfg["unit1_1"], return_sequences=True), merge_mode="sum")(word_embed)
    word_embed = BatchNormalization()(word_embed)
    # word_embed = Dropout(0.3)(word_embed)
    word_embed = Lambda(lambda x: x[0] * K.expand_dims(x[1], axis=-1))([word_embed, mask])

    # sequence_output = model1([inp_token1, inp_segm1])
    # sequence_output = Concatenate(axis=-1)([sequence_output, word_embed])
    text_pool = Lambda(lambda x: x[:, 0, :])(word_embed)


    # Share weights of character-level embedding for premise and hypothesis
    character_embedding_layer = TimeDistributed(Sequential([
        embed_layer,
        # Embedding(input_dim=100, output_dim=char_embedding_size, input_length=chars_per_word),
        Conv1D(filters=128, kernel_size=3, name="char_embed_conv1d"),
        GlobalMaxPooling1D()
    ]), name='CharEmbedding')
    character_embedding_layer.build(input_shape=(None, None, cfg["max_char"]))
    image_char_embed  = character_embedding_layer(inp_image_char)    
    image_embed = Concatenate(axis=-1)([image_char_embed, inp_image])    
    image_embed = Dense(256, activation='relu', name='image_embed')(image_embed)
    image_embed = Lambda(lambda x: x[0] * K.expand_dims(x[1], axis=-1))([image_embed, inp_image_mask])
    pos_embed = Dense(256, activation='relu', name='pos_embed')(inp_pos)
    pos_embed = Lambda(lambda x: x[0] * K.expand_dims(x[1], axis=-1))([pos_embed, inp_image_mask])
    embed = Add()([image_embed , pos_embed]) # batch, maxlen(10), 1024+128
    
    image_embed = Bidirectional(LSTM(512, return_sequences=True), merge_mode="sum")(embed)
    image_embed = BatchNormalization()(image_embed)
    image_embed = Lambda(lambda x: x[0] * K.expand_dims(x[1], axis=-1))([image_embed, inp_image_mask])
    
    image_pool = Lambda(lambda x: x[:, 0, :])(image_embed)
    
    pool = Concatenate(axis=-1)([image_pool, text_pool])
    pool = Dense(1024, activation="relu")(pool)
    pool = Dropout(0.3)(pool)
    pool = Dense(512, activation="relu")(pool)
    pool = Dense(128, activation="relu")(pool)
    
    output = Dense(2, activation='softmax', name='output')(pool)

    opt = get_opt(num_example=cfg["num_example"], lr=cfg['lr'], min_lr=cfg['min_lr'])
    model = Model(inputs=[inp_token1, inp_segm1, 
                          inp_image, inp_image_mask,
                          inp_pos, inp_image_char], outputs=[output])#
    
    model.compile(optimizer=opt, loss={
                'output': 'sparse_categorical_crossentropy'
            }, metrics=['accuracy'])
    if summary:
        model.summary()
    
    return model


In [7]:
def token2id_X(X, x_dict, maxlen=None):
    x = tokenizer.tokenize(X)
    if maxlen:
        x = x[: 1] + list(x)[1: maxlen - 1] + x[-1: ]     
    seg = [0 for _ in x]
    token = list(x)
    x = [x_dict[e] if e in x_dict else x_dict["[UNK]"] for e in token]
    assert len(x) == len(seg)
    return x, seg


def seq_padding(X, maxlen=None, padding_value=None, debug=False):
    L = [len(x) for x in X]
    if maxlen is None:
        maxlen = max(L)

    pad_X = np.array([
        np.concatenate([x, [padding_value] * (maxlen - len(x))]) if len(x) < maxlen else x[: maxlen] for x in X
    ])
    if debug:
        print("[!] before pading {}\n".format(X))
        print("[!] after pading {}\n".format(pad_X))
    return pad_X
    

def MyChoice(Myset):
    result = []
    for i in Myset:
        temp_set = set()
        temp_set.add(i)
        cho = choice(list(Myset - temp_set))
        result.append(cho)
    return result


class data_generator:
    
    def __init__(self, data, batch_size=B_SIZE, shuffle=SHUFFLE):
        self.data = data
        self.batch_size = batch_size
        self.steps = len(self.data) // self.batch_size
        self.shuffle = shuffle

        if len(self.data) % self.batch_size != 0:
            self.steps += 1

    def __len__(self):
        return self.steps
    

    def __iter__(self):
        """
        inp_token1,
        inp_segm1,
        inp_image,
        inp_image_mask,
        inp_pos, 
        inp_image_char
        """
        

        while True:
            idxs = list(range(len(self.data)))
            if self.shuffle:
                np.random.shuffle(idxs)
            T1, T2, Image1, Pos1, label_word_list, image1_mask, image1_char = [], [], [], [], [], [], []
            S1, S2, Image2, Pos2, image2_mask, image2_char = [], [], [], [], [], [] # 负样本
            Id_set = set()

            for i in idxs:
                d = self.data.iloc[i]
                text = d['words']
                label_words = d['label_words']
                
                t1, t2 = token2id_X(text, x_dict=word_index, maxlen=cfg["maxlen"])
                image = np.array(d['features'], dtype="float32")
                image = image[: cfg["max_box"]]
                img_mask = [1 for _ in image[: cfg["max_box"]]]
                
                pos = np.array(d['pos'], dtype="float32")
                pos = pos[: cfg["max_box"]]
                
                image_char = [token2id_X(ent, x_dict=word_index)[0] for ent in label_words.split(IMAGE_LABEM_CONCAT_TOKEN)]
                image_char = image_char[: cfg["max_box"]]
                # print("image_char", len(image_char))
                image_char = pad_sequences(image_char, 
                                           maxlen=cfg["max_char"], 
                                           dtype='int32',
                                           padding='post',
                                           truncating='post',
                                           value=cfg["x_pad"])
                
                assert image.shape[0] == pos.shape[0]
                assert image.shape[0] == cfg["max_box"] or image.shape[0] == len(label_words.split(IMAGE_LABEM_CONCAT_TOKEN))
                assert image_char.shape == (image.shape[0], cfg["max_char"])

                T1.append(t1)
                T2.append(t2)
                Image1.append(image)
                image1_mask.append(img_mask)  
                Pos1.append(pos)
                image1_char.append(image_char)
                Id_set.add(i)

                if len(T1) == self.batch_size//2 or i == idxs[-1]:
                    ## 加入负样本
                    Id_new = MyChoice(Id_set)
#                     print(Id_set, Id_new)
                    for i, id_ in enumerate(Id_new):
                        d_new = self.data.iloc[id_]
                        text = d_new['words']
                        t1, t2 = token2id_X(text, x_dict=word_index, maxlen=cfg["maxlen"])
                        S1.append(t1)
                        S2.append(t2)
                        
                        image = Image1[i]
                        img_mask = image1_mask[i]
                        pos = Pos1[i]
                        image_char = image1_char[i]
                        
                        Image2.append(image)
                        Pos2.append(pos)
                        image2_mask.append(img_mask)
                        image2_char.append(image_char)
                    
                    Y = [1] * len(T1) + [0] * len(S1)
                   
                    T1 = seq_padding(T1 + S1, padding_value=cfg["x_pad"]) 
                    T2 = seq_padding(T2 + S2, padding_value=cfg["x_pad"])
                    
                    Image1 = seq_padding(Image1 + Image2, 
                                         padding_value=np.zeros(shape=(2048, ))
                                        )
                                                         
                    Pos1 = seq_padding(Pos1 + Pos2,
                                       padding_value=np.zeros(shape=(5, ))
                                      )
                    image1_mask = seq_padding(image1_mask + image2_mask,
                                             padding_value=0)
                    
                    image1_char = seq_padding(image1_char + image2_char,
                                             padding_value=np.zeros(shape=(cfg["max_char"])), debug=False)
                    
                    Y = np.array(Y).reshape((len(T1), -1))
                    
                    idx = np.arange(len(T1))
                    np.random.shuffle(idx)
        
                    T1 = T1[idx]
                    T2 = T2[idx]
                    Image1 = Image1[idx]
                    image1_mask = image1_mask[idx]
                    Pos1 = Pos1[idx]
                    image1_char = image1_char[idx]
                    Y = Y[idx]
                    
                    yield [T1, T2, Image1, image1_mask, Pos1, image1_char], Y
                    T1, T2, Image1, Pos1, label_word_list, image1_mask, image1_char = [], [], [], [], [], [], []
                    S1, S2, Image2, Pos2, image2_mask, image2_char = [], [], [], [], [], [] # 负样本
                    Id_set = set()

                        

In [9]:
train_D = data_generator(train_data)
val_D = data_generator(val_data)
_i  = 0
for d in train_D:
    _i += 1
    if  _i > 10:
        break
    print('x',d[0][0].shape, d[0][1].shape,d[0][2].shape, d[0][3].shape, d[0][4].shape, d[0][5].shape, d[1].shape)

_i  = 0
for d in val_D:
    _i += 1
    if  _i > 10:
        break
    print('x',d[0][0].shape, d[0][1].shape,d[0][2].shape, d[0][3].shape, d[0][4].shape, d[0][5].shape, d[1].shape)

x (256, 10) (256, 10) (256, 5, 2048) (256, 5) (256, 5, 5) (256, 5, 5) (256, 1)
x (256, 10) (256, 10) (256, 5, 2048) (256, 5) (256, 5, 5) (256, 5, 5) (256, 1)
x (256, 10) (256, 10) (256, 5, 2048) (256, 5) (256, 5, 5) (256, 5, 5) (256, 1)
x (256, 10) (256, 10) (256, 5, 2048) (256, 5) (256, 5, 5) (256, 5, 5) (256, 1)
x (256, 10) (256, 10) (256, 5, 2048) (256, 5) (256, 5, 5) (256, 5, 5) (256, 1)
x (256, 10) (256, 10) (256, 5, 2048) (256, 5) (256, 5, 5) (256, 5, 5) (256, 1)
x (256, 10) (256, 10) (256, 5, 2048) (256, 5) (256, 5, 5) (256, 5, 5) (256, 1)
x (256, 10) (256, 10) (256, 5, 2048) (256, 5) (256, 5, 5) (256, 5, 5) (256, 1)
x (256, 10) (256, 10) (256, 5, 2048) (256, 5) (256, 5, 5) (256, 5, 5) (256, 1)
x (256, 10) (256, 10) (256, 5, 2048) (256, 5) (256, 5, 5) (256, 5, 5) (256, 1)
x (256, 10) (256, 10) (256, 5, 2048) (256, 5) (256, 5, 5) (256, 5, 5) (256, 1)
x (256, 10) (256, 10) (256, 5, 2048) (256, 5) (256, 5, 5) (256, 5, 5) (256, 1)
x (256, 10) (256, 10) (256, 5, 2048) (256, 5) (256, 

In [10]:
class Evaluate(Callback):
    def __init__(self, filename=None):
        self.score = []
        self.best = 0.
        self.filename = filename
       
    def on_epoch_begin(self, epoch, logs=None):
        if epoch ==  0:
            print("[!] test load&save model")
            f = self.filename + ".h5"
            custom_objects = get_custom_objects()
            self.model.save(f, include_optimizer=False, overwrite=True)
            if "bert" in cfg["verbose"]:
                model_ = load_model(f, custom_objects=custom_objects)  
            else:
                model_ = load_model(f) 
    
    def on_epoch_end(self, epoch, logs=None):
#         if epoch + 1 < 5:
#             return
        score = self.evaluate(self.model)
        self.score.append((epoch, score))
        logs['nDCG@5'] = score
        tf.summary.scalar('nDCG@5', score, step=epoch)
        
        if epoch + 1 in SAVE_EPOCHS:
            self.model.save(self.filename + "_{}.h5".format(epoch + 1), include_optimizer=False, overwrite=True)             
        if score > self.best:
            self.model.save(self.filename + ".h5", include_optimizer=False)
            
        if score > self.best:
            self.best = score
            print("[!] epoch = {}, new NDCG best score = {}".format(epoch + 1,  score))
        print('[!] epoch = {}, score = {}, NDCG best score: {}\n'.format(epoch + 1, score, self.best))

    def eval_preprocess(self, row):

            d = row
            text = d['query']
            label_words = d['label_words']
            t1, t2 = token2id_X(text, x_dict=word_index, maxlen=cfg["maxlen"])
            
            image = np.array(d['feature_convert'], dtype="float32")
            image = image[: cfg["max_box"]]
            img_mask = [1 for _ in image[: cfg["max_box"]]]                   
            pos = np.array(d['pos'], dtype="float32")
            pos = pos[: cfg["max_box"]]
            
            image_char = [token2id_X(ent, x_dict=word_index)[0] for ent in label_words.split(IMAGE_LABEM_CONCAT_TOKEN)]
            image_char = image_char[: cfg["max_box"]]
            image_char = pad_sequences(image_char, 
                                       maxlen=cfg["max_char"], 
                                       dtype='int32',
                                       padding='post',
                                       truncating='post',
                                       value=cfg["x_pad"])
            output = self.model.predict([np.asarray([t1]), np.asarray([t2]), np.asarray([image]), np.asarray([img_mask]), np.asarray([pos]), np.asarray([image_char])])
            return output


    def evaluate(self, model):
        self.model = model
        result = defaultdict(list)
        val_results = val_data.apply(self.eval_preprocess, axis=1)
        qid = val_data["query_id"].values
        pid = val_data["product_id"].values


        for i in trange(len(val_data)): 
            result[qid[i]].append((pid[i], val_results[i][0][1]))
            
        query_id,product1,product2,product3,product4,product5 = [],[],[],[],[],[]
        for key in result.keys():
            rlist = result[key]
            rlist.sort(key=lambda x: x[1], reverse=True)
            query_id.append(key)
            product1.append(rlist[0][0])
            product2.append(rlist[1][0])
            product3.append(rlist[2][0])
            product4.append(rlist[3][0])
            product5.append(rlist[4][0])
        sub = pd.DataFrame({'query-id':query_id,
                            'product1':product1,
                            'product2':product2,
                            'product3':product3,
                            'product4':product4,
                            'product5':product5,

        })
        sub.to_csv('result/val_submission.csv',index=0)
        
        reference = json.load(open(VAL_ANS_PATH))
        
        # read predictions
        k = 5
        predictions = read_submission('result/val_submission.csv', reference, k)

        # compute score for each query
        score_sum = 0.
        for qid in reference.keys():
            ground_truth_ids = set([str(pid) for pid in reference[qid]])
            ref_vec = [1.0] * len(ground_truth_ids)
            pred_vec = [1.0 if pid in ground_truth_ids else 0.0 for pid in predictions[qid]]
            score_sum += get_ndcg(pred_vec, ref_vec, k)
        # the higher score, the better
        score = score_sum / len(reference)
        
        return score

## Train Baseline

In [None]:

gc.collect()
fold_id = -1
print("\n\n[!] fold_id = {} starting".format(fold_id))
# cfg["filename"] = cfg["raw_filename"].format(cfg["verbose"], FOLD_NUM, fold_id)

cfg["filename"] = 'models/1M_baseline'

cfg["num_example"] = len(train_data)
print(len(train_data))

# tf.compat.v1.keras.backend.clear_session()
gc.collect()
seed(SEED - fold_id)
np.random.seed(SEED - fold_id)
tf.compat.v1.random.set_random_seed(SEED - fold_id)
train_D = data_generator(train_data)
print(cfg)
model = build_baseline_model(cfg, summary=True, 
                    word_embedding_matrix=word_embedding_matrix,
                    )
tf.keras.utils.plot_model(model, to_file="models/model_baseline.png", show_shapes=True)


### Model Fitting

In [None]:
evaluator = Evaluate(filename=cfg["filename"])
log_dir = "logs/fit/1M_basaline"
tensorboard_callback = tf.compat.v1.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)

with open("data/val_data.pkl", "rb") as f:
    val_data = joblib.load(f)
    
model.fit(train_D.__iter__(),
                          steps_per_epoch=len(train_data)//cfg["batch_size"],
                          epochs=MAX_EPOCH,
                          callbacks=[evaluator, tensorboard_callback],
                          shuffle=True
                          )
print("\n\n[!] fold_id = {} finish".format(fold_id))
del model, evaluator

In [None]:
%tensorboard --logdir logs/fit

### Predict

In [None]:
with open('data/testA_data.pkl', 'rb') as outp:
    test_data = pickle.load(outp)

f = cfg["filename"] + ".h5"
print(f)
if "bert" in cfg["verbose"]:
    custom_objects = get_custom_objects()
    model = load_model(f, custom_objects=custom_objects)  
else:
    model = load_model(f)
print("finish")

In [None]:
gc.collect()
result = defaultdict(list)
for i in trange(len(test_data)):
    d = test_data.iloc[i]
    qid = d['query_id']
    pid = d['product_id']
    text = d['query']
    label_words = d['label_words']
    t1, t2 = token2id_X(text, x_dict=word_index, maxlen=cfg["maxlen"])

    image = np.array(d['feature_convert'], dtype="float32")
    image = image[: cfg["max_box"]]
    img_mask = [1 for _ in image[: cfg["max_box"]]]                   
    pos = np.array(d['pos'], dtype="float32")
    pos = pos[: cfg["max_box"]]

    image_char = [token2id_X(ent, x_dict=word_index)[0] for ent in label_words.split(IMAGE_LABEM_CONCAT_TOKEN)]
    image_char = image_char[: cfg["max_box"]]
    image_char = pad_sequences(image_char, 
                               maxlen=cfg["max_char"], 
                               dtype='int32',
                               padding='post',
                               truncating='post',
                               value=cfg["x_pad"]) 
    output = model.predict([np.asarray([t1]), np.asarray([t2]), np.asarray([image]), np.asarray([img_mask]), np.asarray([pos]), np.asarray([image_char])])
    result[qid].append((pid, output[0][1]))


In [None]:
query_id,product1,product2,product3,product4,product5 = [],[],[],[],[],[]
for key in result.keys():
    rlist = result[key]
    rlist.sort(key=lambda x: x[1], reverse=True)
    query_id.append(key)
    product1.append(rlist[0][0])
    product2.append(rlist[1][0])
    product3.append(rlist[2][0])
    product4.append(rlist[3][0])
    product5.append(rlist[4][0])

sub = pd.DataFrame({'query-id':query_id,
                    'product1':product1,
                    'product2':product2,
                    'product3':product3,
                    'product4':product4,
                    'product5':product5,

})

sub.to_csv('result/submission_1M.csv',index=0)

In [None]:
sub.head(5)

### Eval

In [None]:
!python eval.py data/testA_answer.json result/submission_1M.csv result/testA_result_1M.json

## Train Bert Model

In [None]:

gc.collect()
fold_id = -1
# cfg["filename"] = cfg["raw_filename"].format(cfg["verbose"], FOLD_NUM, fold_id)

cfg["filename"] = 'models/100K_with_bert'

cfg["num_example"] = len(train_data)
print(len(train_data))

tf.compat.v1.keras.backend.clear_session()
gc.collect()
seed(SEED - fold_id)
np.random.seed(SEED - fold_id)
tf.compat.v1.random.set_random_seed(SEED - fold_id)
train_D = data_generator(train_data)
print(cfg)
model = build_model(cfg, summary=True, 
                    word_embedding_matrix=word_embedding_matrix,
                    )
tf.keras.utils.plot_model(model, to_file="models/model_with_bert.png", show_shapes=True)


In [None]:
evaluator = Evaluate(filename=cfg["filename"])
log_dir = "logs/fit/100k_with_bert"
tensorboard_callback = tf.compat.v1.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)

with open("data/val_data.pkl", "rb") as f:
    val_data = joblib.load(f)
    
model.fit(train_D.__iter__(),
                          steps_per_epoch=len(train_D),
                          epochs=MAX_EPOCH,
                          callbacks=[evaluator, tensorboard_callback],
                          shuffle=True
                          )

In [None]:
%tensorboard 

## Hyperparameter Opt

In [None]:
def build_model(hp):

    global cfg
    
    def _get_model(base_dir, cfg_=None):
        config_file = os.path.join(base_dir, 'bert_config.json')
        checkpoint_file = os.path.join(base_dir, 'bert_model.ckpt')
        if not os.path.exists(config_file):
            config_file = os.path.join(base_dir, 'bert_config_large.json')
            checkpoint_file = os.path.join(base_dir, 'roberta_l24_large_model')
        print(config_file, checkpoint_file)
#         model = load_trained_model_from_checkpoint(config_file, checkpoint_file, training=True, seq_len=cfg_['maxlen'])
        model = load_trained_model_from_checkpoint(config_file, 
                                           checkpoint_file, 
                                           training=False, 
                                           trainable=cfg_["bert_trainable"], 
                                           output_layer_num=cfg["cls_num"],
                                           seq_len=None)
        return model
    
    def get_opt(num_example, warmup_proportion=0.1, lr=2e-5, min_lr=None):
        if cfg["opt"].lower() == "nadam":
            opt = Nadam(lr=lr)
        else:
            total_steps, warmup_steps = calc_train_steps(
                num_example=num_example,
                batch_size=B_SIZE,
                epochs=MAX_EPOCH,
                warmup_proportion=warmup_proportion,
            )

            opt = AdamWarmup(total_steps, warmup_steps, lr=lr, min_lr=min_lr)

        return opt

    # model1 = _get_model(cfg["base_dir"], cfg)
    # model1 = Model(inputs=model1.inputs[: 2], outputs=model1.layers[-7].output)

    global word_index
    word_embedding_matrix = load_word_embed(word_index=word_index)
    embed_layer = Embedding(input_dim=word_embedding_matrix.shape[0], 
                            output_dim=word_embedding_matrix.shape[1],
                            weights=[word_embedding_matrix],
                            trainable=cfg["trainable"],
                            name="embed_layer"
                        )
        
    inp_token1 = Input(shape=(None, ), dtype=np.int32, name="query_token_input")
    inp_segm1 = Input(shape=(None, ), dtype=np.float32, name="query_segm_input")
    
#     inp_token2 = Input(shape=(None, ), dtype=np.int32)
#     inp_segm2 = Input(shape=(None, ), dtype=np.float32)    
    
    inp_image = Input(shape=(None, 2048), dtype=np.float32, name="image_input")
    inp_image_mask = Input(shape=(None, ), dtype=np.float32, name="image_mask_input")
    inp_pos = Input(shape=(None, 5), dtype=np.float32, name="image_pos_input")        
    inp_image_char = Input(shape=(None, cfg["max_char"]), dtype=np.int32, name='image_char_input')
    
    
    mask = Lambda(lambda x: K.cast(K.not_equal(x, cfg["x_pad"]), 'float32'), name="token_mask")(inp_token1)
    word_embed = embed_layer(inp_token1)
    word_embed = Lambda(lambda x: x[0] * K.expand_dims(x[1], axis=-1))([word_embed, mask])
    
    # hp_units_lstm = hp.Int('lstm_units1', min_value=64, max_value=512, step=32)
    word_embed = Bidirectional(LSTM(cfg["unit1_1"], return_sequences=True), merge_mode="sum")(word_embed)
    word_embed = BatchNormalization()(word_embed)
    # word_embed = Dropout(0.3)(word_embed)
    word_embed = Lambda(lambda x: x[0] * K.expand_dims(x[1], axis=-1))([word_embed, mask])

    # sequence_output = model1([inp_token1, inp_segm1])
    # sequence_output = Concatenate(axis=-1)([sequence_output, word_embed])
    text_pool = Lambda(lambda x: x[:, 0, :])(word_embed)

    # Share weights of character-level embedding for premise and hypothesis
    # hp_units_filter = hp.Int('filter_units1', min_value=64, max_value=512, step=32)
    # hp_units_filter_size = hp.Int('filter_size1', min_value=3, max_value=12, step=2)
    character_embedding_layer = TimeDistributed(Sequential([
        embed_layer,
        # Embedding(input_dim=100, output_dim=char_embedding_size, input_length=chars_per_word),
        Conv1D(filters=128, kernel_size=3, padding='same', name="char_embed_conv1d"),
        GlobalMaxPooling1D()
    ]), name='CharEmbedding')
    character_embedding_layer.build(input_shape=(None, None, cfg["max_char"]))
    image_char_embed  = character_embedding_layer(inp_image_char)    
    image_embed = Concatenate(axis=-1)([image_char_embed, inp_image])
    # hp_units0 = hp.Int('dense_units1', min_value=256, max_value=1024, step=32)
    hp_activation = hp.Choice('activation1', values=['relu', 'tanh', 'sigmoid', 'selu', 'elu'])
    image_embed = Dense(256, activation=hp_activation, name='image_embed')(image_embed)

    image_embed = Lambda(lambda x: x[0] * K.expand_dims(x[1], axis=-1))([image_embed, inp_image_mask])


    # hp_units = hp.Int('dense_units2', min_value=256, max_value=2048, step=32)
    hp_activation = hp.Choice('activation2', values=['relu', 'tanh', 'sigmoid', 'selu', 'elu'])
    hp_dropout_prop = hp.Choice('dropout_prop1', values=[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9])
    pos_embed = Dense(256, activation=hp_activation, name='pos_embed')(inp_pos)
    pos_embed = Dropout(hp_dropout_prop)(pos_embed)
    pos_embed = Lambda(lambda x: x[0] * K.expand_dims(x[1], axis=-1))([pos_embed, inp_image_mask])
    embed = Add()([image_embed , pos_embed]) # batch, maxlen(10), 1024+128
    
    # hp_units_lstm0 = hp.Int('lstm_units1', min_value=64, max_value=512, step=32)
    image_embed = Bidirectional(LSTM(512, return_sequences=True), merge_mode="sum")(embed)
    image_embed = BatchNormalization()(image_embed)
    image_embed = Lambda(lambda x: x[0] * K.expand_dims(x[1], axis=-1))([image_embed, inp_image_mask])
    
    image_pool = Lambda(lambda x: x[:, 0, :])(image_embed)
    
    pool = Concatenate(axis=-1)([image_pool, text_pool])

    hp_units = hp.Int('dense_units3', min_value=64, max_value=2048, step=32)
    hp_activation = hp.Choice('activation3', values=['relu', 'tanh', 'sigmoid', 'selu', 'elu'])
    hp_dropout_prop = hp.Choice('dropout_prop2', values=[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9])
    pool = Dense(hp_units, activation=hp_activation)(pool)
    pool = Dropout(hp_dropout_prop)(pool)
    hp_units1 = hp.Int('dense_units4', min_value=64, max_value=2048, step=32)
    hp_activation1 = hp.Choice('activation4', values=['relu', 'tanh', 'sigmoid', 'selu', 'elu'])
    pool = Dense(hp_units1, activation=hp_activation1)(pool)
    hp_units2 = hp.Int('dense_units5', min_value=64, max_value=2048, step=32)
    hp_activation2 = hp.Choice('activation5', values=['relu', 'tanh', 'sigmoid', 'selu', 'elu'])
    pool = Dense(hp_units2, activation=hp_activation2)(pool)
    
    output = Dense(2, activation='softmax', name='output')(pool)

    model = Model(inputs=[inp_token1, inp_segm1, 
                          inp_image, inp_image_mask,
                          inp_pos, inp_image_char], outputs=[output])#
    hp_learning_rate = hp.Choice('learning_rate', values=[1e-2, 1e-3, 1e-4, 1e-5])
    model.compile(optimizer=Adam(learning_rate=hp_learning_rate), loss={
                'output': 'sparse_categorical_crossentropy'
            }, metrics=['accuracy'])

    
    return model

#### Random Search

In [None]:
np.random.seed(SEED)
tf.compat.v1.random.set_random_seed(SEED)

tuner = kt.RandomSearch(build_model,
                     objective='val_accuracy',
                     max_trials=50,
                     directory='tmp/hyperparameter_tuning',
                     project_name='multimodal_hyperparameter_tuning',
                     overwrite=True,
                     seed=1)

In [None]:
stop_early = keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=5)
log_dir = "tmp/hparam_logs"
tensorboard_callback = keras.callbacks.TensorBoard(log_dir=log_dir)

In [None]:
tuner.search(train_D.__iter__(), epochs=20, validation_data=val_D.__iter__(), callbacks=[stop_early, tensorboard_callback],
            batch_size=1024, steps_per_epoch=len(train_D), validation_steps=len(val_D))


#### Hyperband

In [None]:
tuner = kt.Hyperband(build_model,
                     objective=kt.Objective("val_accuracy", direction="max"),
                     max_epochs=20,
                     factor=3,
                     directory='tmp/hyperband_hyperparameter_tuning',
                     project_name='multimodal_hyperband_hyperparameter_tuning',
                     overwrite=True,
                     seed=1)


In [None]:
stop_early = keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=5)
log_dir = "tmp/hparam_hyperband_logs"
tensorboard_callback = keras.callbacks.TensorBoard(log_dir=log_dir)

tuner.search(train_D.__iter__(), epochs=20, validation_data=val_D.__iter__(), callbacks=[stop_early, tensorboard_callback],
            batch_size=1024, steps_per_epoch=len(train_D), validation_steps=len(val_D))



In [None]:
%tensorboard --logdir /tmp/hparam_logs

#### Select the best model

In [None]:
tuner = kt.Hyperband(build_model,
                     objective=kt.Objective("val_accuracy", direction="max"),
                     max_epochs=20,
                     factor=3,
                     directory='tmp/hyperband_hyperparameter_tuning',
                     project_name='multimodal_hyperband_hyperparameter_tuning',
                     overwrite=False,
                     seed=1)


In [None]:
# Get the optimal hyperparameters
best_hps=tuner.get_best_hyperparameters()[0]

print(f"""
The hyperparameter search is complete. The optimal parameters are {best_hps._hps}
""")

In [None]:
hyperband_model = tuner.hypermodel.build(best_hps)

with open("data/val_data.pkl", "rb") as f:
    val_data = joblib.load(f)

evaluator = Evaluate(filename="hyperband_best_model")
log_dir = "logs/fit/Hyperband_best_hyparam" 
tensorboard_callback = tf.compat.v1.keras.callbacks.TensorBoard(log_dir=log_dir)

hyperband_model.fit(train_D.__iter__(),
                          steps_per_epoch=len(train_D),
                          epochs=MAX_EPOCH,
                          callbacks=[evaluator, tensorboard_callback],
                          shuffle=True
                          )

In [None]:
%tensorboard

## Evaluation

In [17]:
bert_models = ["""/root/Applied_AI_Lab_WiSe2021_Passau/ai-light/models/10K_with_bert_20.h5""",  """/root/Applied_AI_Lab_WiSe2021_Passau/ai-light/models/100K_with_bert_20.h5""", """/root/Applied_AI_Lab_WiSe2021_Passau/ai-light/models/bert_1M_example.h5"""]
without_models = ["/root/Applied_AI_Lab_WiSe2021_Passau/ai-light/models/10K_baseline_10.h5", "/root/Applied_AI_Lab_WiSe2021_Passau/ai-light/models/100K_baseline_10.h5"]

In [18]:
bert_data_size = ["10K", "100K", "1M"]
without_data_size = ["10K", "100K"]

In [10]:
with open('data/testA_data.pkl', 'rb') as outp:
    testA_data = pickle.load(outp)
    
with open('data/testB_data.pkl', 'rb') as outp:
    testB_data = pickle.load(outp)

In [12]:
bert_final_A_B_predictions = {}
for model, size in zip(bert_models, bert_data_size):
    print(f"Evaluating model {model} trained on {size} data")
    custom_objects = get_custom_objects()
    model = load_model(model, custom_objects=custom_objects)
    result_A = defaultdict(list)
    result_B = defaultdict(list)
    for i in trange(len(testA_data)):
        d = testA_data.iloc[i]
        qid = d['query_id']
        pid = d['product_id']
        text = d['query']
        label_words = d['label_words']
        t1, t2 = token2id_X(text, x_dict=word_index, maxlen=cfg["maxlen"])

        image = np.array(d['feature_convert'], dtype="float32")
        image = image[: cfg["max_box"]]
        img_mask = [1 for _ in image[: cfg["max_box"]]]                   
        pos = np.array(d['pos'], dtype="float32")
        pos = pos[: cfg["max_box"]]

        image_char = [token2id_X(ent, x_dict=word_index)[0] for ent in label_words.split(IMAGE_LABEM_CONCAT_TOKEN)]
        image_char = image_char[: cfg["max_box"]]
        image_char = pad_sequences(image_char, 
                                maxlen=cfg["max_char"], 
                                dtype='int32',
                                padding='post',
                                truncating='post',
                                value=cfg["x_pad"]) 
        output = model.predict([np.asarray([t1]), np.asarray([t2]), np.asarray([image]), np.asarray([img_mask]), np.asarray([pos]), np.asarray([image_char])])
        result_A[qid].append((pid, output[0][1]))
    
    query_id,product1,product2,product3,product4,product5 = [],[],[],[],[],[]
    for key in result_A.keys():
        rlist = result_A[key]
        rlist.sort(key=lambda x: x[1], reverse=True)
        query_id.append(key)
        product1.append(rlist[0][0])
        product2.append(rlist[1][0])
        product3.append(rlist[2][0])
        product4.append(rlist[3][0])
        product5.append(rlist[4][0])

    sub = pd.DataFrame({'query-id':query_id,
                        'product1':product1,
                        'product2':product2,
                        'product3':product3,
                        'product4':product4,
                        'product5':product5,

    })

    sub.to_csv("/root/Applied_AI_Lab_WiSe2021_Passau/ai-light/final_results/bert_"+size+"_submission_A.csv", index=False)
    os.system("python eval.py data/testA_answer.json /root/Applied_AI_Lab_WiSe2021_Passau/ai-light/final_results/bert_"+size+"_submission_A.csv /root/Applied_AI_Lab_WiSe2021_Passau/ai-light/final_results/bert_"+size+"_submission_A.json")
    for i in trange(len(testB_data)):
        d = testB_data.iloc[i]
        qid = d['query_id']
        pid = d['product_id']
        text = d['query']
        label_words = d['label_words']
        t1, t2 = token2id_X(text, x_dict=word_index, maxlen=cfg["maxlen"])

        image = np.array(d['feature_convert'], dtype="float32")
        image = image[: cfg["max_box"]]
        img_mask = [1 for _ in image[: cfg["max_box"]]]                   
        pos = np.array(d['pos'], dtype="float32")
        pos = pos[: cfg["max_box"]]

        image_char = [token2id_X(ent, x_dict=word_index)[0] for ent in label_words.split(IMAGE_LABEM_CONCAT_TOKEN)]
        image_char = image_char[: cfg["max_box"]]
        image_char = pad_sequences(image_char, 
                                maxlen=cfg["max_char"], 
                                dtype='int32',
                                padding='post',
                                truncating='post',
                                value=cfg["x_pad"]) 
        output = model.predict([np.asarray([t1]), np.asarray([t2]), np.asarray([image]), np.asarray([img_mask]), np.asarray([pos]), np.asarray([image_char])])
        result_B[qid].append((pid, output[0][1]))
        
    query_id,product1,product2,product3,product4,product5 = [],[],[],[],[],[]
    for key in result_B.keys():
        rlist = result_B[key]
        rlist.sort(key=lambda x: x[1], reverse=True)
        query_id.append(key)
        product1.append(rlist[0][0])
        product2.append(rlist[1][0])
        product3.append(rlist[2][0])
        product4.append(rlist[3][0])
        product5.append(rlist[4][0])

    sub = pd.DataFrame({'query-id':query_id,
                        'product1':product1,
                        'product2':product2,
                        'product3':product3,
                        'product4':product4,
                        'product5':product5,

    })

    sub.to_csv("/root/Applied_AI_Lab_WiSe2021_Passau/ai-light/final_results/bert_"+size+"_submission_B.csv", index=False)
    os.system("python eval.py data/testB_answer.json /root/Applied_AI_Lab_WiSe2021_Passau/ai-light/final_results/bert_"+size+"_submission_B.csv /root/Applied_AI_Lab_WiSe2021_Passau/ai-light/final_results/bert_"+size+"_submission_B.json")
    bert_final_A_B_predictions["bert"+size+"_A"] = result_A
    bert_final_A_B_predictions["bert"+size+"_B"] = result_B
    

Evaluating model /root/Applied_AI_Lab_WiSe2021_Passau/ai-light/models/10K_with_bert_20.h5 trained on 10K data


  0%|          | 0/28830 [00:00<?, ?it/s]2022-02-02 08:04:11.263644: I tensorflow/stream_executor/cuda/cuda_dnn.cc:366] Loaded cuDNN version 8301
2022-02-02 08:04:12.110595: I tensorflow/core/platform/default/subprocess.cc:304] Start cannot spawn child process: No such file or directory
2022-02-02 08:04:12.111422: I tensorflow/core/platform/default/subprocess.cc:304] Start cannot spawn child process: No such file or directory
2022-02-02 08:04:12.111497: W tensorflow/stream_executor/gpu/asm_compiler.cc:80] Couldn't get ptxas version string: INTERNAL: Couldn't invoke ptxas --version
2022-02-02 08:04:12.113641: I tensorflow/core/platform/default/subprocess.cc:304] Start cannot spawn child process: No such file or directory
2022-02-02 08:04:12.114214: W tensorflow/stream_executor/gpu/redzone_allocator.cc:314] INTERNAL: Failed to launch ptxas
Relying on driver to perform ptx compilation. 
Modify $PATH to customize ptxas location.
This message will be only logged once.
Could not load symbol 

Read standard from data/testA_answer.json
Read user submit file from /root/Applied_AI_Lab_WiSe2021_Passau/ai-light/final_results/bert_10K_submission_A.csv
The evaluation finished successfully.


100%|██████████| 29005/29005 [1:01:30<00:00,  7.86it/s]


Read standard from data/testB_answer.json
Read user submit file from /root/Applied_AI_Lab_WiSe2021_Passau/ai-light/final_results/bert_10K_submission_B.csv
The evaluation finished successfully.
Evaluating model /root/Applied_AI_Lab_WiSe2021_Passau/ai-light/models/100K_with_bert_20.h5 trained on 100K data


100%|██████████| 28830/28830 [1:01:55<00:00,  7.76it/s]


Read standard from data/testA_answer.json
Read user submit file from /root/Applied_AI_Lab_WiSe2021_Passau/ai-light/final_results/bert_100K_submission_A.csv
The evaluation finished successfully.


100%|██████████| 29005/29005 [1:01:17<00:00,  7.89it/s]


Read standard from data/testB_answer.json
Read user submit file from /root/Applied_AI_Lab_WiSe2021_Passau/ai-light/final_results/bert_100K_submission_B.csv
The evaluation finished successfully.
Evaluating model /root/Applied_AI_Lab_WiSe2021_Passau/ai-light/models/bert_1M_example.h5 trained on 1M data


100%|██████████| 28830/28830 [1:01:13<00:00,  7.85it/s]


Read standard from data/testA_answer.json
Read user submit file from /root/Applied_AI_Lab_WiSe2021_Passau/ai-light/final_results/bert_1M_submission_A.csv
The evaluation finished successfully.


100%|██████████| 29005/29005 [1:01:19<00:00,  7.88it/s]


Read standard from data/testB_answer.json
Read user submit file from /root/Applied_AI_Lab_WiSe2021_Passau/ai-light/final_results/bert_1M_submission_B.csv
The evaluation finished successfully.


In [13]:
import pickle

pickle.dump(bert_final_A_B_predictions, open("/root/Applied_AI_Lab_WiSe2021_Passau/ai-light/final_results/bert_final_A_B_predictions.pkl", "wb"))

In [20]:
without_final_A_B_predictions = {}
for model, size in zip(without_models, without_data_size):
    print(f"Evaluating model {model} trained on {size} data")
    model = load_model(model)
    result_A = defaultdict(list)
    result_B = defaultdict(list)
    for i in trange(len(testA_data)):
        d = testA_data.iloc[i]
        qid = d['query_id']
        pid = d['product_id']
        text = d['query']
        label_words = d['label_words']
        t1, t2 = token2id_X(text, x_dict=word_index, maxlen=cfg["maxlen"])

        image = np.array(d['feature_convert'], dtype="float32")
        image = image[: cfg["max_box"]]
        img_mask = [1 for _ in image[: cfg["max_box"]]]                   
        pos = np.array(d['pos'], dtype="float32")
        pos = pos[: cfg["max_box"]]

        image_char = [token2id_X(ent, x_dict=word_index)[0] for ent in label_words.split(IMAGE_LABEM_CONCAT_TOKEN)]
        image_char = image_char[: cfg["max_box"]]
        image_char = pad_sequences(image_char, 
                                maxlen=cfg["max_char"], 
                                dtype='int32',
                                padding='post',
                                truncating='post',
                                value=cfg["x_pad"]) 
        output = model.predict([np.asarray([t1]), np.asarray([t2]), np.asarray([image]), np.asarray([img_mask]), np.asarray([pos]), np.asarray([image_char])])
        result_A[qid].append((pid, output[0][1]))
    
    query_id,product1,product2,product3,product4,product5 = [],[],[],[],[],[]
    for key in result_A.keys():
        rlist = result_A[key]
        rlist.sort(key=lambda x: x[1], reverse=True)
        query_id.append(key)
        product1.append(rlist[0][0])
        product2.append(rlist[1][0])
        product3.append(rlist[2][0])
        product4.append(rlist[3][0])
        product5.append(rlist[4][0])

    sub = pd.DataFrame({'query-id':query_id,
                        'product1':product1,
                        'product2':product2,
                        'product3':product3,
                        'product4':product4,
                        'product5':product5,

    })

    sub.to_csv("/root/Applied_AI_Lab_WiSe2021_Passau/ai-light/final_results/without_"+size+"_submission_A.csv", index=False)
    os.system("python eval.py data/testA_answer.json /root/Applied_AI_Lab_WiSe2021_Passau/ai-light/final_results/without_"+size+"_submission_A.csv /root/Applied_AI_Lab_WiSe2021_Passau/ai-light/final_results/without_"+size+"_submission_A.json")
    for i in trange(len(testB_data)):
        d = testB_data.iloc[i]
        qid = d['query_id']
        pid = d['product_id']
        text = d['query']
        label_words = d['label_words']
        t1, t2 = token2id_X(text, x_dict=word_index, maxlen=cfg["maxlen"])

        image = np.array(d['feature_convert'], dtype="float32")
        image = image[: cfg["max_box"]]
        img_mask = [1 for _ in image[: cfg["max_box"]]]                   
        pos = np.array(d['pos'], dtype="float32")
        pos = pos[: cfg["max_box"]]

        image_char = [token2id_X(ent, x_dict=word_index)[0] for ent in label_words.split(IMAGE_LABEM_CONCAT_TOKEN)]
        image_char = image_char[: cfg["max_box"]]
        image_char = pad_sequences(image_char, 
                                maxlen=cfg["max_char"], 
                                dtype='int32',
                                padding='post',
                                truncating='post',
                                value=cfg["x_pad"]) 
        output = model.predict([np.asarray([t1]), np.asarray([t2]), np.asarray([image]), np.asarray([img_mask]), np.asarray([pos]), np.asarray([image_char])])
        result_B[qid].append((pid, output[0][1]))
        
    query_id,product1,product2,product3,product4,product5 = [],[],[],[],[],[]
    for key in result_B.keys():
        rlist = result_B[key]
        rlist.sort(key=lambda x: x[1], reverse=True)
        query_id.append(key)
        product1.append(rlist[0][0])
        product2.append(rlist[1][0])
        product3.append(rlist[2][0])
        product4.append(rlist[3][0])
        product5.append(rlist[4][0])

    sub = pd.DataFrame({'query-id':query_id,
                        'product1':product1,
                        'product2':product2,
                        'product3':product3,
                        'product4':product4,
                        'product5':product5,

    })

    sub.to_csv("/root/Applied_AI_Lab_WiSe2021_Passau/ai-light/final_results/without_"+size+"_submission_B.csv", index=False)
    os.system("python eval.py data/testB_answer.json /root/Applied_AI_Lab_WiSe2021_Passau/ai-light/final_results/without_"+size+"_submission_B.csv /root/Applied_AI_Lab_WiSe2021_Passau/ai-light/final_results/without_"+size+"_submission_B.json")
    without_final_A_B_predictions["without"+size+"_A"] = result_A
    without_final_A_B_predictions["without"+size+"_B"] = result_B
    

Evaluating model /root/Applied_AI_Lab_WiSe2021_Passau/ai-light/models/10K_baseline_10.h5 trained on 10K data


  0%|          | 2/28830 [00:01<3:48:03,  2.11it/s]



  0%|          | 3/28830 [00:02<5:43:43,  1.40it/s]



100%|██████████| 28830/28830 [58:18<00:00,  8.24it/s]  


Read standard from data/testA_answer.json
Read user submit file from /root/Applied_AI_Lab_WiSe2021_Passau/ai-light/final_results/without_10K_submission_A.csv
The evaluation finished successfully.


100%|██████████| 29005/29005 [57:58<00:00,  8.34it/s]  


Read standard from data/testB_answer.json
Read user submit file from /root/Applied_AI_Lab_WiSe2021_Passau/ai-light/final_results/without_10K_submission_B.csv
The evaluation finished successfully.
Evaluating model /root/Applied_AI_Lab_WiSe2021_Passau/ai-light/models/100K_baseline_10.h5 trained on 100K data


100%|██████████| 28830/28830 [53:03<00:00,  9.06it/s]  


Read standard from data/testA_answer.json
Read user submit file from /root/Applied_AI_Lab_WiSe2021_Passau/ai-light/final_results/without_100K_submission_A.csv
The evaluation finished successfully.


100%|██████████| 29005/29005 [52:30<00:00,  9.21it/s]  


Read standard from data/testB_answer.json
Read user submit file from /root/Applied_AI_Lab_WiSe2021_Passau/ai-light/final_results/without_100K_submission_B.csv
The evaluation finished successfully.


In [21]:
pickle.dump(without_final_A_B_predictions, open("/root/Applied_AI_Lab_WiSe2021_Passau/ai-light/final_results/without_final_A_B_predictions.pkl", "wb"))

In [None]:
without = pickle.load(open("/root/Applied_AI_Lab_WiSe2021_Passau/ai-light/final_results/without_final_A_B_predictions.pkl", "rb"))

## Ensembling

In [8]:
import pickle

In [9]:
bert = pickle.load(open("/root/Applied_AI_Lab_WiSe2021_Passau/ai-light/final_results/bert_final_A_B_predictions.pkl", "rb"))
without = pickle.load(open("/root/Applied_AI_Lab_WiSe2021_Passau/ai-light/final_results/without_final_A_B_predictions.pkl", "rb"))

In [10]:
all_models = {}
all_models.update(bert)
all_models.update(without)
all_models.keys()

dict_keys(['bert10K_A', 'bert10K_B', 'bert100K_A', 'bert100K_B', 'bert1M_A', 'bert1M_B', 'without10K_A', 'without10K_B', 'without100K_A', 'without100K_B'])

In [99]:
score_dict = {
    # "bert10K_B":0.3611299223044956, 
              "bert100K_B":0.4322373308206258, 
            #   "bert1M_B":0.5444838877873618,
            #   "without10K_B":0.32694372741460237, 
              "without100K_B":0.4189474418223332
              } 

In [100]:
sum_weights = sum(score_dict.values())
print(sum_weights)
result = {}

for model_key in score_dict.keys():
    for i in all_models[model_key].keys():
        final_score = np.zeros_like(len(all_models[model_key][i]))
        for key in score_dict.keys():
            sorted_prod = all_models[key][i]
            sorted_prod.sort(key=lambda x: x[0], reverse=True)
            score = np.array([j[1] for j in sorted_prod])
            score = np.multiply(score_dict[key] , score) 
            final_score = np.add(final_score, score)
        final_score /= sum_weights
        sorted_prod = all_models[model_key][i]
        sorted_prod.sort(key=lambda x: x[0], reverse=True)
        result[i] = [(j[0], final_score[k]) for k, j in enumerate(sorted_prod)]

query_id,product1,product2,product3,product4,product5 = [],[],[],[],[],[]
for key in result.keys():
    rlist = result[key]
    rlist.sort(key=lambda x: x[1], reverse=True)
    query_id.append(key)
    product1.append(rlist[0][0])
    product2.append(rlist[1][0])
    product3.append(rlist[2][0])
    product4.append(rlist[3][0])
    product5.append(rlist[4][0])

sub = pd.DataFrame({'query-id':query_id,
                    'product1':product1,
                    'product2':product2,
                    'product3':product3,
                    'product4':product4,
                    'product5':product5,

})

sub.to_csv("/root/Applied_AI_Lab_WiSe2021_Passau/ai-light/final_results/ensembled_B.csv", index=False)
os.system("python eval.py data/testB_answer.json /root/Applied_AI_Lab_WiSe2021_Passau/ai-light/final_results/ensembled_B.csv /root/Applied_AI_Lab_WiSe2021_Passau/ai-light/final_results/ensemble_submission_B.json")

0.851184772642959
Read standard from data/testB_answer.json
Read user submit file from /root/Applied_AI_Lab_WiSe2021_Passau/ai-light/final_results/ensembled_B.csv
The evaluation finished successfully.


0