In [1]:
import sys
from absl import flags
sys.argv=['preserve_unused_tokens=False']
flags.FLAGS(sys.argv)

['preserve_unused_tokens=False']

In [2]:
pip install bert-tensorflow

Note: you may need to restart the kernel to use updated packages.


In [3]:
# Downloading tokenization script created by the Google
!wget --quiet https://raw.githubusercontent.com/tensorflow/models/master/official/nlp/bert/tokenization.py

In [4]:
from random import randint
from numpy import array
from numpy import argmax
import keras.backend as K
from tensorflow.keras import models
from numpy import array_equal
import numpy as np
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import LSTM, Bidirectional
from tensorflow.keras.layers import Dense, Flatten
from tensorflow.keras import Input
from tensorflow.keras.layers import TimeDistributed
from tensorflow.keras.layers import RepeatVector
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.utils import plot_model
from tensorflow.keras.models import load_model
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import tensorflow as tf
from tensorflow.keras.optimizers import Adam
import tensorflow_hub as hub
from sklearn.model_selection import train_test_split
import pandas as pd
from bert import tokenization
from tensorflow.keras.layers import Lambda
from tensorflow.keras import backend as K
import re
from functools import partial
from tensorflow.keras.callbacks import ModelCheckpoint


In [5]:
image_shape = (256,256,3)
trainable = False
max_seq_length = 128
units = 512
embedding_dim = 768
batch_sz = 8
BUFFER_SIZE = 200
attention_features_shape = 64

In [6]:
img_feature = np.load('Image_Embeddings/Emb_feature/new_test_features/.npy')
np.shape(img_feature)

(8313, 100, 1024)

In [7]:
FMS = pd.read_csv("FMS_final.csv")
FMS['textNdesc'] = 'In the picture '+ FMS.gen_caption + ' And the text says: ' + FMS.text
print(FMS.textNdesc[0], '\n\n')
print(len(FMS))

In the picture man in black and white cap is holding up sign . And the text says: its their character not their color that matters 


8313


In [8]:
# dev_img_feature = np.load('Image_Embeddings/Emb_feature/dev_test_features/final_images.npy')
# np.shape(dev_img_feature)

In [9]:
# dev_FMS = pd.read_csv("FMS_final_dev.csv")
# dev_FMS['textNdesc'] = 'In the picture '+ dev_FMS.gen_caption + ' And the text says: ' + dev_FMS.text
# print(dev_FMS.textNdesc[0], '\n\n')
# print(len(dev_FMS))

In [10]:
def bert_encode(texts, tokenizer, max_len=512):
    all_tokens = []
    all_masks = []
    all_segments = []
    
    for text in texts:
        text = tokenizer.tokenize(text)
            
        text = text[:max_len-2]
        input_sequence = ["[CLS]"] + text + ["[SEP]"]
        pad_len = max_len - len(input_sequence)
        
        tokens = tokenizer.convert_tokens_to_ids(input_sequence)
        tokens += [0] * pad_len
        pad_masks = [1] * len(input_sequence) + [0] * pad_len
        segment_ids = [0] * max_len
        
        all_tokens.append(tokens)
        all_masks.append(pad_masks)
        all_segments.append(segment_ids)
    
    return np.array(all_tokens), np.array(all_masks), np.array(all_segments)

In [11]:
class BahdanauAttention(tf.keras.Model):
    def __init__(self, units):
      super(BahdanauAttention, self).__init__()
      self.W1 = tf.keras.layers.Dense(units)
      self.W2 = tf.keras.layers.Dense(units)
      self.V = tf.keras.layers.Dense(1)

    def call(self, features, hidden):
      hidden_with_time_axis = tf.expand_dims(hidden, 1)

      attention_hidden_layer = (tf.nn.tanh(self.W1(features) +
                                          self.W2(hidden_with_time_axis)))

      score = self.V(attention_hidden_layer)
      attention_weights = tf.nn.softmax(score, axis=1)
      context_vector = attention_weights * features
      context_vector = tf.reduce_sum(context_vector, axis=1)

      return context_vector, attention_weights

In [12]:
class LuongAttention(tf.keras.layers.Layer):
  def __init__(self, units):
    super(LuongAttention, self).__init__()
    self.W1 = tf.keras.layers.Dense(units)
    self.W2 = tf.keras.layers.Dense(units)
    self.V = tf.keras.layers.Dense(1)

  def call(self, query, values):

    # query hidden state shape == (batch_size, hidden size)
    # query_with_time_axis shape == (batch_size, 1, hidden size)
    # values shape == (batch_size, max_len, hidden size)
    # we are doing this to broadcast addition along the time axis to calculate the score
    query_with_time_axis = tf.expand_dims(query, 1)

    values_transposed = tf.transpose(values, perm=[0, 2, 1])

    # score shape == (batch_size, max_length, 1)
    # we get 1 at the last axis because we are applying score to self.V
    # the shape of the tensor before applying self.V is (batch_size, max_length, units)
    #BAHDANAU ADDITIVE:
    #score = self.V(tf.nn.tanh(
    #    self.W1(query_with_time_axis) + self.W2(values)))
    
    #LUONGH Dot-product
    score = tf.transpose(tf.matmul(query_with_time_axis, values_transposed) , perm=[0, 2, 1])

    # attention_weights shape == (batch_size, max_length, 1)
    attention_weights = tf.nn.softmax(score, axis=1)
    # context_vector shape after sum == (batch_size, hidden_size)
    context_vector = attention_weights * values
    context_vector = tf.reduce_sum(context_vector, axis=1)

    return context_vector, attention_weights

In [22]:
def build_model(bert_layer, max_len):
    # hidden = tf.zeros((1, 1024))

    enc_hidden = tf.zeros((1, 1024))

    input_word_ids = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    input_mask = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="input_mask")
    segment_ids = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="segment_ids")
    _, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids])
    clf_output = sequence_output[:, 0, :]
    dec_input = tf.expand_dims(clf_output, axis = 1)

    # image input
    image_input = tf.keras.Input(shape=(100,1024), name="image_input")
    encoder_fc = tf.keras.layers.Dense(1024,kernel_initializer='glorot_uniform',use_bias=False)
    image_input_intermediate = encoder_fc(image_input)
    image_input_intermediate = tf.nn.relu(image_input_intermediate)

    # Set up the decoder layers
    units = 1024
    decoder_gru = tf.keras.layers.GRU(units,return_sequences=True,return_state=True,recurrent_initializer='glorot_uniform') 
    decoder_fc1 = tf.keras.layers.Dense(units)
    decoder_fc2 = tf.keras.layers.Dense(64, activation='relu', name='Dense_3_layer')
    decoder_dropout  = tf.keras.layers.Dropout(0.3)
    decoder_fc3  = tf.keras.layers.Dense(1, activation='sigmoid')

    # decoder layer
    attention = BahdanauAttention(1024)

    context_vector, attention_weights = attention(image_input_intermediate, enc_hidden)
    inputs = tf.concat([tf.expand_dims(context_vector, 1), dec_input], axis=-1)
        
    output, state = decoder_gru(inputs)
    decoder_fc1_output = decoder_fc1(output)
    decoder_fc1_output = tf.reshape(decoder_fc1_output, (-1, decoder_fc1_output.shape[2]))
    decoder_fc2_output = decoder_fc2(decoder_fc1_output)
    decoder_dropout_output = decoder_dropout(decoder_fc2_output)
    decoder_output = decoder_fc3(decoder_dropout_output)
    
    model_encoder_decoder_Bahdanau_Attention = Model([input_word_ids, input_mask, segment_ids, image_input], outputs = decoder_output, name='model_encoder_decoder')
    model_encoder_decoder_Bahdanau_Attention.compile(Adam(lr=0.00001), loss='binary_crossentropy', metrics=['accuracy', 'AUC'])

    return model_encoder_decoder_Bahdanau_Attention



In [14]:
%%time
model_url = "https://tfhub.dev/tensorflow/bert_en_uncased_L-24_H-1024_A-16/1"
model_url2 = "https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-10_H-768_A-12/1"
bert_layer = hub.KerasLayer(model_url, trainable=True)

Wall time: 2min 29s


In [15]:
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = tokenization.FullTokenizer(vocab_file, do_lower_case)

In [16]:
train_input = bert_encode(FMS.textNdesc.values, tokenizer, max_len=100)
train_labels = FMS.label.values

In [17]:
# test_input = bert_encode(dev_FMS.textNdesc.values, tokenizer, max_len=100)
# test_labels = dev_FMS.label.values

In [23]:
model = build_model(bert_layer, max_len=100)
model.summary()



Model: "model_encoder_decoder"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 image_input (InputLayer)       [(None, 100, 1024)]  0           []                               
                                                                                                  
 dense_18 (Dense)               (None, 100, 1024)    1048576     ['image_input[0][0]']            
                                                                                                  
 input_word_ids (InputLayer)    [(None, 100)]        0           []                               
                                                                                                  
 input_mask (InputLayer)        [(None, 100)]        0           []                               
                                                                              

In [24]:
%%time
checkpoint = ModelCheckpoint('model7.h5', monitor='val_loss', save_best_only=True)

train_history = model.fit(
    [train_input, img_feature], train_labels,
    validation_split=0.3,
    epochs=100 ,
    callbacks=[checkpoint],
    batch_size=batch_sz,
    steps_per_epoch=12   
)

Epoch 1/100


In [None]:
import tensorflow as tf
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

In [None]:
model.load_weights('model7.h5')
test_pred = model.predict([train_input, img_feature])



In [None]:
from sklearn.metrics import roc_auc_score, balanced_accuracy_score
y_actual = list(train_labels)
y_prob = list(test_pred.reshape(len(test_pred), ))
print("AUC: ", roc_auc_score(y_actual, y_prob))

AUC:  0.82401700015715
