In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
from tqdm import tqdm
import tensorflow as tf
import cv2
import pickle
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
import time
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, LSTM, Input, Embedding, Conv2D,Concatenate,Flatten,Add,Dropout,GRU,AdditiveAttention
import random
import datetime
from nltk.translate.bleu_score import sentence_bleu
from math import log
import warnings
warnings.filterwarnings('ignore')

In [2]:
dataset = pd.read_csv('Data.csv')
dataset.head()

Unnamed: 0,Person_id,Image1,Image2,Report
0,Scanned Images/CXR1_1_IM-0001_0,Scanned Images/CXR1_1_IM-0001-3001.png,Scanned Images/CXR1_1_IM-0001-4001.png,startseq the cardiac silhouette and mediastinu...
1,Scanned Images/CXR10_IM-0002_0,Scanned Images/CXR10_IM-0002-1001.png,Scanned Images/CXR10_IM-0002-2001.png,startseq the cardiomediastinal silhouette with...
2,Scanned Images/CXR100_IM-0002_0,Scanned Images/CXR100_IM-0002-1001.png,Scanned Images/CXR100_IM-0002-2001.png,startseq both lungs are clear and epanded . he...
3,Scanned Images/CXR1000_IM-0003_0,Scanned Images/CXR1000_IM-0003-1001.png,Scanned Images/CXR1000_IM-0003-2001.png,startseq there increased opacity within the ri...
4,Scanned Images/CXR1000_IM-0003_1,Scanned Images/CXR1000_IM-0003-1001.png,Scanned Images/CXR1000_IM-0003-3001.png,startseq there increased opacity within the ri...


In [3]:
X_train_img, X_cv_img, y_train_rep, y_cv_rep = train_test_split(dataset['Person_id'], dataset['Report'],
                                                                test_size = 0.3094146209873213, random_state=97)

In [4]:
X_train_img.shape, X_cv_img.shape

((2560,), (1147,))

In [5]:
# to make them compatible with batch_size
X_cv_img = X_cv_img.iloc[:-11]
y_cv_rep = y_cv_rep.iloc[:-11]

In [6]:
X_cv_img.shape, y_cv_rep.shape  

((1136,), (1136,))

In [7]:
max_capt_len = 155
pad_size = max_capt_len 

In [8]:
tokenizer = Tokenizer(filters='!"#$%&()*+,-/:;<=>?@[\\]^_`{|}~\t\n')
tokenizer.fit_on_texts(y_train_rep.values)

train_rep_tok = tokenizer.texts_to_sequences(y_train_rep)
cv_rep_tok = tokenizer.texts_to_sequences(y_cv_rep)

train_rep_padded = pad_sequences(train_rep_tok, maxlen=155, padding='post')
cv_rep_padded = pad_sequences(cv_rep_tok, maxlen=155, padding='post')

tokenizer.word_index['<pad>'] = 0
tokenizer.index_word[0] = '<pad>'

In [9]:
f = open('Image_features_attention.pickle','rb') # contains the features from chexNet
Xnet_Features = pickle.load(f)
f.close()

In [10]:
f = open('GLOVE_VECTORS.pickle','rb') # 300d glove vectors  
glove_vectors = pickle.load(f)
f.close()

In [11]:
a = Xnet_Features['Scanned Images/CXR1_1_IM-0001_0'][0]
a.shape

TensorShape([98, 1024])

In [12]:
BATCH_SIZE = 16
BUFFER_SIZE = 500

In [13]:
def load_image(id_, report):
    '''Loads the Image Features with their corresponding Ids'''
    img_feature = Xnet_Features[id_.decode('utf-8')][0]
    return img_feature, report

In [14]:
def create_dataset(img_name_train,reps):
  
    dataset = tf.data.Dataset.from_tensor_slices((img_name_train, reps))

  # Use map to load the numpy files in parallel
    dataset = dataset.map(lambda item1, item2: tf.numpy_function(load_image, [item1, item2], [tf.float32, tf.int32]),
                          num_parallel_calls=tf.data.experimental.AUTOTUNE)

  # Shuffle and batch
    dataset = dataset.shuffle(500).batch(BATCH_SIZE).prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
    return dataset

In [15]:
train_dataset = create_dataset(X_train_img.values, train_rep_padded)
cv_dataset = create_dataset(X_cv_img.values, cv_rep_padded)

In [16]:
vocab_size = len(tokenizer.word_index.keys()) + 1

embedding_matrix = np.zeros((vocab_size,300))
for word, i in tokenizer.word_index.items():
    if word in glove_vectors.keys():
        vec = glove_vectors[word]
        embedding_matrix[i] = vec
    else:
        continue

In [28]:
class Encoder(tf.keras.layers.Layer):
    def __init__(self, units):
        super(Encoder, self).__init__()
        self.units = units
       # self.bs = batch_size
        
    def build(self, input_shape):
        self.maxpool = tf.keras.layers.MaxPool1D()
        self.dense = Dense(self.units, kernel_initializer=tf.keras.initializers.glorot_uniform(seed = 56), name='dense_encoder')
        
    def call(self, input_, training=True):
        
        x = self.maxpool(input_)
        x = self.dense(x)
        
        return x
    
    def get_states(self, bs):
        
        return tf.zeros((bs, self.units))

In [29]:
class OneStepDecoder(tf.keras.layers.Layer):
    def __init__(self, vocab_size, att_units, dec_units):
        super(OneStepDecoder, self).__init__()
        self.vocab_size = vocab_size
       # self.emb_dim = emb_dim
        self.att_units = att_units
        self.dec_units = dec_units
        
    def build(self, input_shape):
        self.embedding = Embedding(self.vocab_size, output_dim=300, input_length=max_capt_len, mask_zero=True,
                                   weights = [embedding_matrix],
                                   name="embedding_layer_decoder")
        self.gru = GRU(self.dec_units, return_sequences=True, return_state=True, name="Decoder_GRU")
        self.fc = Dense(self.vocab_size)
        
        self.V = Dense(1)
        self.W = Dense(self.att_units)
        self.U = Dense(self.att_units)
        
    def call(self, dec_input, hidden_state, enc_output):
       

        hidden_with_time = tf.expand_dims(hidden_state, 1)
        
        attention_weights = self.V(tf.nn.tanh(self.U(enc_output) + self.W(hidden_with_time)))
        
        attention_weights = tf.nn.softmax(attention_weights, 1)
        
        context_vector = attention_weights * enc_output
        
        context_vector = tf.reduce_sum(context_vector, axis=1)
       

        x = self.embedding(dec_input)
        
        x = tf.concat([tf.expand_dims(context_vector, axis=1),x], axis=-1)
        
        output, h_state = self.gru(x, initial_state = hidden_state)
        
        output = tf.reshape(output, (-1, output.shape[2]))
        
        x = self.fc(output)
        
        return x, h_state, attention_weights

In [30]:
class Decoder(tf.keras.layers.Layer):
    
    def __init__(self, vocab_size, input_length, dec_units, att_units):
        super(Decoder, self).__init__()
        self.vocab_size = vocab_size
    #    self.embedding_dim = embedding_dim
        self.input_length = input_length
        self.dec_units = dec_units
        self.att_units = att_units
        self.onestep_decoder = OneStepDecoder(self.vocab_size, self.att_units, self.dec_units)
    @tf.function    
    def call(self, dec_input, hidden_state, enc_output):
        all_outputs = tf.TensorArray(tf.float32, dec_input.shape[1], name='output_arrays')
        
        for timestep in range(dec_input.shape[1]):
            
            output, hidden_state, attention_weights = self.onestep_decoder(dec_input[:, timestep:timestep+1], 
                                                                           hidden_state, enc_output)
            
            all_outputs = all_outputs.write(timestep, output)
            
        all_outputs = tf.transpose(all_outputs.stack(), [1,0,2])
        return all_outputs

In [31]:
class Attention_Model(tf.keras.Model):
    def __init__(self, vocab, units, max_capt_len, att_units, batch_size):
        super(Attention_Model, self).__init__()
        self.batch_size = batch_size
        self.encoder = Encoder(units)
        self.decoder = Decoder(vocab_size, max_capt_len, units, att_units)
        
    def call(self, data):
        enc_input, dec_input = data[0], data[1]
    
        enc_output = self.encoder(enc_input)
        enc_state = self.encoder.get_states(self.batch_size)
        dec_output = self.decoder(dec_input, enc_state, enc_output)

        return dec_output

In [32]:
units = 256
att_units = 10

In [33]:
model1 = Attention_Model(vocab_size, units, max_capt_len, att_units, BATCH_SIZE)

In [23]:
optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
loss_function = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='auto')

def maskedLoss(y_true, y_pred):
    #getting mask value
    mask = tf.math.logical_not(tf.math.equal(y_true, 0))
    
    #calculating the loss
    loss_ = loss_function(y_true, y_pred)
    
    #converting mask dtype to loss_ dtype
    mask = tf.cast(mask, dtype=loss_.dtype)
    
    #applying the mask to loss
    loss_ = loss_*mask
    
    #getting mean over all the values
    loss_ = tf.reduce_mean(loss_)
    return loss_

In [26]:
model1.compile(optimizer=optimizer, loss=maskedLoss)

In [36]:
EPOCHS = 10

In [37]:
current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
train_log_dir = 'Tensorboard/attention_OneStep/fit2/' + current_time + '/train'
val_log_dir = 'Tensorboard/attention_OneStep/fit2/' + current_time + '/test'
train_summary_writer = tf.summary.create_file_writer(train_log_dir)
val_summary_writer = tf.summary.create_file_writer(val_log_dir)

In [38]:
epoch_train_loss = []
epoch_val_loss = []

for epoch in range(EPOCHS):
    start = time.time()
    print("EPOCH: ", epoch+1)
    batch_loss_tr = 0
    batch_loss_val = 0
#    print('Training...')
    for img, rep in train_dataset:
        res = model1.train_on_batch([img, rep[:,:-1]], rep[:,1:])
        batch_loss_tr += res
        
    train_loss = batch_loss_tr/(X_train_img.shape[0]/BATCH_SIZE)

    with train_summary_writer.as_default():
        tf.summary.scalar('loss', train_loss, step = epoch)
    
#    print("VALIDATING..")
    for img, rep in cv_dataset:
        res = model1.test_on_batch([img, rep[:,:-1]], rep[:,1:])
        batch_loss_val += res
        
    val_loss = batch_loss_val/(X_cv_img.shape[0]/BATCH_SIZE)

    with val_summary_writer.as_default():
        tf.summary.scalar('loss', val_loss, step = epoch)    
        
    epoch_train_loss.append(train_loss)

    epoch_val_loss.append(val_loss)
    
    print('Training Loss: {},  Validation Loss: {}'.format(train_loss, val_loss))
    print('Time Taken for this Epoch : {} sec'.format(time.time()-start))   
    model1.save_weights('Weights/Attention/OneStep/epoch_'+ str(epoch+1) + '.h5')

EPOCH:  1
Training Loss: 0.25472485851496457,  Validation Loss: 0.2090852936388741
Time Taken for this Epoch : 192.1460621356964 sec
EPOCH:  2
Training Loss: 0.18820408494211732,  Validation Loss: 0.17770616594754474
Time Taken for this Epoch : 41.08613109588623 sec
EPOCH:  3
Training Loss: 0.1592887477017939,  Validation Loss: 0.15150838513189638
Time Taken for this Epoch : 40.9430148601532 sec
EPOCH:  4
Training Loss: 0.13630630285479128,  Validation Loss: 0.13288516477799753
Time Taken for this Epoch : 41.36209177970886 sec
EPOCH:  5
Training Loss: 0.12193650875706226,  Validation Loss: 0.12260464258806807
Time Taken for this Epoch : 40.99419140815735 sec
EPOCH:  6
Training Loss: 0.11179993164259941,  Validation Loss: 0.1139189470821703
Time Taken for this Epoch : 40.9139723777771 sec
EPOCH:  7
Training Loss: 0.1041051099076867,  Validation Loss: 0.10752762502557794
Time Taken for this Epoch : 41.55908155441284 sec
EPOCH:  8
Training Loss: 0.09720139517448842,  Validation Loss: 0.10

In [62]:
def inference_concat(inputs):
    
    in_ = len(inputs.split()) - 1
    inputs = Xnet_Features[inputs]
    enc_state = tf.zeros((1, 256))
    enc_output = model1.layers[0](inputs)
    input_state = enc_state
    pred = []
    cur_vec = np.array([tokenizer.word_index['startseq']]).reshape(-1,1)

    for i in range(155):

        inf_output, input_state, attention_weights = model1.layers[1].onestep_decoder(cur_vec, input_state, enc_output)

        cur_vec = np.reshape(np.argmax(inf_output), (1, 1))
        if cur_vec[0][0] != 0:
            pred.append(cur_vec)
        else:
            break

    final = ' '.join([tokenizer.index_word[e[0][0]] for e in pred if e[0][0] != 0 and e[0][0] != 7])
    return final#, att_weights

In [63]:
a = inference_concat(X_cv_img.values[867])

In [64]:
y_cv_rep.values[867]  # original

'startseq heart size within normal limits . no focal consolidation . no pneumothora pleural effusion . no bony abnormalities . endseq'

In [65]:
a  # predicted

'the heart size and pulmonary vascularity appear within normal limits . the lungs are clear . no pleural effusion pneumothora . no acute bony abnormality .'

The attention model is already giving decent outputs within just 10 epochs of training!!

### You can try with other examples.