In [69]:
import os
import re
import functools

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import cv2
import tensorflow as tf

%load_ext autoreload
%autoreload 2
from PIL import Image
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from collections import Counter
from wordcloud import WordCloud, STOPWORDS
from Encoder import Encoder
from dataset import Dataset
import string

from sklearn.model_selection import train_test_split
from sklearn.utils import resample
from keras.preprocessing.image import load_img
from spacy.lang.en.stop_words import STOP_WORDS
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import Dense,GlobalAveragePooling2D, Input, Embedding 
from tensorflow.keras.layers import LSTM,Dot,Reshape,Concatenate,BatchNormalization 
from tensorflow.keras.layers import GlobalMaxPooling2D, Dropout, Add, MaxPooling2D, GRU, AveragePooling2D

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Loading Data

In [3]:
def parse_input(x):
    if os.path.exists("/users/vspvikram/Downloads/AML_project/ecgen-radiology/%s.xml" %x):
        f = open("/users/vspvikram/Downloads/AML_project/ecgen-radiology/%s.xml" %x, 'r')
        soup = BeautifulSoup(f, "html.parser")
        reports = soup.find(label="IMPRESSION").get_text().lower()
        return re.sub(r'[\W]', ' ', reports)
    
image = pd.Series(os.listdir('/users/vspvikram/Downloads/AML_project/NLMCXR_png'))
number = image.apply(lambda x: x.split('_')[0][3:])
report = number.apply(parse_input)

In [4]:
df = pd.DataFrame({'Image': image, 'Number': number, 'Caption': report}).dropna().sort_values(['Number']).reset_index(drop
                                                                                                                     =True)
df.head()

Unnamed: 0,Image,Number,Caption
0,CXR1_1_IM-0001-4001.png,1,normal chest x xxxx
1,CXR1_1_IM-0001-3001.png,1,normal chest x xxxx
2,CXR10_IM-0002-2001.png,10,no acute cardiopulmonary process
3,CXR10_IM-0002-1001.png,10,no acute cardiopulmonary process
4,CXR100_IM-0002-2001.png,100,no active disease


# Sample Images and report

In [None]:
for i in range(3):
    file_name = np.random.choice(df.Image)
    img = Image.open('/users/vspvikram/Downloads/AML_project/NLMCXR_png/' + file_name)
    plt.imshow(img)
    plt.show()
    print(df[df.Image == file_name].Caption.item())
    print()

# Visualization

In [None]:
texts = df.Caption.unique()
texts = functools.reduce(lambda a, b: a + b, texts).split()
texts = [i for i in texts if i not in set(stopwords.words('english'))]
words_counter = Counter(texts)

In [None]:
plt.figure(figsize=(12,12))
wordcloud = WordCloud(
    width=1600, height=1600,
    background_color = 'white',
    min_font_size=10).generate(" ".join(texts))

plt.imshow(wordcloud)
plt.show()

In [None]:
n_img = df.groupby(['Number']).size().value_counts().reset_index()
n_img.columns = ['Number of images', 'Count']

plt.figure(figsize=(12,8))
plt.bar(x=n_img['Number of images'], height=n_img['Count'])
plt.xlabel("Number of images for each report")
plt.ylabel("Count")
plt.title("Distribution of number of images for every caption")
plt.show()

In [5]:
df2 = df.copy()
df2['Img_count'] = df['Image'].groupby(df['Number']).transform('count')

# keeping only the 2 images per report files
df2 = df2[df2.Img_count == 2] 

Image_names = df2.groupby(['Number']).apply(lambda x: ', '.join(x.Image)).reset_index()
Image_names.rename(columns= {'Number': 'Number', 0: 'Image'})
df2 = df2[['Number', 'Caption']]
df2 = df2.drop_duplicates()
df2 = df2.merge(Image_names, on='Number')

df2.rename(columns={0: 'Images'}, inplace=True)
Images = df2['Images'].str.split(',', n=1, expand=True)
df2['Image1'] = Images[0]
df2['Image2'] = Images[1]
df2.drop(columns=['Images'], inplace=True)
df2

Unnamed: 0,Number,Caption,Image1,Image2
0,1,normal chest x xxxx,CXR1_1_IM-0001-4001.png,CXR1_1_IM-0001-3001.png
1,10,no acute cardiopulmonary process,CXR10_IM-0002-2001.png,CXR10_IM-0002-1001.png
2,100,no active disease,CXR100_IM-0002-2001.png,CXR100_IM-0002-1001.png
3,1001,diffuse fibrosis no visible focal acute disease,CXR1001_IM-0004-1001.png,CXR1001_IM-0004-1002.png
4,1002,status post left mastectomy heart size normal...,CXR1002_IM-0004-2001.png,CXR1002_IM-0004-1001.png
...,...,...,...,...
3203,994,negative chest,CXR994_IM-2478-2001.png,CXR994_IM-2478-1001.png
3204,995,post operative chest with no acute disease,CXR995_IM-2478-1002.png,CXR995_IM-2478-1001.png
3205,996,heart size is normal and lungs are clear no p...,CXR996_IM-2479-1001.png,CXR996_IM-2479-2001.png
3206,997,no acute cardiopulmonary abnormality,CXR997_IM-2479-1001.png,CXR997_IM-2479-2001.png


In [5]:
df2['Caption'].value_counts()[:20]

no acute cardiopulmonary abnormality                 265
no active disease                                    102
no acute cardiopulmonary abnormalities                97
no acute cardiopulmonary findings                     96
no acute disease                                      92
no acute cardiopulmonary disease                      77
no acute cardiopulmonary process                      56
1  no acute radiographic cardiopulmonary process      49
no evidence of active disease                         41
no acute pulmonary disease                            39
no acute cardiopulmonary abnormality                  38
1  no evidence of active disease                      38
no acute cardiopulmonary abnormality                  36
normal chest                                          32
no acute cardiopulmonary findings                     29
no acute findings                                     28
negative for acute abnormality                        27
no acute process               

In [6]:
df2['Caption_count'] = df2.groupby(['Caption'])['Number'].transform('count')
df2.head()

Unnamed: 0,Number,Caption,Image1,Image2,Caption_count
0,1,normal chest x xxxx,CXR1_1_IM-0001-4001.png,CXR1_1_IM-0001-3001.png,2
1,10,no acute cardiopulmonary process,CXR10_IM-0002-2001.png,CXR10_IM-0002-1001.png,56
2,100,no active disease,CXR100_IM-0002-2001.png,CXR100_IM-0002-1001.png,102
3,1001,diffuse fibrosis no visible focal acute disease,CXR1001_IM-0004-1001.png,CXR1001_IM-0004-1002.png,1
4,1002,status post left mastectomy heart size normal...,CXR1002_IM-0004-2001.png,CXR1002_IM-0004-1001.png,1


**Most of the above captions have similar meaning, that means having disease is rare and the data is not uniform in terms of the captions**

In [None]:
df2[df2.Caption_count < 6]['Caption_count'].value_counts() #39 types of captions are available in the dataset

In [None]:

df_counts_unique = df2.Caption_count.value_counts().reset_index()
df_counts_unique['Caption_count_correct'] = df_counts_unique['Caption_count']/df_counts_unique['index']

plt.figure(figsize=(12,8))
ax = sns.barplot(x='index', y= 'Caption_count_correct', data = df_counts_unique)
plt.xlabel("Number of counts for each unique caption ")
plt.ylabel("Count")
plt.title("Frequencies of counts of unique caption")
plt.show()

In [None]:
df_counts_unique_wt1 = df_counts_unique[df_counts_unique['index'] != 1]

plt.figure(figsize=(12,8))
ax = sns.barplot(x='index', y= 'Caption_count_correct', data = df_counts_unique_wt1)
plt.xlabel("Number of counts for each unique caption ")
plt.ylabel("Count")
plt.title("Unique instances for same Number of counts of unique caption")
plt.show()

# Creating train test Dataset

In [7]:


# Making separate df for rows having (>1 & < 6) caption count
ovsmpl_df = df2[(df2.Caption_count < 6) & (df2.Caption_count > 1)]
resampled = resample(ovsmpl_df, n_samples = int(0.2*ovsmpl_df.shape[0]), 
                     replace=False, random_state=42)

# Making separate df for rows having (==1) caption count
ovsmpl_df_count1 = df2[df2.Caption_count == 1]
resampled_count1 = resample(ovsmpl_df_count1, n_samples = int(0.2*ovsmpl_df_count1.shape[0]),
                            replace=False, random_state=43)

df3 = df2.drop(ovsmpl_df.index, axis=0)
df3 = df3.drop(ovsmpl_df_count1.index, axis=0)

train, test = train_test_split(df3, stratify = df3['Caption'].values, test_size=0.2, random_state=42)

test = test.append(resampled)
test = test.append(resampled_count1)
test = test.reset_index(drop=True)

ovsmpl_df = ovsmpl_df.drop(resampled.index, axis=0)
ovsmpl_df_count1 = ovsmpl_df_count1.drop(resampled_count1.index, axis=0)

train = train.append(ovsmpl_df)
train = train.append(ovsmpl_df_count1)

In [None]:
print(f'Train shape: {train.shape}\nTest shape:  {test.shape}') 

In [None]:
train.head()

In [None]:
def get_img_features(df, input_size):
    path = "/users/vspvikram/Downloads/AML_project/NLMCXR_png/"
    Xnet_features = {}
    for i in range(len(df)):
        image1 = cv2.imread(os.path.join(path, df.iloc[i]["Image1"].lstrip()), cv2.IMREAD_UNCHANGED)/255
        image1 = cv2.resize(image1, (input_size[0], input_size[1]), interpolation = cv2.INTER_NEAREST)
        image1 = np.array(image1).reshape(1,input_size[0],input_size[1],input_size[2])
        
        image2 = cv2.imread(os.path.join(path, df.iloc[i]["Image2"].lstrip()), cv2.IMREAD_UNCHANGED)/255
        image2 = cv2.resize(image2, (input_size[0], input_size[1]), interpolation = cv2.INTER_NEAREST)
        image2 = np.array(image2).reshape(1,input_size[0],input_size[1],input_size[2])

        en_model = Encoder()
        en_model.build(input_shape = (None, input_size[0],input_size[1],input_size[2]))

        image1_features = en_model.predict(image1)
        image2_features = en_model.predict(image2)
        input_concat = np.concatenate((image1_features, image2_features), axis=1)

        Xnet_features[int(df.iloc[i]['Number'])] = input_concat
    return Xnet_features

In [None]:
en_output = get_img_features(test.iloc[1:3], (224,224,3))

In [None]:
len(en_output[131][0])

In [None]:
class Dataset():
    def __init__(self, df,  path, max_pad,
                 tokenizer, input_size = (224,224,3)):
        self.image1 = df.Image1
        self.image2 = df.Image2
        
        self.number = df.Number
        self.path = path
        self.input_size = input_size
        self.pad = max_pad
        
        # preprocessing on the text
        # self.caption = df[caption_col].apply(lambda x: text_preprocess(x))
        self.tokenizer = tokenizer
        self.caption = df.Caption
    
        
    def __getitem__(self, i):
        image1 = cv2.imread(os.path.join(self.path, self.image1.iloc[i].lstrip()), cv2.IMREAD_UNCHANGED)/255
        image1 = cv2.resize(image1, (self.input_size[0], self.input_size[1]), interpolation = cv2.INTER_NEAREST)
        image1 = np.array(image1).reshape(1,self.input_size[0],self.input_size[1],self.input_size[2])
        
        image2 = cv2.imread(os.path.join(self.path, self.image2.iloc[i].lstrip()), cv2.IMREAD_UNCHANGED)/255
        image2 = cv2.resize(image2, (self.input_size[0], self.input_size[1]), interpolation = cv2.INTER_NEAREST)
        image2 = np.array(image2).reshape(1,self.input_size[0],self.input_size[1],self.input_size[2])
        
        # return the caption value
        caption = self.tokenizer.texts_to_sequences(self.caption.iloc[i:i+1])
        caption = pad_sequences(caption, maxlen = self.pad, padding = 'post')
        
        
        return image1, image2, caption
    
    def text_preprocess(self, capt):
        capt = capt.split()

        capt = [word.lower() for word in capt]

        # remove punctuations from the string; like $#&
        table = str.maketrans('', '', string.punctuation)
        capt = [word.translate(table) for word in capt]

        # removing the hanging letters like s or t
        capt = [word for word in capt if len(word)>1] 
        capt = [word for word in capt if word not in STOP_WORDS]

        return ' '.join(capt)

In [9]:
vocabulary = set()
for i in range(len(train)):
    vocabulary.update(train.iloc[i]['Caption'].split())
len(vocabulary)

1300

In [80]:
from tensorflow.keras.preprocessing.text import Tokenizer
a = "Now you4 sh&ould go bAck! go now now now go you4"
token = Tokenizer(num_words=15000)
token.fit_on_texts(train['Caption'])

In [81]:
token.get_config()

{'num_words': 15000,
 'filters': '!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
 'lower': True,
 'split': ' ',
 'char_level': False,
 'oov_token': None,
 'document_count': 2567,
 'word_counts': '{"no": 2088, "acute": 1551, "cardiopulmonary": 1091, "abnormality": 563, "disease": 698, "radiographic": 96, "process": 224, "findings": 312, "evidence": 212, "of": 610, "active": 237, "heart": 227, "size": 212, "is": 260, "normal": 359, "and": 395, "the": 525, "lungs": 271, "are": 176, "clear": 248, "negative": 98, "for": 155, "1": 504, "pulmonary": 319, "comparison": 59, "xxxx": 750, "well": 48, "expanded": 43, "mediastinal": 120, "contour": 67, "within": 109, "limits": 107, "identified": 89, "stable": 173, "chest": 262, "preoperative": 14, "abnormalities": 134, "intrathoracic": 15, "finding": 29, "exam": 44, "x": 54, "emphysema": 46, "without": 107, "or": 266, "cardiac": 38, "pleural": 223, "cardiomediastinal": 18, "silhouette": 47, "vasculature": 24, "appears": 23, "there": 85, "focal": 103, "air"

In [10]:
token.texts_to_sequences([a])

[[831, 831, 831, 831]]

In [50]:
df_1 = train.copy()
df_1 = df_1.iloc[1:6]
max_pad = max([len(df_1.iloc[i]['Caption']) for i in range(len(df_1))])
img_path = "/users/vspvikram/Downloads/AML_project/NLMCXR_png/"
data = Dataset(df_1,img_path, max_pad, token)

In [12]:

a = [([1,2], [3,4]), ([5,6], [7,8]), ([9,10], [11,12])]
b = [np.stack(i, axis=0) for i in zip(*a)]
b

[array([[ 1,  2],
        [ 5,  6],
        [ 9, 10]]),
 array([[ 3,  4],
        [ 7,  8],
        [11, 12]])]

In [39]:
input_array = np.random.randint(1000, size=(32, 10))
input_array

array([[143, 730, 428, 946,  45, 856, 916, 694, 787, 222],
       [350, 967, 905, 947, 683, 611, 752, 972, 314,   6],
       [836, 356, 941, 248,  88, 942, 898, 250, 153, 952],
       [426, 183, 900, 116, 302, 585, 739, 859, 444, 833],
       [645, 392, 614, 731,  53,  32,  20, 219, 122, 122],
       [333, 695, 122, 744, 203, 501,   7, 701,  26, 433],
       [814,  94, 910, 965, 106,  16, 791, 877, 241, 312],
       [472, 172, 520, 965, 255, 845,  44, 571, 487, 249],
       [200, 656, 728, 309, 739, 802, 738,  69, 940, 819],
       [104, 308, 987, 121, 820, 461, 269, 376, 140, 313],
       [868, 854, 537, 888, 989,  51, 981, 964, 601, 485],
       [773, 614, 924, 522, 506, 650, 349, 619, 100, 819],
       [263, 231, 779, 423, 620,   2, 313, 945, 729, 690],
       [622, 384, 971,  34, 154, 126, 625, 555, 601, 896],
       [ 85, 243, 796, 674, 159, 642, 423, 636, 385, 779],
       [991, 548,  66, 228, 260, 184, 823,  32, 815, 295],
       [947, 392, 456, 902, 455, 413, 861, 480, 694, 312

In [40]:
input_array.shape

(32, 10)

# Creating data loader class

In [35]:
class DataLoader(tf.keras.utils.Sequence):
    def __init__(self, dataset, batch_size=1):
        self.dataset = dataset
        self.batch_size = batch_size
        self.indices = np.arange(len(dataset))
        
        
    def __getitem__(self, i):
        ind_start = i*self.batch_size
        ind_end = (i+1)*self.batch_size
        
        output = [self.dataset[self.indices[j]] for j in range(ind_start, ind_end)]
        output = [np.stack(item, axis=0) for item in zip(*output)]
        print(len(output))
        
        return output
    
    def __len__(self):
        return len(self.dataset)//self.batch_size
    
   

In [36]:
dataloader = DataLoader(data, batch_size=4)

In [37]:
len(dataloader)

4

# Creating model training class

In [58]:

class DataLoader2(tf.keras.utils.Sequence):
    def __init__(self, dataset, batch_size=1):
        self.dataset = dataset
        self.batch_size = batch_size
        self.indices = np.arange(len(dataset))
        
        
    def __getitem__(self, i):
        ind_start = i*self.batch_size
        ind_end = (i+1)*self.batch_size
        
        output = [self.dataset[self.indices[j]] for j in range(ind_start, ind_end)]
        output = [np.stack(item, axis=0) for item in zip(*output)]
        
        return output[0][0]
    
    def __len__(self):
        return len(self.dataset)//self.batch_size
    
path = "/users/vspvikram/Downloads/AML_project/CheXNet-Keras-master/"
weight_file = "best_weights.h5"
input_shape = (None, 224, 224, 3)
classes = 14

class Encoder2(tf.keras.Model):
    def __init__(self, classes=14, input_shape = (224, 224, 3)):
        super(Encoder2, self).__init__(trainable=False)
        model = tf.keras.applications.densenet.DenseNet121(weights=os.path.join(path, weight_file),
                            input_shape=input_shape,
                            classes=classes)
        self.model = tf.keras.Model(inputs=model.input, outputs=model.layers[-2].output)
        self.model.trainable = False
#         self.model.build(input_shape = input_shape)
        
    def call(self, x):
        return self.model(x)

In [59]:
# class trainEncoder():
#     def __init__(self):
#         self.model = Encoder()
    
#     def train(self, dataloader):
        
        
model2 = Encoder2()
model2.compile()
dataloader = DataLoader2(data, batch_size=1)
model2.predict(dataloader)

2021-12-12 09:01:04.283120: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:176] None of the MLIR Optimization Passes are enabled (registered 2)
2021-12-12 09:01:04.283254: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz
2021-12-12 09:01:04.587449: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.


array([[3.3021401e-04, 1.5556754e-03, 3.3723761e-03, ..., 8.0976397e-01,
        8.6365545e-01, 6.6404277e-01],
       [0.0000000e+00, 1.8343396e-03, 1.5356751e-03, ..., 8.0483556e-01,
        8.3966011e-01, 6.6455615e-01],
       [3.2652434e-04, 2.0345622e-03, 2.6755284e-03, ..., 8.3007032e-01,
        8.7301546e-01, 6.8006986e-01],
       [2.4232979e-05, 1.5020234e-03, 2.0617729e-03, ..., 8.4052682e-01,
        8.3416975e-01, 6.9074172e-01],
       [3.8804300e-04, 2.3525273e-03, 1.8727980e-03, ..., 8.3511353e-01,
        9.1837698e-01, 6.8563384e-01]], dtype=float32)

# Attention  encoder decoder model

## Image encoder layer

In [60]:
class ImageEncoder(tf.keras.layers.Layer):
    '''
    abc = ImageEncoder()
    abc(dataloader[0])
    
    <tf.Tensor: shape=(1, 1024), dtype=float32, numpy=
    array([[3.3021401e-04, 1.5556754e-03, 3.3723761e-03, ..., 8.0976397e-01,
            8.6365545e-01, 6.6404277e-01]], dtype=float32)>
    '''
    def __init__(self, name = 'image_encoder_block'):
        super().__init__()
        self.chexnet = Encoder()
        self.chexnet.compile()
        
    def call(self, data):
        return self.chexnet(data)

## Attention Model

In [67]:
embedding_dim = 300
dense_dim = 512
lstm_units = dense_dim
dropout_rate = 0.2

## Encoder

In [70]:
def encoder(image1, image2, dense_dim = dense_dim, dropout_rate = dropout_rate):
    '''From the two given images, returns the encoded vector
    input_size = (224, 224)
    image1 = Input(shape = (input_size + (3,))) #shape = 224,224,3
    image2 = Input(shape = (input_size + (3,)))
    en_output = encoder(image1, image2)
    '''
    #image1
    im_encoder = ImageEncoder()
    im_feat1 = im_encoder(image1)
    bk_dense = Dense(dense_dim, name = 'bk_dense', activation='relu')
    im_feat1 = bk_dense(im_feat1)
    
    #image2
    im_feat2 = im_encoder(image2)
    im_feat2 = bk_dense(im_feat2)
    
    # combining the two images together
    concat = Concatenate(axis=1)([im_feat1, im_feat2])
    bn = BatchNormalization(name = 'encoder_batch_norm')(concat)
    dropout = Dropout(dropout_rate, name = 'encoder_dropout')(bn)
    return dropout

## Global Attention layer

In [79]:
class global_attention(tf.keras.layers.Layer):
    '''Calculate global attention
    '''
    def __init__(self, dense_dim = dense_dim):
        super().__init__()
        self.W1 = Dense(units = dense_dim)
        self.W2 = Dense(units = dense_dim)
        self.V = Dense(units = 1)
        
    def call(self, encoder_output, decoder_h):
        decoder_h = tf.expand_dims(decoder_h, axis=1)
        tanh_input = self.W1(encoder_output) + self.W2(decoder_h)
        tanh_output = tf.nn.tanh(tanh_input)
        attention_weights = tf.nn.softmax(self.V(tanh_output), axis=1)
        op = attention_weights*encoder_output
        context_vector = tf.reduce_sum(op, axis=1)
        
        return context_vector, attention_weights

In [83]:
glove = {}
vocab_size = len(token.word_counts)
with open('/users/vspvikram/Downloads/AML_project/glove.6B/glove.6B.300d.txt', encoding='utf-8') as f:
    for line in f:
        word = line.split()
        glove[word[0]] = np.asarray(word[1:], dtype='float32')
        

embedding_dims = 300
embedding_matrix = np.zeros((vocab_size+1, embedding_dim))

for word,i in token.word_index.items():
    embedding_vector = glove.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector[:embedding_dim] # final shape: (1301, 300)

# One Step decoder