1 -Get the data

In [None]:
! git clone https://github.com/amanchadha/coursera-ai-for-medicine-specialization.git

2 - explore the data

In [None]:
# import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
from tensorflow import keras
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Bidirectional

In [None]:
data = pd.read_csv('stanford_report_test.csv' )

In [None]:
data.shape

In [None]:
data = data.reset_index(drop=True)

In [None]:
data.head(5)

 3 - data cleaning

In [None]:
# first we will retain useful features in the dataset
data = data[['Report Impression','Cardiomegaly' ,'Lung Lesion' , 'Airspace Opacity' , 
            'Edema' , 'Consolidation' , 'Pneumonia' , 'Atelectasis' ,'Pneumothorax',
             'Pleural Effusion' , 'Pleural Other' , 'Fracture' ]] 

In [None]:
data.shape

In [None]:
data.head(5)

In [None]:
# we will replace -1 values  and nan values with 0
data.fillna(0.0 , inplace = True)
data.replace(-1.0 , 0.0 ,inplace = True)

In [None]:
unique_dic = {}
for i in data.columns :
  if i != 'Report Impression' :
      unique_dic[i] = data[i].unique()

In [None]:
unique_dic

In [None]:
categories = [i for i in data.columns if i != 'Report Impression']

In [None]:
categories

In [None]:
plt.figure(figsize=(12,5))
plt.barh(y=categories, width=[data[categories[i]].value_counts()[1] for i in range(len(categories)) ])
plt.show()

In [None]:
# You can see that pathologies like Airspace Opacity, Pleural Effusion, and Edema are present
# in many of the reports while Lung Lesion and Pneumonia are not as common in this dataset.

In [None]:
# we will see data distribution
data_distribution ={}
for i in range(len(categories)):
   count= data[categories[i]].value_counts()
   data_distribution[categories[i]] = count
data_distribution

In [None]:
plt.figure(figsize=(25,7))
X_axis = np.arange(len(categories))
plt.bar(X_axis-0.2 , height=[data[categories[i]].value_counts()[1] for i in range(len(categories)) ]
         ,label='True')
plt.bar(X_axis+0.2, height=[data[categories[i]].value_counts()[0] for i in range(len(categories)) ]
          ,label='False')
plt.xticks(X_axis, categories)
plt.xlabel("Groups")
plt.ylabel("Number of Students")
plt.title("Number of Students in each group")
plt.legend()
plt.show()

In [None]:
# we will handle imbalanced data
# so first we will split the data for each class
data_ = {}
for j in range(len(categories)):
  data_frame = data.drop([categories[i] for i in range(len(categories)) if categories[i] != categories[j]] , axis =1) # pd.concat([data['Report Impression'] , data[categories[i]]] ,axis =1)
  data_[categories[j]] = data_frame

In [None]:
def handle_imbalanced_values():
   for i in range(len(data_)):
     data_class_ = data_[categories[i]]
     data_class_1 = data_class_[data_class_[categories[i]] ==1]
     data_class_0 = data_class_[data_class_[categories[i]] ==0]
     data_class_1 = data_class_1.sample(data_class_0[categories[i]].value_counts()[0] , replace = True)
     new_data_class_ = pd.concat([data_class_1 ,data_class_0 ] , axis =0).reset_index(drop =True)
     data_[categories[i]] = new_data_class_

In [None]:
handle_imbalanced_values()

In [None]:
# we will see data_distribution now 
data_distribution ={}
for i in range(len(categories)):
   count= data_[categories[i]][categories[i]].value_counts()
   data_distribution[categories[i]] = count
data_distribution

In [None]:
# so the data now is balanced

In [None]:
for i in range(len(data_)):
   x_ = data_[categories[i]]['Report Impression']
   y_ = data_[categories[i]][categories[i]]
   data_[categories[i]] = (x_ , y_)

In [None]:
print(data_['Edema'][0].shape)
print(data_['Edema'][1].shape)

In [None]:
# we will do some data preprocessing 

In [None]:
print(data_['Edema'][0][0])

In [None]:
def data_preprocessing(data_):
  data_pre = {}
  ps = PorterStemmer()
  for j in range(len(data_)):
     data = data_[categories[j]][0]
     courps =[]
     for i in range(len(data)) :
       report = re.sub('[^a-zA-Z]', ' ', data[i])
       report = report.lower()
       report = report.split()
       report = [ps.stem(i) for i in report]
       report = ' '.join(report)
       courps.append(report)
     courps = np.array(courps)
     data_pre[categories[j]] = courps
  return data_pre

In [None]:
data_pre = data_preprocessing(data_)

In [None]:
data_pre['Edema'][1]

In [None]:
voc_size = 1500

In [None]:
onehot_dic ={}
for i in range(len(data_pre)):
  list__= data_pre[categories[i]]
  onehot = [one_hot(word , voc_size) for word in list__ ]
  onehot_dic[categories[i]] = onehot

In [None]:
print(len(onehot[0]))
print(onehot[0])

In [None]:
max_lenght =181
embedded_docs_dic ={}
for i in range(len(categories)):
  embedded_docs=pad_sequences(onehot_dic[categories[i]],padding='post',maxlen=max_lenght)
  embedded_docs_dic[categories[i]] = embedded_docs

In [None]:
from sklearn.model_selection import train_test_split
x_y_dic = {}
for i in range(len(categories)):
  X_final=np.asarray(embedded_docs_dic[categories[i]]).astype(np.float32)
  y_final=np.asarray(data_[categories[i]][1])
  X_train, X_test, y_train, y_test = train_test_split(X_final, y_final, test_size=0.2, random_state=42)
  x_y_dic[categories[i]] = [X_train, X_test, y_train, y_test]

4 - train the model

In [None]:
def train_model(embedding_vector_features ,X_train,y_train,X_test,y_test ):
    model1=Sequential()
    model1.add(Embedding(voc_size,embedding_vector_features,input_length=max_lenght))
    model1.add(Bidirectional(LSTM(100)))
    model1.add(keras.layers.Dropout(0.3))
    model1.add(Dense(1,activation='sigmoid'))
    model1.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
    history = model1.fit(X_train,y_train,validation_data=(X_test,y_test),epochs=5,batch_size=64)  
    return model1 , history

In [None]:
def train_each_model_alone():
  models = {}
  histories ={}
  for i in range(len(categories)):
    X_train = x_y_dic[categories[i]][0]
    y_train = x_y_dic[categories[i]][2]
    X_test = x_y_dic[categories[i]][1]
    y_test = x_y_dic[categories[i]][3]
    embedding_vector_features=40
    model1 , history = train_model(embedding_vector_features ,X_train,y_train,X_test,y_test )
    models[categories[i]] = model1
    histories[categories[i]] = history
  return models , histories

In [None]:
models , histories = train_each_model_alone()

5 - we will compute score for each model

In [None]:
def score(models ):
  scores = []
  for i in range(len(categories)):
      X_test_ = x_y_dic[categories[i]][1]
      y_test_ = x_y_dic[categories[i]][3]
      score = models[categories[i]].evaluate(X_test_ , y_test_)[1]
      scores.append(score)
  return np.array(scores) 

In [None]:
scores = score(models)

In [None]:
# we will compute mean score of all models
mean_score = scores.sum() / len(scores)
print(mean_score)

6 - we will predict in all the models , confusion matrix and 
 classification report

In [None]:
def predict(models , data) :
  predictions_dic ={}
  for i in categories :
     model = models[i]
     prediction = model.predict(data[i][1])
     prediction =  np.where( prediction >0.5 , 1,0)
     predictions_dic[i] = prediction  
  return predictions_dic

In [None]:
predictions = predict(models , x_y_dic)

In [None]:
from sklearn.metrics import confusion_matrix
def compute_confusion_matrix(models ,data):
   matrixs = []
   for i in range(len(categories)):
      confusion_ = confusion_matrix(data[categories[i]][3] , predictions[categories[i]])
      matrixs.append(confusion_)
   return np.array(matrixs)   
    

In [None]:
matrixs = compute_confusion_matrix(models,x_y_dic )

In [None]:
from matplotlib import pyplot as plt
import seaborn as sn
for i in range(len(matrixs)) :
   sn.heatmap(matrixs[i], annot=True, fmt='d')
   plt.show()

In [None]:
from sklearn.metrics import classification_report
def compute_classification_report(models,data):
   reports = []
   for i in range(len(categories)): 
      report_ = classification_report(data[categories[i]][3] , predictions[categories[i]])
      reports.append(report_)
   return np.array(reports) 

In [None]:
reports = compute_classification_report(models , x_y_dic)
for i in range(len(reports)) :
   print(categories[i])
   print(reports[i])

In [None]:
def plot_accuracy(history) :
  epochs = [i for i in range(7)]
  fig , ax = plt.subplots(1,2)
  train_acc = history.history['accuracy']
  train_loss = history.history['loss']
  val_acc = history.history['val_accuracy']
  val_loss = history.history['val_loss']
  fig.set_size_inches(20,10)
  ax[0].plot(epochs , train_acc , 'go-' , label = 'Training Accuracy')
  ax[0].plot(epochs , val_acc , 'ro-' , label = 'Validation Accuracy')
  ax[0].set_title('Training & Validation Accuracy')
  ax[0].legend()
  ax[0].set_xlabel("Epochs")
  ax[0].set_ylabel("Accuracy")
  ax[1].plot(epochs , train_loss , 'g-o' , label = 'Training Loss')
  ax[1].plot(epochs , val_loss , 'r-o' , label = 'Validation Loss')
  ax[1].set_title('Testing Accuracy & Loss')
  ax[1].legend()
  ax[1].set_xlabel("Epochs")
  ax[1].set_ylabel("Training & Validation Loss")
  plt.show()

In [None]:
plot_accuracy(histories[categories[0]])

# as we see that method doing well in reports analysis