# MSCI 598: Custom Project
## Medical Abstract Classification


In [15]:
import pandas as pd
import numpy as np 
import tensorflow as tf
import os
import re
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from tensorflow.keras import layers
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
from tensorflow_estimator.python.estimator.canned.dnn import dnn_logit_fn_builder
import tensorflow_hub as hub
import tensorflow_text as text

In [2]:
# SOURCE for preprocessing functions: 
# Abstract Segmentation NLP notebook (https://www.kaggle.com/anshulmehtakaggl/abstract-segmentation-nlp)

# Creating a function to read the txt Files
# This function returns all the lines in the txt file as a list
def get_lines(filename):
    with open(filename,"r") as f:
        return f.readlines()

# Creating a function to read the txt Files
# This function returns all the lines in the txt file as a list
def get_lines(filename):
    with open(filename,"r") as f:
        return f.readlines()

#Preprocessing Functions
# Returns a list of dictionaries of abstract's lines
# Dict Format --> {'TARGET':'Background/Results/Objetive/Concludion','Text':'The actual statement'}
def preprocess_data(filename):
    input_lines=get_lines(filename)
    #This will be used to separte the abstracts from  one another using String mets
    abstract_lines=""
    # Empty list of abstracts        
    abstract_samples=[]
    for line in input_lines:
        # Check for a new abstract
        if line.startswith("###"):
            abstract_id=line
            # And since we are in a new abstract we will Reset the abstract_lines
            abstract_lines=""
        # Check for a new line \n escape seq
        elif line.isspace():
            # Split the Lines of the abstract and will return a list of one abstract
            abstract_line_split=abstract_lines.splitlines()
            # Now we have to iterate through this singular abstract
            for abstract_line_number, abstract_line in enumerate(abstract_line_split):
                #  Enumerate() method adds a counter to an iterable and returns it in a form of enumerating object.
                # Create a empty Dict per line
                line_data={}
                # Split on the tab \t esc seq
                target_text_split=abstract_line.split("\t")
                # Get the Label of the sentence as the Label
                line_data["target"]=target_text_split[0]
                # Get the Text of the Lien as the Text Key
                line_data["text"]=target_text_split[1].lower()
                # Also adding the Line Nnumber as it will also aid the model
                line_data["line_number"]=abstract_line_number
                # Number of Lines in that particular abstract
                line_data["total_lines"]=len(abstract_line_split)-1
                # Now we have to append them to the absract_samples list
                abstract_samples.append(line_data)
        # So if both the cases are not there then the line is a labelled sentence
        else:
            abstract_lines+=line
    return abstract_samples


In [3]:
# Reading in and preprocessing data
data_dir = "D:/med_abstracts/20k_abstracts_numbers_with_@/"
filenames=[data_dir + filename for filename in os.listdir(data_dir)]
print(filenames)

train_samples=preprocess_data(data_dir+"train.txt")
val_samples=preprocess_data(data_dir+"dev.txt")
test_samples=preprocess_data(data_dir+"test.txt")        


['D:/med_abstracts/20k_abstracts_numbers_with_@/dev.txt', 'D:/med_abstracts/20k_abstracts_numbers_with_@/test.txt', 'D:/med_abstracts/20k_abstracts_numbers_with_@/train.txt']


In [4]:
# Converting data to df
train_df=pd.DataFrame(train_samples)
test_df=pd.DataFrame(test_samples)
val_df=pd.DataFrame(val_samples)

# Data distribution
train_df.target.value_counts()

# Isolating the sentences
train_sentences=train_df["text"].tolist()
test_sentences=test_df["text"].tolist()
val_sentences=val_df["text"].tolist()

In [5]:
# Turning the target Labels into Numeric Data
# We have 5 main labels -> Background, Objective,Methods, Results, Conclusion
# We'll encode them both 1HEC and Simple Numerical

# Tensorflow is incompatible with sparse matrices
one_hot_encoder=OneHotEncoder(sparse=False)
# You should reshape your X to be a 2D array not 1D array. Fitting a model requires requires a 2D array. i.e (n_samples, n_features)
train_labels_one_hot=one_hot_encoder.fit_transform(train_df["target"].to_numpy().reshape(-1,1))
val_labels_one_hot=one_hot_encoder.fit_transform(val_df["target"].to_numpy().reshape(-1,1))
test_labels_one_hot=one_hot_encoder.fit_transform(test_df["target"].to_numpy().reshape(-1,1))
train_labels_one_hot,val_labels_one_hot,test_labels_one_hot

le=LabelEncoder()
train_labels_encoded=le.fit_transform(train_df["target"])
test_labels_encoded=le.fit_transform(test_df["target"])
val_labels_encoded=le.fit_transform(val_df["target"])

# Retieving classes 
num_classes=len(le.classes_)
class_names=le.classes_
num_classes,class_names

(5,
 array(['BACKGROUND', 'CONCLUSIONS', 'METHODS', 'OBJECTIVE', 'RESULTS'],
       dtype=object))

In [6]:
# Vectorize the text and then create Embeddings

# How long is each sentence on average
sent_lens=[len(sentence.split()) for sentence in train_sentences]
avg_sent_lens=np.mean(sent_lens)
avg_sent_lens
# sent_lens
#  So we will need Padding and Truncating as the input shapes must be maintained

# Calculate the percentile of length of sentences
output_seq_len=int(np.percentile(sent_lens,95))
output_seq_len
# So 95% sentences are in length of 55

# Creating a text Vectorization Layer
# Mapping our text from words to Numbers
# An embedding is a relatively low-dimensional space into which you can translate high-dimensional vectors. 
# Vocabulary size in the Research Paper is 68000
max_tokens=68000

text_vectorizer=TextVectorization(max_tokens=max_tokens,output_sequence_length=output_seq_len)

# Adapt the Text Vectorizer to the Training Data
# We have to adapt it to only the training data so that val and test data are not seen
# Later it can be fitted to the two latter
text_vectorizer.adapt(train_sentences)

In [7]:
# Finding out how many words are there  in the training vocabulary and which are  most common
# Also text vectorizer works pretty straightforwardly, 1 to most common word, 2 to 2nd most common word and so on
train_vocab=text_vectorizer.get_vocabulary()
# Size of Vocab
print(len(train_vocab))
# 5 Most Common Words in the Vocab
print(train_vocab[:5])
# Least common 5 words in the vocab
print(train_vocab[-5:])

# Get the config of our Text Vectorizer
text_vectorizer.get_config()

64841
['', '[UNK]', 'the', 'and', 'of']
['aainduced', 'aaigroup', 'aachener', 'aachen', 'aaacp']


{'name': 'text_vectorization',
 'trainable': True,
 'batch_input_shape': (None,),
 'dtype': 'string',
 'max_tokens': 68000,
 'standardize': 'lower_and_strip_punctuation',
 'split': 'whitespace',
 'ngrams': None,
 'output_mode': 'int',
 'output_sequence_length': 55,
 'pad_to_max_tokens': False,
 'sparse': False,
 'ragged': False,
 'vocabulary': None,
 'idf_weights': None}

In [8]:
# Create an Embedding Layer
# More output dims , more emmbedding, more parameters to train
# Masking the 0 considering them as padding
token_embed=layers.Embedding(input_dim=len(train_vocab),output_dim=128,mask_zero=True,name="token_embedding")

In [9]:
# Creating a Fast Loadinng Dataset with tf data API
# https://www.tensorflow.org/guide/data_performance
# https://www.tensorflow.org/guide/data
# Turn our data into Tensorflow datasets
train_dataset=tf.data.Dataset.from_tensor_slices((train_sentences,train_labels_one_hot))
val_dataset=tf.data.Dataset.from_tensor_slices((val_sentences,val_labels_one_hot))
test_dataset=tf.data.Dataset.from_tensor_slices((test_sentences,test_labels_one_hot))
train_dataset
# <TensorSliceDataset shapes: ((), (5,)), types: (tf.string, tf.float64)>
# Which indicates one Text Sample in first tuple, next tuple is (0,0,0,0,1) -> 1hc 

<TensorSliceDataset element_spec=(TensorSpec(shape=(), dtype=tf.string, name=None), TensorSpec(shape=(5,), dtype=tf.float64, name=None))>

In [10]:
# Pre fetching the data and making them into batches
# Pre fetching reduces the Preparation time of Data taken by CPU
# Pref-fetching in a Multi-threaded way Reduces time and Increases the amount of data as all cores can be utilized to Prepare the Data
# The GPU will do the Computation
train_dataset=train_dataset.batch(32).prefetch(tf.data.experimental.AUTOTUNE)
val_dataset=val_dataset.batch(32).prefetch(tf.data.experimental.AUTOTUNE)
test_dataset=test_dataset.batch(32).prefetch(tf.data.experimental.AUTOTUNE)
train_dataset
# Run the Previous steps as well this otherwie the Shapes will not be fixed
# PrefetchDataset shapes: ((None,), (None, 5)), types: (tf.string, tf.float64)

<PrefetchDataset element_spec=(TensorSpec(shape=(None,), dtype=tf.string, name=None), TensorSpec(shape=(None, 5), dtype=tf.float64, name=None))>

In [60]:
# Creating the Model
inputs=layers.Input(shape=(1,),dtype=tf.string)
text_vectors=text_vectorizer(inputs)
token_embedding=token_embed(text_vectors)
x=layers.Conv1D(64,kernel_size=5,padding="same",activation="relu")(token_embedding)
x=layers.GlobalAveragePooling1D()(x)
outputs=layers.Dense(num_classes,activation="softmax")(x)
# Indirect way of creating the Modelling the op ip
model_1=tf.keras.Model(inputs,outputs)
# Compiling the Model
model_1.compile(loss="categorical_crossentropy",optimizer=tf.keras.optimizers.Adam(),metrics=["accuracy"])

In [61]:
model_1.summary()

Model: "functional_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_3 (InputLayer)         [(None, 1)]               0         
_________________________________________________________________
text_vectorization_4 (TextVe (None, 55)                0         
_________________________________________________________________
token_embedding (Embedding)  (None, 55, 128)           8299648   
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 55, 64)            41024     
_________________________________________________________________
global_average_pooling1d_2 ( (None, 64)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 5)                 325       
Total params: 8,340,997
Trainable params: 8,340,997
Non-trainable params: 0
____________________________________________

In [62]:
history_model_1=model_1.fit(train_dataset,epochs=5,validation_data=val_dataset)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [63]:
model_1.evaluate(val_dataset)



[0.7110169529914856, 0.7919700741767883]

In [64]:
# Making Predictions In terms of Probabilities
model_1_prediction_probability=model_1.predict(val_dataset)
model_1_prediction_probability
# For all 30k statements our Model will output a 5 len list of Prediction Probability
# And out of the 5 the index that is higher is the one in which our class thinks the 
# Sentence belongs

array([[7.0300645e-01, 1.2680595e-03, 2.2879089e-01, 5.0042268e-02,
        1.6892424e-02],
       [5.5768144e-01, 3.1261150e-02, 6.2722201e-03, 3.9774191e-01,
        7.0433393e-03],
       [3.3693090e-02, 1.3626387e-03, 1.5214458e-03, 9.6337378e-01,
        4.9019684e-05],
       ...,
       [5.3408460e-09, 1.6305681e-07, 2.4358046e-04, 9.0132506e-09,
        9.9975628e-01],
       [9.9451326e-02, 3.6207575e-01, 2.5237784e-01, 1.8783767e-02,
        2.6731133e-01],
       [1.7279190e-04, 9.9964249e-01, 1.6835335e-04, 5.8312901e-07,
        1.5736716e-05]], dtype=float32)

In [65]:
# Now converting the Probabilities to classes
model_1_prediction=tf.argmax(model_1_prediction_probability,axis=1)
model_1_prediction

<tf.Tensor: shape=(30212,), dtype=int64, numpy=array([0, 0, 3, ..., 4, 1, 1], dtype=int64)>

In [68]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_recall_fscore_support
# Models for Calculating different evaluation metrics
# Returns a dict of different metrics
def calculate_results(y_true, y_pred):
  # Calculate model accuracy
  model_accuracy = accuracy_score(y_true, y_pred) * 100
  # Calculate model precision, recall and f1 score using "weighted average
  model_precision, model_recall, model_f1, _ = precision_recall_fscore_support(y_true, y_pred, average="weighted")
  model_results = {"accuracy": model_accuracy,
                  "precision": model_precision,
                  "recall": model_recall,
                  "f1": model_f1}
  return model_results

model_1_results=calculate_results(y_true=val_labels_encoded,y_pred=model_1_prediction)
model_1_results


{'accuracy': 79.19700781146565,
 'precision': 0.7896447524512055,
 'recall': 0.7919700781146565,
 'f1': 0.789347087728524}

In [16]:
tfhub_handle_encoder = 'https://tfhub.dev/google/experts/bert/pubmed/2'
tfhub_handle_preprocess = 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3'
bert_preprocess_model = hub.KerasLayer(tfhub_handle_preprocess)

In [27]:
# Sequence length from data notebook
SEQ_LENGTH = 128
def make_bert_preprocess_model(sentence_features, seq_length=128):
    """Returns Model mapping string features to BERT inputs.

    Args:
    sentence_features: a list with the names of string-valued features.
    seq_length: an integer that defines the sequence length of BERT inputs.

    Returns:
    A Keras Model that can be called on a list or dict of string Tensors
    (with the order or names, resp., given by sentence_features) and
    returns a dict of tensors for input to BERT.
    """

    input_segments = [
      tf.keras.layers.Input(shape=(), dtype=tf.string, name=ft)
      for ft in sentence_features]

    # Tokenize the text to word pieces.
    bert_preprocess = hub.load(tfhub_handle_preprocess)
    tokenizer = hub.KerasLayer(bert_preprocess.tokenize, name='tokenizer')
    segments = [tokenizer(s) for s in input_segments]

    # Optional: Trim segments in a smart way to fit seq_length.
    # Simple cases (like this example) can skip this step and let
    # the next step apply a default truncation to approximately equal lengths.
    truncated_segments = segments

    # Pack inputs. The details (start/end token ids, dict of output tensors)
    # are model-dependent, so this gets loaded from the SavedModel.
    packer = hub.KerasLayer(bert_preprocess.bert_pack_inputs,
                          arguments=dict(seq_length=seq_length),
                          name='packer')
    model_inputs = packer(truncated_segments)
    return tf.keras.Model(input_segments, model_inputs)

In [None]:
test_preprocess_model = make_bert_preprocess_model(train_sentences)
test_text = [np.array(['some random test sentence']),
             np.array(['another sentence'])]
text_preprocessed = test_preprocess_model(test_text)

print('Keys           : ', list(text_preprocessed.keys()))
print('Shape Word Ids : ', text_preprocessed['input_word_ids'].shape)
print('Word Ids       : ', text_preprocessed['input_word_ids'][0, :16])
print('Shape Mask     : ', text_preprocessed['input_mask'].shape)
print('Input Mask     : ', text_preprocessed['input_mask'][0, :16])
print('Shape Type Ids : ', text_preprocessed['input_type_ids'].shape)
print('Type Ids       : ', text_preprocessed['input_type_ids'][0, :16])

In [19]:
bert_model = hub.KerasLayer(tfhub_handle_encoder)

bert_results = bert_model(text_preprocessed)

print(f'Loaded BERT: {tfhub_handle_encoder}')
print(f'Pooled Outputs Shape:{bert_results["pooled_output"].shape}')
print(f'Pooled Outputs Values:{bert_results["pooled_output"][0, :12]}')
print(f'Sequence Outputs Shape:{bert_results["sequence_output"].shape}')
print(f'Sequence Outputs Values:{bert_results["sequence_output"][0, :12]}')

Loaded BERT: https://tfhub.dev/google/experts/bert/pubmed/2
Pooled Outputs Shape:(1, 768)
Pooled Outputs Values:[ 0.42222086 -0.48337984  0.36544615 -0.8634969  -0.14478478 -0.63755333
 -0.35506263  0.86173105  0.8384985  -0.6099298   0.70421    -0.10690586]
Sequence Outputs Shape:(1, 128, 768)
Sequence Outputs Values:[[ 0.45039162 -0.52738523  0.3831572  ... -0.599706   -0.443533
  -0.06585392]
 [-0.6186835  -1.8081663   0.44193897 ... -0.3325248  -0.9120241
  -3.5160751 ]
 [ 0.03471279 -1.3693182   0.9009443  ... -0.23209126 -0.01226728
  -2.5252082 ]
 ...
 [ 1.1027188  -1.5887632   0.37287942 ... -0.37087232  0.18958531
   0.8353863 ]
 [ 0.9942305  -1.274762    0.27017665 ... -0.462246    0.02420896
   0.9638618 ]
 [ 1.077033   -1.3870149   0.28120837 ... -0.6093174  -0.06627202
   0.5650061 ]]


In [20]:
def build_classifier_model():
    text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
    preprocessing_layer = hub.KerasLayer(tfhub_handle_preprocess, name='preprocessing')
    encoder_inputs = preprocessing_layer(text_input)
    encoder = hub.KerasLayer(tfhub_handle_encoder, trainable=True, name='BERT_encoder')
    outputs = encoder(encoder_inputs)
    net = outputs['pooled_output']
    net = tf.keras.layers.Dropout(0.1)(net)
    net = tf.keras.layers.Dense(1, activation=None, name='classifier')(net)
    return tf.keras.Model(text_input, net)

classifier_model = build_classifier_model()
bert_raw_result = classifier_model(tf.constant(text_test))
print(tf.sigmoid(bert_raw_result))