In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from transformers import BertTokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
dataframe = pd.read_csv('../utility/filtered.csv')
dataframe.sample(10)

Unnamed: 0,medical_specialty,transcription,label
1230,Gastroenterology,"PREOPERATIVE DIAGNOSIS: , Acute appendicitis.,...",4
330,Radiology,"DUPLEX ULTRASOUND OF LEGS,RIGHT LEG:, Duplex ...",2
303,Radiology,"EXAM: , Five views of the right knee.,HISTORY:...",2
657,Orthopedic,"PREOPERATIVE DIAGNOSIS: , Right hand Dupuytren...",3
1088,Gastroenterology,"PROCEDURE IN DETAIL: , Following a barium enem...",4
113,Urology,"ADMITTING DIAGNOSES:, Solitary left kidney wi...",1
519,Orthopedic,"PREOPERATIVE DIAGNOSES: , Left elbow fracture ...",3
499,Orthopedic,"PREOPERATIVE DIAGNOSIS: , Patellar tendon ret...",3
298,Radiology,INTENSITY-MODULATED RADIATION THERAPY SIMULATI...,2
196,Radiology,"PROTOCOL:, Bruce.,PERTINENT MEDICATION: , Non...",2


In [3]:
dataframe = dataframe.dropna(how='any',axis=0) 
dataframe.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1231 entries, 0 to 1238
Data columns (total 3 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   medical_specialty  1231 non-null   object
 1   transcription      1231 non-null   object
 2   label              1231 non-null   int64 
dtypes: int64(1), object(2)
memory usage: 38.5+ KB


In [4]:
dataframe['label'].value_counts()

label
3    355
2    273
4    224
0    223
1    156
Name: count, dtype: int64

In [5]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

In [6]:
x_input_ids = np.zeros((len(dataframe),512))
x_attn_masks= np.zeros((len(dataframe),512))
x_attn_masks.shape

(1231, 512)

In [7]:
def get_train_data(dataframe, ids, masks, tokenizer):
    for i in range(0,1231):
        tokenized_text = tokenizer.encode_plus(
            dataframe['transcription'].iloc[i],
            add_special_tokens = True,
            return_token_type_ids= False,
            truncation=True,
            max_length=512,
            padding='max_length',
            return_tensors='tf'
        )
        ids[i, :] = tokenized_text.input_ids
        masks[i, :] = tokenized_text.attention_mask
    return ids, masks

In [8]:
x_input_ids, x_attn_masks = get_train_data(dataframe, x_input_ids, x_attn_masks,tokenizer)

In [9]:
print(x_input_ids)
print(x_attn_masks)

[[  101. 10507.  1024. ...  1997.  1996.   102.]
 [  101.  7709.  1024. ...     0.     0.     0.]
 [  101. 12407.  1024. ...     0.     0.     0.]
 ...
 [  101.  3653. 25918. ... 28413.  3424.   102.]
 [  101.  3653. 25918. ...     0.     0.     0.]
 [  101.  3653. 25918. ...  2025.  2228.   102.]]
[[1. 1. 1. ... 1. 1. 1.]
 [1. 1. 1. ... 0. 0. 0.]
 [1. 1. 1. ... 0. 0. 0.]
 ...
 [1. 1. 1. ... 1. 1. 1.]
 [1. 1. 1. ... 0. 0. 0.]
 [1. 1. 1. ... 1. 1. 1.]]


In [10]:
labels = np.zeros((len(dataframe),5))
labels.shape

(1231, 5)

In [11]:
labels[np.arange(len(dataframe)),dataframe.label.values] = 1
labels

array([[1., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0.],
       [0., 1., 0., 0., 0.],
       ...,
       [0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 1.]])

In [12]:
dataset = tf.data.Dataset.from_tensor_slices((x_input_ids,x_attn_masks,labels))
dataset.take(1)

<_TakeDataset element_spec=(TensorSpec(shape=(512,), dtype=tf.float64, name=None), TensorSpec(shape=(512,), dtype=tf.float64, name=None), TensorSpec(shape=(5,), dtype=tf.float64, name=None))>

In [13]:
def mapLabelToTranscrioption(ip_id,attn_mask,labels):
    return{
        'input_ids':ip_id,
        'attention_mask': attn_mask
    }, labels

In [14]:
dataset = dataset.map(mapLabelToTranscrioption)
dataset

<_MapDataset element_spec=({'input_ids': TensorSpec(shape=(512,), dtype=tf.float64, name=None), 'attention_mask': TensorSpec(shape=(512,), dtype=tf.float64, name=None)}, TensorSpec(shape=(5,), dtype=tf.float64, name=None))>

In [15]:
dataset = dataset.shuffle(1000).batch(4, drop_remainder=True)

In [16]:
p = 0.8
train_size = int((len(dataframe)//4)*p)
train_size

245

In [17]:
train_dataset = dataset.take(train_size)
val_dataset = dataset.skip(train_size)

In [18]:
from transformers import TFBertModel
bert_model = TFBertModel.from_pretrained('bert-base-uncased', )

Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [19]:
input_ids = tf.keras.layers.Input(shape=(512,), name='input_ids', dtype='int32')
attn_masks = tf.keras.layers.Input(shape=(512,), name='attention_mask', dtype='int32')

bert_embds = bert_model.bert(input_ids, attention_mask=attn_masks)[1] # 0 -> activation layer (3D), 1 -> pooled output layer (2D)
intermediate_layer = tf.keras.layers.Dense(768, activation='relu', name='intermediate_layer')(bert_embds)
output_layer = tf.keras.layers.Dense(5, activation='softmax', name='output_layer')(intermediate_layer) # softmax -> calcs probs of classes

clinical_model = tf.keras.Model(inputs=[input_ids, attn_masks], outputs=output_layer)
clinical_model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 512)]        0           []                               
                                                                                                  
 attention_mask (InputLayer)    [(None, 512)]        0           []                               
                                                                                                  
 bert (TFBertMainLayer)         TFBaseModelOutputWi  109482240   ['input_ids[0][0]',              
                                thPoolingAndCrossAt               'attention_mask[0][0]']         
                                tentions(last_hidde                                               
                                n_state=(None, 512,                                           

In [20]:
optim = tf.keras.optimizers.legacy.Adam(learning_rate=1e-5, decay=1e-6)
loss_func = tf.keras.losses.CategoricalCrossentropy()
acc = tf.keras.metrics.CategoricalAccuracy('accuracy')
# TP = tf.math.count_nonzero(predicted * actual)
# TN = tf.math.count_nonzero((predicted - 1) * (actual - 1))
# FP = tf.math.count_nonzero(predicted * (actual - 1))
# FN = tf.math.count_nonzero((predicted - 1) * actual)
# precision = TP / (TP + FP)
# recall = TP / (TP + FN)
# f1_score = 2 * precision * recall / (precision + recall)

In [21]:
clinical_model.compile(optimizer=optim, loss=loss_func, metrics=[acc])

In [22]:
train_dataset

<_TakeDataset element_spec=({'input_ids': TensorSpec(shape=(4, 512), dtype=tf.float64, name=None), 'attention_mask': TensorSpec(shape=(4, 512), dtype=tf.float64, name=None)}, TensorSpec(shape=(4, 5), dtype=tf.float64, name=None))>

In [23]:
hist = clinical_model.fit(
    train_dataset,
    validation_data=val_dataset,
    epochs=1
)



In [24]:
clinical_model.save('clinical_notes_model')



INFO:tensorflow:Assets written to: clinical_notes_model\assets


INFO:tensorflow:Assets written to: clinical_notes_model\assets
