In [1]:
import os
import shutil

import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
from official.nlp import optimization  # to create AdamW optimizer

import matplotlib.pyplot as plt

In [2]:
print("TF version: ", tf.__version__)
print("TF HUB version: ", hub.__version__)

TF version:  2.7.0
TF HUB version:  0.12.0


In [3]:
import pandas as pd
import numpy as np

In [4]:
import psycopg2
import pandas as pd
import sys
import spacy
import re
import stanfordnlp
import time
import scispacy
from tqdm import tqdm
from heuristic_sentence_splitter import sent_tokenize_rules

In [5]:
df = pd.read_csv('./datatotal.csv')
labels = df['label']
sentences = df['sentence']

In [6]:
label_list = []
for label in labels:
    if label == 'pain':
        label_list.append(0)
    if label == 'no pain':
        label_list.append(1)
    if label == 'no mention pain':
        label_list.append(2)

In [7]:
sentence_list = sentences.to_list()

In [8]:
label_list = label_list[:662]
sentence_list = sentence_list[:662]

In [9]:
# BERT embedding
# load preprocessor and BERT model
text_input = tf.keras.layers.Input(shape=(), dtype=tf.string)
preprocessor = hub.KerasLayer(
    "https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/1")
encoder_inputs = preprocessor(text_input) # dict with keys: 'input_mask', 'input_type_ids', 'input_word_ids'
encoder = hub.KerasLayer(
    "https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/3",
    trainable=True)
outputs = encoder(encoder_inputs)
pooled_output = outputs["pooled_output"]      # [batch_size, 768].
sequence_output = outputs["sequence_output"]  # [batch_size, seq_length, 768].

In [10]:
max_batch_size = 132
head = 0
tail = max_batch_size
L = len(sentence_list)
pooled_embedding = np.zeros((1, 768))
sequence_embedding = np.zeros((1, 128, 768))
while (tail < L):
    batch_sentence_list = sentence_list[head:tail]
    encoder_inputs = preprocessor(batch_sentence_list)
    outputs = encoder(encoder_inputs)
    pooled_output = outputs["pooled_output"]
    sequence_output = outputs["sequence_output"]
    pooled_embedding = np.append(pooled_embedding, pooled_output, axis=0)
    sequence_embedding = np.append(sequence_embedding, sequence_output, axis=0)
    head = tail
    tail = head + max_batch_size
batch_sentence_list = sentence_list[head:]
encoder_inputs = preprocessor(batch_sentence_list)
outputs = encoder(encoder_inputs)
pooled_output = outputs["pooled_output"]
sequence_output = outputs["sequence_output"]
pooled_embedding = np.append(pooled_embedding, pooled_output, axis=0)
sequence_embedding = np.append(sequence_embedding, sequence_output, axis=0)
pooled_embedding = pooled_embedding[1:]
sequence_embedding = sequence_embedding[1:]

In [11]:
# classification using SVM
from sklearn.model_selection import KFold
from sklearn import svm, metrics

In [12]:
Xi = pooled_embedding
yi = np.array(label_list)

In [13]:
import random

In [14]:
random.seed(10)
shuffle_id = np.arange(len(yi))
random.shuffle(shuffle_id)
X = Xi[shuffle_id,:]
y = yi[shuffle_id]

In [15]:
X = np.squeeze(X)

In [16]:
clf = svm.SVC()
kf = KFold(n_splits = 5)
accuracy_kf = []
sensitivity_kf = []
specificity_kf = []
for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    # svm
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    confusion = metrics.confusion_matrix(y_test, y_pred)
    TP = confusion[1,1]
    TN = confusion[0,0]
    FP = confusion[0,1]
    FN = confusion[1,0]
    accuracy = (TP+TN) / float(TP+TN+FN+FP)
    sensitivity = TP / float(TP+FN)
    specificity = TN / float(TN+FP)
    accuracy_kf.append(accuracy)
    sensitivity_kf.append(sensitivity)
    specificity_kf.append(specificity)

In [17]:
accuracy_kf = np.array(accuracy_kf)
sensitivity_kf = np.array(sensitivity_kf)
specificity_kf = np.array(specificity_kf)

In [18]:
acc = np.mean(accuracy_kf)
sens = np.mean(sensitivity_kf)
spec = np.mean(specificity_kf)
print('5-fold cross validation, acc: %.4f, sensitivity: %.4f, specificity: %.4f'%(acc, sens, spec))

5-fold cross validation, acc: 0.8535, sensitivity: 0.8491, specificity: 0.8569


In [20]:
# tf.keras.utils.plot_model(bert_encoder, show_shapes=True, dpi=48)