# 🛠 Exercises

## 0. Prerequisites

In [1]:
# import libraries
import os
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
import tensorflow_hub as hub

In [2]:
# get helper functions
!wget https://raw.githubusercontent.com/yhs2773/TensorFlow-for-Deep-Learning/main/helper_functions.py
from helper_functions import calculate_results

--2024-01-04 12:28:23--  https://raw.githubusercontent.com/yhs2773/TensorFlow-for-Deep-Learning/main/helper_functions.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.108.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 11093 (11K) [text/plain]
Saving to: ‘helper_functions.py’


2024-01-04 12:28:23 (78.5 MB/s) - ‘helper_functions.py’ saved [11093/11093]



In [3]:
# get data
!git clone https://github.com/Franck-Dernoncourt/pubmed-rct.git
!ls pubmed-rct

Cloning into 'pubmed-rct'...
remote: Enumerating objects: 39, done.[K
remote: Counting objects: 100% (14/14), done.[K
remote: Compressing objects: 100% (9/9), done.[K
remote: Total 39 (delta 8), reused 5 (delta 5), pack-reused 25[K
Receiving objects: 100% (39/39), 177.08 MiB | 20.01 MiB/s, done.
Resolving deltas: 100% (15/15), done.
Updating files: 100% (13/13), done.
PubMed_200k_RCT				       PubMed_20k_RCT_numbers_replaced_with_at_sign
PubMed_200k_RCT_numbers_replaced_with_at_sign  README.md
PubMed_20k_RCT


In [4]:
# set directory for the 20k dataset
data_dir = "pubmed-rct/PubMed_20k_RCT_numbers_replaced_with_at_sign/"

In [5]:
# list of target directories
filenames = [data_dir + filename for filename in os.listdir(data_dir)]
filenames

['pubmed-rct/PubMed_20k_RCT_numbers_replaced_with_at_sign/test.txt',
 'pubmed-rct/PubMed_20k_RCT_numbers_replaced_with_at_sign/dev.txt',
 'pubmed-rct/PubMed_20k_RCT_numbers_replaced_with_at_sign/train.txt']

In [6]:
# function to read lines of a document
def get_lines(filename):
    with open(filename, "r") as f:
        return f.readlines()

In [7]:
train_lines = get_lines(data_dir + "train.txt")
train_lines[:20]

['###24293578\n',
 'OBJECTIVE\tTo investigate the efficacy of @ weeks of daily low-dose oral prednisolone in improving pain , mobility , and systemic low-grade inflammation in the short term and whether the effect would be sustained at @ weeks in older adults with moderate to severe knee osteoarthritis ( OA ) .\n',
 'METHODS\tA total of @ patients with primary knee OA were randomized @:@ ; @ received @ mg/day of prednisolone and @ received placebo for @ weeks .\n',
 'METHODS\tOutcome measures included pain reduction and improvement in function scores and systemic inflammation markers .\n',
 'METHODS\tPain was assessed using the visual analog pain scale ( @-@ mm ) .\n',
 'METHODS\tSecondary outcome measures included the Western Ontario and McMaster Universities Osteoarthritis Index scores , patient global assessment ( PGA ) of the severity of knee OA , and @-min walk distance ( @MWD ) .\n',
 'METHODS\tSerum levels of interleukin @ ( IL-@ ) , IL-@ , tumor necrosis factor ( TNF ) - , and 

In [8]:
# function to preprocess data
def preprocess_text_with_line_numbers(filename):
    input_lines = get_lines(filename)
    abstract_lines = ""
    abstract_samples = []

    for line in input_lines:
        if line.startswith("###"):
            abstract_id = line
            abstract_lines = ""
        elif line.isspace():
            abstract_line_split = abstract_lines.splitlines()

            for abstract_line_number, abstract_line in enumerate(abstract_line_split):
                line_data = {}
                target_text_split = abstract_line.split("\t")
                line_data['target'] = target_text_split[0]
                line_data['text'] = target_text_split[1].lower()
                line_data['line_number'] = abstract_line_number
                line_data['total_lines'] = len(abstract_line_split) - 1
                abstract_samples.append(line_data)
        else:
            abstract_lines += line

    return abstract_samples

In [9]:
# preprocess data
train_samples = preprocess_text_with_line_numbers(data_dir + "train.txt")
val_samples = preprocess_text_with_line_numbers(data_dir + "dev.txt")
test_samples = preprocess_text_with_line_numbers(data_dir + "test.txt")

In [10]:
# turn into data frames
train_df = pd.DataFrame(train_samples)
val_df = pd.DataFrame(val_samples)
test_df = pd.DataFrame(test_samples)

In [11]:
# get lists of sentences
train_sentences = train_df['text'].tolist()
val_sentences = val_df['text'].tolist()
test_sentences = test_df['text'].tolist()

In [12]:
# one-hot encode labels
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder(sparse_output=False)
train_oh = ohe.fit_transform(train_df['target'].to_numpy().reshape(-1, 1))
val_oh = ohe.transform(val_df['target'].to_numpy().reshape(-1, 1))
test_oh = ohe.transform(test_df['target'].to_numpy().reshape(-1, 1))

In [13]:
# label encode labels (instrumental in getting class names)
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
train_le = le.fit_transform(train_df['target'].to_numpy())
val_le = le.transform(val_df['target'].to_numpy())
test_le = le.transform(test_df['target'].to_numpy())

In [14]:
# get num_classes and class_names
num_classes = len(le.classes_)
class_names = le.classes_
num_classes, class_names

(5,
 array(['BACKGROUND', 'CONCLUSIONS', 'METHODS', 'OBJECTIVE', 'RESULTS'],
       dtype=object))

In [15]:
# download pre-trained USE
tf_hub_embedding_layer = hub.KerasLayer("https://tfhub.dev/google/universal-sentence-encoder/4",
                                        trainable=False,
                                        name='universal_sentence_encoder')

In [16]:
# function to split sentences into characters
def split_chars(text):
    return " ".join(list(text))

In [17]:
# split sentence into characters
train_chars = [split_chars(sentence) for sentence in train_sentences]
val_chars = [split_chars(sentence) for sentence in val_sentences]
test_chars = [split_chars(sentence) for sentence in test_sentences]

# check the distribution of character length
char_lens = [len(sentence) for sentence in train_sentences]
output_seq_char_len = int(np.percentile(char_lens, 95))
output_seq_char_len

290

In [18]:
import string

alphabet = string.ascii_lowercase
alphabet

'abcdefghijklmnopqrstuvwxyz'

In [19]:
# create char-level token vectorizer
char_vectorizer = tf.keras.layers.TextVectorization(max_tokens=len(alphabet) + 2,
                                                    output_sequence_length=output_seq_char_len,
                                                    name='char_vectorizer')

# adap character vectorizer
char_vectorizer.adapt(train_chars)

In [20]:
# get char vocab
char_vocab = char_vectorizer.get_vocabulary()

In [21]:
# char embedding layer
char_embed = tf.keras.layers.Embedding(input_dim=len(alphabet) + 2,
                                       output_dim=25,
                                       name='char_embed')

In [22]:
# check the distribution of line_number
int(np.percentile(train_df.line_number, 98))

15

In [23]:
# create line_number one-hot
train_line_numbers_oh = tf.one_hot(train_df['line_number'].to_numpy(), depth=15)
val_line_numbers_oh = tf.one_hot(val_df["line_number"].to_numpy(), depth=15)
test_line_numbers_oh = tf.one_hot(test_df["line_number"].to_numpy(), depth=15)

In [24]:
# check the distribution of total_lines
np.percentile(train_df.total_lines, 98)

20.0

In [25]:
# create total_lines one-hot
train_total_lines_oh = tf.one_hot(train_df['total_lines'].to_numpy(), depth=20)
val_total_lines_oh = tf.one_hot(val_df["total_lines"].to_numpy(), depth=20)
test_total_lines_oh = tf.one_hot(test_df["total_lines"].to_numpy(), depth=20)

In [30]:
# datasets
# train dataset
train_features = tf.data.Dataset.from_tensor_slices((train_line_numbers_oh,
                                                     train_total_lines_oh,
                                                     train_sentences,
                                                     train_chars))
train_labels = tf.data.Dataset.from_tensor_slices(train_oh)
train_ds = tf.data.Dataset.zip((train_features, train_labels))
train_ds = train_ds.batch(32).prefetch(tf.data.AUTOTUNE)

# validation dataset
val_features = tf.data.Dataset.from_tensor_slices((val_line_numbers_oh,
                                                   val_total_lines_oh,
                                                   val_sentences,
                                                   val_chars))
val_labels = tf.data.Dataset.from_tensor_slices(val_oh)
val_ds = tf.data.Dataset.zip((val_features, val_labels))
val_ds = val_ds.batch(32).prefetch(tf.data.AUTOTUNE)

## 1. Train `model_5` on all of the data in the training dataset for as many epochs until it stops improving. Since this might take a while, you might want to use:
- [`tf.keras.callbacks.ModelCheckpoint`](https://www.tensorflow.org/api_docs/python/tf/keras/callbacks/ModelCheckpoint) to save the model's best weights only.
- [`tf.keras.callbacks.EarlyStopping`](https://www.tensorflow.org/api_docs/python/tf/keras/callbacks/EarlyStopping) to stop the model from training once the validation loss has stopped improving for ~3 epochs.

In [26]:
# replicate model_5
# token input model
token_inputs = tf.keras.layers.Input(shape=[], dtype=tf.string)
token_embeddings = tf_hub_embedding_layer(token_inputs)
token_outputs = tf.keras.layers.Dense(128, activation='relu')(token_embeddings)
token_model = tf.keras.Model(inputs=token_inputs, outputs=token_outputs)

# char input model
char_inputs = tf.keras.layers.Input(shape=(1,), dtype=tf.string)
char_vectors = char_vectorizer(char_inputs)
char_embeddings = char_embed(char_vectors)
char_bi_lstm = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32))(char_embeddings)
char_model = tf.keras.Model(inputs=char_inputs, outputs=char_bi_lstm)

# line numbers model
line_number_inputs = tf.keras.layers.Input(shape=(15,), dtype=tf.int32)
x = tf.keras.layers.Dense(32, activation='relu')(line_number_inputs)
line_number_model = tf.keras.Model(inputs=line_number_inputs, outputs=x)

# total lines model
total_lines_inputs = tf.keras.layers.Input(shape=(20,), dtype=tf.int32)
y = tf.keras.layers.Dense(32, activation='relu')(total_lines_inputs)
total_lines_model = tf.keras.Model(inputs=total_lines_inputs, outputs=y)

# token and char hybrid embedding
combined_embeddings = tf.keras.layers.Concatenate()([token_model.output, char_model.output])
z = tf.keras.layers.Dense(256, activation='relu')(combined_embeddings)
z = tf.keras.layers.Dropout(0.5)(z)

# concat combined embedding with line number and total lines models
z = tf.keras.layers.Concatenate()([line_number_model.output, total_lines_model.output, z])

# output layer
output_layer = tf.keras.layers.Dense(num_classes, activation='softmax')(z)

# model_5
model_5 = tf.keras.Model(inputs=[line_number_model.input,
                                 total_lines_model.input,
                                 token_model.input,
                                 char_model.input],
                         outputs=output_layer)

In [27]:
# model summary
model_5.summary()

Model: "model_4"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_2 (InputLayer)        [(None, 1)]                  0         []                            
                                                                                                  
 input_1 (InputLayer)        [(None,)]                    0         []                            
                                                                                                  
 char_vectorizer (TextVecto  (None, 290)                  0         ['input_2[0][0]']             
 rization)                                                                                        
                                                                                                  
 universal_sentence_encoder  (None, 512)                  2567978   ['input_1[0][0]']       

In [28]:
# model callbacks
ckpt_path = 'model_5/model_5.ckpt'
mckpt = tf.keras.callbacks.ModelCheckpoint(filepath=ckpt_path,
                                           save_best_only=True,
                                           save_weights_only=True)

es = tf.keras.callbacks.EarlyStopping(patience=3,
                                      restore_best_weights=True)

In [29]:
# compile
model_5.compile(optimizer='adam',
                loss=tf.keras.losses.CategoricalCrossentropy(label_smoothing=0.2),
                metrics=['accuracy'])

In [31]:
# train
history_5 = model_5.fit(train_ds,
                        epochs=500,
                        validation_data=val_ds,
                        validation_steps=int(len(val_ds) * 0.5),
                        callbacks=[mckpt, es])

Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500


In [32]:
# evaluate
model_5.evaluate(val_ds)



[0.8966736793518066, 0.8558520078659058]

In [34]:
# predict and calculate results
model_5_pred_probs = model_5.predict(val_ds)
model_5_preds = tf.argmax(model_5_pred_probs, axis=1)
results_5 = calculate_results(y_true=val_le, y_pred=model_5_preds)
results_5



{'accuracy': 85.58519793459553,
 'precision': 0.8575051236519442,
 'recall': 0.8558519793459552,
 'f1': 0.8527649681186131}

## 2. Check out the [Keras guide on using pre-trained GloVe embeddings](https://keras.io/examples/nlp/pretrained_word_embeddings/). Can you get this working with one of our models?
- Hint: You'll want to incorporate it with a custom token [Embedding](https://www.tensorflow.org/api_docs/python/tf/keras/layers/Embedding) layer.
- It's up to you whether or not you fine-tune the GloVe embeddings or leave them frozen.

In [35]:
# download and unzip pre-trained GloVe embeddings
!wget https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
!unzip -q glove.6B.zip -d glove_embeddings

--2024-01-04 12:59:20--  https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


2024-01-04 13:02:00 (5.14 MB/s) - ‘glove.6B.zip’ saved [862182613/862182613]



In [36]:
# create text vectorizer
text_vectorizer = tf.keras.layers.TextVectorization(max_tokens=68000, output_sequence_length=55)
text_vectorizer.adapt(train_sentences)
rct_20k_text_vocab = text_vectorizer.get_vocabulary()                       # dictionary of integer to word
word_index = dict(zip(rct_20k_text_vocab, range(len(rct_20k_text_vocab))))  # dictionary of word to index

In [37]:
# embedding dictionary
path_to_glove_file = "glove_embeddings/glove.6B.100d.txt"

embeddings_index = {}
with open(path_to_glove_file) as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, "f", sep=" ")
        embeddings_index[word] = coefs

print(f"Found {len(embeddings_index)} word vectors.")

Found 400000 word vectors.


In [38]:
# prepare embedding matrix
num_tokens = len(rct_20k_text_vocab) + 2
embedding_dim = 100
hits = 0
misses = 0

embedding_matrix = np.zeros((num_tokens, embedding_dim))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # Words not found in embedding index will be all-zeros.
        # This includes the representation for "padding" and "OOV"
        embedding_matrix[i] = embedding_vector
        hits += 1
    else:
        misses += 1
print(f"Converted {hits} words ({misses} misses)")

Converted 29730 words (35111 misses)


In [56]:
# create embedding layer
embedding_layer = tf.keras.layers.Embedding(num_tokens,
                                            embedding_dim,
                                            trainable=False)    # set to false to freeze the weights

# load pre-trained weights to our embedding layer
embedding_layer.build((1,))
embedding_layer.set_weights([embedding_matrix])

In [57]:
# tune model_5
# glove input model (instead of token input model) (only section that needs to be changed)
glove_inputs = tf.keras.layers.Input(shape=[], dtype=tf.string)
glove_vectors = text_vectorizer(glove_inputs)
glove_embeddings = embedding_layer(glove_vectors)
glove_avgpool = tf.keras.layers.GlobalAveragePooling1D()(glove_embeddings)
glove_outputs = tf.keras.layers.Dense(128, activation='relu')(glove_avgpool)
glove_model = tf.keras.Model(inputs=glove_inputs, outputs=glove_outputs)

# char input model
char_inputs = tf.keras.layers.Input(shape=(1,), dtype=tf.string)
char_vectors = char_vectorizer(char_inputs)
char_embeddings = char_embed(char_vectors)
char_bi_lstm = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32))(char_embeddings)
char_model = tf.keras.Model(inputs=char_inputs, outputs=char_bi_lstm)

# line number model
line_number_inputs = tf.keras.layers.Input(shape=(15,), dtype=tf.int32)
x = tf.keras.layers.Dense(32, activation='relu')(line_number_inputs)
line_number_model = tf.keras.Model(inputs=line_number_inputs, outputs=x)

# total lines model
total_lines_inputs = tf.keras.layers.Input(shape=(20,), dtype=tf.int32)
y = tf.keras.layers.Dense(32, activation='relu')(total_lines_inputs)
total_lines_model = tf.keras.Model(inputs=total_lines_inputs, outputs=y)

# glove and char hybrid model
combined_embeddings = tf.keras.layers.Concatenate()([glove_model.output, char_model.output])
z = tf.keras.layers.Dense(256, activation='relu')(combined_embeddings)
z = tf.keras.layers.Dropout(0.5)(z)

# combine hybrid model with line number and total lines models
z = tf.keras.layers.Concatenate()([line_number_model.output, total_lines_model.output, z])

# output layer
output_layer = tf.keras.layers.Dense(len(class_names), activation='softmax')(z)

# build a full model
model_6 = tf.keras.Model(inputs=[line_number_model.input,
                                 total_lines_model.input,
                                 glove_model.input,
                                 char_model.input],
                         outputs=output_layer)

In [58]:
# summary
model_6.summary()

Model: "model_22"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_18 (InputLayer)       [(None,)]                    0         []                            
                                                                                                  
 text_vectorization (TextVe  (None, 55)                   0         ['input_18[0][0]']            
 ctorization)                                                                                     
                                                                                                  
 input_19 (InputLayer)       [(None, 1)]                  0         []                            
                                                                                                  
 embedding_2 (Embedding)     (None, 55, 100)              6484300   ['text_vectorization[4]

In [59]:
# model callbacks
ckpt_path = 'model_6/model_6.ckpt'
mckpt = tf.keras.callbacks.ModelCheckpoint(filepath=ckpt_path,
                                           save_best_only=True,
                                           save_weights_only=True)

In [60]:
# compile
model_6.compile(optimizer=tf.keras.optimizers.Adam(),
                loss=tf.keras.losses.CategoricalCrossentropy(label_smoothing=0.2),
                metrics=['accuracy'])

In [61]:
# train
history_6 = model_6.fit(train_ds,
                        epochs=200,
                        validation_data=val_ds,
                        validation_steps=int(len(val_ds) * 0.5),
                        callbacks=[mckpt, es])

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200


In [62]:
# evaluate
model_6.evaluate(val_ds)



[0.9008594751358032, 0.8523103594779968]

In [63]:
model_6_pred_probs = model_6.predict(val_ds)
model_6_preds = tf.argmax(model_6_pred_probs, axis=1)
results_6 = calculate_results(y_true=val_le, y_pred=model_6_preds)



## 3. Try replacing the TensorFlow Hub Universal Sentence Encoder pre-trained embedding for the [TensorFlow Hub BERT PubMed expert](https://tfhub.dev/google/experts/bert/pubmed/2) (a language model pre-trained on PubMed texts) pre-trained embedding. Does this affect results?
- Note: Using the BERT PubMed expert pre-trained embedding requires an extra preprocessing step for sequences (as detailed in the [TensorFlow Hub guide](https://tfhub.dev/google/experts/bert/pubmed/2)).
- Does the BERT model beat the results mentioned in this paper? https://arxiv.org/pdf/1710.06071.pdf

In [67]:
!pip install -q tensorflow_text

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.2/5.2 MB[0m [31m16.5 MB/s[0m eta [36m0:00:00[0m
[?25h

In [68]:
# Load the BERT encoder and preprocessing models
import tensorflow_text as text # Registers the ops.

preprocess = hub.load('https://kaggle.com/models/tensorflow/bert/frameworks/TensorFlow2/variations/en-uncased-preprocess/versions/3')
bert = hub.load('https://www.kaggle.com/models/google/experts-bert/frameworks/TensorFlow2/variations/pubmed/versions/2')

In [81]:
# reconfigure model_6 to update the token input model
bert_inputs = tf.keras.Input(shape=[None], dtype=tf.string)
bert_inputs = preprocess([])
bert_outputs = bert(bert_inputs, training=False)
bert_model = tf.keras.Model(inputs=bert_inputs, outputs=bert_outputs['pooled_output'])

bert_model.summary()

TypeError: ignored

In [78]:
# Define some sentences to feed into the model
sentences = [
  "Here We Go Then, You And I is a 1999 album by Norwegian pop artist Morten Abel. It was Abel's second CD as a solo artist.",
  "The album went straight to number one on the Norwegian album chart, and sold to double platinum.",
  "Ceylon spinach is a common name for several plants and may refer to: Basella alba Talinum fruticosum",
  "A solar eclipse occurs when the Moon passes between Earth and the Sun, thereby totally or partly obscuring the image of the Sun for a viewer on Earth.",
  "A partial solar eclipse occurs in the polar regions of the Earth when the center of the Moon's shadow misses the Earth.",
]

# Convert the sentences to bert inputs
bert_inputs = preprocess(sentences)

# Feed the inputs to the model to get the pooled and sequence outputs
bert_outputs = bert(bert_inputs, training=False)
pooled_output = bert_outputs['pooled_output']
sequence_output = bert_outputs['sequence_output']

print('\nSentences:')
print(sentences)
print('\nPooled output:')
print(pooled_output)
print('\nSequence output:')
print(sequence_output)


Sentences:
["Here We Go Then, You And I is a 1999 album by Norwegian pop artist Morten Abel. It was Abel's second CD as a solo artist.", 'The album went straight to number one on the Norwegian album chart, and sold to double platinum.', 'Ceylon spinach is a common name for several plants and may refer to: Basella alba Talinum fruticosum', 'A solar eclipse occurs when the Moon passes between Earth and the Sun, thereby totally or partly obscuring the image of the Sun for a viewer on Earth.', "A partial solar eclipse occurs in the polar regions of the Earth when the center of the Moon's shadow misses the Earth."]

Pooled output:
tf.Tensor(
[[ 0.1677935  -0.39312428  0.537474   ...  0.5847805  -0.43331927
  -0.6014683 ]
 [ 0.41831735 -0.11058065  0.37715095 ...  0.42176116 -0.25798553
   0.09233515]
 [-0.55007035  0.36924163 -0.06870158 ... -0.5558884  -0.7557076
  -0.4532629 ]
 [ 0.05911616  0.08547181 -0.5964454  ... -0.55465883 -0.7894638
  -0.7985187 ]
 [ 0.30475476 -0.02680803 -0.615

In [None]:
# char input model
char_inputs = tf.keras.layers.Input(shape=(1,), dtype=tf.string)
char_vectors = char_vectorizer(char_inputs)
char_embeddings = char_embed(char_vectors)
char_bi_lstm = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32))(char_embeddings)
char_model = tf.keras.Model(inputs=char_inputs, outputs=char_bi_lstm)

# line number model
line_number_inputs = tf.keras.layers.Input(shape=(15,), dtype=tf.int32)
x = tf.keras.layers.Dense(32, activation='relu')(line_number_inputs)
line_number_model = tf.keras.Model(inputs=line_number_inputs, outputs=x)

# total lines model
total_lines_inputs = tf.keras.layers.Input(shape=(20,), dtype=tf.int32)
y = tf.keras.layers.Dense(32, activation='relu')(total_lines_inputs)
total_lines_model = tf.keras.Model(inputs=total_lines_inputs, outputs=y)

# glove and char hybrid model
combined_embeddings = tf.keras.layers.Concatenate()([bert_model.output, char_model.output])
z = tf.keras.layers.Dense(256, activation='relu')(combined_embeddings)
z = tf.keras.layers.Dropout(0.5)(z)

# combine hybrid model with line number and total lines models
z = tf.keras.layers.Concatenate()([line_number_model.output, total_lines_model.output, z])

# output layer
output_layer = tf.keras.layers.Dense(len(class_names), activation='softmax')(z)

# build a full model
model_7 = tf.keras.Model(inputs=[line_number_model.input,
                                 total_lines_model.input,
                                 bert_model.input,
                                 char_model.input],
                         outputs=output_layer)

In [None]:
# update checkpoint
ckpt_path = 'model_7/model_7.ckpt'

mckpt = tf.keras.callbacks.ModelCheckpoint(filepath=ckpt_path,
                                           save_best_only=True,
                                           save_weights_only=True)

In [None]:
# compile
model_7.compile(optimizer='adam',
                loss=tf.keras.losses.CategoricalCrossentropy(label_smoothing=0.2),
                metrics=['accuracy'])

In [None]:
# train
history_7 = model_7.fit(train_ds,
                        epochs=300,
                        validation_data=val_ds,
                        validation_steps=int(len(val_ds) * 0.5),
                        callbacks=[mckpt, es])

In [None]:
# evaluate
model_7.evaluate(val_ds)

In [None]:
# get results
model_7_preds = tf.argmax(model_7.predict(val_ds), axis=1)
results_7 = calculate_results(y_true=val_le, y_pred=model_7_preds)

## 4. What happens if you were to merge our `line_number` and `total_lines` features for each sequence? For example, created a `X_of_Y` feature instead? Does this affect model performance?
- Another example: `line_number=1` and `total_lines=11` turns into `line_of_X=1_of_11`.

In [None]:
train_df

## 5. Write a function (or series of functions) to take a sample abstract string, preprocess it (in the same way our model has been trained), make a prediction on each sequence in the abstract, and return the abstract in the format:
- `PREDICTED_LABEL: SEQUENCE`
- `PREDICTED_LABEL: SEQUENCE`
- `PREDICTED_LABEL: SEQUENCE`
- `PREDICTED_LABEL: SEQUENCE`
- ...
    - You can find your own unstructured RCT abstract from PubMed or try this one from: [*Baclofen promotes alcohol abstinence in alcohol dependent cirrhotic patients with hepatitis C virus (HCV) infection*](https://pubmed.ncbi.nlm.nih.gov/22244707/).