1. Import Necessary Libraries

In [1]:
# Install required packages
!pip install sentence-transformers
!pip install transformers
!pip install scikit-learn
!pip install datasets
!pip install tensorflow
!pip install focal-loss

# Import necessary libraries
import pandas as pd
from sentence_transformers import SentenceTransformer, losses, InputExample, models
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from transformers import TFAutoModelForSequenceClassification, AutoTokenizer
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
import numpy as np
import tensorflow as tf
from datasets import Dataset
from focal_loss import BinaryFocalLoss

Collecting sentence-transformers
  Downloading sentence_transformers-3.0.1-py3-none-any.whl.metadata (10 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl.met

  from tqdm.autonotebook import tqdm, trange


In [2]:
!pip install comet_ml

Collecting comet_ml
  Downloading comet_ml-3.44.3-py3-none-any.whl.metadata (3.9 kB)
Collecting everett<3.2.0,>=1.0.1 (from everett[ini]<3.2.0,>=1.0.1->comet_ml)
  Downloading everett-3.1.0-py2.py3-none-any.whl.metadata (17 kB)
Collecting python-box<7.0.0 (from comet_ml)
  Downloading python_box-6.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.8 kB)
Collecting requests-toolbelt>=0.8.0 (from comet_ml)
  Downloading requests_toolbelt-1.0.0-py2.py3-none-any.whl.metadata (14 kB)
Collecting semantic-version>=2.8.0 (from comet_ml)
  Downloading semantic_version-2.10.0-py2.py3-none-any.whl.metadata (9.7 kB)
Collecting sentry-sdk>=1.1.0 (from comet_ml)
  Downloading sentry_sdk-2.12.0-py2.py3-none-any.whl.metadata (9.8 kB)
Collecting simplejson (from comet_ml)
  Downloading simplejson-3.19.2-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.1 kB)
Collecting wurlitzer>=1.0.2 (from comet_ml)
  Downloading wurl

In [3]:
from comet_ml import Experiment
from comet_ml.integration.pytorch import log_model
experiment = Experiment(
  api_key="j7D2qTwhmdLpvtUg9wlAC5nfD",
  project_name="general",
  workspace="omjamil-microsoft-com"
)

[1;38;5;39mCOMET INFO:[0m Experiment is live on comet.com https://www.comet.com/omjamil-microsoft-com/general/fc88beba594446de813ad9627488cc70



#Load the data

In [4]:
#connect to google drive
from google.colab import drive
drive.mount('/content/drive')

[1;38;5;39mCOMET INFO:[0m Couldn't find a Git repository in '/content' nor in any parent directory. Set `COMET_GIT_DIRECTORY` if your Git Repository is elsewhere.


Mounted at /content/drive


In [5]:
data = pd.read_csv('/content/drive/My Drive/Project 266 files/training_data_modified_final.csv')

Remove Unclassiable Rows and Keep only relevant columns

In [6]:
data_cleaned = data[data['classification_raw'] != 'Unclassifiable']
data_cleaned = data_cleaned[['sentence', 'objectivity_classification']]

In [7]:
data_cleaned.value_counts('objectivity_classification')

Unnamed: 0_level_0,count
objectivity_classification,Unnamed: 1_level_1
True,12967
False,8126


In [8]:
# Convert 'objectivity_classification' to boolean: True for Objective, False for Subjective
data_cleaned['objectivity_classification'] = data_cleaned['objectivity_classification'].apply(lambda x: x == True)

In [9]:
# Split the data into training and testing sets
X = data_cleaned['sentence'].tolist()
y = data_cleaned['objectivity_classification'].tolist()
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

In [10]:
X_train[:5]

['in december 2007 the elder kim received a personal letter from thenpresident george w bush asking that the country dismantle its nuclear weapons program after north korea pledged during talks in geneva to do so',
 'and they began to identify organizations that were sympathetic to this holistic approach',
 'colleagues this morning ocasiocortez wrote on twitter where she has 24 million followers',
 'even with the current expansion nearly 10 years old the us economy is showing resilience',
 'according to media reports an airstrike hit a clinic treating some of the victims a few hours later']

In [11]:
y_train[:5]

[True, True, True, True, True]

load a pretrained sentence transformer model

In [12]:
# Load a pretrained Sentence Transformer model
model = SentenceTransformer('all-MiniLM-L6-v2')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [13]:
# encode the sentences into embeddings
x_train_embeddings = model.encode(X_train)
x_test_embeddings = model.encode(X_test)
x_val_embeddings = model.encode(X_val)

In [14]:
#Train a logistic regression model on the embeddings
Classifier = LogisticRegression()
Classifier.fit(x_train_embeddings, y_train)


In [15]:
# make predictions on the test data set
y_pred = Classifier.predict(x_test_embeddings)

In [16]:
# evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

In [17]:
# Print the evaluation metrics
print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1}')

Accuracy: 0.8601564351742119
Precision: 0.864954128440367
Recall: 0.9139201240791004
F1 Score: 0.888763197586727


In [18]:
# Load the fine-tuned model (TensorFlow version)
tokenizer = AutoTokenizer.from_pretrained('distilbert/distilbert-base-uncased')
model = TFAutoModelForSequenceClassification.from_pretrained('distilbert/distilbert-base-uncased')

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertForSequenceClassification: ['vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_projector.bias']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFDistilBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']
You should 

In [19]:
# Prepare inputs for training
train_encodings = tokenizer(X_train, truncation=True, padding=True, max_length=128)
y_train_int = np.array([int(label) for label in y_train])
train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings),
    y_train_int
))

In [20]:
from sklearn.utils.class_weight import compute_class_weight
import numpy as np

# Compute class weights
class_weights = compute_class_weight('balanced', classes=np.unique(y_train_int), y=y_train_int)
class_weights_dict = dict(enumerate(class_weights))


In [21]:
# Compile the model with the custom optimizer
model.compile(optimizer='Adam',
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])


In [22]:
from transformers import TFAutoModelForSequenceClassification

# Load the model
model = TFAutoModelForSequenceClassification.from_pretrained('distilbert/distilbert-base-uncased', num_labels=2)  # num_labels=2 for binary classification

# Compile the model
model.compile(optimizer=tf.optimizers.Adam(learning_rate=0.00001),
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])


Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertForSequenceClassification: ['vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_projector.bias']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFDistilBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']
You should 

In [23]:
#Prepare the Validation Encodings and Dataset:
val_encodings = tokenizer(X_val, truncation=True, padding=True, max_length=128)
y_val_int = np.array([int(label) for label in y_val])
val_dataset = tf.data.Dataset.from_tensor_slices((
    dict(val_encodings),
    y_val_int
))

In [24]:
model.fit(train_dataset.shuffle(1000).batch(16), epochs=6,validation_data=val_dataset.batch(16),class_weight=class_weights_dict)

Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


<tf_keras.src.callbacks.History at 0x799c101d6230>

LET'S START FINE-TUNING

In [25]:
# Instantiate DistilBERT tokenizer...we use the Fast version to optimize runtime
from transformers import DistilBertTokenizerFast
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

In [26]:
# Define the maximum number of words to tokenize (DistilBERT can tokenize up to 512)
MAX_LENGTH = 128


# Define function to encode text data in batches
def batch_encode(tokenizer, texts, batch_size=256, max_length=MAX_LENGTH):
    """""""""
    A function that encodes a batch of texts and returns the texts'
    corresponding encodings and attention masks that are ready to be fed
    into a pre-trained transformer model.

    Input:
        - tokenizer:   Tokenizer object from the PreTrainedTokenizer Class
        - texts:       List of strings where each string represents a text
        - batch_size:  Integer controlling number of texts in a batch
        - max_length:  Integer controlling max number of words to tokenize in a given text
    Output:
        - input_ids:       sequence of texts encoded as a tf.Tensor object
        - attention_mask:  the texts' attention mask encoded as a tf.Tensor object
    """""""""

    input_ids = []
    attention_mask = []

    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        inputs = tokenizer.batch_encode_plus(batch,
                                             max_length=MAX_LENGTH,
                                             padding='max_length', #implements dynamic padding
                                             truncation=True,
                                             return_attention_mask=True,
                                             return_token_type_ids=False
                                             )
        input_ids.extend(inputs['input_ids'])
        attention_mask.extend(inputs['attention_mask'])

    return tf.convert_to_tensor(input_ids), tf.convert_to_tensor(attention_mask)


# Encode X_train
X_train_ids, X_train_attention = batch_encode(tokenizer, X_train)

# Encode X_valid
X_valid_ids, X_valid_attention = batch_encode(tokenizer, X_val)

# Encode X_test
X_test_ids, X_test_attention = batch_encode(tokenizer, X_test)

In [27]:
from transformers import TFDistilBertModel, DistilBertConfig

DISTILBERT_DROPOUT = 0.2
DISTILBERT_ATT_DROPOUT = 0.2

# Configure DistilBERT's initialization
config = DistilBertConfig(dropout=DISTILBERT_DROPOUT,
                          attention_dropout=DISTILBERT_ATT_DROPOUT,
                          output_hidden_states=True)

# The bare, pre-trained DistilBERT transformer model outputting raw hidden-states
# and without any specific head on top.
distilBERT = TFDistilBertModel.from_pretrained('distilbert-base-uncased', config=config)

# Make DistilBERT layers untrainable
for layer in distilBERT.layers:
    layer.trainable = False

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertModel: ['vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_projector.bias']
- This IS expected if you are initializing TFDistilBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFDistilBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.


ADD TWO NEW CNN LAYERS AS A NEW CLASSIFICATION HEAD

In [28]:
MAX_LENGTH = 128
LAYER_DROPOUT = 0.2
LEARNING_RATE = 5e-5
RANDOM_STATE = 42

def build_model(transformer, max_length=MAX_LENGTH, lr = LEARNING_RATE, dropout = LAYER_DROPOUT, dense_units = (256,32)):
    """
    Template for building a model off of the BERT or DistilBERT architecture
    for a binary classification task.

    Input:
      - transformer:  a base Hugging Face transformer model object (BERT or DistilBERT)
                      with no added classification head attached.
      - max_length:   integer controlling the maximum number of encoded tokens
                      in a given sequence.

    Output:
      - model:        a compiled tf.keras.Model with added classification layers
                      on top of the base pre-trained model architecture.
    """

    # Define weight initializer with a random seed to ensure reproducibility
    weight_initializer = tf.keras.initializers.GlorotNormal(seed=RANDOM_STATE)

    # Define input layers
    input_ids_layer = tf.keras.layers.Input(shape=(max_length,),
                                            name='input_ids',
                                            dtype='int32')
    input_attention_layer = tf.keras.layers.Input(shape=(max_length,),
                                                  name='input_attention',
                                                  dtype='int32')

    # DistilBERT outputs a tuple where the first element at index 0
    # represents the hidden-state at the output of the model's last layer.
    # It is a tf.Tensor of shape (batch_size, sequence_length, hidden_size=768).
    last_hidden_state = transformer([input_ids_layer, input_attention_layer])[0]

    # We only care about DistilBERT's output for the [CLS] token,
    # which is located at index 0 of every encoded sequence.
    # Splicing out the [CLS] tokens gives us 2D data.
    cls_token = last_hidden_state[:, 0, :]

    x = tf.keras.layers.Dense(dense_units[0], activation='relu', kernel_initializer='he_normal')(cls_token)
    x = tf.keras.layers.Dropout(dropout)(x)
    x = tf.keras.layers.Dense(dense_units[1], activation='relu', kernel_initializer='he_normal')(x)
    x = tf.keras.layers.Dropout(dropout)(x)

    # Define a single node that makes up the output layer (for binary classification)
    output = tf.keras.layers.Dense(1,
                                   activation='sigmoid',
                                   kernel_initializer=weight_initializer,
                                   kernel_constraint=None,
                                   bias_initializer='zeros'
                                   )(cls_token)

    # Define the model
    model = tf.keras.Model([input_ids_layer, input_attention_layer], output)

    # Compile the model
    model.compile(tf.keras.optimizers.Adam(learning_rate=lr),
                  loss=BinaryFocalLoss(gamma=2),
                  metrics=['accuracy'])

    return model

In [29]:

learning_rates = [1e-5, 5e-5, 1e-4]
dropout_rates = [0.1, 0.2, 0.3]
dense_units = [(256, 32), (128, 64), (64, 32)]

EPOCHS = 6
BATCH_SIZE = 64
NUM_STEPS = len(X_train) // BATCH_SIZE
y_train_array = np.array(y_train)
y_val_array = np.array(y_val)

# Train the model
for lr in learning_rates:
    for dropout in dropout_rates:
        for units in dense_units:
            experiment.set_name(f"LR_{lr}_Dropout_{dropout}_Units_{units}")
            print(f"LR_{lr}_Dropout_{dropout}_Units_{units}")
            model = build_model(distilBERT, lr=lr, dropout=dropout, dense_units=units)
            model.fit(
                x = [X_train_ids, X_train_attention],
                y = y_train_array,
                epochs = EPOCHS,
                batch_size = BATCH_SIZE,
                steps_per_epoch = NUM_STEPS,
                validation_data = ([X_valid_ids, X_valid_attention], y_val_array),
                verbose=2)
            experiment.end()

LR_1e-05_Dropout_0.1_Units_(256, 32)
Epoch 1/6
197/197 - 37s - loss: 0.1598 - accuracy: 0.6411 - val_loss: 0.1518 - val_accuracy: 0.6850 - 37s/epoch - 187ms/step
Epoch 2/6
197/197 - 27s - loss: 0.1554 - accuracy: 0.6606 - val_loss: 0.1475 - val_accuracy: 0.7118 - 27s/epoch - 139ms/step
Epoch 3/6
197/197 - 27s - loss: 0.1515 - accuracy: 0.6790 - val_loss: 0.1437 - val_accuracy: 0.7298 - 27s/epoch - 138ms/step
Epoch 4/6
197/197 - 27s - loss: 0.1475 - accuracy: 0.6997 - val_loss: 0.1402 - val_accuracy: 0.7459 - 27s/epoch - 138ms/step
Epoch 5/6
197/197 - 27s - loss: 0.1444 - accuracy: 0.7104 - val_loss: 0.1370 - val_accuracy: 0.7623 - 27s/epoch - 139ms/step
Epoch 6/6
197/197 - 27s - loss: 0.1420 - accuracy: 0.7234 - val_loss: 0.1340 - val_accuracy: 0.7760 - 27s/epoch - 138ms/step


[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m Comet.ml Experiment Summary
[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m   Data:
[1;38;5;39mCOMET INFO:[0m     display_summary_level : 1
[1;38;5;39mCOMET INFO:[0m     name                  : LR_1e-05_Dropout_0.1_Units_(256, 32)
[1;38;5;39mCOMET INFO:[0m     url                   : https://www.comet.com/omjamil-microsoft-com/general/fc88beba594446de813ad9627488cc70
[1;38;5;39mCOMET INFO:[0m   Others:
[1;38;5;39mCOMET INFO:[0m     Name : LR_1e-05_Dropout_0.1_Units_(256, 32)
[1;38;5;39mCOMET INFO:[0m   Uploads:
[1;38;5;39mCOMET INFO:[0m     environment details : 1
[1;38;5;39mCOMET INFO:[0m     filename            : 1
[1;38;5;39mCOMET INFO:[0m     installed packages  : 1
[1;38;5;39mCOMET INFO:[0m     notebook            : 2
[1;38;

LR_1e-05_Dropout_0.1_Units_(128, 64)
Epoch 1/6
197/197 - 35s - loss: 0.1596 - accuracy: 0.6461 - val_loss: 0.1518 - val_accuracy: 0.6852 - 35s/epoch - 179ms/step
Epoch 2/6
197/197 - 27s - loss: 0.1554 - accuracy: 0.6616 - val_loss: 0.1476 - val_accuracy: 0.7108 - 27s/epoch - 139ms/step
Epoch 3/6
197/197 - 27s - loss: 0.1517 - accuracy: 0.6802 - val_loss: 0.1438 - val_accuracy: 0.7296 - 27s/epoch - 138ms/step
Epoch 4/6
197/197 - 27s - loss: 0.1484 - accuracy: 0.6964 - val_loss: 0.1403 - val_accuracy: 0.7478 - 27s/epoch - 138ms/step
Epoch 5/6
197/197 - 27s - loss: 0.1439 - accuracy: 0.7150 - val_loss: 0.1371 - val_accuracy: 0.7642 - 27s/epoch - 139ms/step
Epoch 6/6
197/197 - 27s - loss: 0.1415 - accuracy: 0.7269 - val_loss: 0.1340 - val_accuracy: 0.7760 - 27s/epoch - 138ms/step
LR_1e-05_Dropout_0.1_Units_(64, 32)
Epoch 1/6
197/197 - 35s - loss: 0.1607 - accuracy: 0.6407 - val_loss: 0.1519 - val_accuracy: 0.6850 - 35s/epoch - 178ms/step
Epoch 2/6
197/197 - 27s - loss: 0.1563 - accuracy: 0

In [30]:
FT_EPOCHS = 4
BATCH_SIZE = 64
NUM_STEPS = len(X_train) // BATCH_SIZE

# Unfreeze distilBERT layers and make available for training
for layer in distilBERT.layers:
    layer.trainable = True

# Recompile model after unfreezing
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=2e-5),
              loss=BinaryFocalLoss(gamma=2),
              metrics=['accuracy'])

# Train the model
train_history2 = model.fit(
    x = [X_train_ids, X_train_attention],
    y =  y_train_array,
    epochs = FT_EPOCHS,
    batch_size = BATCH_SIZE,
    steps_per_epoch = NUM_STEPS,
    validation_data = ([X_valid_ids, X_valid_attention], y_val_array),
    verbose=2
)

Epoch 1/4
197/197 - 85s - loss: 0.0721 - accuracy: 0.8809 - val_loss: 0.0604 - val_accuracy: 0.9031 - 85s/epoch - 429ms/step
Epoch 2/4
197/197 - 64s - loss: 0.0497 - accuracy: 0.9220 - val_loss: 0.0559 - val_accuracy: 0.9111 - 64s/epoch - 327ms/step
Epoch 3/4
197/197 - 65s - loss: 0.0371 - accuracy: 0.9417 - val_loss: 0.0909 - val_accuracy: 0.8836 - 65s/epoch - 328ms/step
Epoch 4/4
197/197 - 65s - loss: 0.0214 - accuracy: 0.9685 - val_loss: 0.0826 - val_accuracy: 0.9033 - 65s/epoch - 328ms/step
