# Text classification model based on BERT and LSTM

All the code is part copy of the code in 
https://github.com/artelab/Image-and-Text-fusion-for-UPMC-Food-101-using-BERT-and-CNNs

In [None]:
# Install BERT for tf2 module
!pip install bert-for-tf2
# Install sentencepiece library for text cleaning
!pip install sentencepiece

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting bert-for-tf2
  Downloading bert-for-tf2-0.14.9.tar.gz (41 kB)
[K     |████████████████████████████████| 41 kB 18 kB/s 
[?25hCollecting py-params>=0.9.6
  Downloading py-params-0.10.2.tar.gz (7.4 kB)
Collecting params-flow>=0.8.0
  Downloading params-flow-0.8.2.tar.gz (22 kB)
Building wheels for collected packages: bert-for-tf2, params-flow, py-params
  Building wheel for bert-for-tf2 (setup.py) ... [?25l[?25hdone
  Created wheel for bert-for-tf2: filename=bert_for_tf2-0.14.9-py3-none-any.whl size=30534 sha256=7bf5e6fd472d4e2a7f6a1ae342a41ca0acd515759c88d57320264ec1e4703319
  Stored in directory: /root/.cache/pip/wheels/ab/a4/72/df07592cea3ae06b5e846f5e52262f8b16748e829ca354b7df
  Building wheel for params-flow (setup.py) ... [?25l[?25hdone
  Created wheel for params-flow: filename=params_flow-0.8.2-py3-none-any.whl size=19472 sha256=1d279e6482e1fd50dd22531d5ec39ad6b4dc351

In [None]:
# Import all necessary libraries
try:
    %tensorflow_version 2.x
except Exception:
    pass

import tensorflow as tf
import tensorflow_hub as hub
import keras
from keras import layers
from keras import callbacks
from keras import optimizers
from keras import utils
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import shuffle
import bert
import os
import numpy as np
import re
import pandas as pd
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go

Colab only includes TensorFlow 2.x; %tensorflow_version has no effect.


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# !unzip /content/drive/MyDrive/train_titles.csv.zip


In [None]:
# !unzip /content/drive/MyDrive/test_titles.csv.zip

In [None]:
%cd /content/drive/MyDrive/NLP_project
%pwd

/content/drive/MyDrive/NLP_project


'/content/drive/MyDrive/NLP_project'

In [None]:
# %cd /content

In [None]:
# Import the training and test .csv files
# colnames=['filename', 'text', 'class']
train = pd.read_csv('train_dataframe.tsv', sep = '\t')
train.rename(columns={"class": 'food'}, inplace=True)
val = pd.read_csv('val_dataframe.tsv', sep = '\t')
# val = val.set_index('filename')
val.rename(columns={"class": 'food'}, inplace=True)
train = train.append(val, ignore_index = True)
train = train.set_index('filename')

In [None]:
# Sort values by 'image_path'
#test = test.sort_values('image_path')
#train = train.sort_values('image_path')

In [None]:
train

Unnamed: 0_level_0,text,food
filename,Unnamed: 1_level_1,Unnamed: 2_level_1
allen_wrench_1_1.png,A set of monkey wrenches sorted by size,allen_wrench
allen_wrench_1_1.png,A set of hex wrenches.,allen_wrench
allen_wrench_1_1.png,A set of allen keys in a case.,allen_wrench
allen_wrench_1_1.png,a set of drill bits,allen_wrench
allen_wrench_1_1.png,This is a small set of tools.,allen_wrench
...,...,...
water_bottle_5_1.png,It is a water bottle that is about 6 in tall.,water_bottle
water_bottle_5_1.png,This is a clear plastic water bottle. The labe...,water_bottle
water_bottle_5_1.png,water bottle which is easy to carry in travel ...,water_bottle
water_bottle_5_1.png,This is a small bottle of water,water_bottle


In [None]:
train= train.sample(frac=1)
val= val.sample(frac=1)

In [None]:
train.tail()

Unnamed: 0_level_0,text,food
filename,Unnamed: 1_level_1,Unnamed: 2_level_1
flashlight_3_4.png,It is a rectangular orange and black flashlight.,flashlight
toothpaste_3_2.png,,toothpaste
marker_3_1.png,This is a green dry erase marker.,marker
onion_4_4.png,"This is a medium, uncut, whole, white onion.",onion
coffee_mug_2_4.png,It is a black and orange mug.,coffee_mug


In [None]:
# Check the shapes
print("train samples:",train.shape[0])
print("val samples:",val.shape[0])

train samples: 14660
val samples: 1840


In [None]:
# Cleaning text function

def preprocess_text(sen):
    # Removing html tags
    sentence = remove_tags(str(sen))

    # Remove punctuations and numbers
    sentence = re.sub('[^a-zA-Z]', ' ', sentence)

    # Single character removal
    sentence = re.sub(r"\s+[a-zA-Z]\s+", ' ', sentence)

    # Removing multiple spaces
    sentence = re.sub(r'\s+', ' ', sentence)

    sentence = sentence.lower()

    return sentence

def remove_tags(text):
    return TAG_RE.sub('', text)

TAG_RE = re.compile(r'<[^>]+>')
vec_preprocess_text = np.vectorize(preprocess_text)

In [None]:
# Check number of classes
nClasses_train = train.food.nunique()

In [None]:
nClasses_test= val.food.nunique()

In [None]:
nClasses_train

47

In [None]:
encoder = LabelEncoder()
processed_train = vec_preprocess_text(train.text.values)

encoded_labels_train = encoder.fit_transform(train.food.values)
labels_train = utils.to_categorical(encoded_labels_train, nClasses_train)



print("Processed text sample:", processed_train[0])
print("Shape of train labels:", labels_train.shape)

Processed text sample: there is bowl the rim of the bowl is square 
Shape of train labels: (14660, 47)


In [None]:
# Import the BERT BASE model from Tensorflow HUB (layer, vocab_file and tokenizer)
BertTokenizer = bert.bert_tokenization.FullTokenizer
bert_layer = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1",
                            trainable=False)
vocabulary_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
to_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = BertTokenizer(vocabulary_file, to_lower_case)

In [None]:
# Preprocessing of texts according to BERT

def get_masks(text, max_length):
    """Mask for padding"""
    tokens = tokenizer.tokenize(text)
    tokens = ["[CLS]"] + tokens + ["[SEP]"]
    length = len(tokens)
    if length > max_length:
        tokens = tokens[:max_length]

    return np.asarray([1]*len(tokens) + [0] * (max_length - len(tokens)))
vec_get_masks = np.vectorize(get_masks, signature = '(),()->(n)')

def get_segments(text, max_length):
    """Segments: 0 for the first sequence, 1 for the second"""
    tokens = tokenizer.tokenize(text)
    tokens = ["[CLS]"] + tokens + ["[SEP]"]
    length = len(tokens)
    if length > max_length:
        tokens = tokens[:max_length]
    
    segments = []
    current_segment_id = 0
    with_tags = ["[CLS]"] + tokens + ["[SEP]"]
    token_ids = tokenizer.convert_tokens_to_ids(tokens)
    
    for token in tokens:
        segments.append(current_segment_id)
        if token == "[SEP]":
            current_segment_id = 1
    return np.asarray(segments + [0] * (max_length - len(tokens)))
vec_get_segments = np.vectorize(get_segments, signature = '(),()->(n)')

def get_ids(text, tokenizer, max_length):
    """Token ids from Tokenizer vocab"""
    tokens = tokenizer.tokenize(text)
    tokens = ["[CLS]"] + tokens + ["[SEP]"]
    length = len(tokens)
    if length > max_length:
        tokens = tokens[:max_length]

    token_ids = tokenizer.convert_tokens_to_ids(tokens)
    input_ids = np.asarray(token_ids + [0] * (max_length-length))
    return input_ids
vec_get_ids = np.vectorize(get_ids, signature = '(),(),()->(n)')


def prepare(text_array, tokenizer, max_length = 128):
    
    ids = vec_get_ids(text_array, 
                      tokenizer, 
                      max_length).squeeze()
    masks = vec_get_masks(text_array,
                      max_length).squeeze()
    segments = vec_get_segments(text_array,
                      max_length).squeeze()

    return ids, segments, masks

In [None]:
max_length = 40 # that must be set according to your dataset
ids_train, segments_train, masks_train = prepare(processed_train,
                                                 tokenizer,
                                                 max_length)

In [None]:
input_word_ids = layers.Input(shape=(max_length,), dtype=tf.int32,
                                       name="input_word_ids")
input_mask = layers.Input(shape=(max_length,), dtype=tf.int32,
                                   name="input_masks")
segment_ids = layers.Input(shape=(max_length,), dtype=tf.int32,
                                    name="segment_ids")
den_out, seq_out = bert_layer([input_word_ids, input_mask, segment_ids])

In [None]:
# Classification Model
input_word_ids = layers.Input(shape=(max_length,), dtype=tf.int32,
                                       name="input_word_ids")
input_mask = layers.Input(shape=(max_length,), dtype=tf.int32,
                                   name="input_mask")
segment_ids = layers.Input(shape=(max_length,), dtype=tf.int32,
                                    name="segment_ids")
den_out, seq_out = bert_layer([input_word_ids, input_mask, segment_ids])

X = layers.LSTM(128)(seq_out)
X = layers.Dropout(0.5)(X)
X = layers.Dense(256, activation="relu")(X)
X = layers.Dropout(0.5)(X)
output = layers.Dense(nClasses_train, activation = 'softmax')(X)

model = keras.models.Model(inputs=[input_word_ids, input_mask, segment_ids], outputs=[output])

In [None]:
# Adam optimizer
opt = optimizers.Adam(learning_rate=.001)
# from keras.optimizers import SGD
# opt = SGD(lr=0.01)
# model.compile(loss = "categorical_crossentropy", optimizer = opt)
# Compile model
model.compile(loss = 'categorical_crossentropy',
              optimizer = opt,
              metrics = ['accuracy'])

In [None]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_word_ids (InputLayer)    [(None, 40)]         0           []                               
                                                                                                  
 input_mask (InputLayer)        [(None, 40)]         0           []                               
                                                                                                  
 segment_ids (InputLayer)       [(None, 40)]         0           []                               
                                                                                                  
 keras_layer (KerasLayer)       [(None, 768),        109482241   ['input_word_ids[0][0]',         
                                 (None, 40, 768)]                 'input_mask[0][0]',         

In [None]:
es = callbacks.EarlyStopping(monitor='val_accuracy', patience=2, restore_best_weights=True)

In [None]:
# Setup callbacks, logs and early stopping condition
checkpoint_path = "BERT_LSTM/weights-improvement-{epoch:02d}-{val_accuracy:.2f}.hdf5"
cp = callbacks.ModelCheckpoint(checkpoint_path, monitor='val_accuracy',save_best_only=True,verbose=1, mode='max')
csv_logger = callbacks.CSVLogger('BERT_LSTM/BERT_LSTM.log')
es = callbacks.EarlyStopping(patience = 3, restore_best_weights=True)

In [None]:
# Reduce learning rate if no improvement is observed
reduce_lr = callbacks.ReduceLROnPlateau(
    monitor='val_accuracy', factor=0.1, patience=1, min_lr=0.00001)

In [None]:
# history = model.fit([ids_train, masks_train, segments_train], 
#           labels_train,
#           epochs = 16,
#           batch_size = 512,
#           callbacks = [csv_logger, reduce_lr])

In [None]:
history = model.fit([ids_train, masks_train, segments_train], 
          labels_train,
          epochs = 16,
          batch_size = 512,
        #   validation_data=[ids_test, masks_test, segments_test],
          validation_split = 0.2,
          verbose=1,
          callbacks = [cp, csv_logger, reduce_lr]
          )

Epoch 1/16
Epoch 1: val_accuracy improved from -inf to 0.37756, saving model to BERT_LSTM/weights-improvement-01-0.38.hdf5
Epoch 2/16
Epoch 2: val_accuracy improved from 0.37756 to 0.66064, saving model to BERT_LSTM/weights-improvement-02-0.66.hdf5
Epoch 3/16
Epoch 3: val_accuracy improved from 0.66064 to 0.76467, saving model to BERT_LSTM/weights-improvement-03-0.76.hdf5
Epoch 4/16
Epoch 4: val_accuracy improved from 0.76467 to 0.80048, saving model to BERT_LSTM/weights-improvement-04-0.80.hdf5
Epoch 5/16
Epoch 5: val_accuracy improved from 0.80048 to 0.81821, saving model to BERT_LSTM/weights-improvement-05-0.82.hdf5
Epoch 6/16
Epoch 6: val_accuracy improved from 0.81821 to 0.83083, saving model to BERT_LSTM/weights-improvement-06-0.83.hdf5
Epoch 7/16
Epoch 7: val_accuracy improved from 0.83083 to 0.84720, saving model to BERT_LSTM/weights-improvement-07-0.85.hdf5
Epoch 8/16
Epoch 8: val_accuracy improved from 0.84720 to 0.85573, saving model to BERT_LSTM/weights-improvement-08-0.86.

In [None]:
# Load the log file
df = pd.read_csv('BERT_LSTM/BERT_LSTM.log')

In [None]:
# Training and Test accuracy
fig = go.Figure()
fig.add_trace(go.Scatter(x=df['epoch'], y=df['accuracy'],
                    mode='lines',
                    name='training'))

fig.add_trace(go.Scatter(x=df['epoch'], y=df['val_accuracy'],
                    mode='lines',
                    name='test'))

fig.update_layout(
    font_size = 15,
    # paper_bgcolor='rgba(0,0,0,0)',
    # plot_bgcolor='rgba(0,0,0,0)',
)

fig.update_xaxes(showgrid=True, gridwidth=0.5, gridcolor='Gray', title_text='Epoch')
fig.update_yaxes(showgrid=True, gridwidth=0.5, gridcolor='Gray', title_text='Accuracy')

In [None]:
# Training and Test loss
fig = go.Figure()
fig.add_trace(go.Scatter(x=df['epoch'], y=df['loss'],
                    mode='lines',
                    name='training'))

fig.add_trace(go.Scatter(x=df['epoch'], y=df['val_loss'],
                    mode='lines',
                    name='test'))

fig.update_layout(
    font_size = 15,
    # paper_bgcolor='rgba(0,0,0,0)',
    # plot_bgcolor='rgba(0,0,0,0)',
)

fig.update_xaxes(showgrid=True, gridwidth=0.5, gridcolor='Gray', title_text="Epoch")
fig.update_yaxes(showgrid=True, gridwidth=0.5, gridcolor='Gray', title_text="Loss")

In [None]:
val = pd.read_csv('test_dataframe.tsv', sep = '\t')
# val = val.set_index('filename')
val.rename(columns={"class": 'food'}, inplace=True)
val = val.set_index('filename')
processed_test = vec_preprocess_text(val.text.values)
encoded_labels_test = encoder.fit_transform(val.food.values)
labels_test = utils.to_categorical(encoded_labels_test, nClasses_test)
#add test
ids_test, segments_test, masks_test = prepare(processed_test, 
                                               tokenizer,
                                               max_length)

In [None]:
print("Processed text sample:", processed_test[0])
print("Shape of test labels:", labels_test.shape)

Processed text sample: nan
Shape of test labels: (1840, 47)


In [None]:
model.load_weights('/content/drive/MyDrive/NLP_project/BERT_LSTM/weights-improvement-14-0.89.hdf5')

In [None]:
opt = optimizers.Adam(learning_rate=.001)
model.compile(loss = 'categorical_crossentropy',
              optimizer = opt,
              metrics = ['accuracy'])

In [None]:
model.evaluate([ids_test, masks_test, segments_test],
               labels_test, 
               batch_size = 512)



[0.3029901385307312, 0.9206521511077881]