# Text Classification with BERT

Install the transformers package from Hugging Face.

In [None]:
import transformers

Load the Data

In [None]:
import pandas as pd
import random

df1 = pd.read_csv("./augmented_text.csv", encoding = 'unicode_escape')
df1.rename(columns = {'label2':'label'}, inplace = True)


In [None]:
df1.info()

In [None]:
df1.head(2)

In [None]:
#Load the labels
import pandas as pd
df2 = pd.read_csv("./assets/DATASET1.csv")
df2.head(2)

In [None]:
df = pd.merge(df1, df2, on ="label", how ="left")
df.info()

In [None]:
df.head()

Preprocessing 

In [None]:
import re
import nltk
nltk.download('punkt')
from nltk import word_tokenize

def clean_text(text):
  text = text.lower()
  text = re.sub("[^a-zA-Z\'\-]", " ", text) 
  return " ".join(word_tokenize(text)[:256])

df["text_clean"] = df.text.apply(clean_text)


Defining observations (`X`) and labels (`y`)


In [None]:
X = df["text_clean"].tolist()
y = pd.get_dummies(df['code'])
# Keep track of the mapping between the one-hot encoding and the labels in a dictionary
mapping = {i: name for i, name in enumerate(y.columns)}

In [None]:
#mapping

Split the dataset


In [None]:
from sklearn.model_selection import train_test_split
# Split Train and Validation data
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=0, stratify=None) #stratify=None

# Keep some data for inference (testing)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=0)

Tokenization: create a tokenizer variable and instantiate BertTokenizer

In [None]:
import tensorflow as tf
from transformers import DistilBertTokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')  #'distilbert-base-uncased'  #'bert-base-uncased'


Tokenize the dataset

In [None]:
train_encodings = tokenizer(X_train, max_length=200, truncation=True, padding=True)
val_encodings = tokenizer(X_val, max_length=200, truncation=True, padding=True)
test_encodings = tokenizer(X_test, max_length=200, truncation=True, padding=True)

Prepare the datasets for training




In [None]:
train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings),
    y_train
))

val_dataset = tf.data.Dataset.from_tensor_slices((
    dict(val_encodings),
    y_val
))

test_dataset = tf.data.Dataset.from_tensor_slices((
    dict(test_encodings),
    y_test
))

## Training

Load BERT model



In [None]:
from transformers import TFDistilBertForSequenceClassification

model = TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=len(set(y)))


### Training arguments

Define the the training arguments and compile the model:

*   Optimizer: Adam
*   Loss function: CategoricalCrossentropy(from_logits=True)
*   Metrics: accuracy
*   Compile the model 
*   Model summary

In [None]:
OPTIMIZER =  tf.keras.optimizers.Adam(learning_rate=3e-5)
LOSS = tf.keras.losses.CategoricalCrossentropy(from_logits=True)
METRICS = ['accuracy']

model.compile(optimizer=OPTIMIZER, loss=LOSS, metrics=METRICS)
model.summary()


Training

In [None]:
BATCH_SIZE = 8
EPOCHS = 8

In [None]:
import os
checkpoint_path = "training_1/cp.ckpt"
checkpoint_dir = os.path.dirname(checkpoint_path)


In [None]:
# Create a callback that saves the model's weights
cp_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_path,
                                                 save_weights_only=True,
                                                 verbose=1)


In [None]:
with tf.device('/GPU:0'):
    history = model.fit(
        train_dataset.batch(BATCH_SIZE),
        epochs=EPOCHS,
        validation_data=val_dataset.batch(BATCH_SIZE),
        callbacks=[cp_callback]
    )

### Plot the learning curve of your model

In [None]:
import tensorflow
from matplotlib import pyplot as plt

def plot_history(history):
    """ This helper function takes the tensorflow.python.keras.callbacks.History
    that is output from your `fit` method to plot the loss and accuracy of
    the training and validation set.
    """
    fig, axs = plt.subplots(1,2, figsize=(12,6))
    axs[0].plot(history.history['accuracy'], label='training set')
    axs[0].plot(history.history['val_accuracy'], label = 'validation set')
    axs[0].set(xlabel = 'Epoch', ylabel='Accuracy', ylim=[0, 1.4])

    axs[1].plot(history.history['loss'], label='training set')
    axs[1].plot(history.history['val_loss'], label = 'validation set')
    axs[1].set(xlabel = 'Epoch', ylabel='Loss', ylim=[0, 10])
    
    axs[0].legend(loc='lower right')
    axs[1].legend(loc='lower right')
    
plot_history(history)

In [None]:
# Loads the weights
model.load_weights(checkpoint_path)

## Model Evaluation

We can now evaluate our model on the test set. Use the `model.evaluate()` function.

In [None]:
loss, accuracy = model.evaluate(test_dataset.batch(BATCH_SIZE))
print(f"Loss: {loss}")
print(f"Accuracy: {accuracy}")

## Test your model

In [None]:
import pandas as pd
import googletrans
from googletrans import Translator

# use translate method to translate a string 
translator = Translator()
text = 'Le boitier métallique sur le toit de cabine est à refixer correctement'
translated = translator.translate(text, dest='en')

# obtain translated text 
text = translated.text


In [None]:
#text = 'Lights turn off after 2 minutes'
text = clean_text(text)
encodings = tokenizer([text], max_length=200, truncation=True, padding=True)
ds = tf.data.Dataset.from_tensor_slices(dict(encodings))
predictions = model.predict(ds)

import numpy as np
print(mapping[np.argmax(predictions[0])])



In [None]:
# serialize weights to HDF5
model.save_weights("model.h5")
print("Saved model to disk")

In [None]:
from tensorflow.keras.models import Model

new_model = TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=50)

new_model.load_weights('model.h5')

# Check its architecture
new_model.summary()


The restored model is compiled with the same arguments as the original model.  
Try running evaluate and predict with the loaded model:

In [None]:
#text = 'Emergency lighting not working or too dim in cabin 3'
text = clean_text(text)
encodings = tokenizer([text], max_length=200, truncation=True, padding=True)
ds = tf.data.Dataset.from_tensor_slices(dict(encodings))
predictions = new_model.predict(ds)

import numpy as np
print(mapping[np.argmax(predictions[0])])