### Here, the pre-trained deep neural language model BERT (Bidirectional Encoder Representations from Transformers) is used to classify sequences from the HP corpora.

To pre-process the corpus data for it to be used with the BERT model, the text of each book has been split into shorter sequences of approximately 6 linguistic sentences which have then been labeled according to the book to which they pertain.

The data consisting of a total of 6603 sequences has been
split into a training (80%) and a test (20%) set.

In [None]:
import torch
import tensorflow as tf
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import random
from google.colab import files

In [None]:
if torch.cuda.is_available(): #Use GPU if available
    device = torch.device("cuda")
    print('GPU used:', torch.cuda.get_device_name(0))
else:
    device = torch.device("cpu")
    print('CPU used.')

**Install Hugging Face 🤗 Transformers**

In [None]:
!pip install transformers

**Training data**

In [None]:
uploaded = files.upload() #Select and upload the relevant file

In [None]:
df = pd.read_csv("Train.csv")
print(f"Total number of sentence sequences: {len(df)}")
df

In [None]:
#Histogram: Number of Sequences per Book
category_count = df['Book'].value_counts()
categories = category_count.index

fig = plt.figure(figsize= (12, 5))
plt.style.use('ggplot')
ax = fig.add_subplot(121)
sns.barplot(x = categories, y = category_count)
for a, p in enumerate(ax.patches):
    ax.annotate(f'' + format(p.get_height(), '.0f'),
                xy = (p.get_x() + p.get_width() / 2.0, p.get_height()),
                xytext = (0,-25), size = 13, color = 'white' , ha = 'center',
                va = 'center', textcoords = 'offset points',
                bbox = dict(boxstyle = 'round', facecolor='none', edgecolor='white', alpha = 0.5))

plt.xlabel('Book', size = 15)
plt.ylabel('Sequences', size= 15)
plt.xticks(size = 12)
plt.title("Number of Sequences per Book" , size = 18)
plt.show()

**BERT Tokenizer**

In [None]:
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

sentences = df.Sentences.values
label = df.Book.values

print(sentences[21]) #original sentence
print(tokenizer.tokenize(sentences[21])) #sentence split into tokens
print(tokenizer.convert_tokens_to_ids(tokenizer.tokenize(sentences[21]))) #sentence mapped to token ids

In [None]:
#Tokenizing all sentences and mapping tokens to word IDs
input_id = []

for s in sentences:
    encoded_s = tokenizer.encode(s,                      
                        add_special_tokens = True) #[CLS],[SEP]
    input_id.append(encoded_s)

print('Max sentence length: ', max([len(s) for s in input_id]))

In [None]:
#Padding/Truncating
from keras_preprocessing.sequence import pad_sequences

max_len = 250
input_id = pad_sequences(input_id, maxlen=max_len, dtype="long", 
                          value=0, truncating="post", padding="post") #Pad input tokens with value 0. Post: end of sequence.

attention_mask = []
for s in input_id:
    att_m = [int(token_id > 0) for token_id in s] #padding, ID = 0 ->  mask = 0. token, ID > 0 -> mask = 1.
    attention_mask.append(att_m)

In [None]:
#splitting into train and validation sets
from sklearn.model_selection import train_test_split

train_input, validation_input, train_label, validation_label = train_test_split(input_id, label, 
                                                            random_state=2023, test_size=0.1) #90% train, 10% val.
train_mask, validation_mask, _, _ = train_test_split(attention_mask, label,
                                             random_state=2023, test_size=0.1)

#Converting into torch tensors
train_input = torch.tensor(train_input)
validation_input = torch.tensor(validation_input)
train_label = torch.tensor(train_label)
validation_label = torch.tensor(validation_label)
train_mask = torch.tensor(train_mask)
validation_mask = torch.tensor(validation_mask)

In [None]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
batch_size = 16

train_data = TensorDataset(train_input, train_mask, train_label)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

val_data = TensorDataset(validation_input, validation_mask, validation_label)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

**BERT Classification**

BertForSequenceClassification: pretrained BERT model with added single linear classification layer.

In [None]:
from transformers import BertForSequenceClassification, AdamW, BertConfig

model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased", #12 layer BERT
    num_labels = 7,  
    output_attentions = False,
    output_hidden_states = False,
)

model.cuda() #run model on GPU

**Optimizer**
AdamW: weight decay

In [None]:
from transformers import get_linear_schedule_with_warmup

optimizer = torch.optim.AdamW(model.parameters(),
                  lr = 5e-5,
                  eps = 1e-8
                )

epochs = 4 #Training epochs
total_steps = len(train_dataloader) * epochs #nr batches * nrepochs

#Learning rate scheduler.
lr_scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0,
                                            num_training_steps = total_steps)

In [None]:
import time
import datetime

def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    elapsed_rounded = int(round((elapsed)))
    return str(datetime.timedelta(seconds=elapsed_rounded))

    
def flat_accuracy(prediction, label):
    '''
    Calculates accuracy of predictions
    '''
    pred_flat = np.argmax(prediction, axis=1).flatten()
    label_flat = label.flatten()
    return np.sum(pred_flat == label_flat) / len(label_flat)

**Training**
Based on [run_glue.py](https://github.com/huggingface/transformers/blob/5bfcd0485ece086ebcbed2d008813037968a9e58/examples/run_glue.py#L128)

In [None]:
from sklearn import metrics
seed_val = 2023
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

loss_val = []
model.to(device)

for epoch in range(0, epochs):

    #Training
    print('Epoch {:} / {:} '.format(epoch + 1, epochs))
    print('Training...')
    
    t0 = time.time() #time for training epoch
    total_loss = 0 #reset total loss for epoch

    model.train()

    for step, batch in enumerate(train_dataloader):
        if step % 40 == 0 and not step == 0: #progress update every 40 batches
            elapsed = format_time(time.time() - t0)

            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))

        #unpacking training batch fom dataloader
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        model.zero_grad() #remove previously calculated gradients        

        outputs = model(b_input_ids, 
                    token_type_ids=None, 
                    attention_mask=b_input_mask, 
                    labels=b_labels)

        loss = outputs[0]
        total_loss += loss.item()
        loss.backward() #backward pass to calculate gradients
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) #preventing exploding gradients

        optimizer.step() #update parameters taking step using gradient
        lr_scheduler.step() #update learning rate

    train_loss = total_loss / len(train_dataloader)
    loss_val.append(train_loss)

    print("Average training loss: {0:.2f}".format(train_loss))
    print("Training epoch time: {:}".format(format_time(time.time() - t0)))
    
    #Validation

    print("Validation...")

    t0 = time.time()
    model.eval()

    eval_loss, eval_accuracy = 0, 0
    batches_eval_steps, batches_eval_examples = 0, 0

    for batch in val_dataloader:

        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        
        with torch.no_grad():        
            #no computation or storage of gradients to save memory and speed up validation
            outputs = model(b_input_ids, 
                            token_type_ids=None, 
                            attention_mask=b_input_mask)
        
        logits = outputs[0] #values prior to applying activation function
        logits = logits.detach().cpu().numpy()
        label_id = b_labels.to('cpu').numpy()
    
        tmp_eval_accuracy = flat_accuracy(logits, label_id)
        eval_accuracy += tmp_eval_accuracy #total accuracy

        batches_eval_steps += 1 #nr batches

    print("Accuracy: {0:.2f}".format(eval_accuracy/batches_eval_steps))
    print("Validation time: {:}".format(format_time(time.time() - t0)))

print("Done!")

In [None]:
#Plot training loss
sns.set(style='darkgrid')
sns.set(font_scale=1.5)
plt.rcParams["figure.figsize"] = (12,6)
plt.plot(loss_val, 'b-o')
plt.title("Training loss")
plt.xlabel("Epoch")
plt.ylabel("Loss")

plt.show()

**Performance On Test Set**

In [None]:
# Upload test set
uploaded = files.upload()

In [None]:
# Inspect data
df_test = pd.read_csv("Test.csv")
print(f"Total number of sentence sequences: {len(df_test)}")
df_test

In [None]:
print('Number of test sentences: {:,}\n'.format(df_test.shape[0]))
sentences = df_test.Sentences.values
label = df_test.Book.values
input_id = []

for s in sentences:
    encoded_s = tokenizer.encode(s,
                        add_special_tokens = True,
                   )
    
    input_id.append(encoded_s)

input_id = pad_sequences(input_id, maxlen=max_len, 
                          dtype="long", truncating="post", padding="post")

attention_mask = []
for seq in input_id:
  seq_mask = [float(i>0) for i in seq]
  attention_mask.append(seq_mask) 

prediction_input = torch.tensor(input_id)
prediction_mask = torch.tensor(attention_mask)
prediction_label = torch.tensor(label)

batch_size = 16

prediction_data = TensorDataset(prediction_input, prediction_mask, prediction_label)
prediction_sampler = SequentialSampler(prediction_data)
prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size)

In [None]:
model.eval()
 
predictions, true_labels = [], []

for batch in prediction_dataloader:
  batch = tuple(t.to(device) for t in batch)
  b_input_ids, b_input_mask, b_labels = batch
  
  with torch.no_grad():
      #Forward pass, calculate logit predictions
      outputs = model(b_input_ids, token_type_ids=None, 
                      attention_mask=b_input_mask)

  logits = outputs[0]
  logits = logits.detach().cpu().numpy()
  label_ids = b_labels.to('cpu').numpy()

  predictions.append(logits)
  true_labels.append(label_ids)

print('Done!')

**Accuracy measured by Matthews correlation coefficient
(MCC) because of class imbalance.**

The MCC is  a correlation coefficient between actual and
predicted classifications and range between -1 and 1 where
a value of 1 indicates perfect prediction and a value of 0 indicates
that the prediction is as good as a random prediction.

In [None]:
from sklearn.metrics import matthews_corrcoef

matthews_set = []

for i in range(len(true_labels)):
  pred_label = np.argmax(predictions[i], axis=1).flatten() 
  matthews = matthews_corrcoef(true_labels[i], pred_label)                
  matthews_set.append(matthews)

flat_predictions = [item for sublist in predictions for item in sublist]
flat_predictions = np.argmax(flat_predictions, axis=1).flatten()
flat_true_labels = [item for sublist in true_labels for item in sublist] #combine to single lsit

mcc = matthews_corrcoef(flat_true_labels, flat_predictions)

print('MCC: %3f' % mcc)

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

print("Confusion matrix:")
print(confusion_matrix(flat_true_labels,flat_predictions))

print("Classification report:")
print(classification_report(flat_true_labels, flat_predictions))

print("Accuracy:")
print(accuracy_score(flat_true_labels, flat_predictions))