# ReqBERT 
by Tooraj Helmi (thelmi@usc.edu)

## Introduction
This model a finetuned BERT used to tag requirments based on the grammar explained in the paper. 

Note: I used Colab for running this model. 


## Configurations

In [None]:
import warnings

_debug_ = False
_root_path_ = '/content/drive/MyDrive/Research/Automata/RetBERT/'
_debug_ = False
_tag_ = 'TAG7'
_max_len_ = 50
_batch_size_ = 32
_learning_rate_ = 5e-5
_epochs_ = 20
apps = ['Trading', 'TicTacToe', 'WordGuess', 'News', 'Food Delivery', 'Calendar', 'Bank'] #, 'TimeCard', 'Alarm']

warnings.filterwarnings('ignore')

## 1.1. Select GPU
We need to have a GPU available to run this model

In [None]:
import tensorflow as tf

# Get the GPU device name.
device_name = tf.test.gpu_device_name()

# The device name should look like the following:
if device_name == '/device:GPU:0':
    print('Found GPU at: {}'.format(device_name))
else:
    raise SystemError('GPU device not found')

In [None]:
import torch

# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

## 1.2. Installing Packages




In [None]:
!pip install transformers
!pip install seqeval

# Loading & Parse


## 2.1. Load

In [None]:
from google.colab import drive
drive.mount("/content/drive", force_remount=True)

In [None]:
import pandas as pd
import numpy as np

import pandas as pd
import numpy as np

def load_data(path, tag, all_apps):
  data = pd.read_csv(path, encoding="latin1")
  data = data.fillna(value=np.NaN)
  data = data.dropna(thresh=2)
  data = data[data[tag].notnull()]
  if not all_apps:
    data = data[data['APP'].isin(apps)]
  data = data[["SENT", "WORD", "POS", tag]]
  print(data.shape)
  print()
  return data

Load Tagged Reqset

## 2.3. Parse Dataset

In [None]:
import math
import seaborn as sns
from matplotlib import pyplot

def parse_dataset(data, show_info=True):
  tokens = []
  token_labels = []
  sentences = []
  all_tokens = []
  unique_tokens = set()
  labels = []
  unique_labels = set()
  label_map = {}

  sent_idx = data['SENT'].iloc[0]

  for index, row in data.iterrows():                                                                
    if row['SENT'] != sent_idx:
      sentences.append(tokens)
      labels.append(token_labels)           

      tokens = []
      token_labels = []

    sent_idx = row['SENT']     

    tokens.append(row['WORD'])
    token_labels.append(row[_tag_])
    unique_labels.add(row[_tag_])

    all_tokens.append(row['WORD'])
    unique_tokens.add(row['WORD'])

    # BERT can toeknize each of the NLTK tokens into multiple tokens, e.g., 
    # 'non-admin' => ', non, -, admin, -. So we need to repeat the tags for
    # all of these extra tokens
    bert_tokens = tokenizer.encode(row['WORD'], add_special_tokens = False)
    for token_id in bert_tokens:
      if tokenizer.ids_to_tokens[token_id][0:2] != '##':
        token_labels.append(row[_tag_])
  
  # Last sentence
  sentences.append(tokens)
  labels.append(token_labels) 

  for (i, label) in enumerate(unique_labels):
      label_map[label] = i
  
  if show_info:
    print('Label Map: ', label_map)

    #Plot label dist.
    freq = pd.DataFrame(columns=['label','count'])
    flat_labels = [l for s in labels for l in s]

    for l in label_map:
      freq = freq.append({'label': l, 'count': flat_labels.count(l)}, ignore_index=True)

    #pyplot.subplots(figsize=[20,5])
    plot = sns.barplot(x=freq['label'], y=freq['count'])
    for item in plot.get_xticklabels():
      item.set_rotation(90)
      
  return sentences, all_tokens, unique_tokens, labels, unique_labels, label_map

# Data Prep

## 3.1. Max Length

In [None]:
from transformers import BertTokenizer
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

def analyze():
  lengths = []

  for sen in sentences:
      sen = ' '.join(sen)

      encoded_sent = tokenizer.encode(sen, add_special_tokens = True) 
      lengths.append(len(encoded_sent))

  print('Min length: {:,} tokens'.format(min(lengths)))
  print('Max length: {:,} tokens'.format(max(lengths)))
  print('Median length: {:,} tokens'.format(int(np.median(lengths))))
  print()
  
  sns.set(style='darkgrid')

  sns.set(font_scale=1.5)
  plt.rcParams["figure.figsize"] = (10,5)

  sns.distplot(lengths, kde=False, rug=False)

  plt.title('Sentence Lengths')
  plt.xlabel('Sentence Length')
  plt.ylabel('# of Sentences')

## 3.2. Tokenize

The `tokenizer.encode_plus` applies the following:

1. Split unknown words into subwords.
2. Add the special `[CLS]` and `[SEP]` tokens.
3. Map the tokens to their IDs.
4. Pad or truncate all sentences to the same length (we chose 50).
5. Create the attention masks which explicitly differentiate real tokens from `[PAD]` tokens (so that the self-attention mechanism in BERT ignores the `[PAD]` tokens).


In [None]:
def tokenize(sentences):
  input_ids = []
  attention_masks = []
  
  for sent in sentences:
      sent_str = ' '.join(sent)
      encoded_dict = tokenizer.encode_plus(
        sent_str,                  
        add_special_tokens = True, 
        truncation = True,
        max_length = _max_len_,           
        pad_to_max_length = True,
        return_attention_mask = True,  
        return_tensors = 'pt')
        
      input_ids.append(encoded_dict['input_ids'][0])
      
      attention_masks.append(encoded_dict['attention_mask'][0])
  if _debug_:
    print('Original: ', sentences[1])
    print('Token IDs:', input_ids[1])
    print('Masks:', attention_masks[1])

  return input_ids, attention_masks

## 3.3. Map to Labels
BERT tokenizer can break the words if they are not located in WordPiece vocabulary. Therefore, we need to apply the original word labels to all of the new pieces

In [None]:
def map_to_labels():
  new_labels = []
  null_label_id = -100
  sent_idx = 0
 
  for (sen, orig_labels) in zip(input_ids, labels):
      if (_debug_):
        print('============================DEBUG==============================')
        print(sentences[sent_idx])
        print(sen)
        print(orig_labels)
        for t in sen: 
          print(tokenizer.ids_to_tokens[t.numpy().item()])
      padded_labels = []
      orig_labels_i = 0 

      for token_id in sen:
          token_id = token_id.numpy().item()

          # If `[PAD]`, `[CLS]`, or `[SEP]`...
          if (token_id == tokenizer.pad_token_id) or \
              (token_id == tokenizer.cls_token_id) or \
              (token_id == tokenizer.sep_token_id):
              
              padded_labels.append(null_label_id)

          # If the token string starts with "##"...
          elif tokenizer.ids_to_tokens[token_id][0:2] == '##':
              padded_labels.append(null_label_id)

          # If it's not any of the above...
          else:
              if (_debug_):
                print(orig_labels_i)
              label_str = orig_labels[orig_labels_i]
              
              # This might happen if a test label is not training
              if label_str not in label_map:
                label_map[label_str] = len(label_map)

              padded_labels.append(label_map[label_str])
              orig_labels_i += 1
      sent_idx += 1;
      assert(len(sen) == len(padded_labels))    
      new_labels.append(padded_labels)
  return new_labels

## 3.4. Prediction

In [None]:
def predict(data_loader):
  model.eval()
  predictions , true_labels = [], []

  total_loss = 0

  for batch in data_loader:
    batch = tuple(t.to(device) for t in batch)
    b_input_ids, b_input_mask, b_labels = batch

    with torch.no_grad():
        outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels= b_labels)

    logits = outputs[1]
    total_loss += outputs[0]
    logits = logits.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()

    predictions.append(logits)
    true_labels.append(label_ids)

  all_predictions = np.concatenate(predictions, axis=0)
  all_true_labels = np.concatenate(true_labels, axis=0)

  predicted_label_ids = np.argmax(all_predictions, axis=2)
  predicted_label_ids = np.concatenate(predicted_label_ids, axis=0)
  all_true_labels = np.concatenate(all_true_labels, axis=0)

  real_token_predictions = []
  real_token_labels = []

  for i in range(len(all_true_labels)):
      if not all_true_labels[i] == -100:
          real_token_predictions.append(predicted_label_ids[i])
          real_token_labels.append(all_true_labels[i])
  return real_token_labels, real_token_predictions, total_loss / len(data_loader)

## 3.5. Accuracy

In [None]:
def acc(y_pred, y_true):
  print(y_pred)

  _, y_pred_tags = torch.max(y_pred, dim = 2)    
  correct_preds = (y_pred_tags == y_true).float()
  acc = correct_preds.sum() / torch.numel(y_true)
  acc = torch.round(acc * 100)
  return acc, y_pred_tags, correct_preds

## 3.6. Proces Data

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

data = load_data(_root_path_ + 'Reqset-comb-tagged.csv', _tag_, True)
print(data.head(10))

sentences, all_tokens, unique_tokens, labels, unique_labels, label_map = parse_dataset(data)

print()
print("Number of training sentences: {:,}".format(len(sentences)))
print("Number of training words: {:,}".format(len(all_tokens)))
print("Number of training uniques words: {:,}".format(len(unique_tokens)))
print()

In [None]:
if _debug_:
  print("Example sentence:")
  print ("    Tokens:", sentences[18])
  print ("    Labels:", labels[18])

analyze()
input_ids, attention_masks = tokenize(sentences)
new_labels = map_to_labels()

## 3.7. Generate Training Data


In [None]:
from torch.utils.data import TensorDataset, random_split
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

pt_input_ids = torch.stack(input_ids, dim=0)
pt_attention_masks = torch.stack(attention_masks, dim=0)
pt_labels = torch.tensor(new_labels, dtype=torch.long)

dataset = TensorDataset(pt_input_ids, pt_attention_masks, pt_labels)

train_size = int(0.9 * len(dataset))
val_size = len(dataset) - train_size

train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

train_dataloader = DataLoader(train_dataset, sampler = RandomSampler(train_dataset), batch_size = _batch_size_)
validation_dataloader = DataLoader(val_dataset, sampler = SequentialSampler(val_dataset), batch_size = _batch_size_)

print('{:>5,} training samples'.format(train_size))
print('{:>5,} validation samples'.format(val_size))

# Train

## 4.1. Define Model



In [None]:
from transformers import BertForTokenClassification, AdamW, BertConfig
from transformers import get_linear_schedule_with_warmup
 
model = BertForTokenClassification.from_pretrained(
    "bert-base-uncased", 
    num_labels = len(label_map) + 1, 
    output_attentions = False, 
    output_hidden_states = False,
    # return_dict = True
)

model.cuda()

optimizer = AdamW(model.parameters(), lr = _learning_rate_)
total_steps = len(train_dataloader) * _epochs_
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps = 0, num_training_steps = total_steps)

## 4.2. Execute Training

In [None]:
import random
from tqdm.notebook import tqdm
from seqeval.metrics import f1_score, accuracy_score

seed_val = 42

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

train_stats = {
    'acc': [],
    'loss': []
}

val_stats = {
    'acc': [],
    'loss': [],
    'f1': []
}

for e in tqdm(range(1, _epochs_+1)):   
    model.train()
    train_epoch_loss = 0
    train_epoch_acc = 0

    batch_idx = 0
    for train_batch in train_dataloader:
        batch_idx += 1
        if batch_idx % 10 == 0:
          print(".", end ="") 
        if batch_idx % 1000 == 0:
          print();

        t_input_ids = train_batch[0].to(device)
        t_input_mask = train_batch[1].to(device)
        t_labels = train_batch[2].to(device)

        model.zero_grad()        
        t_outputs = model(t_input_ids, token_type_ids=None, attention_mask=t_input_mask, labels=t_labels)  
        
        train_epoch_loss += t_outputs[0].item() 

        t_outputs[0].backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()

    # VALIDATION    
    with torch.no_grad():      
      true_tags, pred_tags, val_loss = predict(validation_dataloader)

      val_acc = accuracy_score(pred_tags, true_tags)
      # val_f1 = f1_score(pred_tags, true_tags)

      train_stats['loss'].append(train_epoch_loss/len(train_dataloader))
      val_stats['loss'].append(val_loss.item())
      val_stats['acc'].append(val_acc.item())
      # val_stats['f1'].append(val_f1/len(validation_dataloader))
            
      print(f'Epoch {e+0:03}: | Train Loss: {train_epoch_loss/len(train_dataloader):.5f} | Val Loss: {val_loss:.5f} | Val Acc: {val_acc:.3f}') #' | Val F1: {val_f1/len(validation_dataloader): .3f}')

In [None]:
import matplotlib.pyplot as plt

hist = pd.DataFrame(list(zip(val_stats['acc'])), columns=['val-acc'], dtype=float) 
plt.style.use("ggplot")
plt.figure(figsize=(12, 12))
# plt.plot(hist['train-acc'])
plt.plot(hist['val-acc'])

In [None]:
import matplotlib.pyplot as plt
% matplotlib inline

import seaborn as sns

# Use plot styling from seaborn.
sns.set(style='darkgrid')

# Increase the plot size and font size.
sns.set(font_scale=1.5)
plt.rcParams["figure.figsize"] = (12,6)

# Plot the learning curve.
# plt.plot(loss_values, 'b-o')

# Label the plot.
plt.title("Training loss")
plt.xlabel("Epoch")
plt.ylabel("Loss")

plt.show()

# Test

## 5.1. Generate Testing Data


In [None]:
data = load_data(_root_path_ + 'reqset_test_tagged.csv', _tag_, True)
print(data.head(10))

sentences, all_tokens, unique_tokens, labels, unique_labels, _ = parse_dataset(data)
input_ids, attention_masks = tokenize(sentences)
new_labels = map_to_labels()

pt_input_ids = torch.stack(input_ids, dim=0)
pt_attention_masks = torch.stack(attention_masks, dim=0)
pt_labels = torch.tensor(new_labels, dtype=torch.long)

## 5.2. Evaluate 


In [None]:
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix, classification_report

batch_size = 1
test_data = TensorDataset(pt_input_ids, pt_attention_masks, pt_labels)
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)
true_labels, pred_labels, _ = predict(test_dataloader)

cm = confusion_matrix(true_labels, pred_labels)
confusion_matrix_df = pd.DataFrame(cm)

label_names = (np.asarray([l for l in label_map]))
fig, ax = plt.subplots(figsize=(15,13))  
sns.heatmap(confusion_matrix_df, annot=True, ax=ax, fmt='g', xticklabels=label_names, yticklabels=label_names)
plt.yticks(rotation=0) 
# print(true_labels)
# print(pred_labels)

In [None]:
%%HTML
<style type="text/css">
table.dataframe td, table.dataframe th {
    border: 1px  black solid !important;
  color: black !important;
}
</style>

In [None]:
from sklearn.metrics import f1_score
from sklearn.metrics import precision_recall_fscore_support as score

precision, recall, fscore, support = score(true_labels, pred_labels, labels=[v for v in label_map.values()])

f1_micro = f1_score(true_labels, pred_labels, average='micro') 
f1_macro = f1_score(true_labels, pred_labels, average='macro') 
f1_weighted = f1_score(true_labels, pred_labels, average='weighted') 

print ("Micro F1 score: {:.2%}".format(f1_micro))
print ("Macro F1 score: {:.2%}".format(f1_macro))
print ("Weighted F1 score: {:.2%}".format(f1_weighted))

df = pd.DataFrame({'labels': [k for k in label_map.keys()], 'precision': precision, 'recall': recall, 'fscore': fscore, 'support': support})
df

In [None]:
words = ["if", "user", "picks", "the", "first", "choice", ",", "he", "should", "be", "shown", "a", "white", "balloon", "below", "the", "text"]
sent = [1 for i in range(len(words))]
tags = ["-" for i in range(len(words))]
data = pd.DataFrame(list(zip(sent, words, tags, tags)), columns=["SENT", "WORD", "POS", _tag_])

sentences, all_tokens, unique_tokens, labels, unique_labels, _ = parse_dataset(data, False)
input_ids, attention_masks = tokenize(sentences)
new_labels = map_to_labels()

pt_input_ids = torch.stack(input_ids, dim=0).to(device)
pt_attention_masks = torch.stack(attention_masks, dim=0).to(device)

with torch.no_grad():
  outputs = model(pt_input_ids, token_type_ids=None, attention_mask=pt_attention_masks)
logits = outputs[0]
logits = logits.detach().cpu().numpy()
predicted_label_ids = np.argmax(logits, axis=2)[0]

untokenized = [tokenizer.ids_to_tokens[t.numpy().item()] for t in input_ids[0]]

for i, w in enumerate(words):
  label_id = predicted_label_ids[i+1]
  label = ''
  for l in label_map:
    if label_map[l] == label_id:
      label = l
      break
  print('{:>20}: '.format(untokenized[i+1]), label)