# Finetuning BERT


topics of this notebook:

* learning to predict some label for sentences
* predicting a masked word within a sentence
* modifyinig a pretrained model for domain adaptation: our lexicon induction problem


In [1]:
from google.colab import drive

drive.mount('/content/gdrive')

ModuleNotFoundError: No module named 'google'

In [2]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.0.0-py3-none-any.whl (1.4 MB)
[K     |████████████████████████████████| 1.4 MB 3.7 MB/s eta 0:00:01
Collecting regex!=2019.12.17
  Downloading regex-2020.11.13-cp37-cp37m-macosx_10_9_x86_64.whl (284 kB)
[K     |████████████████████████████████| 284 kB 7.1 MB/s eta 0:00:01
Collecting sacremoses
  Downloading sacremoses-0.0.43.tar.gz (883 kB)
[K     |████████████████████████████████| 883 kB 6.5 MB/s eta 0:00:01
Collecting tokenizers==0.9.4
  Downloading tokenizers-0.9.4-cp37-cp37m-macosx_10_11_x86_64.whl (2.0 MB)
[K     |████████████████████████████████| 2.0 MB 10.2 MB/s eta 0:00:01
[?25hBuilding wheels for collected packages: sacremoses
  Building wheel for sacremoses (setup.py) ... [?25ldone
[?25h  Created wheel for sacremoses: filename=sacremoses-0.0.43-py3-none-any.whl size=893259 sha256=f0b044263ab6cd407c2ed5a3138a641cfa88b0f3c71497ffa8327d3cc1d12ba3
  Stored in directory: /Users/janoschbaltensperger/Library/Caches/pip/whee

# Transfer learning with BERT

Transfer learing in general:

* a model trained on one task is retrained for a different one

* special case: BERT as a context-aware language model is retrained in a sentiment setting

* learning: adopt the weights 
  * of the classifier head on top of BERT
  * of some/all BERT layers




* we use BertForSequenceClassification
* documentation
  * Bert Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled output)
  * we get as a result: the loss and the logits of the batch elements
  * logits: $logit=ln(\frac{z_i}{(1-z_i)})$ where $z_i$ is some value
* we do not need a loss function (but can use), since the Bert model already gives us the loss




In [None]:
a = torch.rand(5)
logit_a=torch.logit(a, eps=1e-6)

a, logit_a,  torch.log(a[0]/(1-a[0]))


In [None]:
from transformers import BertForSequenceClassification, BertTokenizer

MODEL_NAME = "bert-base-german-cased"

model = BertForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)

model.train() # we are in training model, ie. learn

tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)

# torch.set_grad_enabled(False) # torch vectors without grad

In [None]:
# tokenize some text, provide labels

import torch

text_batch=["Der Minister lügt","Der Minister ist ein netter Mensch"]
encoding = tokenizer(text_batch, return_tensors='pt', padding=True)

input_ids = encoding['input_ids']
attention_mask = encoding['attention_mask']

optimizer = torch.optim.Adam(model.parameters(), weight_decay=1e-5, lr=1e-4)

labels = torch.tensor([0,1]).unsqueeze(0)  # e.g. first sentence is negative followed by a positive one (arbitrary coding)

In [None]:
# output
outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
outputs[1]  # prediction

In [None]:
# apply the model, learn

outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
loss=outputs[0]

print("prediction sentence 1",outputs[1][0])
print("\nprediction sentence 2",outputs[1][1])
      
print("\npredicted class is  argument with maximum value: sentence 1",torch.argmax(outputs[1][0]),"     real class",labels[0][0])
print("\npredicted class is  argument with maximum value: sentence 2",torch.argmax(outputs[1][1]),"     real class",labels[0][1])

optimizer.zero_grad()       

loss.backward()
optimizer.step()

if we apply the model several times (manually by crtl return of the cell), we quickly learn


In [None]:
# if we do not want to alter the pretrained weights, but only the top layer (the classification head)

for param in model.base_model.parameters():
    param.requires_grad = False

# Predict Masked Words

let's predict a masked word (lets do on a very low level)

https://demo.allennlp.org/reading-comprehension
https://demo.allennlp.org/masked-lm?text=The%20doctor%20ran%20to%20the%20emergency%20room%20to%20see%20%5BMASK%5D%20patient.

demonstrates the gender bias of bert. We now implement our own masked model

In [None]:
import torch
from transformers import BertTokenizer, BertForMaskedLM

MODEL_NAME = "bert-base-german-cased"

# Load pre-trained model tokenizer (vocabulary)
tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)

text = '[CLS] Ich kaufe mir einen [MASK] . [SEP]'

tokenized_text = tokenizer.tokenize(text)
indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)

# Create the segments tensors.
segments_ids = [0] * len(tokenized_text)

# Convert inputs to PyTorch tensors
tokens_tensor = torch.tensor([indexed_tokens])
segments_tensors = torch.tensor([segments_ids])

# Load pre-trained model (weights)
model = BertForMaskedLM.from_pretrained(MODEL_NAME)
model.eval()


In [None]:
# now we produces on output for each token including the mask

prediction=model(tokens_tensor)
prediction,prediction[0].size()


In [None]:
# we take output at the [MASK] index, which is it?
# not necessarily 6: text = '[CLS] Ich kaufe mir einen [MASK] . [SEP]'

print(tokenized_text)


In [None]:
masked_index=6  # starting with 0, from output of cell above

predicted_index = torch.argmax(prediction[0][0][masked_index]).item()   # gets BERT id of embedding
predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0] # gets word token
predicted_token

In [None]:
# we'll doit in more compact way now

def bertify(input):

  tokenized = tokenizer(input,return_tensors='pt') 

  input_ids=tokenized['input_ids']
  #segment_ids=tokenized['token_type_ids']
  #input_mask=tokenized['attention_mask']
  return input_ids 

input = '[CLS] Angela Merkel kritisierte Russland wegen dem Terrorismus - sie ist also [MASK] Terrorismus . [SEP]'
#input = '[CLS] Er sagte dass Angela Merkel Russland wegen dem Terrorismus lobte , sie ist also [MASK] Terrorismus . [SEP]'

inputstr=input.split(" ")

masked_index= [i for i in range(0,len(inputstr)) if inputstr[i]=='[MASK]'][0]+1

input_ids=bertify(input)

with torch.no_grad():
  predictions=model(input_ids)

print(predictions[0].size(),masked_index)

In [None]:

predicted_index = torch.argmax(predictions[0][0][masked_index]).item()
predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]
predictions[0]
print(predicted_token)


# Lexicon induction with BERT

In [None]:
path="/content/gdrive/My Drive/ml20/"


with open(path+"all_pairs", "r") as file:
    pairs = eval(file.readline())    # read one line with array of triples: [(word1,word2,polarity),...]    
file.close()

In [None]:
pairs[100]  # index 100 a positive pair, because "Mitarbeitende" was tagged as positiv in the underlying text

In [None]:
def datasplit():
    """
    output: X = pairs of tokenizer arrays, each comprising input_ids,attention_mask and sequenze_ids of two words
    output: y = the true labels (0 for neg, 1 for pos, 2 for neut)
    X_test, y_test accordingly
    """
    l=len(pairs)
    split=int(l*0.66)  # 2/3 for training

    train=pairs[:split]
    test=pairs[split+1:]

    # BERT tokenize the pairs, concatenate their input_ids 
    X_test=[tokenizer(a+b,return_tensors='pt', padding="max_length",max_length=9, truncation=True) for (a,b,_) in test]
    y_test =[pol for (_,_,pol) in test]

    X =[tokenizer(a+b,return_tensors='pt', padding="max_length",max_length=9, truncation=True) for (a,b,_) in train]
    y =[pol for (_,_,pol) in train]

    return X,y,X_test,y_test


In [None]:
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch 
import numpy as np

device = torch.device("cpu")

def training(epochs,X,y):
  for epoch in range(epochs):
      for encoding,labels in  zip(X,y):  # no dataloader used this time
          optimizer.zero_grad()       

          labels = torch.tensor(labels).unsqueeze(0)  

          attention_mask = encoding['attention_mask']
          input_mask = encoding['input_ids']

          outputs = model(input_mask.to(device), attention_mask=attention_mask.to(device), labels=labels.to(device)) 
          loss=outputs[0]
         
          loss.backward()
          optimizer.step()
          
       
# divides a by b, returns 0 if b=0
def evalit(a,b):  
    if b==0:
        return 0
    else:
        return a/b
  

def evaluate(X_test,y_test):

  tp=0
  foundlist,tplist,allist=[0,0,0],[0,0,0],[0,0,0]    # each list manages frequencies of polarities; polarity is the list index
                                                     # e.g. tplist[2]+=1 is used to increment (if a true positive was found of a neutral (2) pair)
    
  for encoding,truelabel in  zip(X,y):

          attention_mask = encoding['attention_mask']
          input_mask = encoding['input_ids']

          outputs = model(input_mask.to(device), attention_mask=attention_mask.to(device))
         
          predict=torch.argmax(outputs[0])
          #predict=predict.cpu()
          predict=predict.detach().numpy() 

          foundlist[predict]+=1     # increase by one the dimension which represents the polarity prediction of the system
          allist[truelabel]+=1      # increase by one the dimension which represents the true polarity (label)      

          if predict == truelabel:
            tp+=1
            tplist[predict]+=1
                  
  reclist=[evalit(tplist[i],allist[i]) for i in range(0,3)]      # determine recall for neg,pos,neut
  preclist=[evalit(tplist[i],foundlist[i]) for i in range(0,3)]
  flist=[evalit(2*reclist[i]*preclist[i],reclist[i]+preclist[i]) for i in range(0,3)]

  print("rec: neg,neut,pos",reclist)
  print("prec: neg,neut,pos",preclist)
  print("f: neg,neut,pos",flist)

  return tp/len(y)

In [None]:
epochs=2
acc=0
folds=3

for i in range(folds):
  print("fold",i+1)
  X,y,X_test,y_test= datasplit()  

  model = BertForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=3)
  
  # do not touch the BERT weights, train only the classifier head 
  for param in model.base_model.parameters():
    param.requires_grad = False

  model.to(device)

  optimizer = torch.optim.Adam(model.parameters(), weight_decay=1e-5, lr=1e-4)

  model.train()
  print("learning")
  training(epochs,X,y)   

  model.eval()     
  print("testing")
 
  fold_acc=evaluate(X_test,y_test)   
  print("acc:",fold_acc)
  acc+=fold_acc

print("mean accuracy=", acc/folds)

# Normalisierung

* batch normalization
  * the batch gets normalized, dimension-wise
* layer normalization
  * the whole layer gets normalized, over all dimentions

both use the **z transformation**: $\frac{x_i-\mu}{\sigma}$ where $\mu$ = mean and $\sigma$ = standard deviation

it normalizes to a distribution with $\mu=0$ and $\sigma=1$



In [3]:
# batch normalization

import numpy as np

m = nn.BatchNorm1d(3)  
input = torch.tensor([[2,4,6],[4,2,1],[3,2,2]],dtype=torch.float)
output = m(input)
col1=output[:,0].detach().numpy()

print(output)

xmean=np.mean(col1)
xvar=np.var(col1)

print("input",input,"\n")
print("first column",col1,"\n")
print("mean, standard deviation", xmean,",",np.sqrt(xvar))


NameError: name 'nn' is not defined

In [None]:
# layer normalization: the mean of the whole layer is 0, ...

m = nn.LayerNorm([3,3])

normalized=m(input)

# the sum (here = mean) for each vector (= all layers)
layer_sum=sum(normalized)

# the 
layer_mean=sum(layer_sum)
print("normalized:",normalized,"\n")
print("layer mean:",layer_sum,"\n")
print("mean of whole layer:", layer_mean)