03/02/23: new BERT architecture

In [None]:
#!pip install transformers

In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from sklearn import preprocessing
#from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
#from sklearn.utils.class_weight import compute_class_weight
import transformers
from transformers import BertTokenizerFast, BertForSequenceClassification
#import glob, os

## BERT Tokenizer and GPU

In [2]:
# specify GPU
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# Load the BERT tokenizer
tokenizer = BertTokenizerFast.from_pretrained('bert-base-cased')
max_seq_len = 256

# import BERT-base pretrained model
###bert = AutoModel.from_pretrained('bert-base-cased')

# Load the BertForSequenceClassification model
model = BertForSequenceClassification.from_pretrained(
    'bert-base-cased',
###    num_labels = 3,
    num_labels = 6,
#    output_attentions = False,
#    output_hidden_states = False,
)

optimizer = torch.optim.AdamW(model.parameters(), 
                              lr = 2e-5,
                              #eps = 1e-08
                              )

model.to(device)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at b

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

# Import and process .txt data via path

In [None]:
# JupyterLab directory
path_test= '/data/corpus-webis-editorials-16/annotated-txt/split-for-evaluation-final/test'
path_training= '/data/corpus-webis-editorials-16/annotated-txt/split-for-evaluation-final/training'
path_validation= '/data/corpus-webis-editorials-16/annotated-txt/split-for-evaluation-final/validation'

In [None]:
def extract_df(path):
    main_df = pd.DataFrame(columns=['unit'])

    for filename in glob.glob(os.path.join(path, '*.txt')): ###
        with open(os.path.join(os.getcwd(), filename), 'r') as f: 
            lines = f.readlines()
            #lines.remove('-1\tpar-sep\t\n') ###
            this_lines_df = pd.DataFrame(lines, columns=['unit'])
            main_df = pd.concat([main_df,this_lines_df]) ### ###
        
    main_df[['index','label','text','note']] = main_df['unit'].str.split('\t',3,expand=True)
    main_df = main_df.drop(['index','unit','note'],axis=1).replace('\n','', regex=True)
    main_df = main_df[main_df['label']!='par-sep']
    
    return main_df

In [None]:
def arrange_df(main_df):
    
    main_df = main_df[~main_df['label'].isin(['title','par-sep','no-unit'])]
    result_df = main_df.groupby((~main_df.label.str.match('continued')).shift().cumsum(), as_index=False).sum()
    result_df['label']=result_df['label'].str.replace('continued','')
    
    return result_df

def corse_label(main_df):
    main_df.loc[main_df['label'].str.contains("assumption"),'corse_label'] = '0' # claim
    main_df.loc[main_df['label'].str.contains("other"),'corse_label'] = '1' # others
    main_df.loc[~main_df['label'].str.contains("assumption|other"),'corse_label'] = '2' # premise
    
    return main_df

def fine_label(main_df):
    le = preprocessing.LabelEncoder()
    le.fit(main_df.label)
    main_df['label'] = le.transform(main_df.label)
    
    # to inverse
    #le.inverse_transform(main_df['label'])
    
    return main_df

In [None]:
path_list = [path_training,path_validation,path_test]
df_list = []

for path in path_list:
    main_df = extract_df(path)
    final_df = arrange_df(main_df)
    
    # run either _ or _
    ###final_df = corse_label(final_df)
    final_df = fine_label(final_df)
    
    df_list.append(final_df)

In [None]:
# define datasets
train_df = df_list[0]
val_df = df_list[1]
test_df = df_list[2]
all_data_df = pd.concat([train_df,val_df,test_df])

In [None]:
# run either _ or _

In [None]:
# save datasets
###train_df.to_csv('train.csv',index=False)
###val_df.to_csv('val.csv',index=False)
###test_df.to_csv('test.csv',index=False)
###all_data_df.to_csv('all_data.csv',index=False)
train_df.to_csv('train-6.csv',index=False)
val_df.to_csv('val-6.csv',index=False)
test_df.to_csv('test-6.csv',index=False)
all_data_df.to_csv('all_data-6.csv',index=False)

# Shortcut to .csv data

In [3]:
# loaded prepared data
###train_df = pd.read_csv('train.csv')
###val_df = pd.read_csv('val.csv')
###test_df = pd.read_csv('test.csv')
train_df = pd.read_csv('train-6.csv')
val_df = pd.read_csv('val-6.csv')
test_df = pd.read_csv('test-6.csv')

In [4]:
# concat all data
all_data_df = pd.concat([train_df,val_df,test_df])

# define data
text = all_data_df.text.values
###labels = all_data_df.corse_label.values
labels = all_data_df.label.values

# define train data
train_text = list(train_df.text.values)
###train_labels = list(train_df.corse_label.values)
train_labels = list(train_df.label.values)

# Data Preprocessing

In [5]:
# tokenize and encode sequences in the training set
tokens_train = tokenizer.batch_encode_plus(
    train_text,
    max_length = max_seq_len,
    pad_to_max_length=True,
    truncation=True,
    return_token_type_ids=False
)

# tokenize and encode sequences in the validation set
tokens_val = tokenizer.batch_encode_plus(
    list(val_df.text.values),
    max_length = max_seq_len,
    pad_to_max_length=True,
    truncation=True,
    return_token_type_ids=False
)

# tokenize and encode sequences in the test set
tokens_test = tokenizer.batch_encode_plus(
    list(test_df.text.values),
    max_length = max_seq_len,
    pad_to_max_length=True,
    truncation=True,
    return_token_type_ids=False
)



In [6]:
# for train set
train_seq = torch.tensor(tokens_train['input_ids'])
train_mask = torch.tensor(tokens_train['attention_mask'])
train_y = torch.tensor(train_labels)

# for validation set
val_seq = torch.tensor(tokens_val['input_ids'])
val_mask = torch.tensor(tokens_val['attention_mask'])
###val_y = torch.tensor(list(val_df.corse_label.values))
val_y = torch.tensor(list(val_df.label.values))

# for test set
test_seq = torch.tensor(tokens_test['input_ids'])
test_mask = torch.tensor(tokens_test['attention_mask'])
###test_y = torch.tensor(list(test_df.corse_label.values))
test_y = torch.tensor(list(test_df.label.values))

In [11]:
# DataLoaders

#define a batch size
batch_size = 16#32

# wrap tensors
train_data = TensorDataset(train_seq, train_mask, train_y)

# sampler for sampling the data during training
train_sampler = RandomSampler(train_data)

# dataLoader for train set
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

# wrap tensors
val_data = TensorDataset(val_seq, val_mask, val_y)

# sampler for sampling the data during training
val_sampler = SequentialSampler(val_data)

# dataLoader for validation set
val_dataloader = DataLoader(val_data, sampler = val_sampler, batch_size=batch_size)

# wrap tensors
test_data = TensorDataset(test_seq, test_mask, test_y)

# sampler for sampling the data during testing
test_sampler = SequentialSampler(test_data)

# dataLoader for validation set
test_dataloader = DataLoader(test_data, sampler = test_sampler, batch_size=batch_size)

In [8]:
#def b_tp(preds, labels):
#    '''Returns True Positives (TP): count of correct predictions of actual class 1'''     
#    return sum([preds == labels and preds == 1 for preds, labels in zip(preds, labels)])

def b_tp(preds, labels):
    preds = np.argmax(preds, axis = 1).flatten()
    labels = labels.flatten()
    b_accuracy = sum([preds == labels for preds, labels in zip(preds, labels)])
    
    return b_accuracy / len(labels)

In [9]:
from tqdm import trange

In [13]:
epochs = 3

for _ in trange(epochs, desc = 'Epoch'):
    
    # ========== Training ==========
    
    # Set model to training mode
    model.train()
    
    # Tracking variables
    tr_loss = 0
    nb_tr_examples, nb_tr_steps = 0, 0

    for step, batch in enumerate(train_dataloader):
        
        if step % 50 == 0 and not step == 0:
          # Report progress.
            print('  Batch {:>5,}  of  {:>5,}.'.format(step, len(train_dataloader)))
        
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        
        optimizer.zero_grad()
        # Forward pass
        train_output = model(b_input_ids, 
                             token_type_ids = None, 
                             attention_mask = b_input_mask, 
                             labels = b_labels)
        # Backward pass
        train_output.loss.backward()
        optimizer.step()
        # Update tracking variables
        tr_loss += train_output.loss.item()
        nb_tr_examples += b_input_ids.size(0)
        nb_tr_steps += 1

    # ========== Validation ==========

    # Set model to evaluation mode
    model.eval()

    # Tracking variables 
    val_accuracy = []
#    val_precision = []
#    val_recall = []
#    val_specificity = []

    for batch in val_dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        with torch.no_grad():
          # Forward pass
          eval_output = model(b_input_ids, 
                              token_type_ids = None, 
                              attention_mask = b_input_mask)
        logits = eval_output.logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        
        # Calculate validation accuracy
        b_accuracy = b_tp(logits, label_ids)
        val_accuracy.append(b_accuracy)

    print('\n\t - Train loss: {:.4f}'.format(tr_loss / nb_tr_steps))    
    print('\t - Validation Accuracy: {:.4f}'.format(sum(val_accuracy)/len(val_accuracy)))

Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

  Batch    50  of    532.
  Batch   100  of    532.
  Batch   150  of    532.
  Batch   200  of    532.
  Batch   250  of    532.
  Batch   300  of    532.
  Batch   350  of    532.
  Batch   400  of    532.
  Batch   450  of    532.
  Batch   500  of    532.


Epoch:  33%|███▎      | 1/3 [03:32<07:04, 212.50s/it]


	 - Train loss: 0.6255
	 - Validation Accuracy: 0.8336
  Batch    50  of    532.
  Batch   100  of    532.
  Batch   150  of    532.
  Batch   200  of    532.
  Batch   250  of    532.
  Batch   300  of    532.
  Batch   350  of    532.
  Batch   400  of    532.
  Batch   450  of    532.
  Batch   500  of    532.


Epoch:  67%|██████▋   | 2/3 [07:05<03:32, 212.53s/it]


	 - Train loss: 0.3748
	 - Validation Accuracy: 0.8372
  Batch    50  of    532.
  Batch   100  of    532.
  Batch   150  of    532.
  Batch   200  of    532.
  Batch   250  of    532.
  Batch   300  of    532.
  Batch   350  of    532.
  Batch   400  of    532.
  Batch   450  of    532.
  Batch   500  of    532.


Epoch: 100%|██████████| 3/3 [10:37<00:00, 212.56s/it]


	 - Train loss: 0.2100
	 - Validation Accuracy: 0.8065





In [14]:
# Test Data

model.eval()

test_accuracy = []
logits_list = []
labels_list = []

for batch in test_dataloader:
    
    if step % 50 == 0 and not step == 0:
    # Report progress.
        print('  Batch {:>5,}  of  {:>5,}.'.format(step, len(train_dataloader)))

    
    batch = tuple(t.to(device) for t in batch)
    b_input_ids, b_input_mask, b_labels = batch
    with torch.no_grad():
        # Forward pass
        test_output = model(b_input_ids, 
                            token_type_ids = None, 
                            attention_mask = b_input_mask)
    logits = test_output.logits.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()
    
    logits_list.extend(logits)
    labels_list.extend(label_ids)
    
    # Calculate test accuracy
    b_accuracy = b_tp(logits, label_ids)
    test_accuracy.append(b_accuracy)

print('\t - Test Accuracy: {:.4f}'.format(sum(test_accuracy)/len(test_accuracy)))

	 - Test Accuracy: 0.7995


In [15]:
preds = list(np.argmax(logits_list,axis=1))

In [16]:
print(classification_report(labels_list, preds))

              precision    recall  f1-score   support

           0       0.55      0.72      0.63       486
           1       0.91      0.84      0.88      2005
           2       0.08      0.03      0.05        29
           3       0.32      0.26      0.29        31
           4       0.71      0.69      0.70        81
           5       0.69      0.80      0.74       214

    accuracy                           0.80      2846
   macro avg       0.54      0.56      0.55      2846
weighted avg       0.81      0.80      0.80      2846



In [17]:
torch.save(model.state_dict(), 'saved_weights_6lab_16batch.pt')

# Load trained model

In [None]:
###path_model = ('saved_weights.pt')
path_model = ('saved_weights-6.pt')
loaded_model = AutoModel.from_pretrained('bert-base-cased')
loaded_model.load_state_dict(torch.load(path_model),strict=False)

loaded_model.eval()
loaded_model.to(device)

In [None]:
loaded_model

# Get prediction

In [None]:
# get predictions for test data
with torch.no_grad():
    preds = loaded_model(test_seq.to(device), test_mask.to(device))
    preds = preds.detach().cpu().numpy()

In [None]:
# test scores
print(classification_report(test_seq, preds))

# Admin work

In [None]:
obj = None
gc.collect()

In [12]:
torch.cuda.empty_cache()