In [None]:
import random

import numpy as np
import pandas as pd

import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
import torch.nn as nn

from transformers import BertTokenizerFast, BertForSequenceClassification, AutoTokenizer, AutoModel, AdamW, get_linear_schedule_with_warmup

from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import f1_score, balanced_accuracy_score, confusion_matrix, classification_report

from tqdm.notebook import tqdm

import seaborn as sn

import matplotlib.pyplot as plt

In [2]:
import random
import time 
import os

import numpy as np
import pandas as pd

from argparse import Namespace

import torch
from torch.nn import CrossEntropyLoss
from torch.utils.data import TensorDataset, DataLoader, RandomSampler
from torch.optim import AdamW
from transformers import BertTokenizerFast, BertForSequenceClassification, \
    get_linear_schedule_with_warmup

from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

from tqdm.notebook import tqdm

In [3]:
args = Namespace(
    dataset_dir = ".data",
    dataset_prefix = "faq_with_splits_",
    dataset = "lemmatized_filtered",
    models = ["bert-base-multilingual-uncased", "SZTAKI-HLT/hubert-base-cc"],
    model_save_dir = ".model_storage/BERT",
    model_state_file = "model",
    seed = 1234,
    num_epochs = 2,
    learning_rate = 5e-5,
    hidden_size = 100,
    batch_size = 16,
    cuda = True,
    train_column = 'short_question'
)

In [4]:
device = torch.device('cuda' if args.cuda & torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [5]:
args.model_state_file = os.path.join(args.model_save_dir, args.model_state_file)

In [6]:
if not os.path.exists(args.model_save_dir):
    os.makedirs(args.model_save_dir)
    print(f"Created directory {args.model_save_dir}")
else:
    print(f"{args.model_save_dir} directory already exists")

.model_storage/BERT directory already exists


In [7]:
args.dataset = os.path.join(args.dataset_dir, args.dataset_prefix + args.dataset)
print(args.dataset)

.data\faq_with_splits_lemmatized_filtered


In [8]:
random.seed(args.seed)
np.random.seed(args.seed)
torch.manual_seed(args.seed)
torch.cuda.manual_seed_all(args.seed)

In [9]:
tokenizers = {}

for model in args.models:
    tokenizers[model] = BertTokenizerFast.from_pretrained(model)

Downloading:   0%|          | 0.00/86.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/266k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/420 [00:00<?, ?B/s]

In [10]:
df = pd.read_csv(args.dataset + '.csv')

In [11]:
df.head()

Unnamed: 0,short_question,long_question,answer,main_category,sub_category,split
0,megy hasa kutya mi tehet,megy hasa kutya mi tehet körülbelül hét ismerő...,Kutyának kenyeret? Nem semmi lehet az ismerősö...,Állatok,Kutyák,val
1,ti mot szőrös ruha,ti mot szőrös ruha a gép berakott ruha alap kb...,"A Furminator nevű fésű egy áldás, annyi szőrt ...",Állatok,Kutyák,train
2,részleges körülmetélés nehéz gyógyul begyullad...,részleges körülmetélés nehéz gyógyul begyullad...,Szerintem doki vagy ügyelet... Vagy egy baràt ...,Egészség,Férfiak egészsége,train
3,sos első szemüveg fontos tudnivaló,sos első szemüveg fontos tudnivaló a jobb 0 75...,Semmi köze a szemüveg hordásának vagy nem hord...,Egészség,Szemproblémák,train
4,felpuffadt has megszűnik menstruáció összefüggés,felpuffadt has megszűnik menstruáció összefügg...,"Elsősorban, amikor feleszméltél, h az anorexia...",Egészség,Nők egészsége,train


In [12]:
df = df[['main_category', args.train_column, 'split']]

In [13]:
df.head()

Unnamed: 0,main_category,short_question,split
0,Állatok,megy hasa kutya mi tehet,val
1,Állatok,ti mot szőrös ruha,train
2,Egészség,részleges körülmetélés nehéz gyógyul begyullad...,train
3,Egészség,sos első szemüveg fontos tudnivaló,train
4,Egészség,felpuffadt has megszűnik menstruáció összefüggés,train


In [14]:
target_names = df.main_category.unique().tolist()
target_dict = {k: v for v, k in enumerate(target_names)}

print(target_names)
print(target_dict)

['Állatok', 'Egészség', 'Szórakozás', 'Számítástechnika']
{'Állatok': 0, 'Egészség': 1, 'Szórakozás': 2, 'Számítástechnika': 3}


In [15]:
df.main_category = df.main_category.apply(lambda x: target_dict[x])

In [16]:
max_seq_len = 128

In [17]:
def tokenize(df, tokenizer):
    return tokenizer.batch_encode_plus(
        df[args.train_column].tolist(),
        max_length = max_seq_len,
        padding='max_length',
        truncation=True,
        return_token_type_ids=False
)

In [18]:
valid_df = df[df.split == 'val']
train_df = df[df.split == 'train']
test_df = df[df.split == 'test']

In [19]:
encoded_data = {}

for model in args.models:
    encoded_data[model] = {}
    tokenizer = tokenizers[model]
    encoded_data[model]['train'] = tokenize(train_df, tokenizer)
    encoded_data[model]['valid'] = tokenize(valid_df, tokenizer)
    encoded_data[model]['test'] = tokenize(test_df, tokenizer)

In [20]:
input_ids = {}
attention_masks = {}

for model in args.models:
    input_ids[model] = {}
    attention_masks[model] = {}
    
    for split in ['train', 'valid', 'test']:
        input_ids[model][split] = encoded_data[model][split]['input_ids']
        attention_masks[model][split] = encoded_data[model][split]['attention_mask']

In [21]:
labels = {}

labels['train'] = torch.tensor(train_df.main_category.values)
labels['valid'] = torch.tensor(valid_df.main_category.values)
labels['test'] = torch.tensor(test_df.main_category.values)

In [22]:
datasets = {}

for model in args.models:
    datasets[model] = {}
    
    for split in ['train', 'valid', 'test']:
        datasets[model][split] = TensorDataset(
            torch.tensor(input_ids[model][split]),
            torch.tensor(attention_masks[model][split]),
            labels[split]
        )

In [23]:
dataloaders = {}

for model in args.models:
    dataloaders[model] = {}
    for split in ['train', 'valid', 'test']:
        dataloaders[model][split] = DataLoader(
            datasets[model][split],
            sampler=RandomSampler(datasets[model][split]),
            batch_size=args.batch_size
        )

In [24]:
class_wts = compute_class_weight('balanced', np.unique(train_df.main_category), \
                                 train_df.main_category)

print(class_wts)

[0.82005859 0.7529209  1.05188476 1.99306534]


2         1
3         1
4         1
5         0
         ..
153556    3
153557    2
153558    3
153559    1
153560    0
Name: main_category, Length: 107490, dtype: int64 as keyword args. From version 1.0 (renaming of 0.25) passing these as positional arguments will result in an error


In [25]:
weights = torch.tensor(class_wts, dtype=torch.float)
weights = weights.to(device)

In [26]:
def class_accuracy(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return accuracy_score(y_true=labels_flat, y_pred=preds_flat)  

In [27]:
def show_data_to_model(model, optimizer, criterion, bar, train):
    epoch_loss = 0
    epoch_acc = 0
       
    total_predicted = []
    idx = 0

    for batch in bar:
        idx += 1
        if train:
            optimizer.zero_grad()
            model.zero_grad()

        batch = tuple(b.to(device) for b in batch)

        inputs = {
            'input_ids':      batch[0],
            'attention_mask': batch[1],
            'labels':         batch[2],
        }
        
        outputs = model(**inputs)
        
        logits = outputs[1]
        label_ids = inputs['labels'].cpu().numpy()

        loss = criterion(logits, batch[2])
        logits = logits.detach().cpu().numpy()
        acc = class_accuracy(logits, label_ids)

        total_predicted.append(logits)
        
        if train:
            loss.backward()
            optimizer.step()

        del batch
            
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            
        epoch_loss += loss.item()
        epoch_acc += acc.item()

#        _, predicted = torch.max(predictions.data, 1)
#        total_predicted += predicted.tolist()
        
        bar.set_postfix(loss=(epoch_loss / idx), acc=(epoch_acc / idx))

    total_predicted = np.concatenate(total_predicted, axis=0)

    return epoch_loss / idx , epoch_acc / idx, total_predicted

In [28]:
def train_model(model, optimizer, criterion, bar):
    model.train()
    loss, acc, _ = show_data_to_model(model, optimizer, criterion, bar, True)
    
    return loss, acc

In [None]:
def evaluate_model(model, optimizer, criterion, bar):
    model.eval()
    with torch.no_grad():
        loss, acc, pred = show_data_to_model(model, optimizer, criterion, bar, False)
            
    return loss, acc, pred

In [None]:
def evaluate(dataloader):

    model.eval()
    
    loss_val_total = 0
    predictions_val, true_vals = [], []
    
    for batch in dataloader:
        
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }

        with torch.no_grad():        
            outputs = model(**inputs)
            
        # loss = outputs[0]
        
        logits = outputs[1]
        loss = cross_entropy(logits, batch[2])

        loss_val_total += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = inputs['labels'].cpu().numpy()
        predictions_val.append(logits)
        true_vals.append(label_ids)
    
    loss_val_avg = loss_val_total/len(dataloader) 
    
    predictions_val = np.concatenate(predictions_val, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)
            
    return loss_val_avg, predictions_val, true_vals

In [None]:
def train(model, model_name):
    true_vals = None
    predictions_val = None
    for epoch in tqdm(range(1, epochs+1)):
    
        model.train()
    
        loss_train_total = 0

        progress_bar = tqdm(dataloader_train, desc='Epoch {:1d}'.format(epoch), leave=False, disable=False)
        for batch in progress_bar:

            model.zero_grad()

            batch = tuple(b.to(device) for b in batch)
        
            inputs = {'input_ids':      batch[0],
                      'attention_mask': batch[1],
                      'labels':         batch[2],
                     }       

            outputs = model(**inputs)

            logits = outputs[1]
            loss = cross_entropy(logits, batch[2])

            loss_train_total += loss.item()
            loss.backward()

            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

            optimizer.step()
            scheduler.step()

            progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item()/len(batch))})


        tqdm.write(f'\nEpoch {epoch}')

        loss_train_avg = loss_train_total/len(dataloader_train)            
        tqdm.write(f'Training loss: {loss_train_avg}')

        val_loss, predictions_val, true_vals = evaluate(dataloader_valid)
        val_f1 = f1_score_func(predictions_val, true_vals)
        val_bacc = accuracy(predictions_val, true_vals)
        tqdm.write(f'Validation loss: {val_loss}')
        tqdm.write(f'F1 Score (Weighted): {val_f1}')
        tqdm.write(f'Balanced Accuracy Score: {val_bacc}')

    torch.save(model.state_dict(), model_name)
    return true_vals, predictions_val

In [None]:
from transformers import logging
logging.set_verbosity_error()

In [None]:
train_losses = {}
train_accuracies = {}

valid_losses = {}
valid_accuracies = {}

for model_ in args.models:
    start_time = int(time.time() * 1000)
    
    best_valid_loss = float('inf')
    
    train_losses[model_] = []
    train_accuracies[model_] = []

    valid_losses[model_] = []
    valid_accuracies[model_] = []
    
    train_dl = dataloaders[model_]['train']
    valid_dl = dataloaders[model_]['valid']    
        
    model = BertForSequenceClassification.from_pretrained(
        model_,
        num_labels = len(target_names),
        output_attentions=False,
        output_hidden_states=False   
    )
        
    optimizer = AdamW(
        model.parameters(),
        args.learning_rate,
        eps=1e-8
    )
    
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=0,
        num_training_steps=len(train_dl) * args.num_epochs
    )
    
    criterion = CrossEntropyLoss(weight=weights)
    
    model = model.to(device)
    criterion = criterion.to(device)
   
    epoch_bar = tqdm(desc=f"'{model_}' epoch", total=args.num_epochs, position=0, leave=True)
    
    for epoch in range(args.num_epochs):
        train_bar = tqdm(train_dl, desc=f"'{model_}' train", leave=True)
        valid_bar = tqdm(valid_dl, desc=f"'{model_}' valid", leave=True)

        train_loss, train_acc = train_model(model, optimizer, criterion, train_bar)
        valid_loss, valid_acc, _ = evaluate_model(model, optimizer, criterion, valid_bar)

        train_losses[model_].append(train_loss)
        train_accuracies[model_].append(train_acc)

        valid_losses[model_].append(valid_loss)
        valid_accuracies[model_].append(valid_acc)
    
        if valid_loss < best_valid_loss:
            best_valid_loss = valid_loss
            torch.save(model.state_dict(), args.model_state_file + "_" + model_ + ".pth")
 
        epoch_bar.update()

In [None]:
truth, pred = train(model, 'finetuned_BERT.model')

In [None]:
test_loss, predictions, truth = evaluate(dataloader_test)

In [None]:
cm = confusion_matrix(truth, predictions.argmax(1))
cm_df = pd.DataFrame(cm, index=target_names, columns=target_names)

In [None]:
heatmap = sn.heatmap(cm_df, annot=True, cmap='Reds', fmt='g', annot_kws={"size": 15}, cbar=False)
plt.show()

In [None]:
class_report = classification_report(truth, predictions.argmax(1), target_names=target_names)
print(class_report)