In [None]:
import random

import numpy as np
import pandas as pd

import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
import torch.nn as nn

from transformers import BertTokenizerFast, BertForSequenceClassification, AutoTokenizer, AutoModel, AdamW, get_linear_schedule_with_warmup

from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import f1_score, balanced_accuracy_score, confusion_matrix, classification_report

from tqdm.notebook import tqdm

import seaborn as sn

import matplotlib.pyplot as plt

In [7]:
import random
import time 
import os

import numpy as np
import pandas as pd

from argparse import Namespace

import torch
from torch.nn import CrossEntropyLoss
from torch.utils.data import TensorDataset, DataLoader, RandomSampler

from transformers import BertTokenizerFast, BertForSequenceClassification, AdamW, \
    get_linear_schedule_with_warmup

from sklearn.utils.class_weight import compute_class_weight

from tqdm.notebook import tqdm

In [20]:
args = Namespace(
    dataset_dir = ".data",
    dataset_prefix = "faq_with_splits_",
    dataset = "lemmatized_filtered",
    models = ["bert-base-multilingual-uncased"],
    model_save_dir = ".model_storage/BERT",
    model_state_file = "model",
    seed = 1234,
    num_epochs = 2,
    learning_rate = 5e-5,
    hidden_size = 100,
    batch_size = 16,
    cuda = True,
    train_column = 'short_question'
)

In [21]:
device = torch.device('cuda' if args.cuda & torch.cuda.is_available() else 'cpu')
device

device(type='cpu')

In [22]:
args.model_state_file = os.path.join(args.model_save_dir, args.model_state_file)

In [23]:
args.dataset = os.path.join(args.dataset_dir, args.dataset_prefix + args.dataset)
print(args.dataset)

.data\faq_with_splits_lemmatized_filtered


In [24]:
random.seed(args.seed)
np.random.seed(args.seed)
torch.manual_seed(args.seed)
torch.cuda.manual_seed_all(args.seed)

In [25]:
tokenizers = {}

for model in args.models:
    tokenizers[model] = BertTokenizerFast.from_pretrained(model)

In [27]:
df = pd.read_csv(args.dataset + '.csv')

In [29]:
df.head()

Unnamed: 0,short_question,long_question,answer,main_category,sub_category,split
0,tudt kutya tanítás foglalkozó személy aki tud ...,tudt kutya tanítás foglalkozó személy aki tud ...,Nem kellene általánosítani egy szar kutyaiskol...,Állatok,Kutyák,val
1,7 hetes francia bulldog kaphat rizs,7 hetes francia bulldog kaphat rizs sziaszt 7 ...,"Orvost kaphat, nem rizst!",Állatok,Kutyák,val
2,anime segitség nézik,anime segitség nézik kifogytam romantikus ross...,Nézd meg az Inuyashát.,Szórakozás,"Filmek, sorozatok",train
3,c input handler feladat megoldás,k patrick nevű felhasználó kérdés c input hand...,Visual Studio-ban például lehet.,Számítástechnika,Programozás,train
4,az állat tényleg érez ember kisugárzás,az állat tényleg érez ember kisugárzás én szer...,A kutyák nagyon jól olvassák az emberi testbes...,Állatok,Egyéb kérdések,train


In [30]:
df = df[['main_category', args.train_column, 'split']]

In [31]:
df.head()

Unnamed: 0,main_category,short_question,split
0,Állatok,tudt kutya tanítás foglalkozó személy aki tud ...,val
1,Állatok,7 hetes francia bulldog kaphat rizs,val
2,Szórakozás,anime segitség nézik,train
3,Számítástechnika,c input handler feladat megoldás,train
4,Állatok,az állat tényleg érez ember kisugárzás,train


In [32]:
target_names = df.main_category.unique().tolist()
target_dict = {k: v for v, k in enumerate(target_names)}

print(target_names)
print(target_dict)

['Állatok', 'Szórakozás', 'Számítástechnika', 'Egészség']
{'Állatok': 0, 'Szórakozás': 1, 'Számítástechnika': 2, 'Egészség': 3}


In [33]:
df.main_category = df.main_category.apply(lambda x: target_dict[x])

In [34]:
max_seq_len = 128

In [35]:
def tokenize(df, tokenizer):
    return tokenizer.batch_encode_plus(
        df[args.train_column].tolist(),
        max_length = max_seq_len,
        padding='max_length',
        truncation=True,
        return_token_type_ids=False
)

In [36]:
valid_df = df[df.split == 'val']
train_df = df[df.split == 'train']
test_df = df[df.split == 'test']

In [37]:
encoded_data = {}

for model in args.models:
    encoded_data[model] = {}
    tokenizer = tokenizers[model]
    encoded_data[model]['train'] = tokenize(train_df, tokenizer)
    encoded_data[model]['valid'] = tokenize(valid_df, tokenizer)
    encoded_data[model]['test'] = tokenize(test_df, tokenizer)

In [38]:
input_ids = {}
attention_masks = {}

for model in args.models:
    input_ids[model] = {}
    attention_masks[model] = {}
    
    for split in ['train', 'valid', 'test']:
        input_ids[model][split] = encoded_data[model][split]['input_ids']
        attention_masks[model][split] = encoded_data[model][split]['attention_mask']

In [39]:
labels = {}

labels['train'] = torch.tensor(train_df.main_category.values)
labels['valid'] = torch.tensor(valid_df.main_category.values)
labels['test'] = torch.tensor(test_df.main_category.values)

In [40]:
datasets = {}

for model in args.models:
    datasets[model] = {}
    
    for split in ['train', 'valid', 'test']:
        datasets[model][split] = TensorDataset(
            torch.tensor(input_ids[model][split]),
            torch.tensor(attention_masks[model][split]),
            labels[split]
        )

In [41]:
dataloaders = {}

for model in args.models:
    dataloaders[model] = {}
    for split in ['train', 'valid', 'test']:
        dataloaders[model][split] = DataLoader(
            datasets[model][split],
            sampler=RandomSampler(datasets[model][split]),
            batch_size=args.batch_size
        )

In [43]:
class_wts = compute_class_weight('balanced', np.unique(train_df.main_category), \
                                 train_df.main_category)


print(class_wts)

[1. 1. 1. 1.]


In [45]:
weights = torch.tensor(class_wts, dtype=torch.float)
weights = weights.to(device)

In [61]:
def show_data_to_model(model, optimizer, criterion, bar, train):
    epoch_loss = 0
    epoch_acc = 0
       
    total_predicted = []
    idx = 0

    for batch in bar:
        idx += 1
        print('Batch in bar ', idx)
        if train:
            optimizer.zero_grad()
            model.zero_grad()

        batch = tuple(b.to(device) for b in batch)

        inputs = {
            'input_ids':      batch[0],
            'attention_mask': batch[1],
            'labels':         batch[2],
        }
        
        outputs = model(**inputs)
        
        logits = outputs[1]
        loss = criterion(logits, batch[2])
            
#        acc = class_accuracy(predictions, batch.main_category)
        
        if train:
            loss.backward()
            optimizer.step()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            
        epoch_loss += loss.item()
#        epoch_acc += acc.item()

#        _, predicted = torch.max(predictions.data, 1)
#        total_predicted += predicted.tolist()
        
        bar.set_postfix(loss=(epoch_loss / idx)) #, acc=(epoch_acc / idx))

    return epoch_loss / idx #, epoch_acc / len(iterator), total_predicted

In [53]:
def train_model(model, optimizer, criterion, bar):
    model.train()
    loss = show_data_to_model(model, optimizer, criterion, bar, True)
    
    return loss

In [54]:
def evaluate_model(model, optimizer, criterion, bar):
    model.eval()
    with torch.no_grad():
        loss = show_data_to_model(model, optimizer, criterion, bar, False)
            
    return loss

In [None]:
def f1_score_func(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return f1_score(labels_flat, preds_flat, average='weighted')

def accuracy(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return balanced_accuracy_score(y_true=labels_flat, y_pred=preds_flat)  

In [None]:
def evaluate(dataloader):

    model.eval()
    
    loss_val_total = 0
    predictions_val, true_vals = [], []
    
    for batch in dataloader:
        
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }

        with torch.no_grad():        
            outputs = model(**inputs)
            
        # loss = outputs[0]
        
        logits = outputs[1]
        loss = cross_entropy(logits, batch[2])

        loss_val_total += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = inputs['labels'].cpu().numpy()
        predictions_val.append(logits)
        true_vals.append(label_ids)
    
    loss_val_avg = loss_val_total/len(dataloader) 
    
    predictions_val = np.concatenate(predictions_val, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)
            
    return loss_val_avg, predictions_val, true_vals

In [None]:
def train(model, model_name):
    true_vals = None
    predictions_val = None
    for epoch in tqdm(range(1, epochs+1)):
    
        model.train()
    
        loss_train_total = 0

        progress_bar = tqdm(dataloader_train, desc='Epoch {:1d}'.format(epoch), leave=False, disable=False)
        for batch in progress_bar:

            model.zero_grad()

            batch = tuple(b.to(device) for b in batch)
        
            inputs = {'input_ids':      batch[0],
                      'attention_mask': batch[1],
                      'labels':         batch[2],
                     }       

            outputs = model(**inputs)

            logits = outputs[1]
            loss = cross_entropy(logits, batch[2])

            loss_train_total += loss.item()
            loss.backward()

            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

            optimizer.step()
            scheduler.step()

            progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item()/len(batch))})


        tqdm.write(f'\nEpoch {epoch}')

        loss_train_avg = loss_train_total/len(dataloader_train)            
        tqdm.write(f'Training loss: {loss_train_avg}')

        val_loss, predictions_val, true_vals = evaluate(dataloader_valid)
        val_f1 = f1_score_func(predictions_val, true_vals)
        val_bacc = accuracy(predictions_val, true_vals)
        tqdm.write(f'Validation loss: {val_loss}')
        tqdm.write(f'F1 Score (Weighted): {val_f1}')
        tqdm.write(f'Balanced Accuracy Score: {val_bacc}')

    torch.save(model.state_dict(), model_name)
    return true_vals, predictions_val

In [55]:
from transformers import logging
logging.set_verbosity_error()

In [62]:
train_losses = {}
train_accuracies = {}

valid_losses = {}
valid_accuracies = {}

for model_ in args.models:
    print(model_)
    start_time = int(time.time() * 1000)
    
    best_valid_loss = float('inf')
    
    train_losses[model_] = []
    train_accuracies[model_] = []

    valid_losses[model_] = []
    valid_accuracies[model_] = []
    
    train_dl = dataloaders[model_]['test']
    valid_dl = dataloaders[model_]['valid']
    
    model = BertForSequenceClassification.from_pretrained(
        model_,
        num_labels = len(target_names),
        output_attentions=False,
        output_hidden_states=False   
    )
        
    optimizer = AdamW(
        model.parameters(),
        args.learning_rate,
        eps=1e-8
    )
    
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=0,
        num_training_steps=len(train_dl) * args.num_epochs
    )
    
    criterion = CrossEntropyLoss(weight=weights)
    
    model = model.to(device)
    criterion = criterion.to(device)

    print(len(train_dl))
    
    epoch_bar = tqdm(desc=f"'{model_}' epoch", total=args.num_epochs, position=0, leave=True)
    train_bar = tqdm(train_dl, desc=f"'{model_}' train", leave=False)
    valid_bar = tqdm(valid_dl, desc=f"'{model_}' valid", leave=False)
    
    for epoch in range(args.num_epochs):
        print(epoch)
        train_bar.n = 0
        valid_bar.n = 0
        
        train_bar.refresh()
        valid_bar.refresh()

        train_loss = train_model(model, optimizer, criterion, train_bar)
        valid_loss = evaluate_model(model, optimizer, criterion, valid_bar)

        train_losses[model_].append(train_loss)
        train_accuracies[model_].append(train_acc)

        valid_losses[model_].append(valid_loss)
        valid_accuracies[model_].append(valid_acc)
    
        if valid_loss < best_valid_loss:
            best_valid_loss = valid_loss
            torch.save(model.state_dict(), args.model_state_file + "_" + dataset_key + ".pth")
 
        epoch_bar.update()

bert-base-multilingual-uncased
38


'bert-base-multilingual-uncased' epoch:   0%|          | 0/2 [00:00<?, ?it/s]

'bert-base-multilingual-uncased' train:   0%|          | 0/38 [00:00<?, ?it/s]

'bert-base-multilingual-uncased' valid:   0%|          | 0/38 [00:00<?, ?it/s]

0
Batch in bar  1
Batch in bar  2
Batch in bar  3
Batch in bar  4
Batch in bar  5
Batch in bar  6
Batch in bar  7
Batch in bar  8
Batch in bar  9
Batch in bar  10
Batch in bar  11
Batch in bar  12
Batch in bar  13
Batch in bar  14
Batch in bar  15
Batch in bar  16
Batch in bar  17
Batch in bar  18
Batch in bar  19
Batch in bar  20
Batch in bar  21
Batch in bar  22
Batch in bar  23
Batch in bar  24
Batch in bar  25
Batch in bar  26
Batch in bar  27
Batch in bar  28
Batch in bar  29
Batch in bar  30
Batch in bar  31
Batch in bar  32
Batch in bar  33
Batch in bar  34
Batch in bar  35
Batch in bar  36
Batch in bar  37
Batch in bar  38
Batch in bar  1
Batch in bar  2
Batch in bar  3
Batch in bar  4
Batch in bar  5
Batch in bar  6
Batch in bar  7
Batch in bar  8
Batch in bar  9
Batch in bar  10
Batch in bar  11
Batch in bar  12
Batch in bar  13
Batch in bar  14
Batch in bar  15
Batch in bar  16
Batch in bar  17
Batch in bar  18
Batch in bar  19
Batch in bar  20
Batch in bar  21
Batch in bar 

NameError: name 'train_acc' is not defined

In [None]:
truth, pred = train(model, 'finetuned_BERT.model')

In [None]:
test_loss, predictions, truth = evaluate(dataloader_test)

In [None]:
cm = confusion_matrix(truth, predictions.argmax(1))
cm_df = pd.DataFrame(cm, index=target_names, columns=target_names)

In [None]:
heatmap = sn.heatmap(cm_df, annot=True, cmap='Reds', fmt='g', annot_kws={"size": 15}, cbar=False)
plt.show()

In [None]:
class_report = classification_report(truth, predictions.argmax(1), target_names=target_names)
print(class_report)