In [None]:
import json
import math
import torch
import pickle
import transformers

import numpy as np
import pandas as pd

In [None]:
from pathlib import Path
from itertools import chain
from tqdm import tqdm

In [None]:
from sklearn import metrics
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import classification_report

from torch.utils.data import Dataset
from torch.utils.data import DataLoader

from transformers import BertTokenizer
from transformers import BertModel

In [None]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

## Model

In [None]:
class BERTClass(torch.nn.Module):
    
    
    def __init__(self, model_name, mlb):
        super(BERTClass, self).__init__()
        self.l1 = transformers.BertModel.from_pretrained(model_name)
        self.l2 = torch.nn.Dropout(0.3)
        self.l3 = torch.nn.Linear(768, len(mlb.classes_))
    
    
    def forward(self, ids, mask, token_type_ids):
        output_1 = self.l1(ids, attention_mask = mask, token_type_ids = token_type_ids)
        output_2 = self.l2(output_1.pooler_output)
        output = self.l3(output_2)
        return output

In [None]:
def loss_fn(outputs, targets):
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)

In [None]:
def validation(model, testing_loader, device, total_batch_test):
    model.eval()
    
    fin_targets=[]
    fin_outputs=[]
    
    with torch.no_grad():
        for _, data in tqdm(enumerate(testing_loader, 0), total=total_batch_test):
            
            # X
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            
            # y
            targets = data['targets'].to(device, dtype = torch.float)
            
            # pred
            outputs = model(ids, mask, token_type_ids)
            
            # TODO:
            # fin_targets.extend(targets.cpu().detach().numpy().tolist())
            # fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
            
            fin_targets.extend(targets.cpu().detach().numpy().tolist())
            fin_outputs.extend(outputs.cpu().detach().numpy().tolist())
            
    return fin_outputs, fin_targets

In [None]:
def train(model, optimizer, training_loader, testing_loader, epoch, device, total_batch_train, total_batch_test):
    model.train()
    
    epoch_targets = []
    epoch_outputs = []
    
    for _, data in tqdm(enumerate(training_loader, 0), total=total_batch_train): # enumerate(iterable, start=0)
        
        # _, by batch?
        
        # X
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        
        # y
        targets = data['targets'].to(device, dtype = torch.float)

        # train
        outputs = model(ids, mask, token_type_ids)

        # optimizer clean
        optimizer.zero_grad()
        
        # loss
        loss = loss_fn(outputs, targets)
        
        # if 0 == _ % 5000: print(f'Epoch: {epoch}, Loss:  {loss.item()}')
        epoch_targets.extend(targets.cpu().detach().numpy().tolist())
        epoch_outputs.extend(outputs.cpu().detach().numpy().tolist())
        
        # TODO: why?
        # optimizer.zero_grad()
        
        # optimize
        loss.backward() # get new gradient, upon zero grad
        optimizer.step()
            
    loss_train = loss_fn(torch.tensor(epoch_targets), torch.tensor(epoch_outputs)).item()
    
    fin_outputs, fin_targets = validation(model, testing_loader, device, total_batch_test)
    loss_test = loss_fn(torch.tensor(fin_targets), torch.tensor(fin_outputs)).item()
    
    print(f'epoch: {epoch}, loss train:  {loss_train}, loss test: {loss_test}')
    
    return model, (loss_train, loss_test)

In [None]:
def evaluate(model, testing_loader, device, total_batch_test, mlb):
    
    outputs, targets = validation(model, testing_loader, device, total_batch_test)
    outputs = np.array(outputs) >= 0.5
    
    # accuracy = metrics.accuracy_score(targets, outputs)
    # f1_score_micro = metrics.f1_score(targets, outputs, average='micro')
    # f1_score_macro = metrics.f1_score(targets, outputs, average='macro')
    
    # TODO:
    # print(f"Accuracy Score = {accuracy}")
    # print(f"F1 Score (Micro) = {f1_score_micro}")
    # print(f"F1 Score (Macro) = {f1_score_macro}")
    
    # val_hamming_loss = metrics.hamming_loss(targets, outputs)
    # val_hamming_score = hamming_score(np.array(targets), np.array(outputs))

    # print(f"Hamming Score = {val_hamming_score}")
    # print(f"Hamming Loss = {val_hamming_loss}")
    
    print(classification_report(targets, outputs, target_names=mlb.classes_))

## Ref

[1] https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MultiLabelBinarizer.html

[2] https://huggingface.co/hfl/chinese-bert-wwm <br>
[3] https://huggingface.co/hfl/chinese-bert-wwm-ext <br>

[4] https://huggingface.co/hfl/chinese-roberta-wwm-ext <br>
[5] https://huggingface.co/hfl/chinese-roberta-wwm-ext-large <br>

[6] https://huggingface.co/hfl/chinese-macbert-base

[7] https://huggingface.co/ckiplab/bert-base-chinese <br>
[8] https://huggingface.co/ckiplab/albert-tiny-chinese