In [None]:
!nvidia-smi

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

## Enviroment

In [None]:
!pip3 install transformers
!pip3 install vncorenlp
!pip3 install fairseq
!pip install fastBPE
!pip install pytorch_pretrained_bert

In [None]:
!wget https://public.vinai.io/PhoBERT_base_transformers.tar.gz
!tar -xzvf PhoBERT_base_transformers.tar.gz
!mkdir -p vncorenlp/models/wordsegmenter
!wget https://raw.githubusercontent.com/vncorenlp/VnCoreNLP/master/VnCoreNLP-1.1.1.jar
!wget https://raw.githubusercontent.com/vncorenlp/VnCoreNLP/master/models/wordsegmenter/vi-vocab
!wget https://raw.githubusercontent.com/vncorenlp/VnCoreNLP/master/models/wordsegmenter/wordsegmenter.rdr
!mv VnCoreNLP-1.1.1.jar vncorenlp/ 
!mv vi-vocab vncorenlp/models/wordsegmenter/
!mv wordsegmenter.rdr vncorenlp/models/wordsegmenter/

## Config

In [None]:

import pandas as pd
from tqdm import tqdm
tqdm.pandas()
from torch import nn
import json
import numpy as np
import pickle
import os
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score
from transformers import *
import torch
import matplotlib.pyplot as plt
import torch.utils.data
import torch.nn.functional as F
import argparse
from transformers.modeling_utils import *
from fairseq.data.encoders.fastbpe import fastBPE
from fairseq.data import Dictionary
from vncorenlp import VnCoreNLP
from scipy.special import softmax
import time
from torch.utils.data import Dataset

In [None]:
args = argparse.Namespace(
    train_path='/content/drive/My Drive/Project2/data/preprocessing/train.csv',
    test_path='/content/drive/My Drive/Project2/data/preprocessing/test.csv',
    dict_path='/content/PhoBERT_base_transformers/dict.txt',
    config_path='/content/PhoBERT_base_transformers/config.json',
    rdrsegmenter_path='/content/vncorenlp/VnCoreNLP-1.1.1.jar',
    pretrained_path='/content/PhoBERT_base_transformers/model.bin',
    max_sequence_length=350,
    batch_size=16,
    accumulation_steps=5,
    epochs=5,
    fold=5,
    seed=69,
    lr=3e-5,
    early_stop_max_epochs=3,
    ckpt_path='/content/drive/My Drive/Project2/models',
    bpe_codes='/content/PhoBERT_base_transformers/bpe.codes',
    version=3.2
)

## Extrac feature

In [None]:
def max_sequence(data_df):
    max_seq = 0
    for text in data_df.text.values:
        max_seq = max(max_seq, len(text.split(' ')))
    return max_seq

In [None]:
from collections import defaultdict
def split_by_label(data_df, train_split=0.8):
    by_label = defaultdict(list)
    for _, row in data_df.iterrows():
        by_label[row.label].append(row.to_dict())
    train_list = []
    val_list = []
    for _, item_list in sorted(by_label.items()):
        np.random.shuffle(item_list)
        n_total = len(item_list)
        n_train = int(train_split*n_total)
        train_list.extend(item_list[: n_train])
        val_list.extend(item_list[n_train:])
    train_df = pd.DataFrame(train_list)
    val_df = pd.DataFrame(val_list)
    return train_df, val_df


In [None]:
# args = parser.parse_args()


np.random.seed(1337)
torch.manual_seed(1337)
torch.cuda.manual_seed(1337)
torch.backends.cudnn.deterministic = True

# Load the dictionary


# Load training data
data_df = pd.read_csv(args.train_path)
train_df = data_df[data_df.split == 'train']
val_df = data_df[data_df.split == 'val']
num_labels = len(set(train_df.label.values))
# args.max_sequence_length = max_sequence(train_df)
print(args.max_sequence_length)
test_df = pd.read_csv(args.test_path)

In [None]:
class LabelEncode(object):
    def __init__(self, label=None):
        label = set(label);
        if len(label) == 0:
            token_to_idx = {}
        else:
          token_to_idx = {y:i for i, y in enumerate(sorted(label))}
        self.token_to_idx = token_to_idx
        self.idx_to_token = {i: w for w, i in token_to_idx.items()}

    def lookup_token(self, token):
        return self.token_to_idx[token]

    def lookup_index(self, index):
        if index not in self.idx_to_token:
            raise KeyError("the index (%d) is not in the Vocabulary" % index)
        return self.idx_to_token[index]
    
    def transform(y_pred):
        return [self.lookup_index[y] for y in y_pred]

    def __len__(self):
        return len(self.token_to_idx)

In [None]:
class TextDataset(Dataset):
    def __init__(self, data_df, args, labelencode):
        
        self.bpe = fastBPE(args=args)
        self.data_df = data_df
        self.vocab = Dictionary()
        self.vocab.add_from_file(args.dict_path)
        self.labelencode = labelencode
        self.max_len = args.max_sequence_length
    def vectorize(self, text):
        max_sequence_length = self.max_len
        cls_id = 0
        eos_id = 2
        pad_id = 1
        subwords = self.bpe.encode('<s> ' + text + ' </s>')
        input_ids = self.vocab.encode_line(subwords, append_eos=False, add_if_not_exist=False).long().tolist()
        if len(input_ids) > max_sequence_length:
            input_ids = input_ids[:max_sequence_length]
            input_ids[-1] = eos_id
        else:
            input_ids = input_ids + [pad_id, ] * (max_sequence_length - len(input_ids))
        output = np.array(input_ids)
        return output

    def __len__(self):
        return len(self.data_df)

    def __getitem__(self, index):
        row = self.data_df.iloc[index]
        text = row.text
        x = self.vectorize(text)
        y = self.labelencode.lookup_token(row.label)
        return x, y

In [None]:
labelencode = LabelEncode(train_df.label.tolist())
train_dataset = TextDataset(train_df, args, labelencode)
valid_dataset = TextDataset(val_df, args, labelencode)
test_dataset = TextDataset(test_df, args, labelencode)


## Train

In [None]:
import json
def make_train_state(args):
    return {
        'stop_early': False,
        'early_stop_num_epoch': 0,
        'early_stop_max_epochs': args.early_stop_max_epochs,
        'early_stop_best_val_loss': 1e8,
        'epoch_index': 0,
        'model_filename': args.ckpt_path + "/P2_phobert.bin",
        'learning_rate': args.lr,
        'train_loss': [],
        'val_loss': [],
        'test_loss': 0,
        'test_acc': 0
    }

def update_train_state(model, train_state):
    if train_state['epoch_index'] == 0:
        torch.save(model.state_dict(), train_state['model_filename'])

    else:
        loss_t = train_state['val_loss'][-1]
        if loss_t < train_state['early_stop_best_val_loss']:
            torch.save(model.state_dict(), train_state['model_filename'])
            train_state['early_stop_num_epoch'] = 0
            train_state['early_stop_best_val_loss'] = loss_t
        else:
            train_state['early_stop_num_epoch'] += 1

        if train_state['early_stop_num_epoch'] >= train_state['early_stop_max_epochs']:
            train_state['stop_early'] = True

    return train_state
  
def save_train_state(train_state):
    with open('train_state_{}.json'.format(args.version), 'w') as fp:
        json.dump(train_state, fp)


In [None]:
def score(y_truth, y_preds):
    y_preds = softmax(y_preds, axis=1)
    y_preds = np.argmax(y_preds, axis=1)
    acc = accuracy_score(y_truth, y_preds)
    f1 = f1_score(y_truth, y_preds, average='macro')
    precision = precision_score(y_truth, y_preds, average='macro')
    recall = recall_score(y_truth, y_preds, average='macro')
    return acc, f1, precision, recall

In [None]:

import torch
from torch import nn
from transformers import *
from transformers import RobertaConfig


class RobertaForClassification(BertPreTrainedModel):
   config_class = RobertaConfig
   base_model_prefix = "roberta"
   def __init__(self, config):
       super(RobertaForClassification, self).__init__(config)
       self.num_labels = config.num_labels
       self.roberta = RobertaModel(config)
       self.qa_outputs = nn.Linear(4*config.hidden_size, self.num_labels)

       self.init_weights()

   def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None,
                start_positions=None, end_positions=None):

       outputs = self.roberta(input_ids,
                            attention_mask=attention_mask,
#                            token_type_ids=token_type_ids,
                            position_ids=position_ids,
                            head_mask=head_mask)
       cls_output = torch.cat((outputs[2][-1][:,0, ...],outputs[2][-2][:,0, ...], outputs[2][-3][:,0, ...], outputs[2][-4][:,0, ...]),-1)
       logits = self.qa_outputs(cls_output)
       return logits

In [None]:
def compute_accuracy(y_pred, y_target):
    y_pred_indices = np.argmax(y_pred, axis=1)
    n_correct = np.equal(y_pred_indices, y_target).sum().item()
    return n_correct / len(y_pred_indices)

# Train

In [None]:
# Load model
config = RobertaConfig.from_pretrained(
    args.config_path,
    output_hidden_states=True,
    num_labels=num_labels
)
model_bert = RobertaForClassification.from_pretrained(args.pretrained_path, config=config)
model_bert.cuda()

if torch.cuda.device_count():
    print(f"Training using {torch.cuda.device_count()} gpus")
    model_bert = nn.DataParallel(model_bert)
    tsfm = model_bert.module.roberta
else:
    tsfm = model_bert.roberta

In [None]:
# Creating optimizer and lr schedulers
param_optimizer = list(model_bert.named_parameters())
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]
num_train_optimization_steps = int(args.epochs*len(train_df)/args.batch_size/args.accumulation_steps)
optimizer = AdamW(optimizer_grouped_parameters, lr=args.lr, correct_bias=False)  # To reproduce BertAdam specific behavior set correct_bias=False
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=100, num_training_steps=num_train_optimization_steps)  # PyTorch scheduler
scheduler0 = get_constant_schedule(optimizer)  # PyTorch scheduler
loss_f = nn.CrossEntropyLoss()

if not os.path.exists(args.ckpt_path):
    os.mkdir(args.ckpt_path)


In [None]:
args.epochs = 6
print(args.max_sequence_length)
train_state = make_train_state(args)
# tq = tqdm(range(args.epochs + 1))
for child in tsfm.children():
    for param in child.parameters():
        if not param.requires_grad:
            print("whoopsies")
        param.requires_grad = False
frozen = True
for epoch in range(args.epochs):
    start_time = time.time()
    if epoch > 0 and frozen:
        for child in tsfm.children():
            for param in child.parameters():
                param.requires_grad = True
        frozen = False
        del scheduler0
        torch.cuda.empty_cache()

    
    train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True)
    valid_loader = torch.utils.data.DataLoader(valid_dataset, batch_size=args.batch_size, shuffle=False)
    running_loss = 0
    running_acc = 0
    optimizer.zero_grad()
    # pbar = tqdm(enumerate(train_loader),total=len(train_loader),leave=False)
    for i, (x_batch, y_batch) in enumerate(train_loader):
        model_bert.train()
        y_pred = model_bert(torch.tensor(x_batch, dtype=torch.long).cuda(), attention_mask=(torch.tensor(x_batch, dtype=torch.long)>0).cuda())
        loss =  F.cross_entropy(y_pred.cuda(),torch.tensor(y_batch, dtype=torch.long).cuda())
        loss = loss.mean()
        loss.backward()
        if i % args.accumulation_steps == 0 or i == len(train_loader) - 1:
            optimizer.step()
            optimizer.zero_grad()
            if not frozen:
                scheduler.step()
            else:
                scheduler0.step()
        y_pred = y_pred.squeeze().detach().cpu().numpy()
        y_batch = y_batch.detach().cpu().numpy()
        acc_t = compute_accuracy(y_pred, y_batch)
        running_acc += (acc_t-running_acc)/(i + 1)
        if(i % 100 == 0):
            print("Train: epoch {} step {} - {}".format(epoch, i, running_acc))
        del x_batch, y_batch, y_pred, loss
        torch.cuda.empty_cache()
    train_state['train_loss'].append(running_acc)
    model_bert.eval()
    running_loss = 0
    running_acc = 0
    for i, (x_batch, y_batch) in enumerate(valid_loader):
        torch.no_grad
        y_pred = model_bert(torch.tensor(x_batch, dtype=torch.long).cuda(), attention_mask=(torch.tensor(x_batch, dtype=torch.long)>0).cuda())
        loss =  F.cross_entropy(y_pred.cuda(),torch.tensor(y_batch, dtype=torch.long).cuda())
        loss = loss.mean()
        loss_t = loss.item()
        running_loss += (loss_t-running_loss)/(i + 1)
        y_pred = y_pred.squeeze().detach().cpu().numpy()
        y_batch = y_batch.numpy()
        acc_t = compute_accuracy(y_pred, y_batch)
        running_acc += (acc_t-running_acc)/(i + 1)
        del x_batch, y_batch, y_pred, loss
        torch.cuda.empty_cache()
    train_state['val_loss'].append(running_loss)
    train_state = update_train_state(model_bert, train_state)
    print("Val: epoch", epoch, f"    loss={running_loss:.4f}", f"    acc={running_acc:.4f}", "    time({}s)".format(time.time() - start_time))
    torch.cuda.empty_cache()
    if train_state['stop_early']:
        print("Stop early!")
        break

## Save model

In [None]:

torch.save(model_bert, args.ckpt_path + '/P2_phobert.pth')

# Evaluate

In [None]:
import matplotlib.pyplot as plt
import itertools
import numpy as np
from sklearn.metrics import confusion_matrix
def plot_confusion_matrix(cm,
                          target_names,
                          title='Confusion matrix',
                          cmap=None,
                          normalize=True,
                          results_path='/content/drive/My Drive/Work/IC/results'):

    accuracy = np.trace(cm) / float(np.sum(cm))
    misclass = 1 - accuracy

    if cmap is None:
        cmap = plt.get_cmap('Blues')

    plt.figure(figsize=(8, 6))
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()

    if target_names is not None:
        tick_marks = np.arange(len(target_names))
        plt.xticks(tick_marks, target_names, rotation=90)
        plt.yticks(tick_marks, target_names)

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

    thresh = cm.max() / 1.5 if normalize else cm.max() / 2
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        if normalize:
            plt.text(j, i, "{:0.2f}".format(cm[i, j]),
                     horizontalalignment="center",
                     color="white" if cm[i, j] > thresh else "black")
        else:
            plt.text(j, i, "{:,}".format(cm[i, j]),
                     horizontalalignment="center",
                     color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label\naccuracy={:0.2f}; misclass={:0.2f}'.format(
        accuracy, misclass))
#     plt.show()
    if not os.path.exists(results_path):
        os.mkdir(results_path)
    if normalize:
        plt.savefig(results_path + '/normalize_confusion_matrix.png')
    else:
        plt.savefig(results_path+ '/confusion_matrix.png')

In [None]:
config = RobertaConfig.from_pretrained(
    args.config_path,
    output_hidden_states=True,
    num_labels=num_labels
)
model_path = '/content/drive/My Drive/Project2/models/P2_phobert.pth'
model_state_dict = torch.load(model_path)
model_bert = torch.load(model_path)
for param in model_bert.parameters():
    param.requires_grad = False
model_bert.eval()
device = torch.device("cuda")
model_bert.to(device)

In [None]:
model_bert.eval()
running_acc = 0
test_preds = []
test_truth = []
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=args.batch_size, shuffle=False)
for i, (x_batch, y_batch) in enumerate(test_loader):
    torch.no_grad
    y_pred = model_bert(torch.tensor(x_batch, dtype=torch.long).cuda(), attention_mask=(torch.tensor(x_batch, dtype=torch.long)>0).cuda())
    y_pred = y_pred.squeeze().detach().cpu().numpy()
    y_pred = softmax(y_pred, axis=1)
    y_pred1 = np.argmax(y_pred, axis=1)
    y_batch = y_batch.numpy()
    if len(test_preds) == 0:
        test_preds = y_pred1
        test_truth = y_batch
    else:
        test_preds = np.concatenate([test_preds, y_pred1], axis=0)
        test_truth = np.concatenate([test_truth, y_batch], axis=0)
acc = accuracy_score(test_preds, test_truth)
print("acc: ", acc)

In [None]:
y_

In [None]:
classes = set(test_df.label.values)
y_labels = test_df.label.values
test_preds = label(test_preds)

In [None]:
classes = list(sorted(classes))
print(test_preds)
print(y_labels)
print(classes)

In [None]:
cm = confusion_matrix(y_true=y_labels, y_pred=test_preds, labels=classes)
plot_confusion_matrix(cm, normalize=True, target_names=classes,
                      title="Confusion Matrix IC(normalize)")
plot_confusion_matrix(cm, normalize=False, target_names=classes,
                      title="Confusion Matrix IC")

# Inference

In [None]:
def convert_text(text, vocab, bpe, max_sequence_length):
    output = np.zeros(max_sequence_length)
    cls_id = 0
    eos_id = 2
    pad_id = 1
    subwords = bpe.encode('<s> ' + text + ' </s>')
    input_ids = vocab.encode_line(subwords, append_eos=False, add_if_not_exist=False).long().tolist()
    if len(input_ids) > max_sequence_length:
        input_ids = input_ids[:max_sequence_length]
        input_ids[-1] = eos_id
    else:
        input_ids = input_ids + [pad_id, ] * (max_sequence_length - len(input_ids))
    output = np.array([input_ids])
    return output


In [None]:
text = "Cho mình xem xe ăn dăm với!"

In [None]:
# x = convert_text(text, vocab, bpe,args.max_sequence_length)
# print(x)

In [None]:
def predict(text):
    x = convert_text(text, vocab, bpe,args.max_sequence_length)
    x = torch.tensor(x, dtype=torch.long)
    model_bert.eval()
    torch.no_grad
    y_pred = model_bert(x.cuda(), attention_mask=(x>0).cuda())
    y_pred = y_pred.squeeze().detach().cpu().numpy()
    y_pred = np.exp(y_pred)/sum(np.exp(y_pred))
    y_pred = np.argmax(y_pred)
    return encoder.inverse_transform([y_pred])[0]

In [None]:
text = "inbox cho minh"
print(text, "-->", predict(text))

In [None]:
model = 