In [None]:
!pip install transformers



In [None]:
from transformers import XLMRobertaTokenizer, XLMRobertaForSequenceClassification
import torch
import pandas as pd
import numpy as np
import torch.nn as nn
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from sklearn.metrics import roc_auc_score, accuracy_score
import torch.optim as optim
import torch.nn.functional as F
import json
import tqdm

In [None]:
cuda = torch.device("cuda" if torch.cuda.is_available() else "cpu")

#Data Loading

In [None]:
with open("/content/drive/MyDrive/Fine-Tune-Bert/TextClassification/Data/datasetVNTC.json") as json_file:
  data = json.load(json_file)

In [None]:
data.keys()
data['target_names']

['chính trị xã hội',
 'khoa học',
 'kinh doanh',
 'pháp luật',
 'sức khỏe',
 'thế giới',
 'thể thao',
 'vi tính',
 'văn hóa',
 'đời sống']

In [None]:
print(len(data['data']))

33759


In [None]:
class TextDataset(Dataset):
  def __init__(self, X, y, tokenizer, max_length):
    super(TextDataset, self).__init__()
    self.tokenizer = tokenizer
    self.max_length = max_length
    self.X = X
    self.y = y
  def __len__(self):
    return len(self.y)
  def __getitem__(self, index):
    text = self.X[index]
    inputs = self.tokenizer.encode_plus(
        text,
        None,
        pad_to_max_length = True,
        add_special_tokens = True,
        return_attention_mask = True,
        max_length = self.max_length,
        return_tensors = 'pt', 
    )
    ids = inputs["input_ids"].reshape(self.max_length)
 #   token_type_ids = inputs["token_type_ids"].reshape(self.max_length)
    mask = inputs["attention_mask"].reshape(self.max_length)

    return{
        "ids" : ids.to(cuda),
        "mask" : mask.to(cuda),
#        "token_type_ids" : token_type_ids.to(cuda),
        "target" : torch.tensor(self.y[index], dtype = torch.long, device = cuda),
    }

In [None]:
% pip install sentencepiece



In [None]:
tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-base')

Downloading:   0%|          | 0.00/4.83M [00:00<?, ?B/s]

In [None]:
tokenizer.vocab_size

250002

In [None]:
result = tokenizer.encode_plus("Sinh viên trường Đại Học Bách Khoa Hà Nội Sinh viên trường Đại Học Bách Khoa Hà Nội",return_tensors="pt");

In [None]:
result

{'input_ids': tensor([[    0, 69729,  4603,  4373, 18832, 38635,   335,  5687, 67766,  8548,
          9435, 69729,  4603,  4373, 18832, 38635,   335,  5687, 67766,  8548,
          9435,     2]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [None]:
result['input_ids']

tensor([[    0, 69729,  4603,  4373, 18832, 38635,   335,  5687, 67766,  8548,
          9435, 69729,  4603,  4373, 18832, 38635,   335,  5687, 67766,  8548,
          9435,     2]])

In [None]:
tokenizer.convert_ids_to_tokens(result['input_ids'].reshape(-1,))

In [None]:
datatest = TextDataset(data['data'][30000:], data['target'][30000:],tokenizer, 256)
datatrain = TextDataset(data['data'][0:30000], data['target'][0:30000],tokenizer, 256)

In [None]:
datatrainloader = DataLoader(dataset = datatrain, batch_size = 32)
datatestloader = DataLoader(dataset = datatest, batch_size = 32)

#Build Model

In [None]:
xlm = XLMRobertaForSequenceClassification.from_pretrained('xlm-roberta-base', problem_type="multi_label_classification", num_labels = 10)

Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'roberta.pooler.dense.weight', 'lm_head.layer_norm.weight', 'roberta.pooler.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense

In [None]:
model = xlm.to(cuda)
loss_function = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr = 1e-5)

In [None]:
import gc
gc.collect()

1429

In [None]:
model.train()
total_train_loss = 0
training_loss = []
for epoch in range(5):
  loop = tqdm.tqdm(enumerate(datatrainloader), leave=False, total=len(datatrainloader))
  total_train_loss = 0
  for batch, dl in loop:
    ids = dl['ids']
    mask = dl['mask']
    label = dl['target']
    label = torch.nn.functional.one_hot(label, 10).float().to(cuda)

    optimizer.zero_grad()

    output = model(ids, attention_mask=mask,labels=label)

    loss = output[0]

    total_train_loss = total_train_loss + loss.item()

    loss.backward()
    optimizer.step()
    torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
    
    loop.set_description(f'Epoch={epoch}/5')
    loop.set_postfix(loss=loss.item())
  print('Train loss:' ,total_train_loss)
 
  torch.save(model.state_dict(), '/content/drive/MyDrive/Fine-Tune-Bert/TextClassification/XLM/XLM-RoBERTamodel.pt')
  



Train loss: 184.5717852972448




Train loss: 74.46954880841076




Train loss: 55.178049999289215




Train loss: 43.80275151692331




Train loss: 36.29323753528297


#Assess

In [None]:
xlm.load_state_dict(torch.load('/content/drive/MyDrive/Fine-Tune-Bert/TextClassification/XLM/XLM-RoBERTamodel.pt'), strict=False)

In [None]:
xlm = xlm.to(cuda)

In [None]:
import gc
gc.collect()

1209

In [None]:
def evaluate(xlm, datatestloader):
  predict = []
  targets_list = []
  loop = tqdm.tqdm(enumerate(datatestloader), leave=False, total=len(datatestloader))
  for i, dl in loop:
    ids = dl['ids']
    mask = dl['mask']
    target = dl['target']
    label = torch.nn.functional.one_hot(target, 10).float().to(cuda)
    output = xlm(ids, attention_mask=mask,labels=label)

    preds = output[1]
    preds = preds.detach().cpu().numpy()
    target = target.to('cpu').numpy()

    targets_list.append(target.tolist())
    predict.append(preds.tolist())
  y_true = targets_list
  y_pred = np.argmax(predict, axis=2)
  acc = accuracy_score([item for sublist in y_true for item in sublist], [item for sublist in y_pred for item in sublist])
  print("accuracy : ",acc)


In [None]:
 evaluate(model, datatestloader)

In [None]:
evaluate(xlm, datatrainloader)

In [None]:
y_true = targets_list
y_pred = np.argmax(predict, axis=2)


In [None]:
print([item for sublist in y_true for item in sublist])
print([item for sublist in y_pred for item in sublist])

[8, 1, 9, 9, 14, 3, 5, 9, 6, 1, 4, 3, 6, 5, 11, 0, 12, 14, 2, 3, 9, 2, 7, 12, 2, 4, 1, 0, 4, 0, 1, 8, 0, 6, 4, 1, 3, 3, 8, 12, 10, 8, 7, 12, 1, 7, 7, 0, 3, 3, 12, 11, 1, 10, 0, 8, 3, 8, 11, 13, 6, 8, 2, 9, 12, 8, 5, 3, 2, 3, 7, 0, 13, 1, 0, 13, 3, 0, 2, 2, 12, 0, 4, 14, 11, 5, 10, 13, 2, 14, 4, 5, 11, 6, 14, 3, 5, 14, 3, 12, 10, 0, 0, 7, 12, 13, 7, 0, 0, 8, 5, 8, 7, 11, 14, 7, 14, 4, 2, 9, 2, 12, 3, 2, 13, 8, 8, 5, 6, 3, 5, 7, 12, 4, 3, 13, 7, 0, 12, 2, 12, 9, 8, 14, 13, 10, 7, 8, 6, 13, 13, 5, 0, 11, 7, 4, 11, 14, 6, 9, 6, 11, 8, 3, 3, 9, 8, 0, 12, 11, 9, 12, 9, 1, 8, 10, 6, 4, 3, 7, 3, 9, 4, 8, 6, 10, 14, 9, 0, 3, 0, 12, 8, 5, 2, 10, 7, 12, 2, 12, 2, 8, 3, 5, 14, 9, 13, 7, 0, 3, 2, 11, 10, 13, 5, 6, 5, 4, 5, 14, 0, 4, 2, 0, 11, 4, 5, 10, 3, 2, 7, 1, 13, 12, 13, 11, 13, 12, 8, 8, 0, 3, 3, 13, 4, 3, 4, 14, 9, 3, 8, 8, 9, 3, 0, 13, 6, 14, 0, 0, 5, 14, 11, 13, 7, 1, 12, 12, 7, 13, 4, 8, 14, 7, 13, 7, 6, 0, 13, 6, 9, 11, 8, 11, 12, 11, 1, 2, 3, 5, 0, 3, 4, 7, 0, 5, 12, 10, 11, 3, 3, 3, 5,

In [None]:
acc = accuracy_score([item for sublist in y_true for item in sublist], [item for sublist in y_pred for item in sublist])
print("accuracy : ",acc)

accuracy :  0.616


In [None]:
np.array(targets_list).shape

(250, 10)

In [None]:
y_pred.shape

(250,)

In [None]:
result = tokenizer.encode("Sinh viên trường Đại Học Bách Khoa Hà Nội Sinh viên trường Đại Học Bách Khoa Hà Nội",return_tensors="pt");

AttributeError: ignored

In [None]:
result

In [None]:
xlm(**result)

In [None]:
o = xlm(torch.tensor([result['input_ids']]), attention_mask = torch.tensor([result['attention_mask']]), token_type_ids = torch.tensor(result['token_type_ids']))

In [None]:
o

In [None]:
tokenizer.convert_ids_to_tokens(result['input_ids'])