In [1]:
import imp
import torch
import numpy as np
import random
from datasets import load_dataset
from preprocessing import Preprocess
from transformers import AutoModel, AutoTokenizer
from vncorenlp import VnCoreNLP
from transformers import DataCollatorWithPadding
from torch.utils.data import DataLoader
from tqdm.auto import tqdm
from model import CustomModelSoftmax
from metrics import *
from loss import *
from utils import *
import re

seed = 42
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
np.random.seed(seed)
random.seed(seed)
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True

# Segmenter input
rdrsegmenter = VnCoreNLP("vncorenlp/VnCoreNLP-1.1.1.jar", annotators="wseg", max_heap_size='-Xmx500m') 

# Tokenizer
tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base", local_files_only=True)

# Load datasets
data_files = {'train': "../datasets/data_training/train_datasets.csv", 
              'test': "../datasets/data_training/test_datasets.csv"}

dataset = load_dataset('csv', data_files=data_files)

preprocess = Preprocess(tokenizer, rdrsegmenter)

dataset = preprocess.run(dataset)


  from .autonotebook import tqdm as notebook_tqdm
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Using custom data configuration default-d4a9b86bc10de7c3
Reusing dataset csv (C:\Users\DELL\.cache\huggingface\datasets\csv\default-d4a9b86bc10de7c3\0.0.0\652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a)
100%|██████████| 2/2 [00:00<00:00, 333.07it/s]
Loading cached processed dataset at C:\Users\DELL\.cache\huggingface\datasets\csv\default-d4a9b86bc10de7c3\0.0.0\652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a\cache-1c80317fa3b1799d.arrow
Loading cached processed dataset at C:\Users\DELL\.cache\huggingface\datasets\csv\default-d4a9b86bc10de7c3\0.0.0\652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a\cache-bdd640fb06671ad1.arrow
Loading cached processed dataset at C:\Users\DELL\.cache\huggingface\datasets\csv\default-d4a9b86bc10de7c3\0.0.0\652c3096f041ee27b04d2232d41f10547a8fe

In [2]:
dataset

DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0', 'Review', 'giai_tri', 'luu_tru', 'nha_hang', 'an_uong', 'di_chuyen', 'mua_sam', 'Segment', 'input_ids', 'token_type_ids', 'attention_mask', 'labels_regressor', 'labels_classifier'],
        num_rows: 4299
    })
    test: Dataset({
        features: ['Unnamed: 0', 'Review', 'giai_tri', 'luu_tru', 'nha_hang', 'an_uong', 'di_chuyen', 'mua_sam', 'Segment', 'input_ids', 'token_type_ids', 'attention_mask', 'labels_regressor', 'labels_classifier'],
        num_rows: 1074
    })
})

In [3]:
tokenized_datasets = dataset.remove_columns(['Unnamed: 0','Review', 'giai_tri', 'luu_tru', 'nha_hang', 'an_uong', 'di_chuyen', 'mua_sam', 'Segment'])
tokenized_datasets.set_format("torch")

In [4]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels_regressor', 'labels_classifier'],
        num_rows: 4299
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels_regressor', 'labels_classifier'],
        num_rows: 1074
    })
})

In [5]:
tokenized_datasets["train"]

Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels_regressor', 'labels_classifier'],
    num_rows: 4299
})

In [6]:
tokenized_datasets["train"]["labels_classifier"]

tensor([[0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 1, 0],
        [0, 1, 0, 0, 0, 0],
        ...,
        [0, 1, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 1],
        [0, 0, 0, 0, 1, 0]])

In [7]:



data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
train_dataloader = DataLoader(
    tokenized_datasets["train"], shuffle=True, batch_size=32, collate_fn=data_collator
)
test_dataloader = DataLoader(
    tokenized_datasets["test"], batch_size=32, collate_fn=data_collator
)

In [8]:
model = CustomModelSoftmax("vinai/phobert-base")
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.to(device)

from transformers import AdamW, get_scheduler

optimizer = AdamW(model.parameters(), lr=5e-5)
num_epochs = 10
num_training_steps = num_epochs*len(train_dataloader)
lr_scheduler = get_scheduler(
    'linear',
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)



In [9]:


pb_train = tqdm(range(num_training_steps))
pb_test = tqdm(range(num_epochs*len(test_dataloader)))
best_score = -1

for epoch in range(num_epochs):
  train_loss = 0
  val_loss = 0
  model.train()
  for batch in train_dataloader:
    inputs = {'input_ids': batch['input_ids'].to(device),
             'attention_mask': batch['attention_mask'].to(device)}
    outputs_classifier, outputs_regressor = model(**inputs)
    # loss = nn.MSELoss()(outputs, batch['labels'].to(device).float())
    # loss1 = loss_classifier(outputs_classifier, batch['labels_classifier'].to(device).float())
    loss1 = sigmoid_focal_loss(outputs_classifier, batch['labels_classifier'].to(device).float(), alpha=-1, gamma=1,reduction='mean')
    # loss1 = bce_loss_weights(outputs_classifier, batch['labels_classifier'].to(device).float(), weights.to(device))
    # loss1 = CB_loss(outputs_classifier, batch['labels_classifier'].to(device).float(), num_positive.to(device), num_negative.to(device))
    loss2 = loss_softmax(outputs_regressor, batch['labels_regressor'].to(device).float(), device)
    loss = 10*loss1 + loss2
    optimizer.zero_grad()
    # print(loss.item())
    loss.backward()
    optimizer.step()       
    lr_scheduler.step()
    pb_train.update(1)
    pb_train.set_postfix(loss_classifier=loss1.item(),loss_regressor=loss2.item(),loss=loss.item())
    train_loss += loss.item() / len(train_dataloader)
  print("Train Loss:", train_loss)
  model.eval()
  val_loss = ScalarMetric()
  val_loss_classifier = ScalarMetric()
  val_loss_regressor = ScalarMetric()
  val_acc = AccuracyMetric()
  val_f1_score = F1_score()
  val_r2_score = R2_score()
  num = 0
  correct = 0
  result = None
  model.eval()
  for batch in test_dataloader:
    inputs = {'input_ids': batch['input_ids'].to(device),
            'attention_mask': batch['attention_mask'].to(device)}
    with torch.no_grad():
      outputs_classifier, outputs_regressor = model(**inputs)
      # outputs = nn.Sigmoid()(outputs)
      # loss = nn.BCELoss()(outputs, batch['labels'].to(device).float())
      loss1 = loss_classifier(outputs_classifier, batch['labels_classifier'].to(device).float())
      loss2 = loss_softmax(outputs_regressor, batch['labels_regressor'].to(device).float(), device)
      loss = loss1 + loss2
      outputs_classifier = outputs_classifier.cpu().numpy()
      outputs_regressor = outputs_regressor.cpu().numpy()
      outputs_regressor = outputs_regressor.argmax(axis=-1) + 1
      y_true = batch['labels_regressor'].numpy()
      outputs = pred_to_label(outputs_classifier, outputs_regressor)
      # y_true = prob_to_label(y_true)
      # print(outputs)
      result = np.concatenate([result, np.round(outputs)], axis=0) if result is not None else np.round(outputs)
      val_loss_classifier.update(loss1.item())
      val_loss_regressor.update(loss2.item())
      val_loss.update(loss.item())
      val_acc.update(np.round(outputs), y_true)
      val_f1_score.update(np.round(outputs), y_true)
      val_r2_score.update(np.round(outputs), y_true)
      pb_test.update(1)
  f1_score = val_f1_score.compute()
  r2_score = val_r2_score.compute()
  final_score = (f1_score * r2_score).sum()*1/6
  if final_score > best_score:
    best_score = final_score
    torch.save(model.state_dict(), "weights/model_softmax_vv1.pt")
  print("Test Loss:", val_loss.compute(), "Loss Classifier:", val_loss_classifier.compute(), "Loss Regressor:", val_loss_regressor.compute())
  print("Acc", val_acc.compute())
  print("F1_score", f1_score)
  print("R2_score", r2_score)
  print("Final_score", final_score)
  print("Best_score", best_score)

  0%|          | 1/1350 [00:43<16:16:08, 43.42s/it, loss=5.95, loss_classifier=0.421, loss_regressor=1.74]

KeyboardInterrupt: 