In [None]:
!pip install openpyxl

In [None]:
import pandas as pd

def load_dataset(path):
    datas= pd.read_excel(path)
    data = []
    for idx in datas.index:
        sample = {}
        x = datas.iloc[idx]
        sample['document'] = x['text']
        index = 0
        for key in x.keys():
            if x[key] == 1:
                sample['label'] = index - 1
            index += 1
        data.append(sample)
    return data

In [None]:
train_data = load_dataset('../input/vntc-xlsx/train.xlsx')
test_data = load_dataset('../input/vntc-xlsx/test.xlsx')

In [None]:
from transformers import AutoTokenizer, RobertaForSequenceClassification


tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base")
model = RobertaForSequenceClassification.from_pretrained("vinai/phobert-base", num_labels = 10)

In [None]:
class VTCollator:
    def __init__(self, tokenizer): 
        self.tokenizer = tokenizer
    def __call__(self, batch):
        encodings = {}
        encodings['document'] = self.tokenizer([x['document'] for x in batch],padding=True, truncation=True,return_tensors='pt')
        encodings['label'] = torch.tensor([x['label'] for x in batch])
        return encodings

In [None]:
from torch.utils.data import DataLoader
collator = VTCollator(tokenizer)
train_dataloader = DataLoader(train_data, batch_size=2, shuffle = True, 
                              collate_fn=collator, num_workers=2)

test_dataloader = DataLoader(test_data, batch_size=2,
                              collate_fn=collator, num_workers=2)

In [None]:
import torch
import torch.nn as nn

In [None]:
class Network(nn.Module):
    def __init__(self):
        super().__init__()
        self.roberta = model
        
    def forward(self, x):
        outputs = self.roberta(**x)
        k =  outputs.logits
        return k

In [None]:
import numpy as np
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
from sklearn.metrics import accuracy_score, f1_score
def training(num):
    # Training with Validation
    model = Network()
    model = model.to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.AdamW(model.parameters(), lr = 3e-5, eps = 1e-8)
    epochs = 10
    max_acc = -np.inf
    count = 0
    for e in range(epochs):
        train_loss = 0.0
        for batch in train_dataloader:
            # Transfer Data to GPU if available
            data_q, labels = batch['document'].to(device), batch['label'].to(device) 
            # Clear the gradients
            optimizer.zero_grad()
            # Forward Pass
            target = model(data_q)
            # Find the Loss
            loss = criterion(target,labels)
            # Calculate gradients
            loss.backward()
            # Update Weights
            optimizer.step()
            # Calculate Loss
            train_loss += loss.item()
        model.eval()
        l = []
        t = []
        for batch in test_dataloader:
            # Transfer Data to GPU if available
            data_q = batch['document'].to(device)

            labels = batch['label']
            # Forward Pass
            target = model(data_q)
            # Find the Loss


            lab = labels.numpy()
            tar = np.argmax(target.cpu().detach().numpy(),axis=1)
            l =  lab if len(l) == 0 else np.concatenate((l, lab))
            t =  tar if len(t) == 0 else np.concatenate((t,tar))
            
        acc = accuracy_score(l,t)
        f1 = f1_score(l,t,average='micro')
        print(f'Epoch {e+1} \t\t Training Loss: {train_loss / len(train_dataloader)} \t\t Test Accuracy: {acc} \t\t F1 Score: {f1}')
        if max_acc < acc:
            print(f'Accuracy Increased({max_acc:.6f}--->{acc:.6f}) \t Saving The Model')
            max_acc = acc
            # Saving State Dict
            torch.save(model.state_dict(), f'saved_model_{num}.pth')
            count = 0
        count += 1
        if count == 3:
            break

In [None]:
training(0)