In [1]:
from datasets import load_dataset

ds = load_dataset("ThanhT04/tokenized_arxiv-classification")

In [2]:
ds_train = ds['train']
ds_val = ds['validation']
ds_test = ds['test']

train_text = ds_train['text']
train_label = ds_train['label']

val_text = ds_val['text']
val_label = ds_val['label']

test_text = ds_test['text']
test_label = ds_test['label']

In [7]:
from transformers import BertTokenizer, BertModel
import torch

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased')
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
bert_model = bert_model.to(device)

def embedding(texts, batch_size = 16):
    all_embeddings = []
    for i in range (0, len(texts), batch_size):
        batch_texts = texts[i:i + batch_size]
        inputs = tokenizer(batch_texts, return_tensors = 'pt', padding = True, truncation = True, max_length = 128)
        inputs = {key: value.to(device) for key, value in inputs.items()}
        with torch.no_grad():
            outputs = bert_model(**inputs)
            embedding = outputs.last_hidden_state[:,0,:]
        all_embeddings.append(embedding)
        del inputs, outputs, embedding,
        torch.cuda.empty_cache()
    return torch.cat(all_embeddings, dim = 0)

In [8]:
train_embeddings = embedding(train_text, batch_size = 4)

In [9]:
val_embeddings = embedding(val_text, batch_size = 4)

28388

In [26]:
torch.cuda.empty_cache()

In [None]:
# train_embedding_tensor = torch.cat(train_embeddings, dim = 0)

In [10]:
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader

y_validation = torch.tensor(val_label, dtype=torch.long)
val_dataset = TensorDataset(val_embeddings, y_validation)
val_loader = DataLoader(val_dataset, batch_size = 16, shuffle = False)

In [14]:
y_train = torch.tensor(train_label, dtype=torch.long)
train_dataset = TensorDataset(train_embeddings, y_train)
train_loader = DataLoader(train_dataset, batch_size = 16, shuffle = False)

In [29]:
# feedforward neural network
class FNN(nn.Module):
    def __init__(self, input_dim, hidden_dim_1, hidden_dim_2, hidden_dim_3, output_dim):
        super(FNN, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim_1)
        self.relu = nn.ReLU()
        self.softmax = nn.Softmax(dim=1)  # Output probability for labels
        self.dropout = nn.Dropout(p=0.5)
        self.fc2 = nn.Linear(hidden_dim_1, hidden_dim_2)
        self.fc3 = nn.Linear(hidden_dim_2, hidden_dim_3)
        self.fc4 = nn.Linear(hidden_dim_3, output_dim)
        

    def forward(self, x):
        out = self.fc1(x)
        out = self.relu(out)
        out = self.dropout(out)
        out = self.fc2(out)
        out = self.relu(out)
        out = self.dropout(out)
        out = self.fc3(out)
        out = self.softmax(out)
        # out = self.relu(out)
        # out = self.dropout(out)
        # out = self.fc4(out)
        return out

In [30]:
INPUT_DIM = 768
HIDDEN_DIM_1 = 256
HIDDEN_DIM_2 = 128
HIDDEN_DIM_3 = 64
OUTPUT_DIM = 11

model = FNN(INPUT_DIM, HIDDEN_DIM_1, HIDDEN_DIM_2, HIDDEN_DIM_3, OUTPUT_DIM)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

In [31]:
loss_fn = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [26]:
def train_model(model, dataloader, loss_fn, optimizer):
    model.train()
    total_loss = 0
    correct_predictions = 0

    for batch in dataloader:
        inputs, labels = batch
        inputs = inputs.to(torch.device('cuda'))
        labels = labels.to(torch.device('cuda'))

        optimizer.zero_grad()
        outputs = model(inputs)
        loss = loss_fn(outputs, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        _, preds = torch.max(outputs, dim= 1)   
        correct_predictions += torch.sum(preds == labels)

    return correct_predictions.double() / len(dataloader.dataset), total_loss / len(dataloader)

In [27]:
def eval_model(model, dataloader, loss_fn):
    model.eval()
    total_loss = 0
    correct_predictions = 0

    with torch.no_grad():
        for batch in dataloader:
            inputs, labels = batch
           
            # Move data to GPU
            inputs = inputs.to(torch.device('cuda'))
            labels = labels.to(torch.device('cuda'))
            
            outputs = model(inputs)
            loss = loss_fn(outputs, labels)
            
            total_loss += loss.item()
            _, preds = torch.max(outputs, dim=1)
            correct_predictions += torch.sum(preds == labels)
    return correct_predictions.double() / len(dataloader.dataset), total_loss / len(dataloader)
    

In [33]:
# Train in 15 epochs
for epoch in range(15):
    train_acc, train_loss = train_model(model, train_loader, loss_fn, optimizer)
    val_acc, val_loss = eval_model(model, val_loader, loss_fn)

    print(f"Epoch {epoch+1}/10:")
    print(f"Train loss: {train_loss}, accuracy: {train_acc}")
    print(f"Validation loss: {val_loss}, accuracy: {val_acc}")

Epoch 1/10:
Train loss: 3.6192331825847357, accuracy: 0.5663660701704946
Validation loss: 3.6003731830864196, accuracy: 0.5836
Epoch 2/10:
Train loss: 3.619981482868463, accuracy: 0.5656263209806961
Validation loss: 3.5743350603018595, accuracy: 0.6108
Epoch 3/10:
Train loss: 3.6165012723627226, accuracy: 0.5690080315626321
Validation loss: 3.5774324760315523, accuracy: 0.6088
Epoch 4/10:
Train loss: 3.6136586146287515, accuracy: 0.5714738621952938
Validation loss: 3.620732006753326, accuracy: 0.5640000000000001
Epoch 5/10:
Train loss: 3.61448578968854, accuracy: 0.5712272791320276
Validation loss: 3.5764967058874237, accuracy: 0.6072000000000001
Epoch 6/10:
Train loss: 3.6061407506969614, accuracy: 0.5788361279413837
Validation loss: 3.5786221756297314, accuracy: 0.6064
Epoch 7/10:
Train loss: 3.6043755417138756, accuracy: 0.5807735662956179
Validation loss: 3.586380785437906, accuracy: 0.5968
Epoch 8/10:
Train loss: 3.6029733303231253, accuracy: 0.5821121600676342
Validation loss: 3.