# Dataset with no mathematical symbols and newline characters.

In [3]:
from datasets import load_dataset

ds = load_dataset("ThanhT04/arvix-processed-dataset")

Downloading readme:   0%|          | 0.00/536 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/182M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/181M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/181M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/47.5M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/46.9M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/28388 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2500 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2500 [00:00<?, ? examples/s]

In [4]:
ds_train = ds['train']
ds_val = ds['validation']
ds_test = ds['test']

train_text = ds_train['text']
train_label = ds_train['label']

val_text = ds_val['text']
val_label = ds_val['label']

test_text = ds_test['text']
test_label = ds_test['label']

# Bert tokenization: Get CLS token

In [8]:
from transformers import BertTokenizer, BertModel
import torch

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased')
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
bert_model = bert_model.to(device)

def embedding(texts, batch_size = 16):
    all_embeddings = []
    for i in range (0, len(texts), batch_size):
        batch_texts = texts[i:i + batch_size]
        inputs = tokenizer(batch_texts, return_tensors = 'pt', padding = True, truncation = True, max_length = 128)
        inputs = {key: value.to(device) for key, value in inputs.items()}
        with torch.no_grad():
            outputs = bert_model(**inputs)
            embedding = outputs.last_hidden_state[:,0,:]
        all_embeddings.append(embedding)
        del inputs, outputs, embedding,
        torch.cuda.empty_cache()
    return torch.cat(all_embeddings, dim = 0)



In [9]:
train_embeddings = embedding(train_text, batch_size = 8)

  attn_output = torch.nn.functional.scaled_dot_product_attention(


In [10]:
val_embeddings = embedding(val_text, batch_size = 8)

In [114]:
test_embeddings = embedding(test_text, batch_size = 8)

# Test bert outputs on fnn

In [14]:
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader

y_validation = torch.tensor(val_label, dtype=torch.long)
val_dataset = TensorDataset(val_embeddings, y_validation)
val_loader = DataLoader(val_dataset, batch_size = 16, shuffle = False)

In [15]:
y_train = torch.tensor(train_label, dtype=torch.long)
train_dataset = TensorDataset(train_embeddings, y_train)
train_loader = DataLoader(train_dataset, batch_size = 16, shuffle = False)

In [116]:
y_test = torch.tensor(test_label, dtype=torch.long)
test_dataset = TensorDataset(test_embeddings, y_test)
test_loader = DataLoader(test_dataset, batch_size = 16, shuffle = False)

In [110]:
# feedforward neural network
class FNN(nn.Module):
    def __init__(self, input_dim, hidden_dim_1, hidden_dim_2, hidden_dim_3, output_dim):
        super(FNN, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim_1)
        self.relu = nn.ReLU()
        self.dropout1 = nn.Dropout(p=0.5)
        self.fc2 = nn.Linear(hidden_dim_1, hidden_dim_2)
        self.dropout2 = nn.Dropout(p=0.5)
        self.fc3 = nn.Linear(hidden_dim_2, output_dim)
        # self.dropout3 = nn.Dropout(p=0.4)
        # self.fc4 = nn.Linear(hidden_dim_3, output_dim)
        
    def forward(self, x):
        out = self.fc1(x)
        out = self.relu(out)
        out = self.dropout1(out)
        
        out = self.fc2(out)
        out = self.relu(out)
        out = self.dropout2(out)
        out = self.fc3(out)
        # out = self.relu(out)
        # out = self.dropout3(out)
        # out = self.fc4(out)
        return out

In [111]:
INPUT_DIM = 768
HIDDEN_DIM_1 = 256
HIDDEN_DIM_2 = 128
HIDDEN_DIM_3 = 64
OUTPUT_DIM = 11

model = FNN(INPUT_DIM, HIDDEN_DIM_1, HIDDEN_DIM_2, HIDDEN_DIM_3, OUTPUT_DIM)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

In [112]:
loss_fn = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.00004)

In [29]:
def train_model(model, dataloader, loss_fn, optimizer):
    model.train()
    total_loss = 0
    correct_predictions = 0

    for batch in dataloader:
        inputs, labels = batch
        inputs = inputs.to(torch.device('cuda'))
        labels = labels.to(torch.device('cuda'))

        optimizer.zero_grad()
        outputs = model(inputs)
        loss = loss_fn(outputs, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        _, preds = torch.max(outputs, dim= 1)   
        correct_predictions += torch.sum(preds == labels)

    return correct_predictions.double() / len(dataloader.dataset), total_loss / len(dataloader)

In [30]:
def eval_model(model, dataloader, loss_fn):
    model.eval()
    total_loss = 0
    correct_predictions = 0

    with torch.no_grad():
        for batch in dataloader:
            inputs, labels = batch
           
            # Move data to GPU
            inputs = inputs.to(torch.device('cuda'))
            labels = labels.to(torch.device('cuda'))
            
            outputs = model(inputs)
            loss = loss_fn(outputs, labels)
            
            total_loss += loss.item()
            _, preds = torch.max(outputs, dim=1)
            correct_predictions += torch.sum(preds == labels)
    return correct_predictions.double() / len(dataloader.dataset), total_loss / len(dataloader)
    

In [113]:
# Train in  epochs
for epoch in range(20):
    train_acc, train_loss = train_model(model, train_loader, loss_fn, optimizer)
    val_acc, val_loss = eval_model(model, val_loader, loss_fn)

    print(f"Epoch {epoch+1}/20:")
    print(f"Train loss: {train_loss}, accuracy: {train_acc}")
    print(f"Validation loss: {val_loss}, accuracy: {val_acc}")

Epoch 1/20:
Train loss: 2.0390853423803623, accuracy: 0.2667324221502043
Validation loss: 1.6247162052020905, accuracy: 0.5092
Epoch 2/20:
Train loss: 1.5557642437370731, accuracy: 0.44208820628434553
Validation loss: 1.3156356428079545, accuracy: 0.5912000000000001
Epoch 3/20:
Train loss: 1.3488436086412887, accuracy: 0.5248696632379879
Validation loss: 1.182415288724717, accuracy: 0.6268
Epoch 4/20:
Train loss: 1.2468738576392053, accuracy: 0.5664717486261801
Validation loss: 1.1070507510452514, accuracy: 0.6476000000000001
Epoch 5/20:
Train loss: 1.180677840105245, accuracy: 0.5968014654079189
Validation loss: 1.0540115339740825, accuracy: 0.6628000000000001
Epoch 6/20:
Train loss: 1.1300181571866426, accuracy: 0.6172678596590109
Validation loss: 1.0157462867202274, accuracy: 0.6756
Epoch 7/20:
Train loss: 1.0930759517575654, accuracy: 0.6335071156826829
Validation loss: 0.9920137942216958, accuracy: 0.6824
Epoch 8/20:
Train loss: 1.066061543142292, accuracy: 0.6431238551500634
Vali

In [117]:
test_acc, test_loss = eval_model(model, test_loader, loss_fn)
print(f"Test loss: {test_loss}, accuracy: {test_acc}")


Test loss: 0.8436112671521059, accuracy: 0.7148
