In [1]:
pip install transformers -q

Note: you may need to restart the kernel to use updated packages.


In [5]:
%load_ext tensorboard
%tensorboard --logdir logs
from transformers import AutoTokenizer, AutoModel
from torch.utils.data import TensorDataset, DataLoader, Dataset
from torch.utils.tensorboard import SummaryWriter
import torch.nn as nn
import numpy as np
import torch

class Bert(nn.Module):
    def __init__(self):
        super().__init__()
        self.bert = AutoModel.from_pretrained("bert-base-uncased")
        self.classifier = nn.Linear(in_features = 768, out_features = 4)
    def forward(self, input_ids, attention_mask, token_type_ids):
        outputs = self.bert(input_ids = input_ids, attention_mask = attention_mask, token_type_ids = token_type_ids)
        pooler_output = outputs.pooler_output
        logits = self.classifier(pooler_output).squeeze(-1)
        return logits

class BertDataset(Dataset):
    def __init__(self, data, label):
        super().__init__()
        self.data_length = len(data["input_ids"])
        self.x_input_ids = data["input_ids"]
        self.x_token_type_ids = data["token_type_ids"]
        self.x_attention_mask = data["attention_mask"]
        self.y = label
    def __len__(self):
        return self.data_length
    def __getitem__(self, idx):
        x_input_ids = torch.tensor(self.x_input_ids[idx])
        x_token_type_ids = torch.tensor(self.x_token_type_ids[idx])
        x_attention_mask = torch.tensor(self.x_attention_mask[idx])
        return {"input_ids":x_input_ids, "token_type_ids":x_token_type_ids, "x_attention_mask":x_attention_mask}, torch.tensor(self.y[idx])

def calculate_loss_and_accuracy(model, dataset, device, criterion=None):
    dataloader = DataLoader(dataset, batch_size=256, shuffle=False)
    loss = 0.0
    total = 0
    correct = 0
    model = model.to(device)
    with torch.no_grad():
        for X, Y in dataloader:
            input_ids = X["input_ids"].to(device)
            attention_mask = X["x_attention_mask"].to(device)
            token_type_ids = X["token_type_ids"].to(device)
            Y = Y.to(device)
            Y_pred =  model(input_ids, attention_mask, token_type_ids)
            if criterion != None:
                loss += criterion(Y_pred, Y).item()
            pred = torch.argmax(Y_pred, dim=-1)
            total += len(Y)
            correct += (pred == Y).sum().item()
    return loss / len(dataset), correct / total


def train_model(X_train, y_train, X_test, y_test, batch_size, model, lr, num_epochs, device, collate_fn=None):
    dataset_train = BertDataset(X_train, y_train)
    dataset_test = BertDataset(X_test, y_test)
    dataloader_train = DataLoader(dataset_train, batch_size=batch_size, shuffle=True)
    dataloader_test = DataLoader(dataset_test, batch_size=batch_size, shuffle=True)
    model = model.to(device)
    criterion = nn.CrossEntropyLoss()
    criterion = criterion.to(device)
    for ep in range(num_epochs):
        optimizer = torch.optim.SGD(model.parameters(), lr=lr)
        model.train()
        if ep%30==0:
            lr = lr * 0.1
        for X, Y in dataloader_train:
            input_ids = X["input_ids"].to(device)
            attention_mask = X["x_attention_mask"].to(device)
            token_type_ids = X["token_type_ids"].to(device)

            Y = Y.to(device)
            optimizer.zero_grad()
            Y_pred = model(input_ids, attention_mask, token_type_ids)
            loss = criterion(Y_pred, Y)
            loss.backward()
            optimizer.step()
        model.eval()

        loss_train, acc_train = calculate_loss_and_accuracy(model, dataset_train, device, criterion=criterion)
        loss_test, acc_test = calculate_loss_and_accuracy(model, dataset_test, device, criterion=criterion)

        print(f'epoch: {ep + 1}, loss_train: {loss_train:.4f}, accuracy_train: {acc_train:.4f}, loss_Test: {loss_test:.4f}, accuracy_Test: {acc_test:.4f}')
        TensorboardWriter(ep, loss_train, acc_train, "Train")
        TensorboardWriter(ep, loss_test, acc_test, "Test")

def TensorboardWriter(epoch, loss, accuracy, name):
    writer = SummaryWriter(log_dir="logs")
    writer.add_scalar("Loss/{}_Loss".format(name), loss, epoch)
    writer.add_scalar("Accuracy/{}_Accuracy".format(name), accuracy, epoch)
    writer.close()

def CountVocab(name):
    f = open("{}_code.txt".format(name), "r")
    lines = f.readlines()
    f.close()
    max_num = []
    for line in lines:
        line_t = line.split("\t")[2].replace("\n", "").split(" ")
        max_num.extend(map(int, line_t))
    vocab_max = max(max_num)+1
    return vocab_max

def GetStrLow(name):
    f = open("RNN_CNN/{}_code.txt".format(name), "r")
    lines = f.readlines()
    f.close()
    sent_list = []
    code_list = []

    for line in lines:
        line_s = line.split("\t")
        code_list.append(int(line_s[0]))
        sent = line_s[1].replace("\n", "")
        sent_list.append(sent)
    code_list = torch.tensor(code_list)
    return sent_list, code_list

X_train, Y_train = GetStrLow("train")
X_test, Y_test = GetStrLow("test")

MAX_LENGTH = 32
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
X_train_tokenizer = tokenizer.batch_encode_plus(X_train, padding = "max_length", max_length = MAX_LENGTH, truncation=True)
X_test_tokenizer = tokenizer.batch_encode_plus(X_test, padding = "max_length", max_length = MAX_LENGTH, truncation=True)

BATCH_SIZE = 8
NUM_EPOCHS = 10
lr = 1e-3
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = Bert()
train_model(X_train_tokenizer, Y_train, X_test_tokenizer, Y_test, BATCH_SIZE, model, lr, NUM_EPOCHS, device)


The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


Reusing TensorBoard on port 6006 (pid 84818), started 1:17:30 ago. (Use '!kill 84818' to kill it.)

FileNotFoundError: [Errno 2] No such file or directory: 'RNN_CNN/train_code.txt'