In [119]:
!pip install transformers tokenizers torch torchvision scikit-learn numpy

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [120]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [121]:
from torch.utils.data import Dataset, DataLoader
from torch.optim import Adam
import torch
import torch.nn as nn
import csv
import numpy as np
import typing as t
import csv
from transformers import BertTokenizer, BertModel
from tqdm import tqdm
from sklearn.metrics import precision_recall_fscore_support

In [122]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
print(device)

cuda


In [123]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased',)

In [124]:
class WikiData(Dataset):
  def __init__(self, csv_path:str = "", limit: int = 0):
    self.csv_path = csv_path
    self.limit = limit
    self.data = self._load_csv()

  def _load_csv(self):
    rows = []
    i = 0
    with open(self.csv_path, "r", encoding="utf-8") as f:
      reader = csv.reader(f, delimiter=",")
      for row in reader:
        if self.limit and i >= self.limit:
          break
        _id, label, text = row
        rows.append((_id, label, text))
        i += 1
    return rows

  def __len__(self):
    return len(self.data)
  
  def __getitem__(self, idx: int):
    _id, label, text = self.data[idx]
    label = 0 if label == "standard" else 1
    return text, label

In [125]:
a = torch.tensor([
    [[5, 5], [6, 6]],
    [[7, 7], [8, 8]],
])

b = torch.tensor([
    [0, 1],
    [1, 0]
])
b = b.reshape((a.shape[0], -1, 1))
torch.cat((a, b), dim=2)

tensor([[[5, 5, 0],
         [6, 6, 1]],

        [[7, 7, 1],
         [8, 8, 0]]])

In [126]:
class ReadabilityClassifier(nn.Module):
  def __init__(self,
               hidden_size: int = 126,
               n_lstm_layers: int = 1
               ):
    super().__init__()

    self.bert = BertModel.from_pretrained("bert-base-uncased")

    for param in self.bert.parameters():
      param.requires_grad = False

    self.lstm = nn.LSTM(
        input_size = 769,
        hidden_size = hidden_size,
        num_layers = n_lstm_layers,
        batch_first = True
    )
    self.linear = nn.Linear(hidden_size, 2)
    self.softmax = nn.LogSoftmax(dim=1)
  
  def forward(self, tokens):
    attention = tokens.attention_mask
    embedded = self.bert(**tokens).last_hidden_state
    attention = attention.reshape(embedded.shape[0], -1, 1)
    embedded = torch.cat((embedded, attention), dim=2)
    output, _ = self.lstm(embedded)
    output = output[:, -1, :]
    output = self.linear(output)
    sm = self.softmax(output)
    return sm


In [127]:
def train(
    model: ReadabilityClassifier, 
    criterion: nn.Module,
    dataloader: DataLoader,
    optimizer,
    n_epochs: int = 1,
):
  for epoch in range(n_epochs):
    loop = tqdm(dataloader)
    losses = []
    for texts, labels in loop:
      optimizer.zero_grad()
      tokens = tokenizer(
          texts, 
          return_tensors="pt", 
          padding=True, 
          truncation=True
      ).to(device)
      labels = labels.to(device)
      output = model(tokens)
      loss = criterion(output, labels)
      losses.append(loss.item())
      loss.backward()
      optimizer.step()
    print(f"Loss at epoch {epoch}: {round(sum(losses) / len(losses), 4)}")
    
    torch.save({
        "epoch": epoch,
        "model_state_dict": model.state_dict(),
        "optimizer_state_dict": optimizer.state_dict(),
        "loss": sum(losses) / len(losses)
    }, f"checkpoints/epoch_{epoch}.tar")

In [128]:
def evaluate(model: nn.Module, dataloader: DataLoader):
  y_true = []
  y_pred = []
  with torch.no_grad():
    loop = tqdm(dataloader)
    for texts, levels in loop:
      tokens = tokenizer(
          texts, 
          return_tensors="pt", 
          padding=True, 
          truncation=True
      ).to(device)
      levels = levels.to(device)
      output = model(tokens)
      output = output.argmax(dim=1).cpu().detach().numpy()
      levels = levels.cpu().detach().numpy()
      y_true.extend(levels)
      y_pred.extend(output)
    precision, recall, fscore, _ = precision_recall_fscore_support(
        y_true = levels, 
        y_pred = output,
        average = "micro"
    )
    print()
    print("Precision", precision)
    print("Recall", recall)
    print("Fscore", fscore)
      

In [129]:
train_dataloader = DataLoader(
    WikiData("./drive/MyDrive/Colab Notebooks/dataset_train.csv"),
    batch_size=50,
    shuffle=True
)

In [None]:
model = ReadabilityClassifier()
model.to(device)
criterion = nn.NLLLoss()
optimizer = Adam(model.parameters(), lr=2e-5)
train(model, criterion, train_dataloader, optimizer, n_epochs=5)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
  0%|          | 50/32058 [00:55<9:59:22,  1.12s/it]

In [None]:
test_dataloader = DataLoader(
    WikiData("./drive/MyDrive/Colab Notebooks/dataset_test.csv", limit=100),
    batch_size=50,
    shuffle=True,
)

evaluate(model, test_dataloader)