In [2]:
import pandas as pd 
import torch
from torchmetrics import Accuracy
import transformers
import lightning.pytorch as pl
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [4]:
torch.cuda.is_available()
target_json_path = "../data/json/data_coarse.json"

In [5]:
df = pd.read_json(target_json_path)
print(df.iloc[1])

all_labels = sorted(list(set(df['type'].tolist())))
n_classes = len(all_labels)
label_idx = {lab: int(i) for i, lab in enumerate(all_labels)}
idx_label = {int(i): lab for i, lab in enumerate(all_labels)}

buf_str    José Ignacio Hualde
lbuf                 42.050588
rbuf                 58.439608
stk_str                  $ROOT
lstk                       0.0
rstk                     100.0
type               subordinate
Name: 1, dtype: object


In [5]:
from transformers import BertTokenizerFast, BertModel

In [6]:
tokenizer = BertTokenizerFast.from_pretrained("bert-base-cased")

In [7]:
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

tokenizer_args = {
    'padding': 'max_length',
    'return_tensors': 'pt',
}

class ResumeDataset(Dataset):
    def __init__(self, data_dir):
        self.df = pd.read_json(data_dir)
    def __len__(self):
        return len(self.df)
    def __getitem__(self, idx):
        element = self.df.iloc[idx]
        pos = torch.floor(torch.Tensor([element.lbuf, element.rbuf, element.lstk, element.rstk])).long()
        return element.buf_str, element.stk_str, pos, label_idx[element.type]


In [8]:
model = BertModel.from_pretrained("bert-base-cased")

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [9]:
resume_dataset = ResumeDataset(target_json_path)
total_count = len(resume_dataset)
train_count = int(0.85 * total_count)
valid_count = int(0.1 * total_count)
test_count = total_count - train_count - valid_count

seed = torch.Generator().manual_seed(42)
train_dataset, valid_dataset, test_dataset = torch.utils.data.random_split(
    resume_dataset, (train_count, valid_count, test_count)
)

In [10]:
def tuple_of_tensors_to_tensor(tuple_of_tensors):
    # https://discuss.pytorch.org/t/convert-a-tuple-into-tensor/82964
    return  torch.stack(list(tuple_of_tensors), dim=0)

def collate_batch(batch):
    buf_str, stk_str, pos, label_idx = zip(*batch)
    # print("pos type: ", type(pos), "contents: ", pos)
    # print("label_idx type: ", type(label_idx), "contents: ", label_idx)
    buf_str = list(buf_str)
    stk_str = list(stk_str)
    buf_emb = tokenizer(buf_str, **tokenizer_args) 
    stk_emb = tokenizer(stk_str, **tokenizer_args) 
    return buf_emb, stk_emb, tuple_of_tensors_to_tensor(pos), torch.tensor(list(label_idx))

In [11]:
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True, collate_fn = collate_batch)
valid_loader = DataLoader(valid_dataset, batch_size=32, shuffle=False, collate_fn = collate_batch)

In [12]:
batch = next(iter(train_loader))

In [13]:
args = {
    'positional_dim': 32,
    'hidden_dim': 256,
    'classifier_dropout': 0.3,
    'num_classes': n_classes,
    'n_hidden': 1, # total layers: n_hidden + 2
}

In [14]:
# From https://github.com/wzlxjtu/PositionalEncoding2D/blob/master/positionalembedding2d.py
import math
def positionalencoding1d(d_model, length):
    """
    :param d_model: dimension of the model
    :param length: length of positions
    :return: length*d_model position matrix
    """
    if d_model % 2 != 0:
        raise ValueError("Cannot use sin/cos positional encoding with "
                         "odd dim (got dim={:d})".format(d_model))
    pe = torch.zeros(length, d_model)
    position = torch.arange(0, length).unsqueeze(1)
    div_term = torch.exp((torch.arange(0, d_model, 2, dtype=torch.float) *
                         -(math.log(10000.0) / d_model)))
    pe[:, 0::2] = torch.sin(position.float() * div_term)
    pe[:, 1::2] = torch.cos(position.float() * div_term)

    return pe

In [15]:
import torch.nn as nn
from torch.nn import Linear, Dropout, ReLU, Embedding, CrossEntropyLoss
import math 

class ResumeParser(pl.LightningModule):
    def __init__(self, backend, args):
        super().__init__()
        self.backend = backend 
        self.classifier = nn.Sequential(
            Linear(in_features = self.backend.config.hidden_size + 4 * args['positional_dim'], out_features = args['hidden_dim']),
            Dropout(p = args['classifier_dropout']),
            ReLU(),
            Linear(in_features = args['hidden_dim'], out_features = args['hidden_dim']), #n_hidden = 1 hardcoded
            Dropout(p = args['classifier_dropout']),
            ReLU(),
            Linear(in_features = args['hidden_dim'], out_features = args['num_classes']),
            Dropout(p = args['classifier_dropout']),
        )  
        # self.pos_embeddings = Embedding(num_embeddings = 100, embedding_dim = args['positional_dim'])
        self.pos_embeddings = positionalencoding1d(args['positional_dim'], 101)
        # self.tokenizer = tokenizer
        self.metric = Accuracy(task = "multiclass", num_classes = n_classes)
        self.running_loss = None

        self.ce_loss = CrossEntropyLoss()
        
    def get_logits_and_loss(self, batch):
        inp_buf, inp_stk, pos, typ = batch 
        pos_emb = self.pos_embeddings[pos.cpu()].to(self.device) # B x 4 x D_pos
        pos_emb = pos_emb.reshape((-1, 4 * args['positional_dim'])) # concatenate all positional embeddings
        # print("pos_emb before shape: ", pos_emb.shape)
        # pos_emb = pos_emb.sum(dim = 1) # sum the positional embeddings (B x D_pos)
        # print("pos_emb after shape: ", pos_emb.shape)
        # print("inp ids shape:", inp_buf['input_ids'].shape, "inp_buf type: ", type(inp_buf))
        emb_buf = self.backend(**inp_buf)['pooler_output'] # B x D_backend
        emb_stk = self.backend(**inp_stk)['pooler_output'] # B x D_backend
        # print("Pos embedding shape: ", pos_emb.shape, ", emb_buf shape: ", emb_buf.shape, " emb_stk shape: ", emb_stk.shape)
        classifier_inp = torch.cat((emb_buf + emb_stk, pos_emb), 1) # B x (D_backend + D_pos)
        logits = self.classifier(classifier_inp)
        loss = self.ce_loss(logits, typ)
        return logits, loss

    def training_step(self, batch, batch_idx):
        _, loss = self.get_logits_and_loss(batch)
        
        if self.running_loss == None:
            self.running_loss = loss
        self.running_loss = 0.95 * self.running_loss + 0.05 * loss

        return loss

    def validation_step(self, batch, batch_idx):
        _, _, _, typ = batch 
        logits, loss = self.get_logits_and_loss(batch)
        preds = torch.argmax(logits, dim = 1)
        # print("logits shape:", logits.shape, ", preds.shape: ", preds.shape)
        
        self.log("Validation Accuracy", self.metric(preds, typ))
        self.log("Validation Loss", loss)
        if self.running_loss is not None:
            self.log("Training Loss", self.running_loss)


    def configure_optimizers(self):
        optimizer = torch.optim.AdamW(self.parameters())
        return optimizer

In [16]:
parser = ResumeParser(model, args)

In [17]:
available_gpus = [torch.cuda.device(i) for i in range(torch.cuda.device_count())]
print(available_gpus)

[<torch.cuda.device object at 0x7ff36c226e20>, <torch.cuda.device object at 0x7ff36c226a60>, <torch.cuda.device object at 0x7ff36c226a00>]


In [6]:
%load_ext tensorboard
%tensorboard --logdir=lightning_logs/

In [19]:
trainer = pl.Trainer(accelerator="gpu", devices=[1], val_check_interval = 0.5)


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [20]:
trainer.fit(parser, train_loader, valid_loader)

  rank_zero_warn(
You are using a CUDA device ('NVIDIA A40') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2]

  | Name       | Type               | Params
--------------------------------------------------
0 | backend    | BertModel          | 108 M 
1 | classifier | Sequential         | 296 K 
2 | metric     | MulticlassAccuracy | 0     
3 | ce_loss    | CrossEntropyLoss   | 0     
--------------------------------------------------
108 M     Trainable params
0         Non-trainable params
108 M     Total params
434.427   Total estimated model params size (MB)


Sanity Checking DataLoader 0:   0%|          | 0/2 [00:00<?, ?it/s]

  rank_zero_warn(


                                                                           

  rank_zero_warn(


Epoch 7:  67%|██████▋   | 207/311 [03:00<01:30,  1.15it/s, v_num=16]

  rank_zero_warn("Detected KeyboardInterrupt, attempting graceful shutdown...")


In [None]:
# n_workers / it/s
# 0: 1.33
# 16: 1.25
# 40: 