In [1]:
import pandas as pd
import torch
from torchmetrics import Accuracy
import transformers
import lightning.pytorch as pl
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [3]:
torch.cuda.is_available()


True

In [4]:
target_json_path = "../data/json/data_fine.json"

In [5]:

from utils import label_idx, idx_label, all_labels, n_classes

df = pd.read_json(target_json_path)
print(df.iloc[1])


print(label_idx, idx_label, all_labels)

buf_str    (January 31, 2021)
lbuf                43.235294
rbuf                57.215686
hbuf                       15
stk_str    James Hye Suk Yoon
lstk                41.098039
rstk                59.392157
hstk                       16
type                  discard
Name: 1, dtype: object
{'discard': 0, 'merge': 1, 'pop': 2, 'subordinate': 3} {0: 'discard', 1: 'merge', 2: 'pop', 3: 'subordinate'} ['discard', 'merge', 'pop', 'subordinate']


In [6]:
from transformers import BertTokenizerFast, BertModel

In [7]:
tokenizer = BertTokenizerFast.from_pretrained("bert-base-cased")

In [8]:

from torch.utils.data import DataLoader
from data import ResumeDataset

tokenizer_args = {
    'padding': 'max_length',
    'return_tensors': 'pt',
}

In [9]:
model = BertModel.from_pretrained("bert-base-cased")

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [10]:
resume_dataset = ResumeDataset(target_json_path)
total_count = len(resume_dataset)
train_count = int(0.85 * total_count)
valid_count = int(0.1 * total_count)
test_count = total_count - train_count - valid_count

seed = torch.Generator().manual_seed(42)
train_dataset, valid_dataset, test_dataset = torch.utils.data.random_split(
    resume_dataset, (train_count, valid_count, test_count)
)

In [21]:
def tuple_of_tensors_to_tensor(tuple_of_tensors):
    # https://discuss.pytorch.org/t/convert-a-tuple-into-tensor/82964
    return  torch.stack(list(tuple_of_tensors), dim=0)

def collate_batch(batch):
    buf_str, stk_str, pos, label_idx = zip(*batch)
    # print("pos type: ", type(pos), "contents: ", pos)
    # print("label_idx type: ", type(label_idx), "contents: ", label_idx)
    buf_str = list(buf_str)
    stk_str = list(stk_str)
    buf_emb = tokenizer(buf_str, **tokenizer_args) 
    stk_emb = tokenizer(stk_str, **tokenizer_args) 
    return buf_emb, stk_emb, tuple_of_tensors_to_tensor(pos), torch.tensor(list(label_idx))

def collate_batch_no_tokenize(batch):
    buf_str, stk_str, pos, label_idx = zip(*batch)
    # print("pos type: ", type(pos), "contents: ", pos)
    # print("label_idx type: ", type(label_idx), "contents: ", label_idx)
    # buf_str = list(buf_str)
    # stk_str = list(stk_str)
    # buf_emb = tokenizer(buf_str, **tokenizer_args) 
    # stk_emb = tokenizer(stk_str, **tokenizer_args) 
    return None, None, tuple_of_tensors_to_tensor(pos), torch.tensor(list(label_idx))

In [22]:
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True, num_workers = 0, collate_fn = collate_batch_no_tokenize)
valid_loader = DataLoader(valid_dataset, batch_size=32, shuffle=False, num_workers = 0, collate_fn = collate_batch_no_tokenize)

In [23]:
batch = next(iter(train_loader))

In [24]:
args = {
    'positional_dim': 32,
    'hidden_dim': 256,
    'classifier_dropout': 0.3,
    'num_classes': n_classes,
    'use_llm': False,
    'n_hidden': 1, # total layers: n_hidden + 2
}

In [25]:
from model import ResumeParser
parser = ResumeParser(model, args)

Device:  cpu


In [26]:
available_gpus = [torch.cuda.device(i) for i in range(torch.cuda.device_count())]
print(available_gpus)

[<torch.cuda.device object at 0x7fc68017e610>, <torch.cuda.device object at 0x7fc68017efd0>, <torch.cuda.device object at 0x7fc6801f7d90>]


In [27]:
%load_ext tensorboard
%tensorboard --logdir=lightning_logs/

In [28]:
trainer = pl.Trainer(accelerator="gpu", devices=[2], val_check_interval = 0.5)


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [29]:
trainer.fit(parser, train_loader, valid_loader)

  rank_zero_warn(
You are using a CUDA device ('NVIDIA A40') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2]

  | Name       | Type               | Params
--------------------------------------------------
0 | backend    | BertModel          | 108 M 
1 | classifier | Sequential         | 99.8 K
2 | metric     | MulticlassAccuracy | 0     
3 | ce_loss    | CrossEntropyLoss   | 0     
--------------------------------------------------
108 M     Trainable params
0         Non-trainable params
108 M     Total params
433.640   Total estimated model params size (MB)


Sanity Checking DataLoader 0:   0%|          | 0/2 [00:00<?, ?it/s]

RuntimeError: indices should be either on cpu or on the same device as the indexed tensor (cpu)