# `experiment.ipynb`
This is the Python notebook for the ML experiment. You'll find the model in `models.py`, the `Dataset` in `data.py`, and some utilities (mainly global variables) in `utils.py`. 

In [1]:
import pandas as pd
import torch
from torchmetrics import Accuracy
import transformers
import lightning.pytorch as pl
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
torch.cuda.is_available()

In [4]:
target_json_path = "../data/json/data_fine.json" #you can change this to whatever you have!

In [6]:

from utils import label_idx, idx_label, all_labels, n_classes
from transformers import BertTokenizerFast, BertModel
from torch.utils.data import DataLoader
from data import ResumeDataset

In [None]:
tokenizer_args = {
    'padding': 'max_length',
    'return_tensors': 'pt',
}


In [7]:
tokenizer = BertTokenizerFast.from_pretrained("bert-base-cased")

In [9]:
model = BertModel.from_pretrained("bert-base-cased")

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [10]:
resume_dataset = ResumeDataset(target_json_path)

train_perc = 0.85
val_perc = 0.1

total_count = len(resume_dataset)
train_count = int(train_perc * total_count) 
valid_count = int(val_perc * total_count)
test_count = total_count - train_count - valid_count

seed = torch.Generator().manual_seed(42) # make things (as) deterministic (as possible)
train_dataset, valid_dataset, test_dataset = torch.utils.data.random_split(
    resume_dataset, (train_count, valid_count, test_count)
)

In [20]:
def tuple_of_tensors_to_tensor(tuple_of_tensors):
    # https://discuss.pytorch.org/t/convert-a-tuple-into-tensor/82964
    return  torch.stack(list(tuple_of_tensors), dim=0)

def collate_batch(batch):
    buf_str, stk_str, pos, sty, label_idx = zip(*batch)
    buf_str = list(buf_str)
    stk_str = list(stk_str)
    buf_emb = tokenizer(buf_str, **tokenizer_args) 
    stk_emb = tokenizer(stk_str, **tokenizer_args) 
    return buf_emb, stk_emb, tuple_of_tensors_to_tensor(pos), tuple_of_tensors_to_tensor(sty), torch.tensor(list(label_idx))

def collate_batch_no_tokenize(batch):
    # The same as the above, but without tokenisation and without using string semantic data
    buf_str, stk_str, pos, sty, label_idx = zip(*batch)
    return None, None, tuple_of_tensors_to_tensor(pos), tuple_of_tensors_to_tensor(sty), torch.tensor(list(label_idx))

In [21]:
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True, num_workers = 0, collate_fn = collate_batch_no_tokenize)
valid_loader = DataLoader(valid_dataset, batch_size=32, shuffle=False, num_workers = 0, collate_fn = collate_batch_no_tokenize)

In [23]:
args = {
    'positional_dim': 32,
    'hidden_dim': 256,
    'classifier_dropout': 0.3,
    'num_classes': n_classes,
    'use_llm': False,
    'n_hidden': 1, # total layers: n_hidden + 2
}

In [24]:
from model import ResumeParser
parser = ResumeParser(model, args)

Device:  cpu


In [25]:
available_gpus = [torch.cuda.device(i) for i in range(torch.cuda.device_count())]
print(available_gpus)

[<torch.cuda.device object at 0x7fd6873559a0>, <torch.cuda.device object at 0x7fd687355ee0>, <torch.cuda.device object at 0x7fd687355f10>]


In [29]:
%load_ext tensorboard
%tensorboard --logdir=lightning_logs/

In [27]:
trainer = pl.Trainer(accelerator="gpu", devices=[2], val_check_interval = 0.5)
 # You should change this! Multi-GPU training doesn't currently work, but feel free to try (by removing the devices parameter)


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [28]:
trainer.fit(parser, train_loader, valid_loader)

You are using a CUDA device ('NVIDIA A40') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2]

  | Name       | Type               | Params
--------------------------------------------------
0 | backend    | BertModel          | 108 M 
1 | classifier | Sequential         | 100 K 
2 | metric     | MulticlassAccuracy | 0     
3 | ce_loss    | CrossEntropyLoss   | 0     
--------------------------------------------------
108 M     Trainable params
0         Non-trainable params
108 M     Total params
433.645   Total estimated model params size (MB)


                                                                           

  rank_zero_warn(


Epoch 44:  27%|██▋       | 138/507 [00:00<00:02, 160.49it/s, v_num=16]

  rank_zero_warn("Detected KeyboardInterrupt, attempting graceful shutdown...")
