In [None]:
import pandas as pd 
import torch
from torchmetrics import Accuracy
import transformers
import lightning.pytorch as pl
from tqdm import tqdm

In [3]:
torch.cuda.is_available()

True

In [4]:
df = pd.read_json("../data/json/data_fine.json")
print(df.iloc[1])

all_labels = sorted(list(set(df['type'].tolist())))
n_classes = len(all_labels)
label_idx = {lab: i for i, lab in enumerate(all_labels)}
idx_label = {i: lab for i, lab in enumerate(all_labels)}

buf_str    (January 31, 2021)
stk_str    James Hye Suk Yoon
type                  discard
Name: 1, dtype: object


In [5]:
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

class ResumeDataset(Dataset):
    def __init__(self, data_dir):
        self.df = pd.read_json(data_dir)
    def __len__(self):
        return len(self.df)
    def __getitem__(self, idx):
        element = self.df.iloc[idx]
        return element.buf_str, element.stk_str, label_idx[element.type]


In [18]:
from transformers import BartTokenizerFast
from transformers import BartForSequenceClassification

In [7]:
tokenizer = BartTokenizerFast.from_pretrained("facebook/bart-base")

In [8]:
resume_dataset = ResumeDataset("../data/json/data_fine.json")
total_count = len(resume_dataset)
train_count = int(0.7 * total_count)
valid_count = int(0.2 * total_count)
test_count = total_count - train_count - valid_count

seed = torch.Generator().manual_seed(42)
train_dataset, valid_dataset, test_dataset = torch.utils.data.random_split(
    resume_dataset, (train_count, valid_count, test_count)
)

In [19]:
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=16, shuffle=True)

In [20]:
class ResumeParser(pl.LightningModule):
    def __init__(self, model, tokenizer):
        super().__init__()
        self.model = model
        self.tokenizer = tokenizer
        self.metric = Accuracy(task = "multiclass", num_classes = n_classes)
        
    def training_step(self, batch, batch_idx):
        buf, stk, typ = batch 
        strs = [[a, b] for a, b in zip(buf, stk)]
        inputs = self.tokenizer(strs, return_tensors="pt", padding=True).to(self.device)
        output = self.model(**inputs, labels = typ)

        return output.loss

    def validation_step(self, batch, batch_idx):
        
        buf, stk, typ = batch 
        strs = [[a, b] for a, b in zip(buf, stk)]
        inputs = self.tokenizer(strs, return_tensors="pt", padding=True).to(self.device)
        output = self.model(**inputs, labels = typ)
        logits = output.logits 
        preds = logits.argmax(dim = -1)
        
        self.log("accuracy", self.metric(preds, typ))

        self.log("val_loss", output.loss)

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.model.parameters())
        return optimizer

In [21]:
model = BartForSequenceClassification.from_pretrained("facebook/bart-base", num_labels=n_classes)

Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-base and are newly initialized: ['classification_head.out_proj.bias', 'classification_head.dense.weight', 'classification_head.out_proj.weight', 'classification_head.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [22]:
parser = ResumeParser(model, tokenizer)

In [23]:
buf, stk, typ = next(iter(train_loader)) 
strs = [[a, b] for a, b in zip(buf, stk)]
inputs = tokenizer(strs, return_tensors="pt", padding=True).to('cpu')

print("MODELDEV:", model.device, "INPUTDEV: ", inputs['input_ids'].get_device(), "LABELDEV: ", typ.get_device())

output = model(**inputs, labels = typ)

MODELDEV: cpu INPUTDEV:  -1 LABELDEV:  -1


In [24]:
available_gpus = [torch.cuda.device(i) for i in range(torch.cuda.device_count())]
print(available_gpus)

[<torch.cuda.device object at 0x7efc507082e0>, <torch.cuda.device object at 0x7efc50708970>, <torch.cuda.device object at 0x7efc5324af10>]


In [25]:
trainer = pl.Trainer(accelerator="gpu", devices=1)
trainer.fit(parser, train_loader, valid_loader)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2]

  | Name  | Type                          | Params
--------------------------------------------------------
0 | model | BartForSequenceClassification | 140 M 
--------------------------------------------------------
140 M     Trainable params
0         Non-trainable params
140 M     Total params
560.056   Total estimated model params size (MB)


Epoch 6:  55%|█████▍    | 458/835 [00:44<00:36, 10.32it/s, v_num=1]        

In [None]:
%reload_ext tensorboard
%tensorboard --logdir=lightning_logs/