In [76]:
import pandas as pd 
import torch
import transformers
from tqdm import tqdm

In [77]:
torch.cuda.is_available()

True

In [78]:
df = pd.read_json("../data/json/data_fine.json")
print(df.iloc[1])

all_labels = sorted(list(set(df['type'].tolist())))
label_idx = {lab: i for i, lab in enumerate(all_labels)}
idx_label = {i: lab for i, lab in enumerate(all_labels)}

buf_str    (January 31, 2021)
stk_str    James Hye Suk Yoon
type                  discard
Name: 1, dtype: object


In [91]:
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

class ResumeDataset(Dataset):
    def __init__(self, data_dir):
        self.df = pd.read_json(data_dir)
    def __len__(self):
        return len(self.df)
    def __getitem__(self, idx):
        element = self.df.iloc[idx]
        return element.buf_str, element.stk_str, label_idx[element.type]


In [80]:
from transformers import BartTokenizerFast

In [81]:
tokenizer = BartTokenizerFast.from_pretrained("facebook/bart-base")

In [92]:
resume_dataset = ResumeDataset("../data/json/data_fine.json")
total_count = len(resume_dataset)
train_count = int(0.7 * total_count)
valid_count = int(0.2 * total_count)
test_count = total_count - train_count - valid_count

seed = torch.Generator().manual_seed(42)
train_dataset, valid_dataset, test_dataset = torch.utils.data.random_split(
    resume_dataset, (train_count, valid_count, test_count)
)

In [93]:
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=16, shuffle=True)

In [84]:

from transformers import BartForSequenceClassification

In [85]:
import lightning.pytorch as pl

In [107]:
class ResumeParser(pl.LightningModule):
    def __init__(self, model, tokenizer):
        super().__init__()
        self.model = model
        self.tokenizer = tokenizer
        
    def training_step(self, batch, batch_idx):
        print("MODELDEV:", self.model.device)
        buf, stk, typ = batch 
        strs = [[a, b] for a, b in zip(buf, stk)]
        inputs = self.tokenizer(strs, return_tensors="pt", padding=True).to(self.device)
        output = self.model(**inputs, labels = typ)

        return output.loss

    def validation_step(self, batch, batch_idx):
        
        buf, stk, typ = batch 
        strs = [[a, b] for a, b in zip(buf, stk)]
        inputs = self.tokenizer(strs, return_tensors="pt", padding=True).to(self.device)
        print("MODELDEV:", self.model.device, "INPUTDEV: ", inputs['input_ids'].get_device(), "LABELDEV: ", typ.get_device())
        output = self.model(**inputs, labels = typ)

        self.log("val_loss", output.loss)

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.model.parameters())
        return optimizer

In [101]:
model = BartForSequenceClassification.from_pretrained("facebook/bart-base", num_labels=len(all_labels))

Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-base and are newly initialized: ['classification_head.dense.weight', 'classification_head.out_proj.bias', 'classification_head.dense.bias', 'classification_head.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [105]:
parser = ResumeParser(model, tokenizer)

In [108]:
buf, stk, typ = next(iter(train_loader)) 
strs = [[a, b] for a, b in zip(buf, stk)]
inputs = tokenizer(strs, return_tensors="pt", padding=True).to('cpu')

print("MODELDEV:", model.device, "INPUTDEV: ", inputs['input_ids'].get_device(), "LABELDEV: ", typ.get_device())

output = model(**inputs, labels = typ)

MODELDEV: cpu INPUTDEV:  -1 LABELDEV:  -1


In [95]:
available_gpus = [torch.cuda.device(i) for i in range(torch.cuda.device_count())]
print(available_gpus)

[<torch.cuda.device object at 0x7fa274f3a640>, <torch.cuda.device object at 0x7fa274f3a850>, <torch.cuda.device object at 0x7fa274f3a940>]


In [106]:
trainer = pl.Trainer(accelerator="gpu", devices=1)
trainer.fit(parser, train_loader, valid_loader)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
You are using a CUDA device ('NVIDIA A40') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2]

  | Name  | Type                          | Params
--------------------------------------------------------
0 | model | BartForSequenceClassification | 140 M 
--------------------------------------------------------
140 M     Trainable params
0         Non-trainable params
140 M     Total params
560.056   Total estimated model params size (MB)


Sanity Checking DataLoader 0:   0%|          | 0/2 [00:00<?, ?it/s]MODELDEV: cuda:0 INPUTDEV:  -1 LABELDEV:  0


RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu! (when checking argument for argument index in method wrapper_CUDA__index_select)