In [1]:
import pandas as pd 
import torch
from torchmetrics import Accuracy
import transformers
import lightning.pytorch as pl
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
torch.cuda.is_available()

True

In [3]:
df = pd.read_json("../data/json/data_fine.json")
print(df.iloc[1])

all_labels = sorted(list(set(df['type'].tolist())))
n_classes = len(all_labels)
label_idx = {lab: i for i, lab in enumerate(all_labels)}
idx_label = {i: lab for i, lab in enumerate(all_labels)}

buf_str    (January 31, 2021)
stk_str    James Hye Suk Yoon
type                  discard
Name: 1, dtype: object


In [17]:
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

class ResumeDataset(Dataset):
    def __init__(self, data_dir):
        self.df = pd.read_json(data_dir)
    def __len__(self):
        return len(self.df)
    def __getitem__(self, idx):
        element = self.df.iloc[idx]
        return element.buf_str, element.stk_str, label_idx[element.type]


In [18]:
from transformers import BertTokenizerFast, BertForSequenceClassification

In [19]:
tokenizer = BertTokenizerFast.from_pretrained("bert-base-cased")

In [20]:
model = BertForSequenceClassification.from_pretrained("bert-base-cased", num_labels=n_classes, classifier_dropout = 0.3)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at b

In [31]:
resume_dataset = ResumeDataset("../data/json/data_fine.json")
total_count = len(resume_dataset)
train_count = int(0.7 * total_count)
valid_count = int(0.2 * total_count)
test_count = total_count - train_count - valid_count

seed = torch.Generator().manual_seed(42)
train_dataset, valid_dataset, test_dataset = torch.utils.data.random_split(
    resume_dataset, (train_count, valid_count, test_count)
)

In [32]:
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=128, shuffle=True)

In [35]:
class ResumeParser(pl.LightningModule):
    def __init__(self, model, tokenizer):
        super().__init__()
        self.model = model
        self.tokenizer = tokenizer
        self.metric = Accuracy(task = "multiclass", num_classes = n_classes)
        self.running_loss = None
        
    def training_step(self, batch, batch_idx):
        buf, stk, typ = batch 
        strs = [[a, b] for a, b in zip(buf, stk)]
        inputs = self.tokenizer(strs, return_tensors="pt", padding=True).to(self.device)
        output = self.model(**inputs, labels = typ)
        
        if self.running_loss == None:
            self.running_loss = output.loss
        self.running_loss = 0.95 * self.running_loss + 0.05 * output.loss

        return output.loss

    def validation_step(self, batch, batch_idx):
        
        buf, stk, typ = batch 
        strs = [[a, b] for a, b in zip(buf, stk)]
        inputs = self.tokenizer(strs, return_tensors="pt", padding=True).to(self.device)
        output = self.model(**inputs, labels = typ)
        logits = output.logits 
        preds = logits.argmax(dim = -1)
        
        self.log("accuracy", self.metric(preds, typ))
        self.log("val_loss", output.loss)
        if self.running_loss is not None:
            self.log("train_loss", self.running_loss)


    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters())
        return optimizer

In [36]:
parser = ResumeParser(model, tokenizer)

In [12]:
buf, stk, typ = next(iter(train_loader)) 
strs = [[a, b] for a, b in zip(buf, stk)]
inputs = tokenizer(strs, return_tensors="pt", padding=True).to('cpu')

print("MODELDEV:", model.device, "INPUTDEV: ", inputs['input_ids'].get_device(), "LABELDEV: ", typ.get_device())

output = model(**inputs, labels = typ)

MODELDEV: cpu INPUTDEV:  -1 LABELDEV:  -1


In [13]:
available_gpus = [torch.cuda.device(i) for i in range(torch.cuda.device_count())]
print(available_gpus)

[<torch.cuda.device object at 0x7f63442417f0>, <torch.cuda.device object at 0x7f6344241160>, <torch.cuda.device object at 0x7f6344241be0>]


In [37]:
%reload_ext tensorboard
%tensorboard --logdir=lightning_logs/

In [39]:
trainer = pl.Trainer(accelerator="gpu", devices=1)
trainer.fit(parser, train_loader, valid_loader)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2]

  | Name   | Type                          | Params
---------------------------------------------------------
0 | model  | BertForSequenceClassification | 108 M 
1 | metric | MulticlassAccuracy            | 0     
---------------------------------------------------------
108 M     Trainable params
0         Non-trainable params
108 M     Total params
433.253   Total estimated model params size (MB)


Epoch 32:  10%|▉         | 20/209 [00:03<00:32,  5.90it/s, v_num=1]        

  rank_zero_warn("Detected KeyboardInterrupt, attempting graceful shutdown...")
