In [1]:
import os
import sys
project_dir = os.getcwd().split('notebooks')[0]
sys.path.append(project_dir)

In [2]:
import random
import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.loggers import TensorBoardLogger
from torch import nn
from src.data.data_format import UtteranceCollection
from src.model.classifier import SequenceClassifierModel
from src.data.data_for_training import SequenceClassifierModelDataModule
from tqdm import tqdm
from config import config
from datasets import load_dataset
from src.data.utils import convert_transformers_dataset_to_utterances
from src.model.utils import print_array_stats
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

In [3]:
print('############ Preparing data for fine_tuning ################')
train_dataset = load_dataset('silicone', config.DATASET_NAME, split='train')
val_dataset = load_dataset('silicone', config.DATASET_NAME, split='validation')
test_dataset = load_dataset('silicone', config.DATASET_NAME, split='test')
train_utterances = convert_transformers_dataset_to_utterances(train_dataset)
train_data = UtteranceCollection(utterances=train_utterances)
print('Training data:', len(train_data.utterances))
val_utterances = convert_transformers_dataset_to_utterances(val_dataset)
val_data = UtteranceCollection(utterances=val_utterances)
print('Validation data:', len(val_data.utterances))
test_utterances = convert_transformers_dataset_to_utterances(test_dataset)
test_data = UtteranceCollection(utterances=test_utterances)
print('Test data:', len(test_data.utterances))

############ Preparing data for fine_tuning ################


Found cached dataset silicone (/home/onyxia/.cache/huggingface/datasets/silicone/dyda_da/1.0.0/af617406c94e3f78da85f7ea74ebfbd3f297a9665cb54adbae305b03bc4442a5)
Found cached dataset silicone (/home/onyxia/.cache/huggingface/datasets/silicone/dyda_da/1.0.0/af617406c94e3f78da85f7ea74ebfbd3f297a9665cb54adbae305b03bc4442a5)
Found cached dataset silicone (/home/onyxia/.cache/huggingface/datasets/silicone/dyda_da/1.0.0/af617406c94e3f78da85f7ea74ebfbd3f297a9665cb54adbae305b03bc4442a5)


######## Converting dataset into a list of utterances ########


100%|██████████| 87170/87170 [00:47<00:00, 1826.82it/s]


################ 87170 formated #################
Training data: 87170
######## Converting dataset into a list of utterances ########


100%|██████████| 8069/8069 [00:04<00:00, 1690.73it/s]


################ 8069 formated #################
Validation data: 8069
######## Converting dataset into a list of utterances ########


100%|██████████| 7740/7740 [00:04<00:00, 1871.99it/s]

################ 7740 formated #################
Test data: 7740





In [4]:
model = SequenceClassifierModel(config=config)




Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [5]:
data = SequenceClassifierModelDataModule(train_data, val_data, test_data, model.tokenizer,
                                                 batch_size=config.BATCH_SIZE)

In [6]:
checkpoint_callback = ModelCheckpoint(dirpath=config.DIR_CHECKPOINTS,
                                              filename=config.FILENAME,
                                              save_top_k=1,
                                              verbose=True,
                                              monitor='val_loss',
                                              mode='min')
        
logger = TensorBoardLogger(save_dir=config.LOGS.replace('logs', ''),
                                   version=config.VERSION,
                                   name='logs')

In [7]:
trainer = pl.Trainer(callbacks=[checkpoint_callback],
                             max_epochs=config.NUM_EPOCHS,
                             gpus=[0],
                             strategy='dp',
                             logger=logger)

# Training step
pl.seed_everything(config.SEED)

  rank_zero_deprecation(
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
Global seed set to 42


42

In [8]:
trainer.fit(model, data)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name       | Type             | Params
------------------------------------------------
0 | classifier | Linear           | 3.1 K 
1 | encoder    | BertModel        | 109 M 
2 | loss_fct   | CrossEntropyLoss | 0     
------------------------------------------------
109 M     Trainable params
0         Non-trainable params
109 M     Total params
437.941   Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

  rank_zero_warn("Detected KeyboardInterrupt, attempting graceful shutdown...")
