# Import libraries

In [1]:
import warnings
warnings.simplefilter("ignore")

from datasets import load_dataset
import pandas as pd
import transformers as T
from transformers import DataCollatorWithPadding
from sklearn.model_selection import StratifiedKFold
from lightning.pytorch.loggers import CSVLogger
from collections import OrderedDict
import torch

In [2]:
! nvidia-smi

Sun Aug 27 19:43:32 2023       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.86.10              Driver Version: 535.86.10    CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       On  | 00000000:00:04.0 Off |                    0 |
| N/A   50C    P8              11W /  70W |    105MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

# Load dataset

In [3]:
imdb = load_dataset("imdb")

# Prepare dataset

In [4]:
df_train = pd.DataFrame(imdb["train"])
df_test = pd.DataFrame(imdb["test"])
X = pd.DataFrame(df_train["text"])
y = df_train["label"]
X_test = pd.DataFrame(df_test["text"])

# Define pipeline

## initiate pipeline

In [5]:
from imker import Pipeline, Task, TaskConfig, BaseSplitter, BaseModel
pipe = Pipeline(repo_dir="../../cache", exp_name="imdb", pipeline_name="example")

## define validation

In [6]:
class Splitter(BaseSplitter):
    def __init__(self):
        self.splitter = Task(TaskConfig(task=StratifiedKFold, 
                                    init_params={"n_splits":5, "shuffle":True}))

    def get_n_splits(self):
        return self.splitter.get_n_splits()
    
    def split(self, X, y=None):
        return self.splitter(X, y)

In [7]:
pipe.set_splitter(Splitter)

# define dataset class

In [8]:
class Tokenizer:
    def __init__(self, 
                 tokenizer, 
                 truncation=True, 
                 cast_type="float"):

        self.truncation = truncation
        self.cast_type = getattr(torch, cast_type)
        self.tokenizer = tokenizer

    def tokenize(self, X:str, y=None):

        encode = self.tokenizer(
                                X,
                                truncation = self.truncation,
                                return_attention_mask=True, 
                                return_tensors="pt", 
                            )

        if not y is None:
            encode["labels"] = torch.as_tensor(y).to(self.cast_type)
        
        encode = OrderedDict({k:v.squeeze() for k, v in encode.items()})

        return encode


In [9]:
class TrainDataset(torch.utils.data.Dataset):
    def __init__(self, 
                 X:pd.DataFrame,  
                 y:pd.Series,
                 tokenizer
                 ):

        self.tokenizer = Tokenizer(tokenizer)
        
        self.data = X["text"].to_numpy()
        self.len_data = len(self.data)

        self.labels = y.to_numpy()

    def __len__(self):
        return self.len_data

    def __getitem__(self, index):

        inputs = self.data[index]
        label = self.labels[index]

        encoded_inputs = {}
        encoded = self.tokenizer.tokenize(inputs, label)
        encoded_inputs.update(encoded)

        return encoded_inputs

class TestDataset(torch.utils.data.Dataset):
    def __init__(self, 
                 X:pd.DataFrame,  
                 tokenizer
                 ):

        self.tokenizer = Tokenizer(tokenizer)
        
        self.data = X["text"].to_numpy()
        self.len_data = len(self.data)

    def __len__(self):
        return self.len_data

    def __getitem__(self, index):

        inputs = self.data[index]

        encoded_inputs = {}
        encoded = self.tokenizer.tokenize(inputs)
        encoded_inputs.update(encoded)

        return encoded_inputs

# define model

In [10]:
from imker.adapter.lightning import BaseLightningModule

class TransformerModelForSequence(BaseLightningModule):
    def __init__(self, 
                 model_name, 
                 num_labels, 
                 dropout_ratio=0.1, 
                 **kwargs):
        super().__init__(**kwargs)

        self.num_labels = num_labels
        self.model = T.AutoModel.from_pretrained(model_name)
        self.dropout = torch.nn.Dropout(dropout_ratio)
        self.head = torch.nn.Linear(self.model.config.hidden_size, self.num_labels)
        

    def forward(self, X):
        out = self.model(input_ids=X["input_ids"], 
                        attention_mask=X["attention_mask"])
        out = out[0][:, 0, :]
        out = self.dropout(out)
        out = self.head(out).squeeze()
        return out

    def predict_step(self, batch, batch_idx: int, dataloader_idx: int = 0):
        out = self(batch)
        return torch.where(torch.nn.functional.sigmoid(out)>=0.5, 1, 0)
    
    def compute_loss(self, batch):
        y = batch.pop("labels")
        X = batch
        out = self.forward(X)
        loss = self.loss(out, y)
        return loss

In [11]:
from imker.adapter.lightning import LightningTask

class Classifier(BaseModel):
    def __init__(self):

        self.transformer_model = "distilbert-base-uncased"
        self.EPOCHS = 3
        self.BATCH_SIZE = 16
        self.N_ITERATIONS = self.EPOCHS * int(4/5 * 25000 // self.BATCH_SIZE) 
        tokenizer = T.AutoTokenizer.from_pretrained(self.transformer_model)
        tokenizer.deprecation_warnings["Asking-to-pad-a-fast-tokenizer"] = True

        self.model = Task(TaskConfig(task=LightningTask, 
                                     init_params=dict(
                                        model=TransformerModelForSequence, 
                                        train_dataset=TrainDataset, 
                                        valid_dataset=TrainDataset, 
                                        test_dataset=TestDataset, 
                                        model_init_params=dict(
                                            model_name=self.transformer_model, 
                                            num_labels=1, 
                                            loss = torch.nn.BCEWithLogitsLoss, 
                                            optimizer=torch.optim.AdamW, 
                                            optimizer_params=dict(
                                                lr=1e-5
                                            ), 
                                            lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR, 
                                            lr_scheduler_params=dict(T_max=self.N_ITERATIONS)
                                        ), 
                                        train_dataset_params=dict(
                                            tokenizer = tokenizer, 
                                        ), 
                                        valid_dataset_params=dict(
                                            tokenizer = tokenizer, 
                                        ), 
                                        test_dataset_params=dict(
                                            tokenizer = tokenizer, 
                                        ), 
                                        epochs = self.EPOCHS, 
                                        batch_size = self.BATCH_SIZE, 
                                        early_stopping_round = 5, 
                                        collate_fn = DataCollatorWithPadding, 
                                        collate_fn_params = dict(tokenizer=tokenizer), 
                                        limit_train_batches = 0.1, # just for tutorial
                                        loader_num_workers=0, 
                                        checkpoint_dir = "../../checkpoint", 
                                        logger=CSVLogger, 
                                        logger_params = dict(
                                            save_dir="../../checkpoint"
                                        )
                                    ), 
                                    cache=True)
                        )

    def forward(self, X, y=None, eval_set=None):
        return {"transformer":self.model(X, y, eval_set=eval_set)}


In [12]:
pipe.set_model(Classifier)

# train

In [13]:
pipe.train(X, y)

StratifiedKFold                :           split process takes 0.0000 [sec]


Global seed set to 42
Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name    | Type              | Params
----------------------------------------------
0 | loss    | BCEWithLogitsLoss | 0     
1 | model   | DistilBertModel   | 66.4 M
2 | dropout | Dropout           | 0     
3 | head    | Linear            | 769   
----------------------------------------------
66.4 M    Trainable params
0         Non-trainable params
66.4 M    Total params
265.455   Total estimated model params size (MB)


Epoch 2: 100%|██████████| 125/125 [01:17<00:00,  1.61it/s, v_num=0, train_loss_step=0.234, valid_loss_step=0.400] 

`Trainer.fit` stopped: `max_epochs=3` reached.


Epoch 2: 100%|██████████| 125/125 [01:19<00:00,  1.56it/s, v_num=0, train_loss_step=0.234, valid_loss_step=0.400]
LightningTask                  :             fit process takes 238.4449 [sec]


Global seed set to 42
Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name    | Type              | Params
----------------------------------------------
0 | loss    | BCEWithLogitsLoss | 0     
1 | model   | DistilBertModel   | 66.4 M
2 | dropout | Dropout           | 0     
3 | head    | Linear            | 769   
----------------------------------------------
66.4 M    Trainable params
0         Non-trainable params
66.4 M    Total params
265.455   Total estimated model params size (MB)


Epoch 2: 100%|██████████| 125/125 [01:17<00:00,  1.61it/s, v_num=1, train_loss_step=0.222, valid_loss_step=0.237] 

`Trainer.fit` stopped: `max_epochs=3` reached.


Epoch 2: 100%|██████████| 125/125 [01:19<00:00,  1.56it/s, v_num=1, train_loss_step=0.222, valid_loss_step=0.237]
LightningTask                  :             fit process takes 238.3958 [sec]


Global seed set to 42
Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name    | Type              | Params
----------------------------------------------
0 | loss    | BCEWithLogitsLoss | 0     
1 | model   | DistilBertModel   | 66.4 M
2 | dropout | Dropout           | 0     
3 | head    | Linear            | 769   
----------------------------------------------
66.4 M    Trainable params
0         Non-trainable params
66.4 M    Total params
265.455   Total estimated model params size (MB)


Epoch 2: 100%|██████████| 125/125 [01:17<00:00,  1.61it/s, v_num=2, train_loss_step=0.281, valid_loss_step=0.0713]

`Trainer.fit` stopped: `max_epochs=3` reached.


Epoch 2: 100%|██████████| 125/125 [01:19<00:00,  1.57it/s, v_num=2, train_loss_step=0.281, valid_loss_step=0.0713]
LightningTask                  :             fit process takes 236.5010 [sec]


Global seed set to 42
Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name    | Type              | Params
----------------------------------------------
0 | loss    | BCEWithLogitsLoss | 0     
1 | model   | DistilBertModel   | 66.4 M
2 | dropout | Dropout           | 0     
3 | head    | Linear            | 769   
----------------------------------------------
66.4 M    Trainable params
0         Non-trainable params
66.4 M    Total params
265.455   Total estimated model params size (MB)


Epoch 2: 100%|██████████| 125/125 [01:16<00:00,  1.62it/s, v_num=3, train_loss_step=0.290, valid_loss_step=0.375] 

`Trainer.fit` stopped: `max_epochs=3` reached.


Epoch 2: 100%|██████████| 125/125 [01:18<00:00,  1.58it/s, v_num=3, train_loss_step=0.290, valid_loss_step=0.375]
LightningTask                  :             fit process takes 236.3300 [sec]


Global seed set to 42
Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name    | Type              | Params
----------------------------------------------
0 | loss    | BCEWithLogitsLoss | 0     
1 | model   | DistilBertModel   | 66.4 M
2 | dropout | Dropout           | 0     
3 | head    | Linear            | 769   
----------------------------------------------
66.4 M    Trainable params
0         Non-trainable params
66.4 M    Total params
265.455   Total estimated model params size (MB)


Epoch 2: 100%|██████████| 125/125 [01:17<00:00,  1.62it/s, v_num=4, train_loss_step=0.151, valid_loss_step=0.0371] 

`Trainer.fit` stopped: `max_epochs=3` reached.


Epoch 2: 100%|██████████| 125/125 [01:19<00:00,  1.58it/s, v_num=4, train_loss_step=0.151, valid_loss_step=0.0371]
LightningTask                  :             fit process takes 236.2818 [sec]


<imker.pipeline.pipeline.Pipeline at 0x7faaa6bfe220>

Training config is automatically saved.

In [14]:
from imker import RepositoryViewer
viewer = RepositoryViewer(repo_dir="../../cache/")
viewer.search_repo().pipe(lambda x:x[x["processor"]=="LightningTask"])

Unnamed: 0,taskId,lastUpdatedDate,repo,method,processor,cacheFile,config
16,22,2023-08-27T19:55:38.173250,../../cache,fit,LightningTask,../../cache/task/fit/LightningTask/c455b53e5e1...,../../cache/task/fit/LightningTask/c455b53e5e1...
17,21,2023-08-27T20:03:30.783287,../../cache,fit,LightningTask,../../cache/task/fit/LightningTask/68cfe8dfbee...,../../cache/task/fit/LightningTask/68cfe8dfbee...
18,20,2023-08-27T19:59:34.508272,../../cache,fit,LightningTask,../../cache/task/fit/LightningTask/5148be02c01...,../../cache/task/fit/LightningTask/5148be02c01...
19,19,2023-08-27T19:51:41.662210,../../cache,fit,LightningTask,../../cache/task/fit/LightningTask/0e6864d835b...,../../cache/task/fit/LightningTask/0e6864d835b...
20,18,2023-08-27T19:47:43.234984,../../cache,fit,LightningTask,../../cache/task/fit/LightningTask/0d12f5f221e...,../../cache/task/fit/LightningTask/0d12f5f221e...


In [15]:
viewer.load_config(task_id=22)

{'init_params': {'batch_size': 16,
  'checkpoint_dir': '../../checkpoint',
  'collate_fn': 'DataCollatorWithPadding',
  'collate_fn_params': {'tokenizer': 'DistilBertTokenizerFast'},
  'early_stopping_round': 5,
  'epochs': 3,
  'limit_train_batches': 0.1,
  'loader_num_workers': 0,
  'logger': 'CSVLogger',
  'logger_params': {'save_dir': '../../checkpoint'},
  'model': 'TransformerModelForSequence',
  'model_init_params': {'loss': 'BCEWithLogitsLoss',
   'lr_scheduler': 'CosineAnnealingLR',
   'lr_scheduler_params': {'T_max': 3750},
   'model_name': 'distilbert-base-uncased',
   'num_labels': 1,
   'optimizer': 'AdamW',
   'optimizer_params': {'lr': 1e-05}},
  'test_dataset': 'TestDataset',
  'test_dataset_params': {'tokenizer': 'DistilBertTokenizerFast'},
  'train_dataset': 'TrainDataset',
  'train_dataset_params': {'tokenizer': 'DistilBertTokenizerFast'},
  'valid_dataset': 'TrainDataset',
  'valid_dataset_params': {'tokenizer': 'DistilBertTokenizerFast'}},
 'fit_params': {},
 'tran

# validation

## set metrics

In [16]:
from sklearn.metrics import accuracy_score, f1_score
pipe.set_metrics([accuracy_score, f1_score])

## run validation

In [17]:
val_preds = pipe.validate(X, y)

StratifiedKFold                :           split process takes 0.0000 [sec]


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting DataLoader 0: 100%|██████████| 313/313 [01:34<00:00,  3.32it/s]
LightningTask                  :         forward process takes 95.7584 [sec]


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting DataLoader 0: 100%|██████████| 313/313 [01:33<00:00,  3.35it/s]
LightningTask                  :         forward process takes 95.1411 [sec]


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting DataLoader 0: 100%|██████████| 313/313 [01:33<00:00,  3.36it/s]
LightningTask                  :         forward process takes 94.7321 [sec]


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting DataLoader 0: 100%|██████████| 313/313 [01:32<00:00,  3.37it/s]
LightningTask                  :         forward process takes 94.4435 [sec]


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting DataLoader 0: 100%|██████████| 313/313 [01:33<00:00,  3.35it/s]
LightningTask                  :         forward process takes 94.9493 [sec]


## validation results

Here is the validation scores.

In [18]:
pipe.get_scores()

Unnamed: 0,Unnamed: 1,fold0,fold1,fold2,fold3,fold4
transformer,accuracy_score,0.9052,0.9114,0.9048,0.8964,0.8954
transformer,f1_score,0.903107,0.910523,0.905405,0.894026,0.899249


You can see the prediction results for the validation data.

In [19]:
val_preds.transformer

array([0, 0, 0, ..., 1, 1, 1])

This time the prediction results was cached, so you can access it again without taking time.

In [20]:
pipe.validate(X, y)

StratifiedKFold                :           split process takes 0.0000 [sec]
LightningTask                  :         forward process takes 0.0963 [sec]
LightningTask                  :         forward process takes 0.0715 [sec]
LightningTask                  :         forward process takes 0.0715 [sec]
LightningTask                  :         forward process takes 0.0737 [sec]
LightningTask                  :         forward process takes 0.0713 [sec]


DataContainer([('transformer', array([0, 0, 0, ..., 1, 1, 1]))])

# Inference

Inference takes a little long time with T4 GPU, so I omit it this time.

In [None]:
# test_preds = pipe.inference(X_test)

# Reproduce Pipeline

In [21]:
pipe2 = Pipeline.load(
    repo_dir="../../cache", 
    exp_name="imdb", 
    pipeline_name="example", 
    splitter = Splitter, 
    model = Classifier
)

In [22]:
val_preds2 = pipe2.validate(X, y, calc_metrics=False)

StratifiedKFold                :           split process takes 0.0000 [sec]


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting DataLoader 0: 100%|██████████| 313/313 [01:33<00:00,  3.34it/s]
LightningTask                  :         forward process takes 95.0915 [sec]


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting DataLoader 0: 100%|██████████| 313/313 [01:33<00:00,  3.36it/s]
LightningTask                  :         forward process takes 94.5513 [sec]


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting DataLoader 0: 100%|██████████| 313/313 [01:33<00:00,  3.36it/s]
LightningTask                  :         forward process takes 94.7389 [sec]


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting DataLoader 0: 100%|██████████| 313/313 [01:32<00:00,  3.37it/s]
LightningTask                  :         forward process takes 94.3974 [sec]


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting DataLoader 0: 100%|██████████| 313/313 [01:33<00:00,  3.36it/s]
LightningTask                  :         forward process takes 94.6299 [sec]


In [23]:
all(val_preds.transformer==val_preds2.transformer)

True