# Introduction

In [None]:
!pip install -r requirements.txt

In [1]:
# Import packages
# DL
import torch
import torch.nn as nn
import torch.nn.functional as F
import transformers
import evaluate
import torch


# from transformers import TrainingArguments, Trainer
from transformers.trainer_utils import get_last_checkpoint
import bitsandbytes as bnb

# visualization, data utils
import random
import numpy as np
import pandas as pd
import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt
import datasets
from PIL import Image
from datasets import load_dataset
from torch.utils.data import DataLoader
import wandb


# evaluation
import evaluate, sklearn
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split

# other utils
from tqdm import tqdm
from functools import partial
import os, glob, sys, shutil, datetime
from pathlib import Path
import random, math
from tqdm import tqdm
from dataclasses import dataclass
from typing import Any, Dict, Tuple, Union, List, Optional, Callable, Iterable
from omegaconf import DictConfig, OmegaConf
import gc
from easydict import EasyDict as edict
import IPython

In [2]:
import pandas as pd
import datasets
import underthesea
from pyvi import ViTokenizer
from spacy.lang.vi import STOP_WORDS as VIETNAMESE_STOP_WORDS

from src.data.vietnamese_eda import VietnameseEDATransform
EDA_TRANSFORM = VietnameseEDATransform()


# def text_preprocess(text):
#     # normalize text
#     text = underthesea.text_normalize(text)
#     # word segmentation
#     text = ViTokenizer.tokenize(text.lower())
#     # remove stop words
#     words = text.split()
#     words = [word for word in words if word not in VIETNAMESE_STOP_WORDS]
#     return " ".join(words)


def read_data(file_path, is_train=True):
    # remove header, two columns with name 'Class' and 'Data'
    data = pd.read_csv(file_path, delimiter="\t", header=None, names=["label", "text"])
    data = data[1:]
    data = datasets.Dataset.from_pandas(data)
    if is_train:
        # transform: just get a random EDA transform if exists, else original text
        transform = lambda list_text: [random.choice(EDA_TRANSFORM(x) or [x]) for x in list_text] 
    else:
        transform = lambda x: x
    data = data.map(lambda x: {"label": int(x["label"]) + 1, "text": (x["text"])})
    # data.set_transform(lambda x: {"label": x["label"], "text": transform(x["text"])})
    data = data.cast_column(
        "label",
        datasets.ClassLabel(num_classes=3, names=["negative", "neutral", "positive"]),
    )
    return data

def get_transform(is_train):
    if is_train:
        aug_transform = lambda list_text: [random.choice(EDA_TRANSFORM(x)) for x in list_text] 
    else:
        aug_transform = lambda x: x
    return lambda x: {"label": (x["label"]), "text": aug_transform(x["text"])}

train_table = read_data("vlsp_sentiment_train.csv")
# split train_table to train and validation
train_val_dict = train_table.train_test_split(
    test_size=0.1, stratify_by_column="label", seed=44
)
train_table = train_val_dict["train"]
val_table = train_val_dict["test"]

test_table = read_data("vlsp_sentiment_test.csv")

train_table.set_transform(get_transform(is_train=True))
val_table.set_transform(get_transform(is_train=False))
test_table.set_transform(get_transform(is_train=False))

Map:   0%|          | 0/5100 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/5100 [00:00<?, ? examples/s]

Map:   0%|          | 0/1050 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1050 [00:00<?, ? examples/s]

In [3]:
from src.models.phow2vec import PhoW2VecWrapper

w2v_model = PhoW2VecWrapper(max_length=64)

Shape of matrix word embedding: (1587507, 300)
Vocab size: 1587507
Vector dimension: 300


## Method based on CNN

In [4]:
class CNNClassifier(nn.Module):
    def __init__(
        self, word2vec_model, input_dim, num_filters, filter_sizes, output_dim, dropout
    ):
        super(CNNClassifier, self).__init__()
        self.embedding = word2vec_model

        self.convs = nn.ModuleList(
            [
                nn.Conv1d(
                    in_channels=input_dim, out_channels=num_filters, kernel_size=fs
                )
                for fs in filter_sizes
            ]
        )
        self.max_pools = nn.ModuleList(
            nn.AdaptiveMaxPool1d(output_size=1) for _ in filter_sizes
        )  # make (B, C, L) to (B, C, 1), use adaptive for not caring about the length of the input
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(len(filter_sizes) * num_filters, output_dim)
        self.loss_fct = nn.CrossEntropyLoss(reduction="mean")

        self.init_weights()

    @property
    def device(self):
        return next(self.parameters()).device

    def init_weights(self):
        for name, param in self.named_parameters():
            if "weight" in name:
                nn.init.normal_(param.data, mean=0, std=0.02)
            elif "bias" in name:
                nn.init.constant_(param.data, 0)

    def forward(self, texts, labels=None):
        # x is text or len of text
        # this is weird because I combined model and tokenizer into 1
        x = self.embedding(texts)  # [batch_size, sent_len, emb_dim]
        x = x.transpose(-2, -1)  # [batch_size, emb_dim, sent_len]
        x = x.to(self.device)
        # consider emb_dim as input channel
        conved_output = [
            F.relu(conv(x)) for conv in self.convs
        ]  # list of tensor shaped [batch_size, num_filter, sent_len - filter_sizes[n] + 1]
        pooled_output = [
            pool(conv).squeeze(-1) for conv, pool in zip(conved_output, self.max_pools)
        ]  # list of tensor shaped [batch_size, num_filter]
        cat = torch.cat(
            pooled_output, dim=-1
        )  # [batch_size, num_filter * len(filter_sizes)]
        drop_output = self.dropout(cat)  # [batch_size, num_filter * len(filter_sizes)]
        logits = self.fc(drop_output)  # [batch_size, output_dim]

        # output should be wrapped in edict, for multi-way attribute-accessing
        return_dict = edict({"logits": logits})  # logits will use in inference
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits, labels)
            return_dict.losses = {"ce_loss": loss}  # losses will use in training

        return edict(return_dict)

In [5]:
cnn_model = CNNClassifier(
    word2vec_model=w2v_model,
    input_dim=300,
    num_filters=100,
    filter_sizes=[3, 4, 5],
    output_dim=3,
    dropout=0.1,
)

cnn_model(
    ["Tôi là sinh viên trường đại học bách khoa hà nội"] * 3, torch.tensor([1, 0, 2])
)

{'logits': tensor([[-0.0295, -0.0078,  0.0065],
         [-0.0256, -0.0057,  0.0155],
         [-0.0211, -0.0046,  0.0092]], grad_fn=<AddmmBackward0>),
 'losses': {'ce_loss': tensor(1.0998, grad_fn=<NllLossBackward0>)}}

## Method based on LSTM

In [6]:
from torch import dropout_


class LSTMClassifier(torch.nn.Module):
    def __init__(
        self,
        word2vec_model,
        input_dim=300,
        hidden_dims=[384, 384, 384],
        output_dim=3,
        n_layers=3,
        bidirectional=True,
        dropout=0.2,
    ):
        super(LSTMClassifier, self).__init__()
        self.embedding = word2vec_model

        num_direct = 2 if bidirectional else 1
        # hidden_dims is vector dim of single direction, output_dim of lstm is hidden_dim * num_direct

        list_in_lstm_dims = [input_dim] + [
            hidden_dims[i] * num_direct for i in range(len(hidden_dims) - 1)
        ]
        list_out_lstm_dims = hidden_dims

        self.lstm_chain = nn.ModuleList(
            [
                nn.LSTM(
                    input_size=list_in_lstm_dims[i],
                    hidden_size=list_out_lstm_dims[i],
                    num_layers=1,
                    bidirectional=bidirectional,
                    dropout=dropout,
                    batch_first=True,
                )
                for i in range(n_layers)
            ]
        )
        self.max_pooling = nn.AdaptiveMaxPool1d(output_size=1)
        self.fc = nn.Linear(list_out_lstm_dims[-1] * num_direct, output_dim)
        self.dropout = nn.Dropout(dropout)
        self.loss_fct = nn.CrossEntropyLoss(reduction="mean")
        self.init_weights()

    @property
    def device(self):
        return next(self.parameters()).device

    def init_weights(self):
        for name, param in self.named_parameters():
            if "weight" in name:
                nn.init.normal_(param.data, mean=0, std=0.02)
            elif "bias" in name:
                nn.init.constant_(param.data, 0)

    def forward(self, texts, labels=None):
        x = self.embedding(texts)
        x = x.to(self.device)  # [batch_size, sent_len, input_dim]

        for lstm in self.lstm_chain:
            x, (hidden, cell) = lstm(x)

        # num_direct = 2 if bidirectional else 1
        # x: [batch_size, sent_len, hidden_dim * num_direct] # last layer 's output for whole sequence
        # hidden: [num_layer * num_direct, batch_size, hidden_dim] # last hidden state of all layers
        # cell: [num_layer * num_direct, batch_size, hidden_dim] # last cell state of all layers

        # take the last hidden state of the last layer as global feature
        # the input have been padded to the right, so we can take the last hidden state as global feature
        global_feature = x[:, -1, :]  # [batch_size, hidden_dim * num_direct]

        dropout_output = self.dropout(
            global_feature
        )  # [batch_size, hidden_dim * num_direct]
        logits = self.fc(dropout_output)
        return_dict = edict({"logits": logits})
        if labels is not None:
            loss = self.loss_fct(logits, labels)
            return_dict.losses = {"ce_loss": loss}
        return edict(return_dict)

## Hybrid method combining CNN and LSTM

In [7]:
import torch


class HybridClassifer(torch.nn.Module):
    def __init__(
        self,
        word2vec_model,
        input_dim=300,
        lstm_hidden_dim=384,
        dropout=0.2,
        cnn_num_filters=300,
        cnn_filter_sizes=[3, 4, 5],
    ):
        super(HybridClassifer, self).__init__()
        self.embedding = word2vec_model

        # just one layer as in slide for speed up
        self.lstm = torch.nn.LSTM(
            input_size=input_dim,
            hidden_size=lstm_hidden_dim,
            num_layers=1,
            bidirectional=True,
            dropout=dropout,
            batch_first=True,
        )
        self.batch_norm = nn.BatchNorm1d(lstm_hidden_dim * 2)
        self.convs = nn.ModuleList(
            [
                nn.Conv1d(
                    in_channels=lstm_hidden_dim * 2,
                    out_channels=cnn_num_filters,
                    kernel_size=fs,
                )
                for fs in cnn_filter_sizes
            ]
        )
        self.max_pools = nn.ModuleList(
            nn.AdaptiveMaxPool1d(output_size=1) for _ in cnn_filter_sizes
        )
        self.fc = nn.Linear(cnn_num_filters * len(cnn_filter_sizes), 3)
        self.dropout = nn.Dropout(dropout)

        self.loss_fct = nn.CrossEntropyLoss(reduction="mean")
        self.init_weights()

    def forward(self, texts, labels=None):
        x = self.embedding(texts)
        x = x.to(self.device)

        # LSTM
        x, (hidden, cell) = self.lstm(x)
        # x: [batch_size, sent_len, hidden_dim * num_direct] # last layer 's output for whole sequence
        x = x.transpose(
            1, 2
        )  # Transpose to (batch_size, hidden_dim * num_direct, sent_len)
        x = self.batch_norm(x)
        x = x.transpose(
            1, 2
        )  # Transpose back to (batch_size, sent_len, hidden_dim * num_direct)

        # CNN
        x = x.transpose(-2, -1)
        conved_output = [F.relu(conv(x)) for conv in self.convs]
        pooled_output = [
            pool(conv).squeeze(-1) for conv, pool in zip(conved_output, self.max_pools)
        ]
        cat = torch.cat(pooled_output, dim=-1)
        drop_output = self.dropout(cat)
        logits = self.fc(drop_output)
        return_dict = edict({"logits": logits})
        if labels is not None:
            loss = self.loss_fct(logits, labels)
            return_dict.losses = {"ce_loss": loss}
        return edict(return_dict)

    @property
    def device(self):
        return next(self.parameters()).device

    def init_weights(self):
        for name, param in self.named_parameters():
            if "weight" in name:
                nn.init.normal_(param.data, mean=0, std=0.02)
            elif "bias" in name:
                nn.init.constant_(param.data, 0)


hybrid_model = HybridClassifer(
    word2vec_model=w2v_model,
    input_dim=300,
    lstm_hidden_dim=384,
    dropout=0.2,
    cnn_num_filters=300,
    cnn_filter_sizes=[3, 4, 5, 6, 7],
)


hybrid_model(
    ["Tôi là sinh viên trường đại học bách khoa hà nội"] * 3, torch.tensor([1, 0, 2])
)



{'logits': tensor([[ 0.0522, -0.0197,  0.0052],
         [ 0.0315,  0.0051,  0.0036],
         [ 0.0454, -0.0192, -0.0017]], grad_fn=<AddmmBackward0>),
 'losses': {'ce_loss': tensor(1.1069, grad_fn=<NllLossBackward0>)}}

In [8]:
lstm_model = LSTMClassifier(
    word2vec_model=w2v_model,
    input_dim=300,
    hidden_dims=[384, 512, 512],
    output_dim=3,
    n_layers=3,
    bidirectional=True,
    dropout=0.4,
)

lstm_model(
    ["Tôi là sinh viên trường đại học bách khoa hà nội"] * 3, torch.tensor([1, 0, 2])
)



{'logits': tensor([[0.0003, 0.0003, 0.0003],
         [0.0010, 0.0004, 0.0001],
         [0.0005, 0.0005, 0.0002]], grad_fn=<AddmmBackward0>),
 'losses': {'ce_loss': tensor(1.0985, grad_fn=<NllLossBackward0>)}}

## Method based on Encoder-only Transformer

In [9]:
import underthesea, pyvi


class VietnameseTextPreprocessor(nn.Module):
    def __init__(self, max_length=100):
        super(VietnameseTextPreprocessor, self).__init__()
        self.max_length = max_length
        self.tokenizer = PhoW2VecWrapper(max_length=max_length)

    def forward(self, texts):
        return self.tokenizer(texts)

In [10]:
import transformers
from transformers import AutoTokenizer, AutoBackbone, AutoModelForPreTraining, AutoModel
import torch
import torch.nn as nn
from easydict import EasyDict as edict


class HuggingFaceModelWrapper(torch.nn.Module):
    def __init__(self, model_name, num_classes=3, max_length=100):
        super(HuggingFaceModelWrapper, self).__init__()

        self.model_name = model_name
        self.num_classes = num_classes
        self.max_length = max_length

        self.tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
        self.backbone = AutoModel.from_pretrained(model_name)
        self.pooling = lambda x: x[
            :, 0, :
        ]  # just use the first token as global feature

        # my favourite MLP: linear, layer norm, gelu activation, linear
        self.classifier = nn.Sequential(
            nn.Linear(
                self.backbone.config.hidden_size, self.backbone.config.hidden_size
            ),
            nn.LayerNorm(self.backbone.config.hidden_size),
            nn.GELU(),
            nn.Linear(self.backbone.config.hidden_size, num_classes),
        )

        self.loss_fct = nn.CrossEntropyLoss(reduction="mean")

    @property
    def device(self):
        return next(self.parameters()).device

    def forward(self, texts, labels=None):
        inputs = self.tokenizer(
            texts,
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=self.max_length,
        ).to(self.device)
        outputs = self.backbone(**inputs)
        global_feature = self.pooling(outputs.last_hidden_state)
        logits = self.classifier(global_feature)

        return_dict = edict({"logits": logits})  # will be used in inference
        if labels is not None:
            loss = self.loss_fct(logits, labels)
            return_dict.losses = {"ce_loss": loss}  # will be used in training
        return edict(return_dict)

In [11]:
# AutoModelForPreTraining.from_pretrained("vinai/phobert-base-v2")
wrapper_model = HuggingFaceModelWrapper("vinai/phobert-base-v2", 3)


wrapper_model(
    ["hello kitty", "tôi là sinh viên trường đại học bách khoa hà nội"],
)

Some weights of RobertaModel were not initialized from the model checkpoint at vinai/phobert-base-v2 and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'logits': tensor([[-0.0735, -0.5013, -0.6271],
         [-0.2695, -0.4358, -0.3927]], grad_fn=<AddmmBackward0>)}

In [12]:
def data_collator_for_cnn(list_examples):
    # list_examples: list of examples, each example is a dict
    # list_examples[0] = {"label": tensor, "text": text}
    # ...
    # list_examples[n] = {"label": tensor, "text": text}
    # return: dict of batched examples
    labels = torch.Tensor([example["label"] for example in list_examples]).long()

    # weird implementation that combine model and tokenizer into 1
    texts = [example["text"] for example in list_examples]
    return {"labels": labels, "texts": texts}

# Data Visualization


# Data Preprocessing

# Model Building

# Model Training

In [13]:
def calculate_accuracy(model, dataloader):
    """Function calculates accuracy of given model on dataloader

    Args:
        model (CLIPClassifier): CLIP classifier model
        dataloader (DataLoader): evaluation dataloader

    Returns:
        float: model's accuracy
    """
    # create metric computer
    accuracy_metric = evaluate.load("accuracy")
    f1_metric = evaluate.load("f1")
    precision_metric = evaluate.load("precision")
    recall_metric = evaluate.load("recall")
    list_metrics = [accuracy_metric, f1_metric, precision_metric, recall_metric]
    # evaluate model
    predictions_list = []
    references_list = []
    device = model.device

    for batch in tqdm(
        dataloader, total=len(dataloader), desc="Evaluate model on dataset"
    ):
        batch["labels"] = batch["labels"].to(device)
        predictions = model(**batch)["logits"]
        predictions_list.append(torch.argmax(predictions, dim=1))
        references_list.append(batch["labels"])

    results_dict = {}
    for metric in list_metrics:
        if metric.name == "accuracy":
            result_dict = metric.compute(
                predictions=torch.concat(predictions_list),
                references=torch.concat(references_list),
            )
        else:
            result_dict = metric.compute(
                predictions=torch.concat(predictions_list),
                references=torch.concat(references_list),
                average="macro",
            )
        results_dict.update(result_dict)

    # rename f1 to macro_f1
    results_dict["macro_f1"] = results_dict.pop("f1")
    results_dict["macro_precision"] = results_dict.pop("precision")
    results_dict["macro_recall"] = results_dict.pop("recall")

    return results_dict


@torch.no_grad()
def classification_evaluate(
    model: nn.Module,
    dataset: datasets.Dataset,
    tokenizer: None,
    collate_fn: Callable,
    batch_size: int = 64,
    num_workers: int = 2,
    device: str = "cuda",
):

    classifier = model
    test_dataloader = DataLoader(
        dataset, batch_size=batch_size, num_workers=num_workers, collate_fn=collate_fn
    )
    classifier = classifier.to(device)
    results_dict = calculate_accuracy(classifier, test_dataloader)
    print(f"Evaluate metrics: {results_dict}")
    return results_dict

In [14]:
class MyTrainer:
    def __init__(
        self,
        config: DictConfig,
        model: torch.nn.Module,
        train_dataset: Union[torch.utils.data.Dataset, datasets.Dataset],
        val_dataset: Union[torch.utils.data.Dataset, datasets.Dataset],
        tokenizer: Optional[transformers.PreTrainedTokenizer] = None,
        # labels: list[str] = ["negative", "neutral", "positive"],
        optimizer: Optional[torch.optim.Optimizer] = None,
        scheduler: Optional[torch.optim.lr_scheduler._LRScheduler] = None,
        logger: Optional[wandb.sdk.wandb_run.Run] = None,
        metrics_to_save_best: Optional[List[str]] = ["val/metrics/accuracy"],
        device: Optional[torch.device] = "cuda",
        collate_fn: Optional[Callable] = None,
    ):

        self.config = config
        self.device = device
        self._logger = logger
        self.setup_output_dir()
        # main components
        self.model = model
        self.train_dataset = train_dataset
        self.val_dataset = val_dataset
        # initialize some variables for training loop
        self.cur_step: int = -1
        self.cur_epoch: int = -1
        self.exit_by_patience: bool = False
        self.current_patience: int = -1
        self.max_patience: int = config.get("patience", math.inf)
        self.best_metrics_values: Dict[str, Any] = {
            **{key: -1 for key in metrics_to_save_best}
        }
        self.history_metrics: List[Dict[str, Any]] = []

        self.collate_fn = collate_fn
        self.tokenizer = tokenizer
        self.build_dataloader(train_dataset)
        # self.setup_optimizers_before_training(config)
        print(self.config)

    def setup_optimizers_before_training(self, config):
        self.optimizer = torch.optim.AdamW(
            self.model.parameters(),
            lr=config.get("lr", 1e-4),
            betas=config.get("betas", (0.9, 0.995)),
        )
        self.scheduler = transformers.get_cosine_schedule_with_warmup(
            self.optimizer,
            num_warmup_steps=config.get("num_warmup_steps", 100),
            num_training_steps=config.get("max_epochs", 10)
            * len(self.train_dataloader),
            num_cycles=config.get("num_cycles", 0.5),
            last_epoch=self.cur_epoch,
        )

    def build_dataloader(self, dataset):
        self.train_dataloader = DataLoader(
            dataset,
            batch_size=self.config.batch_size,
            shuffle=True,
            num_workers=self.config.num_workers,
            collate_fn=self.collate_fn,
        )
        self.val_dataloader = DataLoader(
            self.val_dataset,
            batch_size=self.config.batch_size,
            shuffle=False,
            num_workers=self.config.num_workers,
            collate_fn=self.collate_fn,
        )

    def setup_output_dir(self):
        self.project_name = self._logger.project
        self.group_name = self._logger.group
        self.experiment_name = self._logger.name
        if self._logger is None:
            return
        output_dir = self.config.get("output_dir", "checkpoints")
        prefix = (
            f"{output_dir}/{self.project_name}/{self.group_name}/{self.experiment_name}"
        )
        if not os.path.exists(prefix):
            os.makedirs(prefix)
        with open(f"{prefix}/config.yaml", "w") as f:
            OmegaConf.save(self.config, f)
        self.checkpoint_prefix = prefix

    def extract_loss(self, output, validation=False) -> torch.Tensor:
        if hasattr(output, "losses"):
            losses = output.losses
        elif isinstance(output, torch.Tensor):
            losses = {"total": output}
        else:
            losses = output
        total_loss = 0
        for key in losses:
            if losses[key] is not None:
                total_loss += losses[key]
                loss_reduce = losses[key].detach()
                if validation:
                    mode = "validation"
                else:
                    mode = "train"
                self.log(
                    f"{mode}/losses/{key}",
                    loss_reduce.item(),
                )
                # print(f"{mode}/losses/{key}", loss_reduce.item())
        return total_loss

    def train(self) -> None:
        self.setup_optimizers_before_training(self.config)
        model, optimizer, scheduler = (self.model, self.optimizer, self.scheduler)
        # model.to(self.device)
        pbar_train_dataloader = tqdm(
            self.train_dataloader, total=len(self.train_dataloader), desc="Training"
        )
        # Below training loop use model (not self.model) and optimizer (not self.optimizer)
        while True:  # until meet max epoch or max patience
            self.cur_epoch += 1
            # dataloader = self.train_dataloader
            pbar_train_dataloader.reset()
            if self.cur_epoch % 3 == 0:
                IPython.display.clear_output(wait=True)
            print(f"Current Epoch {self.cur_epoch}")
            for data in pbar_train_dataloader:
                self.cur_step += 1
                # CHECK EXIT CONDITION
                if (
                    self.cur_epoch >= self.config.max_epochs
                    or self.exit_by_patience == True
                ):
                    print("Exit requirement reached, exiting")
                    self.save_checkpoint(for_last=True)
                    return self.get_training_results()
                # FORWARD PASS
                model.train()
                data = self.move_to_device(data, self.device)
                optimizer.zero_grad(set_to_none=True)
                with torch.autocast(device_type=self.device, dtype=torch.bfloat16):
                    if type(data) is dict:
                        output = model(**data)
                    elif type(data) is list:
                        output = model(*data)
                    else:
                        output = model(data)

                    total_loss = self.extract_loss(output)
                # BACKWARD PASS
                total_loss.backward()
                optimizer.step()
                # LEARNING RATE MONITOR
                if self.cur_step % self.config.train_log_interval == 0:
                    for index_group in range(len(optimizer.param_groups)):
                        lr = optimizer.param_groups[index_group]["lr"]
                        self.log(f"train/lr_group_{index_group}", lr)
                    print("Learning rate: ", lr)
                    print("Total loss: ", total_loss.item())
                scheduler.step()
            self.on_validate_start()

    def move_to_device(self, obj, device):
        if isinstance(obj, dict):
            d = {}
            for k, v in obj.items():
                d[k] = self.move_to_device(v, device)
            return d
        if isinstance(obj, list):
            l = []
            for v in obj:
                l.append(self.move_to_device(v, device))
            return l
        if isinstance(obj, str):
            return obj
        return obj.to(device)

    def on_validate_start(self):
        metrics_dict = self.validate()
        self.handle_checkpoint_with_patience(metrics_dict, set_patience=True)
        self.setup_for_patience_callback()
        torch.cuda.empty_cache()

    def validate(self):
        validation_loader = self.val_dataloader
        model = self.model

        model.eval()
        print("Evaluating")
        with torch.no_grad():
            metrics = self.evaluate(validation_loader)
            if metrics is not None:
                history_obj = {}
                for key in metrics:
                    log_key = f"val/metrics/{key}"
                    self.log(
                        log_key,
                        metrics[key],
                    )
                    history_obj[key] = round(metrics[key], 4)
                    print(f"{key}: {metrics[key]}")
                self.history_metrics.append(
                    {**history_obj, "epoch": self.cur_epoch, "step": self.cur_step}
                )
        return metrics

    @torch.no_grad()
    def evaluate(self, dataloader):
        results_dict = classification_evaluate(
            self.model,
            self.val_dataset,
            self.tokenizer,
            collate_fn=self.collate_fn,
            batch_size=self.config.batch_size,
            num_workers=self.config.num_workers,
            device=self.device,
        )
        return results_dict

    def setup_for_patience_callback(self):
        if self.current_patience > self.max_patience:
            print("Early stopping")
            self.exit_by_patience = True

    def handle_checkpoint_with_patience(
        self, metrics: Dict[str, Any], set_patience=True
    ):
        reset_patience_flag = False
        if len(self.best_metrics_values) == 0:
            self.best_metrics_values = metrics
            return
        for key in self.best_metrics_values:
            short_key = key.split("/")[-1]
            if metrics.get(short_key, -100) > self.best_metrics_values[key]:
                self.best_metrics_values[key] = metrics[short_key]
                self.save_checkpoint(name=f"best_{short_key}")
                reset_patience_flag = True
        if set_patience:
            if reset_patience_flag:
                self.current_patience = 0
            else:
                self.current_patience += 1

    def save_checkpoint(self, name="last", for_last=False):
        model_no_ddp = self.model
        check_point_file_path = (
            f"{self.checkpoint_prefix}/{name}.pt"
            if not for_last
            else f"{self.checkpoint_prefix}/last.pt"
        )
        model_state_dict = model_no_ddp.state_dict()
        save_obj = {
            "model": model_state_dict,
            "optimizer": self.optimizer.state_dict(),
            "scheduler": self.scheduler.state_dict(),
            "config": self.config,
            "epoch": self.cur_epoch,
            "history": self.history_metrics,
            "best_metrics": self.best_metrics_values,
            "patience": self.current_patience,
        }
        torch.save(
            save_obj,
            check_point_file_path,
        )
        torch.cuda.empty_cache()

    def log(
        self,
        name: str,
        value: Union[torch.Tensor, float, int],
    ):
        if type(self._logger) is not None:
            self._logger.log({name: value, "epoch": self.cur_epoch}, step=self.cur_step)

    def get_training_results(self):
        history_metrics = pd.DataFrame(self.history_metrics).round(4)
        print(history_metrics)
        return {
            "best_metrics": self.best_metrics_values,
            "patience": self.current_patience,
            "history": history_metrics,
        }

In [15]:
@dataclass
class TraininingConfig:
    max_epochs: int = 50
    lr: float = 5e-4
    betas: Tuple[float, float] = (0.9, 0.995)
    num_warmup_steps: int = 10
    train_log_interval: int = 10
    val_log_interval: int = 10
    max_patience: int = 5
    output_dir: str = "checkpoints"
    batch_size: int = 512
    num_workers: int = 4

    def get(self, key, default=None):
        return getattr(self, key, default)

In [16]:
# model = HuggingFaceModelWrapper("vinai/phobert-base-v2", 3).to("cuda")

model = CNNClassifier(
    word2vec_model=w2v_model,
    input_dim=300,
    num_filters=300,
    filter_sizes=[3, 4, 5, 6, 7, 8],
    output_dim=3,
    dropout=0.2,
).to("cuda")

# model = LSTMClassifier(
#     word2vec_model=w2v_model,
#     input_dim=300,
#     hidden_dims=[384, 384],
#     output_dim=3,
#     n_layers=2,
#     bidirectional=True,
#     dropout=0.2,
# ).to("cuda")

# model = HybridClassifer(
#     word2vec_model=w2v_model,
#     input_dim=300,
#     lstm_hidden_dim=384,
#     dropout=0.2,
#     cnn_num_filters=300,
#     cnn_filter_sizes=[3, 4, 5, 6, 7],
# ).to("cuda")

training_config = TraininingConfig(
    max_epochs=45, train_log_interval=10, val_log_interval=10, max_patience=7
)
logger = wandb.init(
    anonymous="allow",
    project="<finetune><clip><har_dataset>",
    group="finetune_har_dataset",
    name=str(datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S-")) + "-finetune_clip",
    config=training_config,
    notes="",
    tags=["<finetune>", "<clip>", "<har>"],
)
IPython.display.clear_output(wait=True)
trainer = MyTrainer(
    config=training_config,
    model=model,
    train_dataset=train_table,
    val_dataset=val_table,
    logger=logger,
    device="cuda",
    collate_fn=data_collator_for_cnn,
)

TraininingConfig(max_epochs=45, lr=0.0005, betas=(0.9, 0.995), num_warmup_steps=10, train_log_interval=10, val_log_interval=10, max_patience=7, output_dir='checkpoints', batch_size=512, num_workers=4)


In [17]:
trainer.train()

Current Epoch 45
Exit requirement reached, exiting
    accuracy  macro_f1  macro_precision  macro_recall  epoch  step
0     0.4882    0.4540           0.5261        0.4882      0     8
1     0.4529    0.3990           0.5876        0.4529      1    17
2     0.6000    0.5986           0.6120        0.6000      2    26
3     0.6137    0.6133           0.6196        0.6137      3    35
4     0.6000    0.5913           0.6335        0.6000      4    44
5     0.6392    0.6387           0.6468        0.6392      5    53
6     0.6373    0.6351           0.6490        0.6373      6    62
7     0.6353    0.6343           0.6504        0.6353      7    71
8     0.6373    0.6343           0.6583        0.6373      8    80
9     0.6412    0.6383           0.6661        0.6412      9    89
10    0.6510    0.6484           0.6772        0.6510     10    98
11    0.6725    0.6719           0.6779        0.6725     11   107
12    0.6667    0.6670           0.6792        0.6667     12   116
13    0.684

{'best_metrics': {'val/metrics/accuracy': 0.7019607843137254},
 'patience': 16,
 'history':     accuracy  macro_f1  macro_precision  macro_recall  epoch  step
 0     0.4882    0.4540           0.5261        0.4882      0     8
 1     0.4529    0.3990           0.5876        0.4529      1    17
 2     0.6000    0.5986           0.6120        0.6000      2    26
 3     0.6137    0.6133           0.6196        0.6137      3    35
 4     0.6000    0.5913           0.6335        0.6000      4    44
 5     0.6392    0.6387           0.6468        0.6392      5    53
 6     0.6373    0.6351           0.6490        0.6373      6    62
 7     0.6353    0.6343           0.6504        0.6353      7    71
 8     0.6373    0.6343           0.6583        0.6373      8    80
 9     0.6412    0.6383           0.6661        0.6412      9    89
 10    0.6510    0.6484           0.6772        0.6510     10    98
 11    0.6725    0.6719           0.6779        0.6725     11   107
 12    0.6667    0.6670  

In [18]:
def train_evaluate_pipeline(
    config: DictConfig,
    model: torch.nn.Module,
    train_dataset: Union[torch.utils.data.Dataset, datasets.Dataset],
    val_dataset: Union[torch.utils.data.Dataset, datasets.Dataset],
    test_dataset: Union[torch.utils.data.Dataset, datasets.Dataset],
    collate_fn: Callable,
    tokenizer: Optional[transformers.PreTrainedTokenizer] = None,
    device: str = "cuda",
    num_workers: int = 2,
    logger: Optional[wandb.sdk.wandb_run.Run] = None,
):
    trainer = MyTrainer(
        config=config,
        model=model,
        train_dataset=train_dataset,
        val_dataset=val_dataset,
        logger=logger,
        device=device,
        collate_fn=collate_fn,
    )
    trainer.train()
    trainer.save_checkpoint(for_last=True)
    results = trainer.get_training_results()

    test_results = classification_evaluate(
        model,
        test_dataset,
        tokenizer,
        collate_fn=collate_fn,
        batch_size=config.batch_size,
        num_workers=num_workers,
        device=device,
    )
    print(f"Test results: {test_results}")
    return test_results


evaluation_results = train_evaluate_pipeline(
    config=training_config,
    model=model,
    train_dataset=train_table,
    val_dataset=val_table,
    test_dataset=test_table,
    collate_fn=data_collator_for_cnn,
    tokenizer=None,
    device="cuda",
    num_workers=2,
    logger=logger,
)

Current Epoch 45
Exit requirement reached, exiting
    accuracy  macro_f1  macro_precision  macro_recall  epoch  step
0     0.7078    0.7084           0.7098        0.7078      0     8
1     0.7078    0.7082           0.7091        0.7078      1    17
2     0.6961    0.6953           0.6971        0.6961      2    26
3     0.7000    0.7002           0.7006        0.7000      3    35
4     0.6961    0.6956           0.6957        0.6961      4    44
5     0.7039    0.7041           0.7046        0.7039      5    53
6     0.6980    0.6987           0.7035        0.6980      6    62
7     0.7059    0.7052           0.7066        0.7059      7    71
8     0.6922    0.6925           0.6994        0.6922      8    80
9     0.7098    0.7099           0.7113        0.7098      9    89
10    0.6980    0.6975           0.7005        0.6980     10    98
11    0.7078    0.7075           0.7077        0.7078     11   107
12    0.7059    0.7064           0.7084        0.7059     12   116
13    0.700

Evaluate model on dataset: 100%|█████████████████████████████████████████████████████████| 3/3 [00:01<00:00,  2.18it/s]

Evaluate metrics: {'accuracy': 0.7095238095238096, 'macro_f1': 0.7099185109194494, 'macro_precision': 0.7106348316574992, 'macro_recall': 0.7095238095238096}
Test results: {'accuracy': 0.7095238095238096, 'macro_f1': 0.7099185109194494, 'macro_precision': 0.7106348316574992, 'macro_recall': 0.7095238095238096}





In [20]:
evaluation_results

{'accuracy': 0.7095238095238096,
 'macro_f1': 0.7099185109194494,
 'macro_precision': 0.7106348316574992,
 'macro_recall': 0.7095238095238096}

In [19]:
trainer.get_training_results()

    accuracy  macro_f1  macro_precision  macro_recall  epoch  step
0     0.4882    0.4540           0.5261        0.4882      0     8
1     0.4529    0.3990           0.5876        0.4529      1    17
2     0.6000    0.5986           0.6120        0.6000      2    26
3     0.6137    0.6133           0.6196        0.6137      3    35
4     0.6000    0.5913           0.6335        0.6000      4    44
5     0.6392    0.6387           0.6468        0.6392      5    53
6     0.6373    0.6351           0.6490        0.6373      6    62
7     0.6353    0.6343           0.6504        0.6353      7    71
8     0.6373    0.6343           0.6583        0.6373      8    80
9     0.6412    0.6383           0.6661        0.6412      9    89
10    0.6510    0.6484           0.6772        0.6510     10    98
11    0.6725    0.6719           0.6779        0.6725     11   107
12    0.6667    0.6670           0.6792        0.6667     12   116
13    0.6843    0.6838           0.6884        0.6843     13  

{'best_metrics': {'val/metrics/accuracy': 0.7019607843137254},
 'patience': 16,
 'history':     accuracy  macro_f1  macro_precision  macro_recall  epoch  step
 0     0.4882    0.4540           0.5261        0.4882      0     8
 1     0.4529    0.3990           0.5876        0.4529      1    17
 2     0.6000    0.5986           0.6120        0.6000      2    26
 3     0.6137    0.6133           0.6196        0.6137      3    35
 4     0.6000    0.5913           0.6335        0.6000      4    44
 5     0.6392    0.6387           0.6468        0.6392      5    53
 6     0.6373    0.6351           0.6490        0.6373      6    62
 7     0.6353    0.6343           0.6504        0.6353      7    71
 8     0.6373    0.6343           0.6583        0.6373      8    80
 9     0.6412    0.6383           0.6661        0.6412      9    89
 10    0.6510    0.6484           0.6772        0.6510     10    98
 11    0.6725    0.6719           0.6779        0.6725     11   107
 12    0.6667    0.6670  

# Model Evaluation