<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Config" data-toc-modified-id="Config-1">Config</a></span></li></ul></div>

<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"></ul></div>

## Config

In [1]:
# Configuration


MODEL_NAME = 'bert-base-uncased'
PROJECT_NAME = 'toxic_comment_detection'
LOG_DIR = "logdir"

PERC_OF_DATA = 80 # percent of data to use
BATCH_SIZE = 20
EPOCHS = 4
NUM_CLASSES = 2
LEARNING_RATE = 3e-5 
NUM_WORKERS = 2
SEED = 42

WANDB = False # if True, install and use wandb logger
COLAB_MODE = False

In [2]:
!pip install transformers catalyst
if WANDB:
    !pip install wandb

[33mDEPRECATION: Configuring installation scheme with distutils config files is deprecated and will no longer work in the near future. If you are using a Homebrew or Linuxbrew Python, please see discussion at https://github.com/Homebrew/homebrew-core/issues/76621[0m[33m
[33mDEPRECATION: Configuring installation scheme with distutils config files is deprecated and will no longer work in the near future. If you are using a Homebrew or Linuxbrew Python, please see discussion at https://github.com/Homebrew/homebrew-core/issues/76621[0m[33m
[0m

In [1]:
from pathlib import Path
import os
from typing import List, Mapping, Tuple
import logging
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from tqdm import notebook

from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer
# Transformers 
from transformers import AutoConfig, AutoModel, AutoTokenizer, PreTrainedTokenizerFast

import torch
from torch.utils.data import DataLoader, TensorDataset
from catalyst import dl
import torch.nn as nn
from catalyst.contrib.losses.focal import FocalLossBinary

from catalyst.dl import (
    CheckpointCallback,
    OptimizerCallback,
    SchedulerCallback,
    SupervisedRunner,
    AccuracyCallback,
    PrecisionRecallF1SupportCallback
)

import platform

def choose_device(platform:str) -> str:
    if platform == 'Darwin':
        return 'mps'
    elif platform == 'Linux' and torch.cuda.is_avaliable():
        return 'cuda'
    else:
        return 'cpu'



ModuleNotFoundError: No module named 'catalyst'

In [4]:
if COLAB_MODE:
    from google.colab import drive
    drive.mount('/content/drive')


    COLAB_ROOT = Path('drive/MyDrive/Colab_Notebooks/', PROJECT_NAME)

    #Optional: move to the desired location:
    %cd $COLAB_ROOT

In [5]:
local_path = '../datasets/toxic_comments.csv'
cloud_path = '/datasets/toxic_comments.csv'
colab_path = '../datasets/toxic_comments.csv'

if os.path.exists(local_path):
    df = pd.read_csv(local_path)
elif os.path.exists(cloud_path):
    df = pd.read_csv(cloud_path)
elif os.path.exists(colab_path):
    df = pd.read_csv(colab_path)
else:
    print('something wrong! Check path')

In [6]:
class TextClassificationDataset(Dataset):
    """
    Wrapper around Torch Dataset to perform text classification
    """

    def __init__(
        self,
        texts: List[str],
        labels: np.array = None,
        max_seq_length: int = 256,
        model_name: str = MODEL_NAME,
    ):
        """
        Args:
            texts (List[str]): a list with texts to classify or to train the
                classifier on
            labels List[str]: a list with classification labels (optional)
            label_dict (dict): a dictionary mapping class names to class ids,
                to be passed to the validation data (optional)
            max_seq_length (int): maximal sequence length in tokens,
                texts will be stripped to this length
            model_name (str): transformer model name, needed to perform
                appropriate tokenization
        """

        self.texts = texts
        self.labels = labels
        self.max_seq_length = max_seq_length

        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model_name = model_name

        # suppresses tokenizer warnings
        logging.getLogger("transformers.tokenization_utils").setLevel(logging.FATAL)

        # special tokens for transformers
        # in the simplest case a [CLS] token is added in the beginning
        # and [SEP] token is added in the end of a piece of text
        # [CLS] <indexes text tokens> [SEP] .. <[PAD]>

        self.sep_vid = self.tokenizer.vocab["[SEP]"]
        self.cls_vid = self.tokenizer.vocab["[CLS]"]
        self.pad_vid = self.tokenizer.vocab["[PAD]"]

    def __len__(self) -> int:
        """
        Returns:
            int: length of the dataset
        """
        return len(self.texts)

    def __getitem__(self, index) -> Mapping[str, torch.Tensor]:
        """Gets element of the dataset
        Args:
            index (int): index of the element in the dataset
        Returns:
            Single element by index
        """

        # encoding the text
        x = self.texts[index]

        # a dictionary with `input_ids` and `attention_mask` as keys
        output_dict = self.tokenizer.encode_plus(
            x,
            add_special_tokens=True,
            padding='max_length',
            max_length=self.max_seq_length,
            truncation=True,
            return_tensors="pt",
            return_attention_mask=True,
        )

        # for Catalyst, there needs to be a key called features
        output_dict['features'] = output_dict['input_ids'].squeeze()
        del output_dict['input_ids']

        return {
                'attention_mask': output_dict['attention_mask'].squeeze(),
                'features': output_dict['features'].squeeze(),
                'targets': torch.tensor(self.labels[index].squeeze())
                }




In [7]:


sample = df.sample(frac=PERC_OF_DATA / 100)
X_train, X_test, y_train, y_test = train_test_split(
    sample['text'],
    sample['toxic'].values,
    test_size=.3,
    stratify=sample['toxic'].values
)

X_valid, X_test, y_valid, y_test = train_test_split(
    X_test,
    y_test,
    test_size=.5,
    stratify=y_test
)



In [8]:
# Check class balance
sum(y_train == 0) / len(y_train), sum(y_test == 0) / len(y_test)

(0.8983650219899506, 0.8983758943025746)

In [9]:
train_dataset = TextClassificationDataset(
    texts=X_train.values.tolist(),
    labels=y_train,
                               )

valid_dataset = TextClassificationDataset(
    texts=X_valid.values,
    labels=y_valid
)

test_dataset = TextClassificationDataset(
    texts=X_test.values.tolist(),
    labels=y_test
)

In [10]:
# creating PyTorch data loaders and placing them in dictionaries (for Catalyst)
train_val_loaders = {
        "train": DataLoader(
            dataset=train_dataset,
            batch_size=BATCH_SIZE,
            shuffle=True,
            num_workers=NUM_WORKERS
        ),
        "valid": DataLoader(
            dataset=valid_dataset,
            batch_size=BATCH_SIZE,
            shuffle=False,
            num_workers=NUM_WORKERS
        ),
    }

test_loaders = {
        "test": DataLoader(
            dataset=test_dataset,
            batch_size=BATCH_SIZE,
            shuffle=False,
            num_workers=NUM_WORKERS
        )
    }

In [11]:
class BertForSequenceClassification(nn.Module):
    """
    Simplified version of the same class by HuggingFace.
    See transformers/modeling_distilbert.py in the transformers repository.
    """

    def __init__(self, pretrained_model_name: str, num_classes: int = NUM_CLASSES, dropout: float = 0.3):
        """
        Args:
            pretrained_model_name (str): HuggingFace model name.
                See transformers/modeling_auto.py
            num_classes (int): the number of class labels
                in the classification task
        """
        super().__init__()

        config = AutoConfig.from_pretrained(
            pretrained_model_name, num_labels=NUM_CLASSES)

        self.model = AutoModel.from_pretrained(pretrained_model_name,
                                                    config=config)
#         self.pre_classifier = nn.Linear(config.hidden_size, config.hidden_size)
        self.classifier = nn.Linear(config.hidden_size, num_classes)
        self.dropout = nn.Dropout(dropout)

    def forward(self, features, attention_mask=None, head_mask=None):
        """Compute class probabilities for the input sequence.

        Args:
            features (torch.Tensor): ids of each token,
                size ([bs, seq_length]
            attention_mask (torch.Tensor): binary tensor, used to select
                tokens which are used to compute attention scores
                in the self-attention heads, size [bs, seq_length]
            head_mask (torch.Tensor): 1.0 in head_mask indicates that
                we keep the head, size: [num_heads]
                or [num_hidden_layers x num_heads]
        Returns:
            PyTorch Tensor with predicted class probabilities
        """
        assert attention_mask is not None, "attention mask is none"
        
        bert_output = self.model(input_ids=features,
                                            attention_mask=attention_mask,
                                            head_mask=head_mask)
        # we only need the hidden state here and don't need
        # transformer output, so index 0
        seq_output = bert_output[0]  # (bs, seq_len, dim)
        # mean pooling, i.e. getting average representation for all tokens
        pooled_output = seq_output.mean(axis=1)  # (bs, dim)
        pooled_output = self.dropout(pooled_output)  # (bs, dim)
        logits = self.classifier(pooled_output)  # (bs, dim)

        return logits

In [None]:

# model = BertForSequenceClassification.from_pretrained(MODEL_NAME)
model = BertForSequenceClassification(pretrained_model_name=MODEL_NAME,
                                      num_classes=NUM_CLASSES)

# specify criterion for the multi-class classification task, optimizer and scheduler
criterion = torch.nn.CrossEntropyLoss()
# criterion = FocalLossBinary()
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer)

# reproducibility
torch.manual_seed(SEED)
# prepare_cudnn(deterministic=True)

# here we specify that we pass masks to the runner. So model's forward method will be called with
# these arguments passed to it.
# model training
runner = SupervisedRunner(
    input_key=("features", "attention_mask")
    )

# finally, training the model with Catalyst
runner.train(
    model=model,
    criterion=criterion,
    optimizer=optimizer,
    scheduler=scheduler,
    loaders=train_val_loaders,
    callbacks=[
        AccuracyCallback(num_classes=2, input_key="logits", target_key="targets"),
        PrecisionRecallF1SupportCallback(
            input_key="logits", target_key="targets", num_classes=NUM_CLASSES
        ),
        OptimizerCallback(accumulation_steps=4, metric_key="loss"),
        SchedulerCallback(loader_key="valid", metric_key="loss"),
        CheckpointCallback(logdir=LOG_DIR, loader_key="valid", metric_key="loss", minimize=True),
    ],
     loggers={"wandb": dl.WandbLogger(project=PROJECT_NAME, name=f"{MODEL_NAME}_catalyst")} if WANDB else None,
    logdir=LOG_DIR,
    num_epochs=EPOCHS,
    verbose = True
)

# and running inference
torch.cuda.empty_cache()

# getting validation metrics
metrics = runner.evaluate_loader(
    loader=train_val_loaders["valid"],
    callbacks=[PrecisionRecallF1SupportCallback(
            input_key="logits", target_key="targets", num_classes=NUM_CLASSES
        )],
)
print(metrics)

# lastly, saving predicted scores for the test set
# test_pred_scores = np.concatenate(
#     [pred["logits"].detach().cpu().numpy() for pred in runner.predict_loader(loader=test_loaders["test"])]
# )


# np.savetxt(X=test_pred_scores, fname=Path(LOG_DIR / 'pred.txt'))

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


1/4 * Epoch (train):   0%|          | 0/4468 [00:00<?, ?it/s]

Traceback (most recent call last):
  File "<string>", line 1, in <module>
  File "/usr/local/Cellar/python@3.9/3.9.13_3/Frameworks/Python.framework/Versions/3.9/lib/python3.9/multiprocessing/spawn.py", line 116, in spawn_main
    exitcode = _main(fd, parent_sentinel)
  File "/usr/local/Cellar/python@3.9/3.9.13_3/Frameworks/Python.framework/Versions/3.9/lib/python3.9/multiprocessing/spawn.py", line 126, in _main
    self = reduction.pickle.load(from_parent)
AttributeError: Can't get attribute 'TextClassificationDataset' on <module '__main__' (built-in)>


In [None]:
import tensorboard
from tensorboard import notebook
notebook.list() # View open TensorBoard instances

In [None]:
!ls 

In [None]:
!tensorboard dev upload \
  --logdir logdir/tensorboard/ \
  --name "catalyst_bert_full_dataset" \


In [None]:
!tensorboard --logdir /logdir/tensorboard/

In [None]:
!ls logdir/tensorboard