<a href="https://colab.research.google.com/github/vlordier/colabs/blob/main/Model4(Detoxify)_Notebook4_(Updated).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!git clone https://github.com/unitaryai/detoxify.git

fatal: destination path 'detoxify' already exists and is not an empty directory.


# **Installing Neccessary Libraries**

In [None]:
!pip install pytorch_lightning
!pip install datasets
!pip install transformers



# **Kaggle Dataset API**

In [None]:
! pip install  kaggle
!pip install --upgrade --force-reinstall --no-deps kaggle
from google.colab import files

files.upload()
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!ls ~/.kaggle
!chmod 600 /root/.kaggle/kaggle.json

Collecting kaggle
  Using cached kaggle-1.5.12-py3-none-any.whl
Installing collected packages: kaggle
  Attempting uninstall: kaggle
    Found existing installation: kaggle 1.5.12
    Uninstalling kaggle-1.5.12:
      Successfully uninstalled kaggle-1.5.12
Successfully installed kaggle-1.5.12


Saving kaggle.json to kaggle (2).json
kaggle.json


# **ZIP File Extraction**

In [None]:
!kaggle competitions download -c jigsaw-toxic-comment-classification-challenge
import zipfile
zip_ref = zipfile.ZipFile("/content/jigsaw-toxic-comment-classification-challenge.zip", 'r')
zip_ref.extractall("/content/")
zip_ref.close()
zip_ref = zipfile.ZipFile("/content/train.csv.zip", 'r')
zip_ref.extractall("/content/")
zip_ref.close()
zip_ref = zipfile.ZipFile("/content/test.csv.zip", 'r')
zip_ref.extractall("/content/")
zip_ref.close()
zip_ref = zipfile.ZipFile("/content/test_labels.csv.zip", 'r')
zip_ref.extractall("/content/")
zip_ref.close()

jigsaw-toxic-comment-classification-challenge.zip: Skipping, found more recently modified local copy (use --force to force download)


In [None]:

# importing sys
import sys
sys.path.insert(0, '/content/detoxify')
!pip install sentencepiece
!pip install transformers




# **Training Validation and Testing**

# Before Running this cell
# You need to update test.csv and train.csv directory in JSON FILE
# /content/detoxify/configs/Toxic_comment_classification_BERT.json

In [None]:
#training with just 2000 samples (or 200 batches with batch size of 10) for quick training
import pandas as pd
train_df=pd.read_csv("train.csv")
new_train_df=train_df.iloc[:2000,:]
new_train_df.to_csv("new_train.csv")

In [None]:
import argparse
import json
import os
import transformers

import pytorch_lightning as pl
import torch
from pytorch_lightning.callbacks import ModelCheckpoint
from torch.nn import functional as F
from torch.utils.data import DataLoader

import src.data_loaders as module_data
from src.utils import get_model_and_tokenizer
from transformers import AutoTokenizer

def get_model_and_tokenizer222(model_type, model_name, tokenizer_name, num_classes):

    model = getattr(transformers, model_name).from_pretrained(
        model_type, num_labels=num_classes
    )
    tokenizer =  AutoTokenizer.from_pretrained(model_type)
    return model, tokenizer

import argparse
import json
import os

import pytorch_lightning as pl
import torch
from pytorch_lightning.callbacks import ModelCheckpoint
from torch.nn import functional as F
#from torch.utils.data import DataLoader

import src.data_loaders as module_data


class ToxicClassifier(pl.LightningModule):
    """Toxic comment classification for the Jigsaw challenges.
    Args:
        config ([dict]): takes in args from a predefined config
                              file containing hyperparameters.
    """

    def __init__(self, config):
        super().__init__()
        self.save_hyperparameters()
        self.num_classes = config["arch"]["args"]["num_classes"]
        self.model_args = config["arch"]["args"]
        self.model, self.tokenizer = get_model_and_tokenizer(**self.model_args)
        self.bias_loss = False

        if "loss_weight" in config:
            self.loss_weight = config["loss_weight"]
        if "num_main_classes" in config:
            self.num_main_classes = config["num_main_classes"]
            self.bias_loss = True
        else:
            self.num_main_classes = self.num_classes

        self.config = config

    def forward(self, x):
        x2=list(x)
        inputs = self.tokenizer(
            x2, return_tensors="pt", truncation=True, padding=True
        ).to(self.model.device)
        outputs = self.model(**inputs)[0]
        return outputs

    def training_step(self, batch, batch_idx):
        x, meta = batch
        output = self.forward(x)
        loss = self.binary_cross_entropy(output, meta)
        self.log("train_loss", loss)
        return {"loss": loss}

    def validation_step(self, batch, batch_idx):
        x, meta = batch
        output = self.forward(x)
        loss = self.binary_cross_entropy(output, meta)
        acc = self.binary_accuracy(output, meta)
        self.log("val_loss", loss)
        self.log("val_acc", acc)
        return {"loss": loss, "acc": acc}

    def test_step(self, batch, batch_idx):
        x, meta = batch
        output = self.forward(x)
        loss = self.binary_cross_entropy(output, meta)
        acc = self.binary_accuracy(output, meta)
        print(output,meta)
        self.log("test_loss", loss)
        self.log("test_acc", acc)
        return {"loss": loss, "acc": acc}

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), **self.config["optimizer"]["args"])

    def binary_cross_entropy(self, input, meta):
        """Custom binary_cross_entropy function.

        Args:
            output ([torch.tensor]): model predictions
            meta ([dict]): meta dict of tensors including targets and weights

        Returns:
            [torch.tensor]: model loss
        """

        if "weight" in meta:
            target = meta["target"].to(input.device).reshape(input.shape)
            weight = meta["weight"].to(input.device).reshape(input.shape)
            return F.binary_cross_entropy_with_logits(input, target, weight=weight)
        elif "multi_target" in meta:
            target = meta["multi_target"].to(input.device)
            loss_fn = F.binary_cross_entropy_with_logits
            mask = target != -1
            loss = loss_fn(input, target.float(), reduction="none")

            if "class_weights" in meta:
                weights = meta["class_weights"][0].to(input.device)
            elif "weights1" in meta:
                weights = meta["weights1"].to(input.device)
            else:
                weights = torch.tensor(1 / self.num_main_classes).to(input.device)
                loss = loss[:, : self.num_main_classes]
                mask = mask[:, : self.num_main_classes]

            weighted_loss = loss * weights
            nz = torch.sum(mask, 0) != 0
            masked_tensor = weighted_loss * mask
            masked_loss = torch.sum(masked_tensor[:, nz], 0) / torch.sum(mask[:, nz], 0)
            loss = torch.sum(masked_loss)
            return loss
        else:
            target = meta["target"].to(input.device)
            return F.binary_cross_entropy_with_logits(input, target.float())

    def binary_accuracy(self, output, meta):
        """Custom binary_accuracy function.

        Args:
            output ([torch.tensor]): model predictions
            meta ([dict]): meta dict of tensors including targets and weights

        Returns:
            [torch.tensor]: model accuracy
        """
        if "multi_target" in meta:
            target = meta["multi_target"].to(output.device)
        else:
            target = meta["target"].to(output.device)
        with torch.no_grad():
            mask = target != -1
            pred = torch.sigmoid(output[mask]) >= 0.5
            correct = torch.sum(pred.to(output[mask].device) == target[mask])
            if torch.sum(mask).item() != 0:
                correct = correct.item() / torch.sum(mask).item()
            else:
                correct = 0

        return torch.tensor(correct)
data_loaders=0
global model
def cli_main():
    pl.seed_everything(1234)


    config = json.load(open('/content/detoxify/configs/Toxic_comment_classification_BERT.json'))

    config["device"] = None

    # data
    def get_instance(module, name, config, *args, **kwargs):
        return getattr(module, config[name]["type"])(
            *args, **config[name]["args"], **kwargs
        )

    dataset = get_instance(module_data, "dataset", config)
    val_dataset = get_instance(module_data, "dataset", config, train=False)

    data_loader = DataLoader(
        dataset,
        batch_size=int(config["batch_size"]),
        num_workers=2,
        shuffle=True,
        drop_last=True,
        pin_memory=True,
    )

    valid_data_loader = DataLoader(
        val_dataset,
        batch_size=config["batch_size"],
        num_workers=2,
        shuffle=False,
    )
    # model
    model = ToxicClassifier(config)

    # training

    checkpoint_callback = ModelCheckpoint(
        save_top_k=100,
        verbose=True,
        monitor="val_loss",
        mode="min",
    )
    trainer = pl.Trainer(
        gpus=1,
        max_epochs=1,
        accumulate_grad_batches=config["accumulate_grad_batches"],
        callbacks=[checkpoint_callback],
        resume_from_checkpoint=None,
        default_root_dir="saved/" + config["name"],
        deterministic=True,
    )
    trainer.fit(model, data_loader,valid_data_loader)
    return trainer,model,valid_data_loader


if __name__ == "__main__":
    trainer,model,valid_data_loader=cli_main()


Global seed set to 1234
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from

Validation sanity check: 0it [00:00, ?it/s]

Global seed set to 1234


Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Epoch 0, global step 66: val_loss reached 0.12196 (best 0.12196), saving model to "saved/Jigsaw_BERT/lightning_logs/version_6/checkpoints/epoch=0-step=66.ckpt" as top 100
