In [1]:
import os
import sys
import logging
import argparse
from pathlib import Path
from ast import literal_eval
from collections import Counter
from typing import Any, Dict, Optional

from icecream import ic
from tqdm.auto import tqdm

import torchmetrics
from torchmetrics.functional import accuracy, f1, auroc

import pytorch_lightning as pl
from pytorch_lightning.loggers import TensorBoardLogger
from pytorch_lightning.core.decorators import auto_move_data
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping

import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler

import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, multilabel_confusion_matrix


import matplotlib.pyplot as plt
from pylab import rcParams
from matplotlib import rc

import transformers
from transformers import (
    AdamW,
    AutoConfig,
    AutoModel,
    AutoModelForSequenceClassification,
    AutoTokenizer,
)
from transformers.optimization import (
    Adafactor,
    get_linear_schedule_with_warmup,
)

In [2]:
%matplotlib inline
%config InlineBackend.figure_format='retina'

In [3]:
sns.set(style='whitegrid', palette='muted', font_scale=1.2)
HAPPY_COLORS_PALETTE = ["#01BEFE", "#FFDD00", "#FF7D00", "#FF006D", "#ADFF02", "#8F00FF"]
sns.set_palette(sns.color_palette(HAPPY_COLORS_PALETTE))
rcParams['figure.figsize'] = 12, 8

In [4]:
RANDOM_SEED=2021
pl.seed_everything(RANDOM_SEED)

Global seed set to 2021


2021

In [5]:
ic.configureOutput(outputFunction=sys.stdout.write, includeContext=True)

In [6]:
logger = logging.getLogger(__name__)

In [7]:
MAX_LEN = 200
TRAIN_BATCH_SIZE = 4
VALID_BATCH_SIZE = 8
EPOCHS = 1
LEARNING_RATE = 1e-05
MODEL_NAME = "microsoft/deberta-large"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

In [8]:
train_dataset = pd.read_csv("data_v0.4.2_train.csv")
val_dataset = pd.read_csv("data_v0.4.2_val.csv")
##
train_dataset["sectors"] = train_dataset["sectors"].apply(literal_eval)
val_dataset["sectors"] = val_dataset["sectors"].apply(literal_eval)
##
secotr_set = set()
for sectors_i in train_dataset["sectors"]:
    secotr_set.update(sectors_i)
sectorname_to_sectorid = {sector:i for i, sector in enumerate(list(sorted(secotr_set)))}

In [9]:
class SectorsDataset(Dataset):
    def __init__(self, dataframe, sectorname_to_sectorid, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.excerpt_text = dataframe["excerpt"].tolist(
        ) if dataframe is not None else None
        self.targets = self.data["sectors"].tolist(
        ) if dataframe is not None else None
        self.sectorname_to_sectorid = sectorname_to_sectorid
        self.sectorid_to_sectorname = list(sectorname_to_sectorid.keys())
        self.max_len = max_len

    def encode_example(self,
                       excerpt_text: str,
                       index=None,
                       as_batch: bool = False):
        #excerpt_text = " ".join(excerpt_text.split())

        inputs = self.tokenizer(excerpt_text,
                                            None,
                                            truncation=True,
                                            add_special_tokens=True,
                                            max_length=self.max_len,
                                            padding="max_length",
                                            return_token_type_ids=True)
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]
        targets = None
        if self.targets:
            target_indices = [
                self.sectorname_to_sectorid[target]
                for target in self.targets[index]
                if target in self.sectorname_to_sectorid
            ]
            targets = np.zeros(len(self.sectorname_to_sectorid), dtype=np.int)
            targets[target_indices] = 1

        encoded = {
            'ids':
            torch.tensor(ids, dtype=torch.long),
            'mask':
            torch.tensor(mask, dtype=torch.long),
            'token_type_ids':
            torch.tensor(token_type_ids, dtype=torch.long),
            'targets':
            torch.tensor(targets, dtype=torch.float32)
            if targets is not None else None
        }
        if as_batch:
            return {
                "ids": encoded["ids"].unsqueeze(0),
                "mask": encoded["mask"].unsqueeze(0),
                "token_type_ids": encoded["ids"].unsqueeze(0)
            }
        return encoded

    def __len__(self):
        return len(self.excerpt_text)

    def __getitem__(self, index):
        excerpt_text = str(self.excerpt_text[index])
        return self.encode_example(excerpt_text, index)

In [10]:
training_set = SectorsDataset(train_dataset, sectorname_to_sectorid, tokenizer,
                              MAX_LEN)
val_set = SectorsDataset(val_dataset, sectorname_to_sectorid, tokenizer,
                         MAX_LEN)

val_set_frac = SectorsDataset(val_dataset.sample(frac=.005),
                              sectorname_to_sectorid, tokenizer, MAX_LEN)

In [11]:
train_params = {
    'batch_size': TRAIN_BATCH_SIZE,
    'shuffle': True,
    'num_workers': 4
}

val_params = {
    'batch_size': VALID_BATCH_SIZE,
    'shuffle': False,
    'num_workers': 4
}

training_loader = DataLoader(training_set, **train_params)
val_loader = DataLoader(val_set, **val_params)
val_loader_frac = DataLoader(val_set_frac, **val_params)

In [12]:
class Model(nn.Module):
    def __init__(self, model_name_or_path: str, num_labels:int):
        super().__init__()
        self.l1 = AutoModel.from_pretrained(model_name_or_path)
        self.l2 = torch.nn.Dropout(0.3)
        self.l3 = torch.nn.Linear(1024, num_labels)
    def forward(self, inputs):
        output = self.l1(inputs["ids"],
                            attention_mask=inputs["mask"],)
        output = output.last_hidden_state
        output = self.l2(output)
        output = self.l3(output)
        return output[:, 0, :]

In [13]:
class SectorsTransformer(pl.LightningModule):
    def __init__(self,
                 model_name_or_path: str,
                 num_labels: int,
                 empty_dataset: SectorsDataset,
                 pred_threshold: float = .5,
                 learning_rate: float = 2e-5,
                 adam_epsilon: float = 1e-8,
                 warmup_steps: int = 0,
                 weight_decay: float = 0.0,
                 train_batch_size: int = 32,
                 eval_batch_size: int = 32,
                 eval_splits: Optional[list] = None,
                 **kwargs):
        super().__init__()

        self.save_hyperparameters()

        self.model = Model(model_name_or_path, num_labels)
        self.empty_dataset = empty_dataset
        self.pred_threshold = pred_threshold

        self.f1_score_train = torchmetrics.F1(
            num_classes=2,
            threshold=0.5,
            average='macro',
            mdmc_average="samplewise",
            ignore_index=None,
            top_k=None,
            multiclass=True,
            compute_on_step=True,
            dist_sync_on_step=False,
            process_group=None,
            dist_sync_fn=None,
        )

        self.f1_score_val = torchmetrics.F1(
            num_classes=2,
            threshold=0.5,
            average='macro',
            mdmc_average="samplewise",
            ignore_index=None,
            top_k=None,
            multiclass=True,
            compute_on_step=True,
            dist_sync_on_step=False,
            process_group=None,
            dist_sync_fn=None,
        )
    @auto_move_data
    def forward(self, inputs):
        output = self.model(inputs)
        return output

    def training_step(self, batch, batch_idx):
        outputs = self(batch)
        loss = F.binary_cross_entropy_with_logits(outputs, batch["targets"])

        self.f1_score_train(torch.sigmoid(outputs),
                            batch["targets"].to(dtype=torch.long))
        self.log("train_f1", self.f1_score_train, prog_bar=True)
        return loss

    def validation_step(self, batch, batch_idx, dataloader_idx=0):
        outputs = self(batch)
        val_loss = F.binary_cross_entropy_with_logits(outputs,
                                                      batch["targets"])

        self.f1_score_val(torch.sigmoid(outputs),
                          batch["targets"].to(dtype=torch.long))
        self.log("val_f1",
                 self.f1_score_val,
                 on_step=True,
                 on_epoch=True,
                 prog_bar=True,
                 logger=False)
        
        self.log("val_loss",
                 val_loss,
                 on_step=True,
                 on_epoch=True,
                 prog_bar=True,
                 logger=False)
        return {'val_loss': val_loss}

    def test_step(self, batch, batch_nb):
        logits = self(batch)
        preds = (torch.sigmoid(logits) > .5)
        return {"preds": preds, "targets_i": batch["targets"]}

    def on_test_epoch_end(self, outputs):
        preds = torch.cat([output["preds"] for output in outputs]).cpu()
        targets = torch.cat([output["targets_i"] for output in outputs]).cpu()
        recalls = []
        precisions = []
        f1_scores = []
        for i in range(targets.shape[1]):
            class_roc_auc = auroc(preds[:, i], targets[:, i])
            self.log(
                f"{self.empty_dataset.sectorid_to_sectorname[i]}_roc_auc/Train",
                class_roc_auc)
            class_f1 = metrics.f1_score(targets[:, i], preds[:, i])
            self.log(
                f"{self.empty_dataset.sectorid_to_sectorname[i]}_f1/Train",
                class_f1)

    def predict_step(self, batch, batch_idx, dataloader_idx=None):
        output = self(batch)
        return {"logits": output}

    def on_predict_epoch_end(self, outputs):
        print(outputs)
        logits = torch.cat([output["logits"] for output in outputs[0]])
        preds = torch.sigmoid(logits) >= self.pred_threshold
        pred_classes = []
        for pred in preds:
            pred_classes_i = [
                self.empty_dataset.sectorid_to_sectorname[i]
                for i, p in enumerate(pred) if p
            ]
            pred_classes.append(pred_classes_i)
        self.log({"pred_classes": pred_classes})

    def custom_predict(self, inputs):
        self.eval()
        self.freeze()
        as_batch = False
        if isinstance(inputs, str):
            as_batch = True
        inputs = self.empty_dataset.encode_example(inputs, as_batch)

        with torch.no_grad():
            logits = self(inputs)
        preds = (torch.sigmoid(logits) >= self.pred_threshold)
        pred_classes = []
        for pred in preds:
            pred_classes_i = [
                self.empty_dataset.sectorid_to_sectorname[i]
                for i, p in enumerate(pred) if p
            ]
            pred_classes.append(pred_classes_i)
        return pred_classes

    def total_steps(self) -> int:
        """The number of total training steps that will be run. Used for lr scheduler purposes."""
        self.dataset_size = len(self.train_dataloader().dataset)
        num_devices = max(1, self.hparams.gpus)  # TODO: consider num_tpu_cores
        effective_batch_size = self.hparams.train_batch_size * self.hparams.accumulate_grad_batches * num_devices
        return (self.dataset_size /
                effective_batch_size) * self.hparams.max_epochs

    def configure_optimizers(self):
        "Prepare optimizer and schedule (linear warmup and decay)"
        model = self.model
        no_decay = ["bias", "LayerNorm.weight"]
        optimizer_grouped_parameters = [
            {
                "params": [
                    p for n, p in model.named_parameters()
                    if not any(nd in n for nd in no_decay)
                ],
                "weight_decay":
                self.hparams.weight_decay,
            },
            {
                "params": [
                    p for n, p in model.named_parameters()
                    if any(nd in n for nd in no_decay)
                ],
                "weight_decay":
                0.0,
            },
        ]
        optimizer = AdamW(optimizer_grouped_parameters,
                          lr=self.hparams.learning_rate,
                          eps=self.hparams.adam_epsilon)

        scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=self.hparams.warmup_steps,
            num_training_steps=self.total_steps())
        scheduler = {
            'scheduler': scheduler,
            'interval': 'step',
            'frequency': 1
        }
        return [optimizer], [scheduler]

    def train_dataloader(self):
        return training_loader

    def val_dataloader(self):
        return val_loader

In [14]:
# data_module = pl.LightningDataModule.from_datasets(train_dataset=training_set,
#                                                    val_dataset=val_set,
#                                                    test_dataset=None,
#                                                    batch_size=TRAIN_BATCH_SIZE,
#                                                    num_workers=4)

In [15]:
checkpoint_callback = ModelCheckpoint(
  dirpath="checkpoints",
  filename="best-checkpoint",
  save_top_k=1,
  verbose=True,
  monitor="val_loss",
  mode="min"
)

In [16]:
logger = TensorBoardLogger("lightning_logs", name="sector-classifier-deberta-xlarge")

In [17]:
early_stopping_callback = EarlyStopping(monitor='val_loss', patience=2)

In [18]:
trainer = pl.Trainer(
    logger=logger,
    checkpoint_callback=checkpoint_callback,
    callbacks=[early_stopping_callback],
    progress_bar_refresh_rate=30,
    profiler="simple",
    log_gpu_memory=True,
    weights_summary=None,
    gpus=1,
    accumulate_grad_batches=1,
    max_epochs=EPOCHS,
    gradient_clip_val=1,
    gradient_clip_algorithm='norm',
    #overfit_batches=1,
    #limit_predict_batches=2,
    #limit_test_batches=2,
    #fast_dev_run=True,
    #limit_train_batches=1,
    #limit_val_batches=1,
    #limit_test_batches: Union[int, float] = 1.0,
)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores


In [19]:
empty_dataset = SectorsDataset(None, sectorname_to_sectorid, tokenizer,
                         MAX_LEN)
model = SectorsTransformer(MODEL_NAME,
                           len(sectorname_to_sectorid),
                           empty_dataset,
                           gpus=1,
                           precision=16,
                           plugin='deepspeed_stage_3_offload',
                           accumulate_grad_batches=1,
                           max_epochs=EPOCHS)

Some weights of the model checkpoint at microsoft/deberta-large were not used when initializing DebertaModel: ['lm_predictions.lm_head.dense.bias', 'config', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.weight']
- This IS expected if you are initializing DebertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
trainer.fit(model)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Validation sanity check: 0it [00:00, ?it/s]

Global seed set to 2021


Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

FIT Profiler Report

Action                             	|  Mean duration (s)	|Num calls      	|  Total time (s) 	|  Percentage %   	|
--------------------------------------------------------------------------------------------------------------------------------------
Total                              	|  -              	|_              	|  3.9396e+04     	|  100 %          	|
--------------------------------------------------------------------------------------------------------------------------------------
run_training_epoch                 	|  3.9383e+04     	|1              	|  3.9383e+04     	|  99.967         	|
run_training_batch                 	|  1.6798         	|22664          	|  3.8071e+04     	|  96.637         	|
optimizer_step_and_closure_0       	|  1.6792         	|22664          	|  3.8057e+04     	|  96.601         	|
training_step_and_backward         	|  1.4376         	|22664          	|  3.2581e+04     	|  82.701         	|
backward                           

In [None]:
def custom_eval(model, eval_dataloader):
    if model.device.type == "cpu":
        model.to("cuda")
    model.eval()
    model.freeze()
    preds_val_all = []
    y_true = []

    with torch.no_grad():
        for batch in tqdm(eval_dataloader, total=len(eval_dataloader.dataset)//eval_dataloader.batch_size):
            logits = model({"ids": batch["ids"].to("cuda"), "mask": batch["mask"].to("cuda"), "token_type_ids": batch["token_type_ids"].to("cuda")})
            preds_batch = np.zeros(logits.shape, dtype=np.int)
            preds_batch[(torch.sigmoid(logits) > model.pred_threshold).cpu().nonzero(as_tuple=True)] = 1
            preds_val_all.append(preds_batch)
            y_true.append(batch["targets"].numpy().astype(np.int))

    preds_val_all = np.concatenate(preds_val_all)
    y_true = np.concatenate(y_true)

    f1_scores = []
    recalls = []
    precisions = []
    accuracies = []
    supports = []
    
    for sec_name, sec_id in sectorname_to_sectorid.items():
        cls_rprt = classification_report(y_true[:, sec_id], preds_val_all[:, sec_id], output_dict=True)
        precisions.append(cls_rprt["macro avg"]["precision"])
        recalls.append(cls_rprt["macro avg"]["recall"])
        f1_scores.append(cls_rprt["macro avg"]["f1-score"])
        supports.append(cls_rprt["1"]["support"])
        accuracies.append(cls_rprt["accuracy"])
        
    metrics_df = pd.DataFrame({
        "Sector": list(sectorname_to_sectorid.keys()),
        "Precision": precisions,
        "Recall": recalls,
        "F1 Score": f1_scores,
        "Accuracy": accuracies,
        "Support": supports,
    })
    return metrics_df
    #multilabel_confusion_matrix()

In [None]:
df_metrics_val = custom_eval(model, val_loader)

  0%|          | 0/1259 [00:00<?, ?it/s]

In [None]:
df_metrics_train = custom_eval(model, training_loader)

  0%|          | 0/22663 [00:00<?, ?it/s]

In [None]:
df_metrics_val

Unnamed: 0,Sector,Precision,Recall,F1 Score,Accuracy,Support
0,Agriculture,0.808469,0.667625,0.716316,0.984612,180
1,Cross,0.752083,0.699675,0.718703,0.822099,2286
2,Education,0.894701,0.935452,0.913817,0.972401,836
3,Food Security,0.872642,0.88086,0.876688,0.948575,1176
4,Health,0.879001,0.885446,0.882128,0.9041,2822
5,Livelihoods,0.846959,0.770886,0.802213,0.920778,1297
6,Logistics,0.834473,0.574581,0.61854,0.980046,219
7,Nutrition,0.923074,0.860766,0.889209,0.984116,406
8,Protection,0.864582,0.851872,0.85798,0.905391,2178
9,Shelter,0.867774,0.84171,0.854157,0.960886,757


In [None]:
df_metrics_train

Unnamed: 0,Sector,Precision,Recall,F1 Score,Accuracy,Support
0,Agriculture,0.788334,0.635532,0.682722,0.977739,2212
1,Cross,0.773411,0.72407,0.743041,0.835207,20577
2,Education,0.899217,0.944875,0.920492,0.974408,7522
3,Food Security,0.889743,0.896432,0.893047,0.955501,10585
4,Health,0.892365,0.894214,0.893282,0.913704,25400
5,Livelihoods,0.843899,0.76587,0.797731,0.919319,11672
6,Logistics,0.864809,0.632691,0.692505,0.983762,1785
7,Nutrition,0.920455,0.848977,0.881082,0.983145,3657
8,Protection,0.858491,0.844646,0.851268,0.901106,19601
9,Shelter,0.884214,0.833606,0.856752,0.96277,6815


In [None]:
%load_ext tensorboard

In [None]:
#%tensorboard --logdir lightning_logs

In [None]:
# logger: Union[pytorch_lightning.loggers.base.LightningLoggerBase, Iterable[pytorch_lightning.loggers.base.LightningLoggerBase], bool] = True,
# checkpoint_callback: bool = True,
# callbacks: Union[List[pytorch_lightning.callbacks.base.Callback], pytorch_lightning.callbacks.base.Callback, NoneType] = None,
# default_root_dir: Union[str, NoneType] = None,
# gradient_clip_val: float = 0.0,
# gradient_clip_algorithm: str = 'norm',
# process_position: int = 0,
# num_nodes: int = 1,
# num_processes: int = 1,
# gpus: Union[int, str, List[int], NoneType] = None,
# auto_select_gpus: bool = False,
# tpu_cores: Union[int, str, List[int], NoneType] = None,
# log_gpu_memory: Union[str, NoneType] = None,
# progress_bar_refresh_rate: Union[int, NoneType] = None,
# overfit_batches: Union[int, float] = 0.0,
# track_grad_norm: Union[int, float, str] = -1,
# check_val_every_n_epoch: int = 1,
# fast_dev_run: Union[int, bool] = False,
# accumulate_grad_batches: Union[int, Dict[int, int], List[list]] = 1,
# max_epochs: Union[int, NoneType] = None,
# min_epochs: Union[int, NoneType] = None,
# max_steps: Union[int, NoneType] = None,
# min_steps: Union[int, NoneType] = None,
# max_time: Union[str, datetime.timedelta, Dict[str, int], NoneType] = None,
# limit_train_batches: Union[int, float] = 1.0,
# limit_val_batches: Union[int, float] = 1.0,
# limit_test_batches: Union[int, float] = 1.0,
# limit_predict_batches: Union[int, float] = 1.0,
# val_check_interval: Union[int, float] = 1.0,
# flush_logs_every_n_steps: int = 100,
# log_every_n_steps: int = 50,
# accelerator: Union[str, pytorch_lightning.accelerators.accelerator.Accelerator, NoneType] = None,
# sync_batchnorm: bool = False,
# precision: int = 32,
# weights_summary: Union[str, NoneType] = 'top',
# weights_save_path: Union[str, NoneType] = None,
# num_sanity_val_steps: int = 2,
# truncated_bptt_steps: Union[int, NoneType] = None,
# resume_from_checkpoint: Union[str, pathlib.Path, NoneType] = None,
# profiler: Union[pytorch_lightning.profiler.profilers.BaseProfiler, str, NoneType] = None,
# benchmark: bool = False,
# deterministic: bool = False,
# reload_dataloaders_every_epoch: bool = False,
# auto_lr_find: Union[bool, str] = False,
# replace_sampler_ddp: bool = True,
# terminate_on_nan: bool = False,
# auto_scale_batch_size: Union[str, bool] = False,
# prepare_data_per_node: bool = True,
# plugins: Union[List[Union[pytorch_lightning.plugins.base_plugin.Plugin, pytorch_lightning.plugins.environments.cluster_environment.ClusterEnvironment, str]], pytorch_lightning.plugins.base_plugin.Plugin, pytorch_lightning.plugins.environments.cluster_environment.ClusterEnvironment, str, NoneType] = None,
# amp_backend: str = 'native',
# amp_level: str = 'O2',
# distributed_backend: Union[str, NoneType] = None,
# move_metrics_to_cpu: bool = False,
# multiple_trainloader_mode: str = 'max_size_cycle',
# stochastic_weight_avg: bool = False,