In [1]:
# kernel conda_pytorch_latest_p36
#!pip install icecream
# !pip install tqdm
# !pip install torchmetrics
# !pip install pytorch_lightning
# !pip install transformers

In [2]:
import os
import sys
import logging
import argparse
from pathlib import Path
from ast import literal_eval
from collections import Counter
from typing import Any, Dict, Optional

from icecream import ic
from tqdm.auto import tqdm

import torchmetrics
from torchmetrics.functional import accuracy, f1, auroc

import pytorch_lightning as pl
from pytorch_lightning.loggers import TensorBoardLogger
from pytorch_lightning.core.decorators import auto_move_data
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping

import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler

import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, multilabel_confusion_matrix


import matplotlib.pyplot as plt
from pylab import rcParams
from matplotlib import rc

import transformers
from transformers import (
    AdamW,
    AutoConfig,
    AutoModel,
    AutoModelForSequenceClassification,
    T5ForConditionalGeneration,
    T5Tokenizer,
    AutoTokenizer,
)
from transformers.optimization import (
    Adafactor,
    get_linear_schedule_with_warmup,
)

In [3]:
%matplotlib inline
%config InlineBackend.figure_format='retina'

In [4]:
sns.set(style='whitegrid', palette='muted', font_scale=1.2)
HAPPY_COLORS_PALETTE = ["#01BEFE", "#FFDD00", "#FF7D00", "#FF006D", "#ADFF02", "#8F00FF"]
sns.set_palette(sns.color_palette(HAPPY_COLORS_PALETTE))
rcParams['figure.figsize'] = 12, 8

In [5]:
RANDOM_SEED=2021
pl.seed_everything(RANDOM_SEED)

Global seed set to 2021


2021

In [6]:
ic.configureOutput(outputFunction=sys.stdout.write, includeContext=True)

In [7]:
logger = logging.getLogger(__name__)

In [8]:
MAX_LEN = 200
TRAIN_BATCH_SIZE = 32
VALID_BATCH_SIZE = 64
EPOCHS = 3
LEARNING_RATE = 1e-05
MODEL_NAME = "microsoft/xtremedistil-l6-h384-uncased"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

In [9]:
DATA_ROOT_DIR = "/home/ec2-user/SageMaker/deep-experiments/data/frameworks_data/data_v0.4.3"
TRAIN_PATH = os.path.join(DATA_ROOT_DIR, "data_v0.4.3_train.csv")
VAL_PATH = os.path.join(DATA_ROOT_DIR, "data_v0.4.3_val.csv")
##
train_dataset = pd.read_csv(TRAIN_PATH)
val_dataset = pd.read_csv(VAL_PATH)
##
train_dataset["sectors"] = train_dataset["sectors"].apply(literal_eval)
val_dataset["sectors"] = val_dataset["sectors"].apply(literal_eval)
##
secotr_set = set()
for sectors_i in train_dataset["sectors"]:
    secotr_set.update(sectors_i)
sectorname_to_sectorid = {sector:i for i, sector in enumerate(list(sorted(secotr_set)))}

In [10]:
class SectorsDataset(Dataset):
    def __init__(self, dataframe, sectorname_to_sectorid, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.excerpt_text = dataframe["excerpt"].tolist(
        ) if dataframe is not None else None
        self.targets = self.data["sectors"].tolist(
        ) if dataframe is not None else None
        self.sectorname_to_sectorid = sectorname_to_sectorid
        self.sectorid_to_sectorname = list(sectorname_to_sectorid.keys())
        self.max_len = max_len

    def encode_example(self,
                       excerpt_text: str,
                       index=None,
                       as_batch: bool = False):
        #excerpt_text = " ".join(excerpt_text.split())

        inputs = self.tokenizer(excerpt_text,
                                            None,
                                            truncation=True,
                                            add_special_tokens=True,
                                            max_length=self.max_len,
                                            padding="max_length",
                                            return_token_type_ids=True)
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]
        targets = None
        if self.targets:
            target_indices = [
                self.sectorname_to_sectorid[target]
                for target in self.targets[index]
                if target in self.sectorname_to_sectorid
            ]
            targets = np.zeros(len(self.sectorname_to_sectorid), dtype=np.int)
            targets[target_indices] = 1

        encoded = {
            'ids':
            torch.tensor(ids, dtype=torch.long),
            'mask':
            torch.tensor(mask, dtype=torch.long),
            'token_type_ids':
            torch.tensor(token_type_ids, dtype=torch.long),
            'targets':
            torch.tensor(targets, dtype=torch.float32)
            if targets is not None else None
        }
        if as_batch:
            return {
                "ids": encoded["ids"].unsqueeze(0),
                "mask": encoded["mask"].unsqueeze(0),
                "token_type_ids": encoded["ids"].unsqueeze(0)
            }
        return encoded

    def __len__(self):
        return len(self.excerpt_text)

    def __getitem__(self, index):
        excerpt_text = str(self.excerpt_text[index])
        return self.encode_example(excerpt_text, index)

In [11]:
training_set = SectorsDataset(train_dataset, sectorname_to_sectorid, tokenizer,
                              MAX_LEN)
val_set = SectorsDataset(val_dataset, sectorname_to_sectorid, tokenizer,
                         MAX_LEN)

val_set_frac = SectorsDataset(val_dataset.sample(frac=.05),
                              sectorname_to_sectorid, tokenizer, MAX_LEN)

In [12]:
train_params = {
    'batch_size': TRAIN_BATCH_SIZE,
    'shuffle': True,
    'num_workers': 4
}

val_params = {
    'batch_size': VALID_BATCH_SIZE,
    'shuffle': False,
    'num_workers': 4
}

training_loader = DataLoader(training_set, **train_params)
val_loader = DataLoader(val_set, **val_params)
val_loader_frac = DataLoader(val_set_frac, **val_params)

In [13]:
class Model(nn.Module):
    def __init__(self, model_name_or_path: str, num_labels:int):
        super().__init__()
        self.l1 = AutoModel.from_pretrained(model_name_or_path)
        self.l2 = torch.nn.Dropout(0.3)
        self.l3 = torch.nn.Linear(384, num_labels)
    def forward(self, inputs):
        output = self.l1(inputs["ids"],
                            attention_mask=inputs["mask"],)
        output = output.last_hidden_state
        output = self.l2(output)
        output = self.l3(output)
        return output[:, 0, :]

In [14]:
class SectorsTransformer(pl.LightningModule):
    def __init__(self,
                 model_name_or_path: str,
                 num_labels: int,
                 empty_dataset: SectorsDataset,
                 pred_threshold: float = .5,
                 learning_rate: float = 2e-5,
                 adam_epsilon: float = 1e-8,
                 warmup_steps: int = 0,
                 weight_decay: float = 0.0,
                 train_batch_size: int = 32,
                 eval_batch_size: int = 32,
                 eval_splits: Optional[list] = None,
                 **kwargs):
        super().__init__()

        self.save_hyperparameters()

        self.model = Model(model_name_or_path, num_labels)
        self.empty_dataset = empty_dataset
        self.pred_threshold = pred_threshold

        self.f1_score_train = torchmetrics.F1(
            num_classes=2,
            threshold=0.5,
            average='macro',
            mdmc_average="samplewise",
            ignore_index=None,
            top_k=None,
            multiclass=True,
            compute_on_step=True,
            dist_sync_on_step=False,
            process_group=None,
            dist_sync_fn=None,
        )

        self.f1_score_val = torchmetrics.F1(
            num_classes=2,
            threshold=0.5,
            average='macro',
            mdmc_average="samplewise",
            ignore_index=None,
            top_k=None,
            multiclass=True,
            compute_on_step=True,
            dist_sync_on_step=False,
            process_group=None,
            dist_sync_fn=None,
        )
    @auto_move_data
    def forward(self, inputs):
        output = self.model(inputs)
        return output

    def training_step(self, batch, batch_idx):
        outputs = self(batch)
        loss = F.binary_cross_entropy_with_logits(outputs, batch["targets"])

        self.f1_score_train(torch.sigmoid(outputs),
                            batch["targets"].to(dtype=torch.long))
        self.log("train_f1", self.f1_score_train, prog_bar=True)
        return loss

    def validation_step(self, batch, batch_idx, dataloader_idx=0):
        outputs = self(batch)
        val_loss = F.binary_cross_entropy_with_logits(outputs,
                                                      batch["targets"])

        self.f1_score_val(torch.sigmoid(outputs),
                          batch["targets"].to(dtype=torch.long))
        self.log("val_f1",
                 self.f1_score_val,
                 on_step=True,
                 on_epoch=True,
                 prog_bar=True,
                 logger=False)
        
        self.log("val_loss",
                 val_loss,
                 on_step=True,
                 on_epoch=True,
                 prog_bar=True,
                 logger=False)
        return {'val_loss': val_loss}

    def test_step(self, batch, batch_nb):
        logits = self(batch)
        preds = (torch.sigmoid(logits) > .5)
        return {"preds": preds, "targets_i": batch["targets"]}

    def on_test_epoch_end(self, outputs):
        preds = torch.cat([output["preds"] for output in outputs]).cpu()
        targets = torch.cat([output["targets_i"] for output in outputs]).cpu()
        recalls = []
        precisions = []
        f1_scores = []
        for i in range(targets.shape[1]):
            class_roc_auc = auroc(preds[:, i], targets[:, i])
            self.log(
                f"{self.empty_dataset.sectorid_to_sectorname[i]}_roc_auc/Train",
                class_roc_auc)
            class_f1 = metrics.f1_score(targets[:, i], preds[:, i])
            self.log(
                f"{self.empty_dataset.sectorid_to_sectorname[i]}_f1/Train",
                class_f1)

    def predict_step(self, batch, batch_idx, dataloader_idx=None):
        output = self(batch)
        return {"logits": output}

    def on_predict_epoch_end(self, outputs):
        print(outputs)
        logits = torch.cat([output["logits"] for output in outputs[0]])
        preds = torch.sigmoid(logits) >= self.pred_threshold
        pred_classes = []
        for pred in preds:
            pred_classes_i = [
                self.empty_dataset.sectorid_to_sectorname[i]
                for i, p in enumerate(pred) if p
            ]
            pred_classes.append(pred_classes_i)
        self.log({"pred_classes": pred_classes})

    def custom_predict(self, inputs):
        self.eval()
        self.freeze()
        as_batch = False
        if isinstance(inputs, str):
            as_batch = True
        inputs = self.empty_dataset.encode_example(inputs, as_batch)

        with torch.no_grad():
            logits = self(inputs)
        preds = (torch.sigmoid(logits) >= self.pred_threshold)
        pred_classes = []
        for pred in preds:
            pred_classes_i = [
                self.empty_dataset.sectorid_to_sectorname[i]
                for i, p in enumerate(pred) if p
            ]
            pred_classes.append(pred_classes_i)
        return pred_classes

    def total_steps(self) -> int:
        """The number of total training steps that will be run. Used for lr scheduler purposes."""
        self.dataset_size = len(self.train_dataloader().dataset)
        num_devices = max(1, self.hparams.gpus)  # TODO: consider num_tpu_cores
        effective_batch_size = self.hparams.train_batch_size * self.hparams.accumulate_grad_batches * num_devices
        return (self.dataset_size /
                effective_batch_size) * self.hparams.max_epochs

    def configure_optimizers(self):
        "Prepare optimizer and schedule (linear warmup and decay)"
        model = self.model
        no_decay = ["bias", "LayerNorm.weight"]
        optimizer_grouped_parameters = [
            {
                "params": [
                    p for n, p in model.named_parameters()
                    if not any(nd in n for nd in no_decay)
                ],
                "weight_decay":
                self.hparams.weight_decay,
            },
            {
                "params": [
                    p for n, p in model.named_parameters()
                    if any(nd in n for nd in no_decay)
                ],
                "weight_decay":
                0.0,
            },
        ]
        optimizer = AdamW(optimizer_grouped_parameters,
                          lr=self.hparams.learning_rate,
                          eps=self.hparams.adam_epsilon)

        scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=self.hparams.warmup_steps,
            num_training_steps=self.total_steps())
        scheduler = {
            'scheduler': scheduler,
            'interval': 'step',
            'frequency': 1
        }
        return [optimizer], [scheduler]

    def train_dataloader(self):
        return training_loader

    def val_dataloader(self):
        return val_loader
    
    def custom_eval(self, eval_dataloader):
        if self.device.type == "cpu":
            self.to("cuda")
        self.eval()
        self.freeze()
        preds_val_all = []
        y_true = []

        with torch.no_grad():
            for batch in tqdm(eval_dataloader, total=len(eval_dataloader.dataset)//eval_dataloader.batch_size):
                logits = self({"ids": batch["ids"].to("cuda"), "mask": batch["mask"].to("cuda"), "token_type_ids": batch["token_type_ids"].to("cuda")})
                preds_batch = np.zeros(logits.shape, dtype=np.int)
                preds_batch[(torch.sigmoid(logits) > self.pred_threshold).cpu().nonzero(as_tuple=True)] = 1
                preds_val_all.append(preds_batch)
                y_true.append(batch["targets"].numpy().astype(np.int))

        preds_val_all = np.concatenate(preds_val_all)
        y_true = np.concatenate(y_true)

        f1_scores = []
        recalls = []
        precisions = []
        accuracies = []
        supports = []

        for sec_name, sec_id in sectorname_to_sectorid.items():
            cls_rprt = classification_report(y_true[:, sec_id], preds_val_all[:, sec_id], output_dict=True)
            precisions.append(cls_rprt["macro avg"]["precision"])
            recalls.append(cls_rprt["macro avg"]["recall"])
            f1_scores.append(cls_rprt["macro avg"]["f1-score"])
            supports.append(cls_rprt["1"]["support"])
            accuracies.append(cls_rprt["accuracy"])

        metrics_df = pd.DataFrame({
            "Sector": list(sectorname_to_sectorid.keys()),
            "Precision": precisions,
            "Recall": recalls,
            "F1 Score": f1_scores,
            "Accuracy": accuracies,
            "Support": supports,
        })
        return metrics_df
        #multilabel_confusion_matrix()

In [15]:
log_dir_name = "-".join(MODEL_NAME.split("/"))

In [16]:
dirpath = f"./checkpoints-{log_dir_name}"
if not os.path.exists(dirpath):
    os.makedirs(dirpath)
checkpoint_callback = ModelCheckpoint(
  dirpath=dirpath,
  save_top_k=1,
  verbose=True,
  monitor="val_loss",
  mode="min"
)



In [17]:
logger = TensorBoardLogger("lightning_logs", name=f"sector-classifier-{log_dir_name}")

In [18]:
early_stopping_callback = EarlyStopping(monitor='val_loss', patience=2)

In [19]:
trainer = pl.Trainer(
    logger=logger,
    callbacks=[early_stopping_callback, checkpoint_callback],
    progress_bar_refresh_rate=30,
    profiler="simple",
    log_gpu_memory=True,
    weights_summary=None,
    gpus=1,
    accumulate_grad_batches=1,
    max_epochs=EPOCHS,
    gradient_clip_val=1,
    gradient_clip_algorithm='norm',
    #overfit_batches=1,
    #limit_predict_batches=2,
    #limit_test_batches=2,
    #fast_dev_run=True,
    #limit_train_batches=1,
    #limit_val_batches=1,
    #limit_test_batches: Union[int, float] = 1.0,
)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores


In [20]:
empty_dataset = SectorsDataset(None, sectorname_to_sectorid, tokenizer,
                         MAX_LEN)
model = SectorsTransformer(MODEL_NAME,
                           len(sectorname_to_sectorid),
                           empty_dataset,
                           gpus=1,
                           precision=16,
                           plugin='deepspeed_stage_3_offload',
                           accumulate_grad_batches=1,
                           max_epochs=EPOCHS)

In [21]:
model

SectorsTransformer(
  (model): Model(
    (l1): BertModel(
      (embeddings): BertEmbeddings(
        (word_embeddings): Embedding(30522, 384, padding_idx=0)
        (position_embeddings): Embedding(512, 384)
        (token_type_embeddings): Embedding(2, 384)
        (LayerNorm): LayerNorm((384,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): BertEncoder(
        (layer): ModuleList(
          (0): BertLayer(
            (attention): BertAttention(
              (self): BertSelfAttention(
                (query): Linear(in_features=384, out_features=384, bias=True)
                (key): Linear(in_features=384, out_features=384, bias=True)
                (value): Linear(in_features=384, out_features=384, bias=True)
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (output): BertSelfOutput(
                (dense): Linear(in_features=384, out_features=384, bias=True)
                (L

In [22]:
trainer.fit(model)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Validation sanity check: 0it [00:00, ?it/s]

Global seed set to 2021


Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Epoch 0, global step 2832: val_loss reached 0.17171 (best 0.17171), saving model to "/home/ec2-user/SageMaker/deep-experiments/notebooks/models/abdullah/modeling_data_v0.4.3/checkpoints-microsoft-xtremedistil-l6-h384-uncased/epoch=0-step=2832.ckpt" as top 1


Validating: 0it [00:00, ?it/s]

Epoch 1, global step 5665: val_loss reached 0.16593 (best 0.16593), saving model to "/home/ec2-user/SageMaker/deep-experiments/notebooks/models/abdullah/modeling_data_v0.4.3/checkpoints-microsoft-xtremedistil-l6-h384-uncased/epoch=1-step=5665-v1.ckpt" as top 1


Validating: 0it [00:00, ?it/s]

Epoch 2, global step 8498: val_loss was not in top 1
FIT Profiler Report

Action                             	|  Mean duration (s)	|Num calls      	|  Total time (s) 	|  Percentage %   	|
--------------------------------------------------------------------------------------------------------------------------------------
Total                              	|  -              	|_              	|  3597.2         	|  100 %          	|
--------------------------------------------------------------------------------------------------------------------------------------
run_training_epoch                 	|  1197.5         	|3              	|  3592.4         	|  99.865         	|
run_training_batch                 	|  0.40115        	|8499           	|  3409.4         	|  94.777         	|
optimizer_step_and_closure_0       	|  0.40056        	|8499           	|  3404.4         	|  94.638         	|
training_step_and_backward         	|  0.38499        	|8499           	|  3272.1         	|  

In [23]:
df_metrics_val = model.custom_eval(val_loader)
df_metrics_val

  0%|          | 0/157 [00:00<?, ?it/s]

Unnamed: 0,Sector,Precision,Recall,F1 Score,Accuracy,Support
0,Agriculture,0.844241,0.57225,0.615635,0.978854,233
1,Cross,0.751853,0.683237,0.705282,0.819617,2286
2,Education,0.896978,0.90043,0.898695,0.969026,836
3,Food Security,0.88956,0.888122,0.888839,0.954234,1176
4,Health,0.876081,0.871047,0.873507,0.89864,2822
5,Livelihoods,0.81104,0.786754,0.798154,0.912936,1297
6,Logistics,0.988381,0.506329,0.506622,0.97677,237
7,Nutrition,0.924019,0.875112,0.897924,0.985009,409
8,Protection,0.848174,0.820603,0.833159,0.891294,2178
9,Shelter,0.849865,0.81905,0.833585,0.955227,767


In [24]:
df_metrics_train = model.custom_eval(training_loader)
df_metrics_train

  0%|          | 0/2832 [00:00<?, ?it/s]

Unnamed: 0,Sector,Precision,Recall,F1 Score,Accuracy,Support
0,Agriculture,0.823879,0.579251,0.623911,0.978688,2101
1,Cross,0.783806,0.711253,0.735891,0.836939,20577
2,Education,0.914114,0.92853,0.921169,0.975588,7522
3,Food Security,0.887455,0.897699,0.892483,0.955059,10585
4,Health,0.900505,0.889314,0.894655,0.916186,25400
5,Livelihoods,0.834957,0.778168,0.802732,0.918866,11672
6,Logistics,0.855036,0.509842,0.51445,0.980773,1765
7,Nutrition,0.9028,0.877035,0.889449,0.983332,3679
8,Protection,0.869997,0.837659,0.852268,0.90414,19601
9,Shelter,0.889048,0.850523,0.868567,0.964833,6904
