# Import

In [1]:
import sys
sys.path.append('../../')

import os
import sys
import logging
import argparse
from pathlib import Path
from ast import literal_eval
from collections import Counter
from typing import Any, Dict, Optional

In [2]:
from icecream import ic
from tqdm.auto import tqdm

import torchmetrics
from torchmetrics.functional import accuracy, f1, auroc

import sagemaker
from sagemaker import get_execution_role
import pytorch_lightning as pl
from pytorch_lightning.loggers import TensorBoardLogger
from pytorch_lightning.core.decorators import auto_move_data
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping
from pytorch_lightning.loggers import MLFlowLogger


import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler

import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, multilabel_confusion_matrix


import matplotlib.pyplot as plt
from pylab import rcParams
from matplotlib import rc

import transformers
from transformers import (
    AdamW,
    AutoConfig,
    AutoModel,
    AutoModelForSequenceClassification,
    T5ForConditionalGeneration,
    T5Tokenizer,
    AutoTokenizer,
)
from transformers.optimization import (
    Adafactor,
    get_linear_schedule_with_warmup,
)

Local constants, regarding the data, MLFlow server, paths, etc..: use them

In [3]:
sys.path.append('../../scripts/examples/sector-pl/')
from data import SectorsDataset
from model import SectorsTransformer

In [4]:
from deep.constants import *
from deep.utils import *

# Data

In [5]:

class Model(nn.Module):
    def __init__(self, model_name_or_path: str, num_labels: int):
        super().__init__()
        self.l1 = AutoModel.from_pretrained(model_name_or_path)
        self.l2 = torch.nn.Dropout(0.3)
        self.l3 = torch.nn.Linear(768, num_labels)

    def forward(self, inputs):
        output = self.l1(
            inputs["ids"],
            attention_mask=inputs["mask"],
        )
        output = output.last_hidden_state
        output = self.l2(output)
        output = self.l3(output)
        return output[:, 0, :]


class SectorsTransformer(pl.LightningModule):
    def __init__(
        self,
        model_name_or_path: str,
        num_labels: int,
        pred_threshold: float = 0.5,
        learning_rate: float = 2e-5,
        adam_epsilon: float = 1e-8,
        warmup_steps: int = 0,
        weight_decay: float = 0.0,
        train_batch_size: int = 32,
        eval_batch_size: int = 32,
        eval_splits: Optional[list] = None,
        **kwargs,
    ):
        super().__init__()

        self.save_hyperparameters()

        self.model = Model(model_name_or_path, num_labels)
        self.pred_threshold = pred_threshold

        self.f1_score_train = torchmetrics.F1(
            num_classes=2,
            threshold=0.5,
            average="macro",
            mdmc_average="samplewise",
            ignore_index=None,
            top_k=None,
            multiclass=True,
            compute_on_step=True,
            dist_sync_on_step=False,
            process_group=None,
            dist_sync_fn=None,
        )

        self.f1_score_val = torchmetrics.F1(
            num_classes=2,
            threshold=0.5,
            average="macro",
            mdmc_average="samplewise",
            ignore_index=None,
            top_k=None,
            multiclass=True,
            compute_on_step=True,
            dist_sync_on_step=False,
            process_group=None,
            dist_sync_fn=None,
        )

    @auto_move_data
    def forward(self, inputs):
        output = self.model(inputs)
        return output

    def training_step(self, batch, batch_idx):
        outputs = self(batch)
        loss = F.binary_cross_entropy_with_logits(outputs, batch["targets"])

        self.f1_score_train(torch.sigmoid(outputs), batch["targets"].to(dtype=torch.long))
        self.log("train_f1", self.f1_score_train, prog_bar=True)
        return loss

    def validation_step(self, batch, batch_idx, dataloader_idx=0):
        outputs = self(batch)
        val_loss = F.binary_cross_entropy_with_logits(outputs, batch["targets"])

        self.f1_score_val(torch.sigmoid(outputs), batch["targets"].to(dtype=torch.long))
        self.log(
            "val_f1",
            self.f1_score_val,
            on_step=True,
            on_epoch=True,
            prog_bar=True,
            logger=False,
        )

        self.log("val_loss", val_loss, on_step=True, on_epoch=True, prog_bar=True, logger=False)
        return {"val_loss": val_loss}

    def test_step(self, batch, batch_nb):
        logits = self(batch)
        preds = torch.sigmoid(logits) > 0.5
        return {"preds": preds, "targets_i": batch["targets"]}

    def predict_step(self, batch, batch_idx, dataloader_idx=None):
        output = self(batch)
        return {"logits": output}

    def configure_optimizers(self):
        "Prepare optimizer and schedule (linear warmup and decay)"
        model = self.model
        no_decay = ["bias", "LayerNorm.weight"]
        optimizer_grouped_parameters = [
            {
                "params": [
                    p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)
                ],
                "weight_decay": self.hparams.weight_decay,
            },
            {
                "params": [
                    p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)
                ],
                "weight_decay": 0.0,
            },
        ]
        optimizer = AdamW(
            optimizer_grouped_parameters,
            lr=self.hparams.learning_rate,
            eps=self.hparams.adam_epsilon,
        )

        scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=self.hparams.warmup_steps,
            num_training_steps=1000,  # CHANGE ME
        )
        scheduler = {"scheduler": scheduler, "interval": "step", "frequency": 1}
        return [optimizer], [scheduler]


In [23]:
class SectorsDataset(Dataset):
    def __init__(self, dataframe, tokenizer, class_to_id=None, max_len=200):
        self.tokenizer = tokenizer
        self.excerpt_text = list(dataframe["excerpt"])
        self.targets = list(dataframe["sectors"]) if "sectors" in dataframe.columns else None
        self.class_to_id = class_to_id
        self.max_len = max_len

    def encode_example(self, excerpt_text: str, index=None):
        # excerpt_text = " ".join(excerpt_text.split())

        inputs = self.tokenizer(
            excerpt_text,
            None,
            truncation=True,
            add_special_tokens=True,
            max_length=self.max_len,
            padding="max_length",
            return_token_type_ids=True,
        )
        ids = inputs["input_ids"]
        mask = inputs["attention_mask"]
        token_type_ids = inputs["token_type_ids"]
        encoded = {
            "ids": torch.tensor(ids, dtype=torch.long),
            "mask": torch.tensor(mask, dtype=torch.long),
            "token_type_ids": torch.tensor(token_type_ids, dtype=torch.long),
        }

        if self.targets:
            target_indices = [
                self.class_to_id[target]
                for target in self.targets[index]
                if target in self.class_to_id
            ]
            targets = np.zeros(len(self.class_to_id), dtype=np.int)
            targets[target_indices] = 1

            encoded["targets"] = torch.tensor(targets, dtype=torch.float32) if targets is not None else None

        return encoded

    def __len__(self):
        return len(self.excerpt_text)

    def __getitem__(self, index):
        excerpt_text = str(self.excerpt_text[index])
        return self.encode_example(excerpt_text, index)


In [24]:
model_name = 'sentence-transformers/paraphrase-mpnet-base-v2'

In [25]:
train_df = pd.read_csv(LATEST_DATA_PATH / "data_v0.5_train.csv")

In [26]:
class_to_id = {class_: i for i, class_ in enumerate(SECTORS)}

In [27]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
training_set = SectorsDataset(
    dataframe=train_df, tokenizer=tokenizer, class_to_id=class_to_id, max_len=200
)

In [28]:
train_params = {"batch_size": 4, "shuffle": True, "num_workers": 0}
training_loader = DataLoader(training_set, **train_params)

In [29]:
type(training_set[0]['targets'])

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  targets = np.zeros(len(self.class_to_id), dtype=np.int)


torch.Tensor

In [30]:
trainer = pl.Trainer(
    gpus=0,
    max_epochs=1,
)
model = SectorsTransformer(
    model_name,
    len(class_to_id),
)
trainer.fit(model, training_loader)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores

  | Name           | Type  | Params
-----------------------------------------
0 | model          | Model | 109 M 
1 | f1_score_train | F1    | 0     
-----------------------------------------
109 M     Trainable params
0         Non-trainable params
109 M     Total params
437.980   Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]



Training: 0it [00:00, ?it/s]

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  targets = np.zeros(len(self.class_to_id), dtype=np.int)
