kernel `conda_pytorch_latest_p36`

In [249]:
"""!pip install icecream
!pip install tqdm
!pip install torchmetrics
!pip install pytorch_lightning
!pip install transformers"""

'!pip install icecream\n!pip install tqdm\n!pip install torchmetrics\n!pip install pytorch_lightning\n!pip install transformers'

In [237]:
import os
import sys
import logging
import argparse
from pathlib import Path
from ast import literal_eval
from collections import Counter
from typing import Any, Dict, Optional

from icecream import ic
from tqdm.auto import tqdm

import torchmetrics
from torchmetrics.functional import accuracy, f1, auroc

import pytorch_lightning as pl
from pytorch_lightning.loggers import TensorBoardLogger
from pytorch_lightning.core.decorators import auto_move_data
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping

import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler

import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, multilabel_confusion_matrix


import matplotlib.pyplot as plt
from pylab import rcParams
from matplotlib import rc

import transformers
from transformers import (
    AdamW,
    AutoConfig,
    AutoModel,
    AutoModelForSequenceClassification,
    T5ForConditionalGeneration,
    T5Tokenizer,
    AutoTokenizer,
)
from transformers.optimization import (
    Adafactor,
    get_linear_schedule_with_warmup,
)

import tensorflow as tf
import re

In [3]:
%matplotlib inline
%config InlineBackend.figure_format='retina'

In [4]:
sns.set(style='whitegrid', palette='muted', font_scale=1.2)
HAPPY_COLORS_PALETTE = ["#01BEFE", "#FFDD00", "#FF7D00", "#FF006D", "#ADFF02", "#8F00FF"]
sns.set_palette(sns.color_palette(HAPPY_COLORS_PALETTE))
rcParams['figure.figsize'] = 12, 8

In [5]:
RANDOM_SEED=2021
pl.seed_everything(RANDOM_SEED)

Global seed set to 2021


2021

In [6]:
ic.configureOutput(outputFunction=sys.stdout.write, includeContext=True)

In [7]:
logger = logging.getLogger(__name__)

In [11]:
#from google.colab import drive
#drive.mount ('/content/drive')

Mounted at /content/drive


In [13]:
#cd 'drive/My Drive/Colab Notebooks'

/content/drive/My Drive/Colab Notebooks


In [14]:
MAX_LEN = 200
TRAIN_BATCH_SIZE = 32
VALID_BATCH_SIZE = 64
EPOCHS = 3
LEARNING_RATE = 1e-05
MODEL_NAME = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
DATA_ROOT_DIR = os.path.join("..", "..", "..", "data", "frameworks_data", "data_v0.4.4")
TRAIN_PATH = os.path.join("data_v0.4.4_train.csv")
VAL_PATH = os.path.join("data_v0.4.4_val.csv")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

In [15]:
def clean_rows (row):
    """
    1) Apply litteral evaluation
    2) Drop values that are repeated multiple times in rows
    """
    return list(set(literal_eval(row)))

In [23]:
train_dataset = pd.read_csv(TRAIN_PATH)
val_dataset = pd.read_csv(VAL_PATH)

all_dataset = pd.concat([train_dataset, val_dataset])

# Keep only unique values in pillars
all_dataset["subpillars"] = all_dataset["subpillars"].apply(lambda x: clean_rows (x))
all_dataset["pillars"] = all_dataset["pillars"].apply(lambda x: clean_rows (x))

# Keep only rows with a not empty pillar
all_dataset = all_dataset[all_dataset.pillars.apply(lambda x: len(x)>0)]

In [24]:
random_state=42
val_size=0.2

def get_subpillar_datasets (subpillar_name:str, dataset=all_dataset, seed=42):
    """
    1) keep rows where the sub-pillar name is contained in the column 'subpillars'
    2) keep only subpillar names in the column 'subpillar' (omit pillar name)
    """
    df = dataset[['entry_id', 'excerpt', 'subpillars']]
    df['subpillars'] = df.subpillars\
                        .apply(lambda x: list(filter(lambda y: subpillar_name in y, x)))\
                        .apply(lambda x: [y.split('->')[1] for y in (x)])

    df = df[df.subpillars.apply(lambda x: len(x)>0)].rename(columns={'pillars':'target'})\
            .rename(columns={'subpillars':'target'})
    return train_test_split(df, test_size=val_size, random_state=random_state)
    
    
    
capacities_response_train_dataset, capacities_response_val_dataset =\
                get_subpillar_datasets ('Capacities & Response')

hum_conditions_train_dataset, hum_conditions_val_dataset =\
                get_subpillar_datasets ('Humanitarian Conditions')


impact_train_dataset, impact_val_dataset = get_subpillar_datasets ('Impact')

people_at_risk_train_dataset, people_at_risk_val_dataset = get_subpillar_datasets ('People At Risk')

priority_interventions_train_dataset, priority_interventions_val_dataset = \
                get_subpillar_datasets ('Priority Interventions')

priority_needs_train_dataset, priority_needs_val_dataset = get_subpillar_datasets ('Priority Needs')


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.


In [25]:
print('number of rows for capacities & response :', capacities_response_train_dataset.shape[0])
print('number of rows for Humanitarian Conditions:', hum_conditions_train_dataset.shape[0])
print('number of rows for Impact :', impact_train_dataset.shape[0])
print('number of rows for People At Risk :', people_at_risk_train_dataset.shape[0])
print('number of rows for Priority Interventions :', priority_interventions_train_dataset.shape[0])
print('number of rows for Priority Needs :', priority_needs_train_dataset.shape[0])

number of rows for capacities & response : 7787
number of rows for Humanitarian Conditions: 23636
number of rows for Impact : 17136
number of rows for People At Risk : 5972
number of rows for Priority Interventions : 1344
number of rows for Priority Needs : 1625


In [26]:
all_dataset = all_dataset[['entry_id', 'excerpt', 'pillars']]\
                    .rename(columns={'pillars':'target'})

pillars_train_dataset, pillars_val_dataset = train_test_split(all_dataset, 
                                                              random_state=random_state, 
                                                              test_size=val_size)


In [27]:
#check that entry_id is an id
train_dataset.entry_id.is_unique

True

In [41]:
class CustomDataset(Dataset):
    def __init__(self, dataframe, tagname_to_tagid, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe

        self.excerpt_text = dataframe["excerpt"].tolist(
        ) if dataframe is not None else None

        self.targets = self.data['target'].tolist(
        ) if dataframe is not None else None

        self.entry_ids = self.data['entry_id'].tolist(
        ) if dataframe is not None else None

        self.tagname_to_tagid = tagname_to_tagid
        self.tagid_to_tagname = list(tagname_to_tagid.keys())
        self.max_len = max_len

    def encode_example(self,
                       excerpt_text: str,
                       index=None,
                       as_batch: bool = False):
        
        inputs = self.tokenizer(excerpt_text,
                                            None,
                                            truncation=True,
                                            add_special_tokens=True,
                                            max_length=self.max_len,
                                            padding="max_length",
                                            return_token_type_ids=True)
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]
        
        targets = None
        if self.targets:
            target_indices = [
                self.tagname_to_tagid[target]
                for target in self.targets[index]
                if target in self.tagname_to_tagid
            ]
            targets = np.zeros(len(self.tagname_to_tagid), dtype=np.int)
            targets[target_indices] = 1

        encoded = {
            'ids':
            torch.tensor(ids, dtype=torch.long),
            'mask':
            torch.tensor(mask, dtype=torch.long),
            'token_type_ids':
            torch.tensor(token_type_ids, dtype=torch.long),
            'targets':
            torch.tensor(targets, dtype=torch.float32)
            if targets is not None else None,
            'entry_id':
            self.entry_ids[index]
        }
        if as_batch:
            return {
                "ids": encoded["ids"].unsqueeze(0),
                "mask": encoded["mask"].unsqueeze(0),
                "token_type_ids": encoded["ids"].unsqueeze(0)
            }
        return encoded

    def __len__(self):
        return len(self.excerpt_text)

    def __getitem__(self, index):
        excerpt_text = str(self.excerpt_text[index])
        return self.encode_example(excerpt_text, index)

In [29]:
train_params = {
    'batch_size': TRAIN_BATCH_SIZE,
    'shuffle': True,
    'num_workers': 4
}

val_params = {
    'batch_size': VALID_BATCH_SIZE,
    'shuffle': False,
    'num_workers': 4
}


In [30]:
def get_loaders (train_dataset, val_dataset):
    training_set = CustomDataset(train_dataset, tagname_to_tagid, tokenizer, MAX_LEN)
    val_set = CustomDataset(val_dataset, tagname_to_tagid, tokenizer, MAX_LEN)
    val_set_frac = CustomDataset(val_dataset.sample(frac=.01),
                                  tagname_to_tagid, tokenizer, MAX_LEN)
    
    training_loader = DataLoader(training_set, **train_params)
    
    val_loader_frac = DataLoader(val_set_frac, **val_params)
    return training_loader, val_loader, val_loader_frac

In [31]:
class Model(nn.Module):
    def __init__(self, model_name_or_path: str, num_labels:int, dropout_rate=0.3):
        super().__init__()
        self.l1 = AutoModel.from_pretrained(model_name_or_path)
        self.l2 = torch.nn.Dropout(dropout_rate)
        self.l3 = torch.nn.Linear(384, num_labels)
        
    def forward(self, inputs):
        output = self.l1(inputs["ids"],
                            attention_mask=inputs["mask"],)
        output = output.last_hidden_state
        output = self.l2(output)
        output = self.l3(output)
        return output[:, 0, :]

In [32]:
class Transformer(pl.LightningModule):
    def __init__(self,
                 model_name_or_path: str,
                 num_labels: int,
                 empty_dataset: CustomDataset,
                 
                 pred_threshold: float = .5,
                 learning_rate: float = 2e-5,
                 adam_epsilon: float = 1e-8,
                 warmup_steps: int = 0,
                 weight_decay: float = 0.0,
                 train_batch_size: int = 32,
                 eval_batch_size: int = 32,
                 eval_splits: Optional[list] = None,
                 dropout_rate: float = 0.3,

                 **kwargs):
        super().__init__()

        self.save_hyperparameters()

        self.model = Model(model_name_or_path, num_labels, dropout_rate)
        self.empty_dataset = empty_dataset
        self.pred_threshold = pred_threshold

        self.f1_score_train = torchmetrics.F1(
            num_classes=2,
            threshold=0.5,
            average='macro',
            mdmc_average="samplewise",
            ignore_index=None,
            top_k=None,
            multiclass=True,
            compute_on_step=True,
            dist_sync_on_step=False,
            process_group=None,
            dist_sync_fn=None,
        )

        self.f1_score_val = torchmetrics.F1(
            num_classes=2,
            threshold=0.5,
            average='macro',
            mdmc_average="samplewise",
            ignore_index=None,
            top_k=None,
            multiclass=True,
            compute_on_step=True,
            dist_sync_on_step=False,
            process_group=None,
            dist_sync_fn=None,
        )
    @auto_move_data
    def forward(self, inputs):
        output = self.model(inputs)
        return output

    def training_step(self, batch, batch_idx):
        outputs = self(batch)
        loss = F.binary_cross_entropy_with_logits(outputs, batch["targets"])

        self.f1_score_train(torch.sigmoid(outputs),
                            batch["targets"].to(dtype=torch.long))
        self.log("train_f1", self.f1_score_train, prog_bar=True)
        return loss

    def validation_step(self, batch, batch_idx, dataloader_idx=0):
        outputs = self(batch)
        val_loss = F.binary_cross_entropy_with_logits(outputs,
                                                      batch["targets"])

        self.f1_score_val(torch.sigmoid(outputs),
                          batch["targets"].to(dtype=torch.long))
        self.log("val_f1",
                 self.f1_score_val,
                 on_step=True,
                 on_epoch=True,
                 prog_bar=True,
                 logger=False)
        
        self.log("val_loss",
                 val_loss,
                 on_step=True,
                 on_epoch=True,
                 prog_bar=True,
                 logger=False)
        return {'val_loss': val_loss}

    def test_step(self, batch, batch_nb):
        logits = self(batch)
        preds = (torch.sigmoid(logits) > .5)
        return {"preds": preds, "targets_i": batch["targets"]}

    def on_test_epoch_end(self, outputs):
        preds = torch.cat([output["preds"] for output in outputs]).cpu()
        targets = torch.cat([output["targets_i"] for output in outputs]).cpu()
        recalls = []
        precisions = []
        f1_scores = []
        for i in range(targets.shape[1]):
            class_roc_auc = auroc(preds[:, i], targets[:, i])
            self.log(
                f"{self.empty_dataset.sectorid_to_sectorname[i]}_roc_auc/Train",
                class_roc_auc)
            class_f1 = metrics.f1_score(targets[:, i], preds[:, i])
            self.log(
                f"{self.empty_dataset.sectorid_to_sectorname[i]}_f1/Train",
                class_f1)

    def predict_step(self, batch, batch_idx, dataloader_idx=None):
        output = self(batch)
        return {"logits": output}

    def on_predict_epoch_end(self, outputs):
        logits = torch.cat([output["logits"] for output in outputs[0]])
        preds = torch.sigmoid(logits) >= self.pred_threshold
        pred_classes = []
        for pred in preds:
            pred_classes_i = [
                self.empty_dataset.sectorid_to_sectorname[i]
                for i, p in enumerate(pred) if p
            ]
            pred_classes.append(pred_classes_i)
        self.log({"pred_classes": pred_classes})

    def custom_predict(self, inputs):
        self.eval()
        self.freeze()
        as_batch = False
        if isinstance(inputs, str):
            as_batch = True
        inputs = self.empty_dataset.encode_example(inputs, as_batch)

        with torch.no_grad():
            logits = self(inputs)
        preds = (torch.sigmoid(logits) >= self.pred_threshold)
        pred_classes = []
        for pred in preds:
            pred_classes_i = [
                self.empty_dataset.tagid_to_tagname[i]
                for i, p in enumerate(pred) if p
            ]
            pred_classes.append(pred_classes_i)
        return pred_classes

    def total_steps(self) -> int:
        """The number of total training steps that will be run. Used for lr scheduler purposes."""
        self.dataset_size = len(self.train_dataloader().dataset)
        num_devices = max(1, self.hparams.gpus)  # TODO: consider num_tpu_cores
        effective_batch_size = self.hparams.train_batch_size * self.hparams.accumulate_grad_batches * num_devices
        return (self.dataset_size /
                effective_batch_size) * self.hparams.max_epochs

    def configure_optimizers(self):
        "Prepare optimizer and schedule (linear warmup and decay)"
        model = self.model
        no_decay = ["bias", "LayerNorm.weight"]
        optimizer_grouped_parameters = [
            {
                "params": [
                    p for n, p in model.named_parameters()
                    if not any(nd in n for nd in no_decay)
                ],
                "weight_decay":
                self.hparams.weight_decay,
            },
            {
                "params": [
                    p for n, p in model.named_parameters()
                    if any(nd in n for nd in no_decay)
                ],
                "weight_decay":
                0.0,
            },
        ]
        optimizer = AdamW(optimizer_grouped_parameters,
                          lr=self.hparams.learning_rate,
                          eps=self.hparams.adam_epsilon)

        scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=self.hparams.warmup_steps,
            num_training_steps=self.total_steps())
        scheduler = {
            'scheduler': scheduler,
            'interval': 'step',
            'frequency': 1
        }
        return [optimizer], [scheduler]

    def train_dataloader(self):
        return training_loader

    def val_dataloader(self):
        return val_loader
    
    def custom_eval(self, eval_dataloader):
        if self.device.type == "cpu":
            self.to("cuda")
        self.eval()
        self.freeze()
        preds_val_all = []
        y_true = []

        with torch.no_grad():
            for batch in tqdm(eval_dataloader, total=len(eval_dataloader.dataset)//eval_dataloader.batch_size):
                logits = self({"ids": batch["ids"].to("cuda"), "mask": batch["mask"].to("cuda"), "token_type_ids": batch["token_type_ids"].to("cuda")})
                preds_batch = np.zeros(logits.shape, dtype=np.int)
                preds_batch[(torch.sigmoid(logits) > self.pred_threshold).cpu().nonzero(as_tuple=True)] = 1
                preds_val_all.append(preds_batch)
                y_true.append(batch["targets"].numpy().astype(np.int))

        preds_val_all = np.concatenate(preds_val_all)
        y_true = np.concatenate(y_true)

        f1_scores = []
        recalls = []
        precisions = []
        accuracies = []
        supports = []
        tagname_to_tagid = self.empty_dataset.tagname_to_tagid
        for tag_name, tag_id in tagname_to_tagid.items():
            cls_rprt = classification_report(y_true[:, tag_id], preds_val_all[:, tag_id], output_dict=True)
            precisions.append(cls_rprt["macro avg"]["precision"])
            recalls.append(cls_rprt["macro avg"]["recall"])
            f1_scores.append(cls_rprt["macro avg"]["f1-score"])
            accuracies.append(cls_rprt["accuracy"])

        metrics_df = pd.DataFrame({
            "Sector": list(tagname_to_tagid.keys()),
            "Precision": precisions,
            "Recall": recalls,
            "F1 Score": f1_scores,
            "Accuracy": accuracies,
        })
        return metrics_df


  "The `@auto_move_data` decorator is deprecated in v1.3 and will be removed in v1.5."


In [None]:

def train_on_specific_targets (train_dataset, tagname_to_tagid, name_classifier:str, dropout_rate:float):

    if not os.path.exists(dirpath):
        os.makedirs(dirpath)

    logger = TensorBoardLogger("lightning_logs", name=name_classifier)

    empty_dataset = CustomDataset(None, tagname_to_tagid, tokenizer,
                         MAX_LEN)

    trainer = pl.Trainer(
        logger=logger,
        callbacks=[early_stopping_callback, checkpoint_callback],
        progress_bar_refresh_rate=30,
        profiler="simple",
        log_gpu_memory=True,
        weights_summary=None,
        gpus=1,
        accumulate_grad_batches=1,
        max_epochs=EPOCHS,
        gradient_clip_val=1,
        gradient_clip_algorithm='norm'
        #overfit_batches=1,
        #limit_predict_batches=2,
        #limit_test_batches=2,
        #fast_dev_run=True,
        #limit_train_batches=1,
        #limit_val_batches=1,
        #limit_test_batches: Union[int, float] = 1.0,
    )


    model = Transformer(MODEL_NAME,
                            len(tagname_to_tagid),
                            empty_dataset,
                            gpus=1,
                            precision=16,
                            plugin='deepspeed_stage_3_offload',
                            accumulate_grad_batches=1,
                            max_epochs=EPOCHS,
                            dropout_rate=dropout_rate)

    trainer.fit(model)

    return model.custom_eval(val_loader)

### Train model to predict pillars

In [None]:
log_dir_name = "-".join(MODEL_NAME.split("/"))
early_stopping_callback = EarlyStopping(monitor='val_loss', patience=2)


dirpath = f"./checkpoints-pillars-{log_dir_name}"

checkpoint_callback = ModelCheckpoint(
  dirpath=dirpath,
  save_top_k=1,
  verbose=True,
  monitor="val_loss",
  mode="min"
)


tag_set = set()
for tags_i in pillars_train_dataset["target"]:
    tag_set.update(tags_i)
tagname_to_tagid = {tag:i for i, tag in enumerate(list(sorted(tag_set)))}

training_loader, val_loader, val_loader_frac = get_loaders (pillars_train_dataset,
                                                 pillars_val_dataset)

errors_pillars = train_on_specific_targets(pillars_train_dataset,
                                                 tagname_to_tagid,
                                                 f"pillars-classifier-{log_dir_name}",
                                                 dropout_rate=0.5)
errors_pillars

### Train models to predict sub-pillars
- Dropout rate is 0.5 for length of training between 5.000 and 10.000
- Dropout rate is 0.7 for length of training inferior to 5.000
- Dropout rate is 0.3 for length of training superior to 10.000

In [None]:

dirpath = f"./checkpoints-subpillars-capacities-responses-{log_dir_name}"

checkpoint_callback = ModelCheckpoint(
  dirpath=dirpath,
  save_top_k=1,
  verbose=True,
  monitor="val_loss",
  mode="min"
)


tag_set = set()
for tags_i in capacities_response_train_dataset["target"]:
    tag_set.update(tags_i)
tagname_to_tagid = {tag:i for i, tag in enumerate(list(sorted(tag_set)))}

training_loader, val_loader, val_loader_frac = get_loaders (capacities_response_train_dataset,
                                                 capacities_response_val_dataset)

errors_capacities_response = train_on_specific_targets(capacities_response_train_dataset,
                                                 tagname_to_tagid,
                                                 f"capacities-and-response-classifier-{log_dir_name}",
                                                 dropout_rate=0.5)
errors_capacities_response

In [None]:
dirpath = f"./checkpoints-subpillars-people-at-risk-{log_dir_name}"

checkpoint_callback = ModelCheckpoint(
  dirpath=dirpath,
  save_top_k=1,
  verbose=True,
  monitor="val_loss",
  mode="min"
)

tag_set = set()
for tags_i in people_at_risk_train_dataset["target"]:
    tag_set.update(tags_i)
tagname_to_tagid = {tag:i for i, tag in enumerate(list(sorted(tag_set)))}

training_loader, val_loader, val_loader_frac = get_loaders (people_at_risk_train_dataset,
                                                 people_at_risk_val_dataset)

errors_people_at_risk = train_on_specific_targets(people_at_risk_train_dataset,
                                            tagname_to_tagid,
                                            f"people-at-risk-classifier-{log_dir_name}",
                                            dropout_rate=0.5)
errors_people_at_risk



In [None]:
dirpath = f"./checkpoints-subpillars-impact-{log_dir_name}"

checkpoint_callback = ModelCheckpoint(
  dirpath=dirpath,
  save_top_k=1,
  verbose=True,
  monitor="val_loss",
  mode="min"
)

tag_set = set()
for tags_i in impact_train_dataset["target"]:
    tag_set.update(tags_i)
tagname_to_tagid = {tag:i for i, tag in enumerate(list(sorted(tag_set)))}

training_loader, val_loader, val_loader_frac = get_loaders (impact_train_dataset,
                                                 impact_val_dataset)

errors_impact = train_on_specific_targets(impact_train_dataset,
                                    tagname_to_tagid,
                                    f"impact-classifier-{log_dir_name}",
                                    dropout_rate=0.3)
errors_impact

In [None]:
dirpath = f"./checkpoints-subpillars-humanitarian-conditions-{log_dir_name}"

checkpoint_callback = ModelCheckpoint(
  dirpath=dirpath,
  save_top_k=1,
  verbose=True,
  monitor="val_loss",
  mode="min"
)


tag_set = set()
for tags_i in hum_conditions_train_dataset["target"]:
    tag_set.update(tags_i)
tagname_to_tagid = {tag:i for i, tag in enumerate(list(sorted(tag_set)))}

training_loader, val_loader, val_loader_frac = get_loaders (hum_conditions_train_dataset,
                                                 hum_conditions_val_dataset)

errors_hum_conditions = train_on_specific_targets(hum_conditions_train_dataset,
                                                 tagname_to_tagid,
                                                 f"Humanitarian-Conditions-classifier-{log_dir_name}",
                                                 dropout_rate=0.3)
errors_hum_conditions

In [None]:
dirpath = f"./checkpoints-subpillars-priority-interventions-{log_dir_name}"

checkpoint_callback = ModelCheckpoint(
  dirpath=dirpath,
  save_top_k=1,
  verbose=True,
  monitor="val_loss",
  mode="min"
)

tag_set = set()
for tags_i in priority_interventions_train_dataset["target"]:
    tag_set.update(tags_i)
tagname_to_tagid = {tag:i for i, tag in enumerate(list(sorted(tag_set)))}

training_loader, val_loader, val_loader_frac = get_loaders (priority_interventions_train_dataset,
                                                 priority_interventions_val_dataset)

errors_priority_interventions = train_on_specific_targets(priority_interventions_train_dataset,
                                                 tagname_to_tagid,
                                                 f"Priority-Interventions-classifier-{log_dir_name}",
                                                 dropout_rate=0.7)
errors_priority_interventions


In [None]:
dirpath = f"./checkpoints-subpillars-priority-needs-{log_dir_name}"

checkpoint_callback = ModelCheckpoint(
  dirpath=dirpath,
  save_top_k=1,
  verbose=True,
  monitor="val_loss",
  mode="min"
)

tag_set = set()
for tags_i in priority_needs_train_dataset["target"]:
    tag_set.update(tags_i)
tagname_to_tagid = {tag:i for i, tag in enumerate(list(sorted(tag_set)))}

training_loader, val_loader, val_loader_frac = get_loaders (priority_needs_train_dataset,
                                                 priority_needs_val_dataset)


errors_priority_needs = train_on_specific_targets(priority_needs_train_dataset,
                                                 tagname_to_tagid,
                                                 f"priority-needs-classifier-{log_dir_name}",
                                                 dropout_rate=0.6)
errors_priority_needs

# TESTING:


In [36]:
checkpoint_path_pillars = "checkpoints-pillarssentence-transformers-paraphrase-multilingual-MiniLM-L12-v2/epoch=1-step=2749.ckpt"
model_pillars = Transformer.load_from_checkpoint(checkpoint_path_pillars)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=470693617.0, style=ProgressStyle(descri…




  cpuset_checked))


In [None]:
def model_evaluate_(model, dataset, name:str):
    tag_set = set()
    for tags_i in dataset["target"]:
        tag_set.update(tags_i)
    tagname_to_tagid = {tag:i for i, tag in enumerate(list(sorted(tag_set)))}

    val_set = CustomDataset(dataset, tagname_to_tagid, tokenizer, MAX_LEN)
    val_loader = DataLoader(val_set, **val_params)

    model.eval()
    model.freeze()
    predictions_pillar = []
    y_true = []
    logits_tot=[]
    indexes=torch.tensor([])


    with torch.no_grad():
        iter=0
        for batch in tqdm(val_loader, total=len(val_loader.dataset)//val_loader.batch_size):
            
            logits = model({"ids": batch["ids"].to('cuda'),
                            "mask": batch["mask"].to('cuda'),
                            "token_type_ids": batch["token_type_ids"].to('cuda')})
            
            preds_batch = np.zeros(logits.shape, dtype=np.int)
            preds_batch[(torch.sigmoid(logits) >= model.pred_threshold).cpu().nonzero(as_tuple=True)] = 1
            if iter==0:
                predictions = preds_batch
                indexes = batch["entry_id"]
            
            else:
                predictions = np.concatenate([predictions,preds_batch], 0) #.append(preds_batch)
                indexes = tf.concat([indexes, batch["entry_id"]], 0)
            
            """original_sentences.append(batch['original_sentence'])"""
            iter += 1
    
    np.save('predictions_round_1'+name, np.array(predictions_pillar))
    np.save('indexes_round1'+name, np.array(indexes))
    return predictions, indexes

In [46]:

predictions_priority_needs, indexes_priority_needs = model_evaluate_(model_pillars, pillars_val_dataset, name='pillars')

HBox(children=(FloatProgress(value=0.0, max=171.0), HTML(value='')))

  cpuset_checked))





In [48]:
boolean_capacities_responses = predictions_pillar[:,0]==1
capacities_response_after_bool = pillars_val_dataset[boolean_capacities_responses]

boolean_hum_conditions = predictions_pillar[:,1]==1
hum_conditions_after_bool = pillars_val_dataset[boolean_hum_conditions]

boolean_impact = predictions_pillar[:,2]==1
impact_after_bool = pillars_val_dataset[boolean_impact]

boolean_people_at_risk = predictions_pillar[:,3]==1
people_at_risk_after_bool = pillars_val_dataset[boolean_people_at_risk]

boolean_priority_interventions = predictions_pillar[:,4]==1
priority_interventions_after_bool = pillars_val_dataset[boolean_priority_interventions]

boolean_priority_needs = predictions_pillar[:,5]==1
priority_needs_after_bool = pillars_val_dataset[boolean_priority_needs]

In [52]:
checkpoint_path_subpillars_priority_needs =\
         "checkpoints-subpillars-priority-needs-sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2/epoch=0-step=50.ckpt"
model_subpillars_priority_needs = Transformer.load_from_checkpoint(checkpoint_path_subpillars_priority_needs)

In [49]:
checkpoint_path_subpillars_capacities_responses =\
         "checkpoints-subpillars-capacities-responses-sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2/epoch=1-step=487.ckpt"
model_subpillars_capacities_responses = Transformer.load_from_checkpoint(checkpoint_path_subpillars_capacities_responses)



In [60]:
checkpoint_path_subpillars_hum_conditions =\
         "checkpoints-subpillars-humanitarian-conditions-sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2/epoch=0-step=738.ckpt"
model_subpillars_hum_conditions = Transformer.load_from_checkpoint(checkpoint_path_subpillars_hum_conditions)

In [62]:
checkpoint_path_subpillars_impact =\
         "checkpoints-subpillars-impact-sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2/epoch=1-step=1071.ckpt"
model_subpillars_impact = Transformer.load_from_checkpoint(checkpoint_path_subpillars_impact)

In [64]:
checkpoint_path_subpillars_people_at_risk =\
         "checkpoints-subpillars-people-at-risk-sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2/epoch=2-step=560.ckpt"
model_subpillars_people_at_risk = Transformer.load_from_checkpoint(checkpoint_path_subpillars_people_at_risk)

In [66]:
checkpoint_path_subpillars_priority_interventions =\
         "checkpoints-subpillars-priority-interventions-sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2/epoch=0-step=41.ckpt"
model_subpillars_priority_interventions = Transformer.load_from_checkpoint(checkpoint_path_subpillars_priority_interventions)

In [57]:
predictions_priority_needs, indexes_priority_needs = model_evaluate_(model_subpillars_priority_needs, priority_needs_after_bool, name='priority needs')

  cpuset_checked))


HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




In [59]:
predictions_capacities_responses, indexes_capacities_responses = \
        model_evaluate_(model_subpillars_capacities_responses, capacities_response_after_bool, name='capacities responses')

  cpuset_checked))


HBox(children=(FloatProgress(value=0.0, max=26.0), HTML(value='')))




In [61]:
predictions_hum_conditions, indexes_hum_conditions = \
        model_evaluate_(model_subpillars_hum_conditions, hum_conditions_after_bool, name='humanitarian conditions')

  cpuset_checked))


HBox(children=(FloatProgress(value=0.0, max=92.0), HTML(value='')))




In [63]:
predictions_impact, indexes_impact = \
        model_evaluate_(model_subpillars_impact, impact_after_bool, name='impact')

  cpuset_checked))


HBox(children=(FloatProgress(value=0.0, max=64.0), HTML(value='')))




In [65]:
predictions_people_at_risk, indexes_people_at_risk = \
        model_evaluate_(model_subpillars_people_at_risk, people_at_risk_after_bool, name='people at risk')

  cpuset_checked))


HBox(children=(FloatProgress(value=0.0, max=13.0), HTML(value='')))




In [67]:
predictions_priority_interventions, indexes_priority_interventions = \
        model_evaluate_(model_subpillars_priority_interventions, priority_interventions_after_bool, name='priority interventions')

  cpuset_checked))


HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))




In [68]:
indexes_list = pillars_val_dataset.entry_id
np.save('indexes total list', np.array(indexes_list))

In [155]:
train_dataset = pd.read_csv(TRAIN_PATH)
val_dataset = pd.read_csv(VAL_PATH)

all_dataset = pd.concat([train_dataset, val_dataset])

final_val_df = all_dataset.loc[all_dataset.entry_id.isin(pillars_val_dataset.entry_id)][['entry_id', 'subpillars']]


def perfectEval(anonstring):
        try:
            ev = literal_eval(anonstring)
            return ev
        except ValueError:
            corrected = "\'" + anonstring + "\'"
            ev = literal_eval(corrected)
            return ev

final_val_df['evall'] = final_val_df.subpillars.apply(lambda x: perfectEval(x))

tag_set = set()
for tags_i in final_val_df["evall"]:
    tag_set.update(tags_i)
tagname_to_tagid = {tag:i for i, tag in enumerate(list(sorted(tag_set)))}
tagname_to_tagid

{'Capacities & Response->International Response': 0,
 'Capacities & Response->National Response': 1,
 'Capacities & Response->Number Of People Reached': 2,
 'Capacities & Response->Response Gaps': 3,
 'Humanitarian Conditions->Coping Mechanisms': 4,
 'Humanitarian Conditions->Living Standards': 5,
 'Humanitarian Conditions->Number Of People In Need': 6,
 'Humanitarian Conditions->Physical And Mental Well Being': 7,
 'Impact->Driver/Aggravating Factors': 8,
 'Impact->Impact On People': 9,
 'Impact->Impact On People Or Impact On Services': 10,
 'Impact->Impact On Services': 11,
 'Impact->Impact On Systems And Services': 12,
 'Impact->Number Of People Affected': 13,
 'People At Risk->Number Of People At Risk': 14,
 'People At Risk->Risk And Vulnerabilities': 15,
 'Priority Interventions->Expressed By Humanitarian Staff': 16,
 'Priority Interventions->Expressed By Population': 17,
 'Priority Needs->Expressed By Humanitarian Staff': 18,
 'Priority Needs->Expressed By Population': 19}

In [153]:
tagname_to_tagid['Capacities & Response->International Response']

0

In [158]:

def fill_column (row, n_labels):
    """
    function to return proper labels (for relevance column and for sectors column)
    """
    values_to_fill = row
    row = [0]*n_labels
    for target_tmp in values_to_fill:
        row[tagname_to_tagid[target_tmp]]=1
    return row

final_val_df.evall = final_val_df.evall.apply(lambda x: fill_column(x, n_labels=20))

In [161]:
final_val_df['y true']=final_val_df.evall
final_val_df = final_val_df[['entry_id', 'y true']]
final_val_df.head()

Unnamed: 0,entry_id,y true
5,162971,"[1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
7,174404,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
36,163544,"[0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
40,164446,"[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
45,163613,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [75]:
predicted_dataset = final_val_df[['entry_id']]

In [85]:
predicted_dataset['prediction'] = predicted_dataset['entry_id']
predicted_dataset['prediction capacity response'] = predicted_dataset['prediction'].apply(lambda x: [0]*4)
predicted_dataset['prediction hum conditions'] = predicted_dataset['prediction'].apply(lambda x: [0]*4)
predicted_dataset['prediction impact'] = predicted_dataset['prediction'].apply(lambda x: [0]*6)
predicted_dataset['prediction people at risk'] = predicted_dataset['prediction'].apply(lambda x: [0]*2)
predicted_dataset['prediction Priority Interventions'] = predicted_dataset['prediction'].apply(lambda x: [0]*2)
predicted_dataset['prediction Priority needs'] = predicted_dataset['prediction'].apply(lambda x: [0]*2)



predicted_dataset.head()

Unnamed: 0,entry_id,prediction,prediction capacity response,prediction hum conditions,prediction impact,prediction people at risk,prediction Priority Interventions,prediction Priority needs
5,162971,162971,"[0, 0, 0, 0]","[0, 0, 0, 0]","[0, 0, 0, 0, 0, 0]","[0, 0]","[0, 0]","[0, 0]"
7,174404,174404,"[0, 0, 0, 0]","[0, 0, 0, 0]","[0, 0, 0, 0, 0, 0]","[0, 0]","[0, 0]","[0, 0]"
36,163544,163544,"[0, 0, 0, 0]","[0, 0, 0, 0]","[0, 0, 0, 0, 0, 0]","[0, 0]","[0, 0]","[0, 0]"
40,164446,164446,"[0, 0, 0, 0]","[0, 0, 0, 0]","[0, 0, 0, 0, 0, 0]","[0, 0]","[0, 0]","[0, 0]"
45,163613,163613,"[0, 0, 0, 0]","[0, 0, 0, 0]","[0, 0, 0, 0, 0, 0]","[0, 0]","[0, 0]","[0, 0]"


In [146]:
for i in range (len(predictions_priority_needs)):
    predicted_dataset.loc[predicted_dataset['entry_id']==indexes_priority_needs[i], 'prediction Priority needs']=\
            predicted_dataset.loc[predicted_dataset['entry_id']==indexes_priority_needs[i], 'prediction Priority needs'].\
            apply(lambda x: predictions_priority_needs[i,:])

In [147]:
for i in range (len(predictions_capacities_responses)):
    predicted_dataset.loc[predicted_dataset['entry_id']==indexes_capacities_responses[i], 'prediction capacity response']=\
            predicted_dataset.loc[predicted_dataset['entry_id']==indexes_capacities_responses[i], 'prediction capacity response'].\
            apply(lambda x: predictions_capacities_responses[i,:])

In [148]:
for i in range (len(predictions_hum_conditions)):
    predicted_dataset.loc[predicted_dataset['entry_id']==indexes_hum_conditions[i], 'prediction hum conditions']=\
            predicted_dataset.loc[predicted_dataset['entry_id']==indexes_hum_conditions[i], 'prediction hum conditions'].\
            apply(lambda x: predictions_hum_conditions[i,:])

In [149]:
for i in range (len(predictions_impact)):
    predicted_dataset.loc[predicted_dataset['entry_id']==indexes_impact[i], 'prediction impact']=\
            predicted_dataset.loc[predicted_dataset['entry_id']==indexes_impact[i], 'prediction impact'].\
            apply(lambda x: predictions_impact[i,:])

In [150]:
for i in range (len(predictions_people_at_risk)):
    predicted_dataset.loc[predicted_dataset['entry_id']==indexes_people_at_risk[i], 'prediction people at risk']=\
            predicted_dataset.loc[predicted_dataset['entry_id']==indexes_people_at_risk[i], 'prediction people at risk'].\
            apply(lambda x: predictions_people_at_risk[i,:])

In [151]:
for i in range (len(predictions_priority_interventions)):
    predicted_dataset.loc[predicted_dataset['entry_id']==indexes_priority_interventions[i], 'prediction Priority Interventions']=\
            predicted_dataset.loc[predicted_dataset['entry_id']==indexes_priority_interventions[i], 'prediction Priority Interventions'].\
            apply(lambda x: predictions_priority_interventions[i,:])

In [248]:
predicted_dataset.head()

Unnamed: 0,entry_id,prediction,prediction capacity response,prediction hum conditions,prediction impact,prediction people at risk,prediction Priority Interventions,prediction Priority needs
5,162971,162971,"[1, 0, 1, 0]","[0, 0, 0, 0]","[0, 0, 0, 0, 0, 0]","[0, 0]","[0, 0]","[0, 0]"
7,174404,174404,"[0, 0, 0, 0]","[0, 0, 0, 1]","[0, 0, 0, 0, 0, 0]","[0, 0]","[0, 0]","[0, 0]"
36,163544,163544,"[0, 0, 0, 0]","[0, 1, 0, 0]","[0, 0, 0, 0, 0, 0]","[0, 0]","[0, 0]","[0, 0]"
40,164446,164446,"[0, 0, 1, 0]","[0, 0, 0, 0]","[0, 0, 0, 0, 0, 0]","[0, 0]","[0, 0]","[0, 0]"
45,163613,163613,"[0, 1, 0, 0]","[0, 0, 0, 0]","[0, 0, 0, 0, 0, 0]","[0, 0]","[0, 0]","[0, 0]"


In [188]:
predicted_dataset_v2 = predicted_dataset[['entry_id', 
                                          'prediction capacity response',
                                          'prediction hum conditions',
                                          'prediction impact',
                                          'prediction people at risk',
                                          'prediction Priority Interventions',
                                          'prediction Priority needs']]
predicted_dataset_v2['predictions combined'] = \
            predicted_dataset_v2[predicted_dataset_v2.columns[1:]].apply(
                    lambda x: ','.join(x.dropna().astype(str)),
                    axis=1
                )         
             
predicted_dataset_v2.head()

Unnamed: 0,entry_id,prediction capacity response,prediction hum conditions,prediction impact,prediction people at risk,prediction Priority Interventions,prediction Priority needs,predictions combined
5,162971,"[1, 0, 1, 0]","[0, 0, 0, 0]","[0, 0, 0, 0, 0, 0]","[0, 0]","[0, 0]","[0, 0]","[1 0 1 0],[0, 0, 0, 0],[0, 0, 0, 0, 0, 0],[0, ..."
7,174404,"[0, 0, 0, 0]","[0, 0, 0, 1]","[0, 0, 0, 0, 0, 0]","[0, 0]","[0, 0]","[0, 0]","[0, 0, 0, 0],[0 0 0 1],[0, 0, 0, 0, 0, 0],[0, ..."
36,163544,"[0, 0, 0, 0]","[0, 1, 0, 0]","[0, 0, 0, 0, 0, 0]","[0, 0]","[0, 0]","[0, 0]","[0, 0, 0, 0],[0 1 0 0],[0, 0, 0, 0, 0, 0],[0, ..."
40,164446,"[0, 0, 1, 0]","[0, 0, 0, 0]","[0, 0, 0, 0, 0, 0]","[0, 0]","[0, 0]","[0, 0]","[0 0 1 0],[0, 0, 0, 0],[0, 0, 0, 0, 0, 0],[0, ..."
45,163613,"[0, 1, 0, 0]","[0, 0, 0, 0]","[0, 0, 0, 0, 0, 0]","[0, 0]","[0, 0]","[0, 0]","[0 1 0 0],[0, 0, 0, 0],[0, 0, 0, 0, 0, 0],[0, ..."


In [202]:
def custom_concat (row):
    sample = row[0]
    for array in row[1:]:
        sample = np.concatenate((sample, array), axis=0)
    return sample
    
predicted_dataset_v2['predictions combined'] = \
            predicted_dataset_v2[predicted_dataset_v2.columns[1:]].apply(
                    lambda x: custom_concat(x),
                    axis=1
                )       
predicted_dataset_v2 = predicted_dataset_v2[['entry_id', 'predictions combined']]

In [209]:
final_df = final_val_df.merge(predicted_dataset_v2, on='entry_id', how='inner')

In [210]:
final_df.head()

Unnamed: 0,entry_id,y true,predictions combined
0,162971,"[1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,174404,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ..."
2,163544,"[0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,164446,"[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,163613,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [211]:
final_df.to_csv('finalllll')


In [218]:
true_y = np.array([true_yss for true_yss in final_df['y true']])
pred_y = np.array([true_yss for true_yss in final_df['predictions combined']])


In [221]:
pred_y[:,0]

array([1, 0, 0, ..., 0, 0, 0])

In [230]:
precisions = []
recalls = []
f1_scores = []
accuracies = []

for i in range (20):
    cls_rprt = classification_report(true_y[:, i], pred_y[:, i], output_dict=True)
    precisions.append(cls_rprt["macro avg"]["precision"])
    recalls.append(cls_rprt["macro avg"]["recall"])
    f1_scores.append(cls_rprt["macro avg"]["f1-score"])
    accuracies.append(cls_rprt["accuracy"])

  _warn_prf(average, modifier, msg_start, len(result))


In [247]:
results_df = pd.DataFrame.from_dict(tagname_to_tagid, orient='index')
results_df['accuracy'] = accuracies
results_df['recalls'] = recalls
results_df['precisions'] = precisions
results_df['f1_scores'] = f1_scores
results_df = results_df[['accuracy', 'f1_scores', 'precisions', 'recalls']]
results_df

Unnamed: 0,accuracy,f1_scores,precisions,recalls
Capacities & Response->International Response,0.956249,0.850766,0.833437,0.87054
Capacities & Response->National Response,0.921412,0.755087,0.811081,0.719961
Capacities & Response->Number Of People Reached,0.987084,0.879518,0.927895,0.841506
Capacities & Response->Response Gaps,0.998999,0.49975,0.4995,0.5
Humanitarian Conditions->Coping Mechanisms,0.957431,0.705649,0.871415,0.647512
Humanitarian Conditions->Living Standards,0.788703,0.768127,0.764052,0.77362
Humanitarian Conditions->Number Of People In Need,0.971439,0.792849,0.847261,0.754312
Humanitarian Conditions->Physical And Mental Well Being,0.894124,0.813598,0.83861,0.794305
Impact->Driver/Aggravating Factors,0.84437,0.711894,0.714116,0.70976
Impact->Impact On People,0.913953,0.736992,0.773447,0.711404


In [242]:
time_taken_for_prediction = 65 * 60
time_per_sample = time_taken_for_prediction / pillars_val_dataset.shape[0]
print('time taken for one sample in seconds:', time_per_sample)

time taken for one sample in seconds: 0.3547389485173731
