In [1]:
!pip install icecream
!pip install tqdm
!pip install torchmetrics
!pip install pytorch_lightning
!pip install transformers

Collecting icecream
  Downloading https://files.pythonhosted.org/packages/1f/c0/8e2bc1b5eab95e5155841c826b431692638c19bf04ee4cdc86b379f85150/icecream-2.1.1-py2.py3-none-any.whl
Collecting executing>=0.3.1
  Downloading https://files.pythonhosted.org/packages/17/85/b84ea78f52bcb5513a790e64edc19687d8699ea6b4197f075da28547a370/executing-0.7.0-py2.py3-none-any.whl
Collecting colorama>=0.3.9
  Downloading https://files.pythonhosted.org/packages/44/98/5b86278fbbf250d239ae0ecb724f8572af1c91f4a11edf4d36a206189440/colorama-0.4.4-py2.py3-none-any.whl
Collecting asttokens>=2.0.1
  Downloading https://files.pythonhosted.org/packages/16/d5/b0ad240c22bba2f4591693b0ca43aae94fbd77fb1e2b107d54fff1462b6f/asttokens-2.0.5-py2.py3-none-any.whl
Installing collected packages: executing, colorama, asttokens, icecream
Successfully installed asttokens-2.0.5 colorama-0.4.4 executing-0.7.0 icecream-2.1.1
Collecting torchmetrics
[?25l  Downloading https://files.pythonhosted.org/packages/4d/8b/de8df9044ca2ac5dfc6b

In [1]:
import os
import sys
import logging
import argparse
from pathlib import Path
from ast import literal_eval
from collections import Counter
from typing import Any, Dict, Optional

from icecream import ic
from tqdm.auto import tqdm

import torchmetrics
from torchmetrics.functional import accuracy, f1, auroc

import pytorch_lightning as pl
from pytorch_lightning.loggers import TensorBoardLogger
from pytorch_lightning.core.decorators import auto_move_data
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping

import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler

import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, multilabel_confusion_matrix


import matplotlib.pyplot as plt
from pylab import rcParams
from matplotlib import rc

import transformers
from transformers import (
    AdamW,
    AutoConfig,
    AutoModel,
    AutoModelForSequenceClassification,
    T5ForConditionalGeneration,
    T5Tokenizer,
    AutoTokenizer,
    XLMRobertaTokenizer
)
from transformers.optimization import (
    Adafactor,
    get_linear_schedule_with_warmup,
)

import tensorflow as tf
import re

In [2]:
%matplotlib inline
%config InlineBackend.figure_format='retina'

RANDOM_SEED=2021
pl.seed_everything(RANDOM_SEED)

ic.configureOutput(outputFunction=sys.stdout.write, includeContext=True)

logger = logging.getLogger(__name__)

Global seed set to 2021


In [3]:
from google.colab import drive
drive.mount ('/content/drive', force_remount=True)

Mounted at /content/drive


In [4]:
cd 'drive/My Drive/Colab Notebooks'

/content/drive/My Drive/Colab Notebooks


In [5]:
TRAIN_BATCH_SIZE = 32
VALID_BATCH_SIZE = 64
EPOCHS = 3
LEARNING_RATE = 1e-05
GPU_NB=1

train_params = {
        'batch_size': TRAIN_BATCH_SIZE,
        'shuffle': True,
        'num_workers': 2
    }

val_params = {
    'batch_size': VALID_BATCH_SIZE,
    'shuffle': False,
    'num_workers': 2
    }

MODEL_NAME = "microsoft/xtremedistil-l6-h256-uncased"
#https://huggingface.co/microsoft/Multilingual-MiniLM-L12-H384/blob/main/README.md
#https://huggingface.co/models?search=XtremeDistil
TOKENIZER_NAME = MODEL_NAME

DATA_ROOT_DIR = os.path.join("..", "..", "..", "data", "frameworks_data", "data_v0.4.4")
TRAIN_PATH = os.path.join("data/data_v0.4.4_train.csv")
VAL_PATH = os.path.join("data_v0.4.4_val.csv")
tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_NAME)

In [6]:
import pandas as pd


In [7]:
from util_functions_classes.utils import *
from util_functions_classes.generate_models import *

  "The `@auto_move_data` decorator is deprecated in v1.3 and will be removed in v1.5."


In [8]:
def import_model (path:str):
    df = pd.read_csv(os.path.join(path), index_col=0)
    df['target'] = df['target'].apply(lambda x: clean_rows(x))
    return df

train_pillars = import_model("original_data/train.csv")
val_pillars = import_model("original_data/test.csv")


In [9]:
train_capacities_reponses = import_model("original_data/capacities_response_train_dataset.csv")
val_capacities_reponses = import_model("original_data/capacities_response_val_dataset.csv")

train_hum_conditions = import_model("original_data/hum_conditions_train_dataset.csv")
val_hum_conditions = import_model("original_data/hum_conditions_val_dataset.csv")

train_impact = import_model("original_data/impact_train_dataset.csv")
val_impact = import_model("original_data/impact_val_dataset.csv")

train_people_at_risk = import_model("original_data/people_at_risk_train_dataset.csv")
val_people_at_risk = import_model("original_data/people_at_risk_val_dataset.csv")

train_priority_interventions = import_model("original_data/priority_interventions_train_dataset.csv")
val_priority_interventions = import_model("original_data/priority_interventions_val_dataset.csv")

train_priority_needs = import_model("original_data/priority_needs_train_dataset.csv")
val_priority_needs = import_model("original_data/priority_needs_val_dataset.csv")

In [10]:
train_priority_interventions.shape[0]

1344

In [11]:
train_pillars.head(2)

Unnamed: 0,entry_id,excerpt,target
18977,63497,Más de 2.600 personas lograron moverse por Col...,"[Capacities & Response, Humanitarian Conditions]"
45407,324376,"(Boucle du Mouhoun, Sahel, Nord) L'un des rapp...",[Capacities & Response]


In [12]:
train_hum_conditions.head(2)

Unnamed: 0,entry_id,excerpt,target
83730,290327,"Fatoumata Haidara, Plan International Director...",[Living Standards]
27368,42935,According to the 2018 Libya Humanitarian Respo...,[Number Of People In Need]


In [25]:
def compute_weights (number_data_classes, n_tot):
    number_classes = len(number_data_classes)
    return [n_tot / (number_classes * number_data_class) for number_data_class in number_data_classes]



number_data_classes = [train_capacities_reponses.shape[0],
                        train_hum_conditions.shape[0],
                        train_impact.shape[0],
                        train_people_at_risk.shape[0],
                        train_priority_interventions.shape[0],
                        train_priority_needs.shape[0]]

pillars_weights = compute_weights (number_data_classes, train_pillars.shape[0])
pillars_weights

[1.230683617995805,
 0.4054549557172674,
 0.5592514783691255,
 1.6047108729627149,
 7.130456349206349,
 5.897435897435898]

In [14]:
ls

 [0m[01;34maugmented_data[0m/
 [01;34mcheckpoints-pillars-balanced-data-model-microsoft-Multilingual-MiniLM-L12-H384[0m/
 [01;34mcheckpoints-pillars-microsoft-Multilingual-MiniLM-L12-H384[0m/
 [01;34mcheckpoints-pillars-original-data-model-microsoft-xtremedistil-l12-h384-uncased[0m/
 [01;34mcheckpoints-pillars-weighted-model-microsoft-Multilingual-MiniLM-L12-H384[0m/
 data_v0.4.4_train.csv
 data_v0.4.4_val.csv
 indexes-pillars.npy
 indexes-pillars-weighted.npy
 indexes-pillars-weighted-xtreme-distil.npy
 [01;34mlightning_logs[0m/
'model_generation(1) (1).ipynb'
'model_generation (1).ipynb'
'model_generation(1).ipynb'
 model_generation.ipynb
 [01;34moriginal_data[0m/
'paraphrase_multilingual_MiniLM_L12_v2_pillars_and_subpillars_(1).ipynb'
 paraphrase_multilingual_MiniLM_L12_v2_pillars_and_subpillars.ipynb
 predictions-pillars.npy
 predictions-pillars-weighted.npy
 predictions-pillars-weighted-xtreme-distil.npy
 [01;34mtesttt-sentence-transformers-paraphrase-multilingual-

In [19]:
PATH_NAME = MODEL_NAME.split('/')[1] + 'no-augmentation-weight-sklearn'
os.mkdir(PATH_NAME)

In [21]:
os.chdir(PATH_NAME)

In [22]:
ls

In [23]:
log_dir_name = "-".join(MODEL_NAME.split("/"))

early_stopping_callback = EarlyStopping(monitor='val_loss', patience=2)

checkpoint_callback_params = {
    'save_top_k': 1,
    'verbose': True,
    'monitor': "val_loss",
    'mode': "min"
}

dirpath_pillars = f"./checkpoints-pillars-{log_dir_name}"
dirpath_subpillars_capacities_responses = f"./checkpoints-subpillars-capacities-responses-{log_dir_name}"
dirpath_subpillars_people_at_risk = f"./checkpoints-subpillars-people-at-risk-{log_dir_name}"
dirpath_subpillars_impact = f"./checkpoints-subpillars-impact-{log_dir_name}"
dirpath_subpillars_hum_conditions = f"./checkpoints-subpillars-humanitarian-conditions-{log_dir_name}"
dirpath_subpillars_priority_interventions = f"./checkpoints-subpillars-priority-interventions-{log_dir_name}"
dirpath_subpillars_priority_needs = f"./checkpoints-subpillars-priority-needs-{log_dir_name}"

checkpoint_callback_pillars = ModelCheckpoint(
  dirpath=dirpath_pillars,
  **checkpoint_callback_params
)
checkpoint_callback_subpillars_capacities_responses = ModelCheckpoint(
  dirpath=dirpath_subpillars_capacities_responses,
  **checkpoint_callback_params
)
checkpoint_callback_subpillars_people_at_risk = ModelCheckpoint(
  dirpath=dirpath_subpillars_people_at_risk,
  **checkpoint_callback_params
)
checkpoint_callback_subpillars_impact = ModelCheckpoint(
  dirpath=dirpath_subpillars_impact,
  **checkpoint_callback_params
)
checkpoint_callback_subpillars_hum_conditions = ModelCheckpoint(
  dirpath=dirpath_subpillars_hum_conditions,
  **checkpoint_callback_params
)
checkpoint_callback_subpillars_priority_interventions = ModelCheckpoint(
  dirpath=dirpath_subpillars_priority_interventions,
  **checkpoint_callback_params
)
checkpoint_callback_subpillars_priority_needs = ModelCheckpoint(
  dirpath=dirpath_subpillars_priority_needs,
  **checkpoint_callback_params
)


In [26]:
model_pillars = train_on_specific_targets(train_pillars,
                                        val_pillars,
                                            f"pillars-{log_dir_name}-",
                                           dirpath_pillars,
                                            MODEL_NAME,
                                            tokenizer,
                                            early_stopping_callback,
                                            checkpoint_callback_pillars,
                                           gpu_nb=GPU_NB,
                                           train_params=train_params,
                                           val_params=val_params,
                                           MAX_EPOCHS=EPOCHS,
                                            dropout_rate=0.3,
                                          weight_classes=pillars_weights)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validation sanity check', layout=Layout…

RuntimeError: ignored

In [None]:
checkpoint_path = "checkpoints-pillars-balanced-data-model-microsoft-Multilingual-MiniLM-L12-H384/epoch=0-step=3685.ckpt"
model_pillars = Transformer.load_from_checkpoint(checkpoint_path)

In [None]:
tagname_to_tagid = tagname_to_id (train_pillars["target"])
_ , val_loader = get_loaders (train_pillars, val_pillars, train_params, val_params, tagname_to_tagid, tokenizer)
preds, ids = model_pillars.custom_predict(val_loader, 'pillars-weighted-xtreme-distil')

HBox(children=(FloatProgress(value=0.0, max=224.0), HTML(value='')))




In [None]:
preds

array([[1, 0, 0, 0, 0, 0],
       [1, 0, 0, 0, 0, 0],
       [1, 0, 0, 0, 0, 0],
       ...,
       [0, 1, 0, 0, 0, 0],
       [0, 0, 1, 0, 0, 0],
       [0, 0, 0, 0, 0, 0]])

In [None]:
def custom_predict(model, validaion_loader, name:str):
        model.eval()
        model.freeze()
        indexes=torch.tensor([])

        with torch.no_grad():
            iter=0
            for batch in tqdm(validaion_loader, total=len(validaion_loader.dataset)//validaion_loader.batch_size):
                
                logits = model({"ids": batch["ids"].to('cuda'),
                                "mask": batch["mask"].to('cuda'),
                                "token_type_ids": batch["token_type_ids"].to('cuda')})
                
                preds_batch = np.zeros(logits.shape, dtype=np.int)
                preds_batch[(torch.sigmoid(logits) >= model.pred_threshold).cpu().nonzero(as_tuple=True)] = 1
                if iter==0:
                    predictions = preds_batch
                    indexes = batch["entry_id"]
                
                else:
                    predictions = np.concatenate([predictions,preds_batch], 0) #.append(preds_batch)
                    indexes = tf.concat([indexes, batch["entry_id"]], 0)
                
                iter += 1
                
        np.save('predictions-'+name, np.array(predictions))
        np.save('indexes-'+name, np.array(indexes))
        return predictions, indexes

In [None]:
ids

<tf.Tensor: shape=(16470,), dtype=int64, numpy=array([ 51669,  64493,  85912, ..., 220019, 200026, 334746])>

In [None]:
model_subpillars_capacities_responses = train_on_specific_targets(
                                            train_capacities_reponses,
                                           val_capacities_reponses,
                                            f"capacities-response-{log_dir_name}-",
                                           dirpath_subpillars_capacities_responses,
                                            MODEL_NAME,
                                            tokenizer,
                                            early_stopping_callback,
                                            checkpoint_callback_subpillars_capacities_responses,
                                           gpu_nb=GPU_NB,
                                           train_params=train_params,
                                           val_params=val_params,
                                           MAX_EPOCHS=EPOCHS,
                                            dropout_rate=0.4)

TypeError: ignored

In [None]:
model_subpillars_people_at_risk = train_on_specific_targets(train_people_at_risk,
                                           val_people_at_risk,
                                            f"people-at-risk-{log_dir_name}-",
                                           dirpath_subpillars_people_at_risk,
                                            MODEL_NAME,
                                            tokenizer,
                                            early_stopping_callback,
                                            checkpoint_callback_subpillars_people_at_risk,
                                           gpu_nb=GPU_NB,
                                           train_params=train_params,
                                           val_params=val_params,
                                           MAX_EPOCHS=EPOCHS,
                                            dropout_rate=0.4)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validation sanity check', layout=Layout…

Global seed set to 2021




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Training', layout=Layout(flex='2'), max…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

Epoch 0, global step 489: val_loss reached 0.27747 (best 0.27747), saving model to "/content/drive/My Drive/Colab Notebooks/checkpoints-subpillars-people-at-risk-microsoft-Multilingual-MiniLM-L12-H384/epoch=0-step=489.ckpt" as top 1


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

Epoch 1, global step 979: val_loss reached 0.19688 (best 0.19688), saving model to "/content/drive/My Drive/Colab Notebooks/checkpoints-subpillars-people-at-risk-microsoft-Multilingual-MiniLM-L12-H384/epoch=1-step=979.ckpt" as top 1


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

Epoch 2, global step 1469: val_loss reached 0.17652 (best 0.17652), saving model to "/content/drive/My Drive/Colab Notebooks/checkpoints-subpillars-people-at-risk-microsoft-Multilingual-MiniLM-L12-H384/epoch=2-step=1469.ckpt" as top 1
FIT Profiler Report

Action                             	|  Mean duration (s)	|Num calls      	|  Total time (s) 	|  Percentage %   	|
--------------------------------------------------------------------------------------------------------------------------------------
Total                              	|  -              	|_              	|  1548.0         	|  100 %          	|
--------------------------------------------------------------------------------------------------------------------------------------
run_training_epoch                 	|  468.47         	|3              	|  1405.4         	|  90.789         	|
run_training_batch                 	|  0.87816        	|1470           	|  1290.9         	|  83.391         	|
optimizer_step_and_closu




In [None]:
model_subpillars_impact = train_on_specific_targets(train_impact,
                                           val_impact,
                                            f"impact-{log_dir_name}-",
                                           dirpath_subpillars_impact,
                                            MODEL_NAME,
                                            tokenizer,
                                            early_stopping_callback,
                                            checkpoint_callback_subpillars_impact,
                                           gpu_nb=GPU_NB,
                                           train_params=train_params,
                                           val_params=val_params,
                                           MAX_EPOCHS=EPOCHS,
                                            dropout_rate=0.4)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validation sanity check', layout=Layout…

Global seed set to 2021




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Training', layout=Layout(flex='2'), max…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

Epoch 0, global step 535: val_loss reached 0.46957 (best 0.46957), saving model to "/content/drive/My Drive/Colab Notebooks/checkpoints-subpillars-impact-microsoft-Multilingual-MiniLM-L12-H384/epoch=0-step=535.ckpt" as top 1


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

Epoch 1, global step 1071: val_loss reached 0.42329 (best 0.42329), saving model to "/content/drive/My Drive/Colab Notebooks/checkpoints-subpillars-impact-microsoft-Multilingual-MiniLM-L12-H384/epoch=1-step=1071.ckpt" as top 1
FIT Profiler Report

Action                             	|  Mean duration (s)	|Num calls      	|  Total time (s) 	|  Percentage %   	|
--------------------------------------------------------------------------------------------------------------------------------------
Total                              	|  -              	|_              	|  1223.5         	|  100 %          	|
--------------------------------------------------------------------------------------------------------------------------------------
run_training_epoch                 	|  529.58         	|2              	|  1059.2         	|  86.566         	|
run_training_batch                 	|  0.87735        	|1072           	|  940.52         	|  76.869         	|
optimizer_step_and_closure_0    




In [None]:
model_subpillars_hum_conditions = train_on_specific_targets(train_hum_conditions,
                                           val_hum_conditions,
                                            f"humanitarian-conditions-{log_dir_name}-",
                                           dirpath_subpillars_hum_conditions,
                                            MODEL_NAME,
                                            tokenizer,
                                            early_stopping_callback,
                                            checkpoint_callback_subpillars_hum_conditions,
                                           gpu_nb=GPU_NB,
                                           train_params=train_params,
                                           val_params=val_params,
                                           MAX_EPOCHS=EPOCHS,
                                            dropout_rate=0.4)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


RuntimeError: ignored

In [None]:
model_subpillars_priority_interventions = train_on_specific_targets(train_priority_interventions,
                                           val_priority_interventions,
                                            f"priority-interventions-{log_dir_name}-",
                                           dirpath_subpillars_priority_interventions,
                                            MODEL_NAME,
                                            tokenizer,
                                            early_stopping_callback,
                                            checkpoint_callback_subpillars_priority_interventions,
                                           gpu_nb=GPU_NB,
                                           train_params=train_params,
                                           val_params=val_params,
                                           MAX_EPOCHS=EPOCHS,
                                            dropout_rate=0.5)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validation sanity check', layout=Layout…

Global seed set to 2021




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Training', layout=Layout(flex='2'), max…

RuntimeError: ignored

In [None]:
model_subpillars_priority_needs = train_on_specific_targets(train_priority_needs,
                                           val_priority_needs,
                                            f"priority-needs-{log_dir_name}-",
                                           dirpath_subpillars_priority_needs,
                                            MODEL_NAME,
                                            tokenizer,
                                            early_stopping_callback,
                                            checkpoint_callback_subpillars_priority_needs,
                                           gpu_nb=GPU_NB,
                                           train_params=train_params,
                                           val_params=val_params,
                                           MAX_EPOCHS=EPOCHS,
                                            dropout_rate=0.5)