In [None]:
"""!pip install icecream
!pip install tqdm
!pip install torchmetrics
!pip install pytorch_lightning
!pip install transformers"""

In [None]:
import os
import sys
import logging
import argparse
from pathlib import Path
from collections import Counter
from typing import Any, Dict, Optional

from icecream import ic
from tqdm.auto import tqdm

import torchmetrics
from torchmetrics.functional import accuracy, f1, auroc

import pytorch_lightning as pl
from pytorch_lightning.loggers import TensorBoardLogger
from pytorch_lightning.core.decorators import auto_move_data
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping

import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, multilabel_confusion_matrix


from pylab import rcParams
from matplotlib import rc

import transformers
from transformers import (
    AdamW,
    AutoConfig,
    AutoModel,
    AutoModelForSequenceClassification,
    AutoTokenizer,
)
from transformers.optimization import (
    Adafactor,
    get_linear_schedule_with_warmup,
)

import tensorflow as tf
import re

In [None]:
%matplotlib inline
%config InlineBackend.figure_format='retina'

RANDOM_SEED=2021
pl.seed_everything(RANDOM_SEED)

ic.configureOutput(outputFunction=sys.stdout.write, includeContext=True)

logger = logging.getLogger(__name__)

In [None]:
TRAIN_BATCH_SIZE = 8
VALID_BATCH_SIZE = 16
EPOCHS = 3
LEARNING_RATE = 2e-05
#MEHOD_LANGUAGES: 'keep', 'omit', None
DROPOUT_RATE = 0.3
GPU_NB=1

train_params = {
        'batch_size': TRAIN_BATCH_SIZE,
        'shuffle': True,
        'num_workers': 2
    }

val_params = {
    'batch_size': VALID_BATCH_SIZE,
    'shuffle': False,
    'num_workers': 2
    }

#https://huggingface.co/microsoft/Multilingual-MiniLM-L12-H384/blob/main/README.md
#https://huggingface.co/models?search=XtremeDistil

MODEL_NAME_EN = "microsoft/xtremedistil-l6-h384-uncased"
TOKENIZER_NAME_EN = MODEL_NAME_EN

MODEL_NAME_NOT_EN = 'microsoft/Multilingual-MiniLM-L12-H384'
TOKENIZER_NAME_NOT_EN = 'xlm-roberta-base'


DATA_ROOT_DIR = os.path.join("..", "..", "..", "data", "frameworks_data", "data_v0.4.4")
TRAIN_PATH = os.path.join("data_v0.4.4_train.csv")
VAL_PATH = os.path.join("data_v0.4.4_val.csv")


In [None]:
from util_functions_classes.utils import *
from util_functions_classes.generate_models import *

In [None]:
tot_train = pd.read_csv(os.path.join(DATA_ROOT_DIR, TRAIN_PATH))
tot_test = pd.read_csv(os.path.join(DATA_ROOT_DIR, VAL_PATH))

all_dataset = pd.concat([tot_train, tot_test])

# Keep only unique values in pillars
all_dataset["pillars"] = all_dataset["pillars"].apply(lambda x: clean_rows (x))
all_dataset["subpillars"] = all_dataset["subpillars"].apply(lambda x: clean_rows (x))

# Keep only rows with a not empty pillar
all_dataset = all_dataset[all_dataset.pillars.apply(lambda x: len(x)>0)]

In [None]:
en_capacities_response_train_dataset, en_capacities_response_val_dataset =\
                get_subpillar_datasets ('Capacities & Response', 
                                        all_dataset,
                                        perform_augmentation=False,
                                        method='keep')

en_hum_conditions_train_dataset, en_hum_conditions_val_dataset =\
                get_subpillar_datasets ('Humanitarian Conditions', 
                                        all_dataset,
                                        perform_augmentation=False,
                                        method='keep')

en_impact_train_dataset, en_impact_val_dataset =\
                                        get_subpillar_datasets ('Impact', 
                                       all_dataset,
                                      perform_augmentation=False,
                                        method='keep')

en_people_at_risk_train_dataset, en_people_at_risk_val_dataset =\
                                        get_subpillar_datasets ('People At Risk',
                                       all_dataset,
                                       perform_augmentation=False,
                                        method='keep')

en_priority_interventions_train_dataset, en_priority_interventions_val_dataset = \
                get_subpillar_datasets ('Priority Interventions', 
                                        all_dataset,
                                        perform_augmentation=False,
                                        method='keep')

en_priority_needs_train_dataset, en_priority_needs_val_dataset =\
                                        get_subpillar_datasets ('Priority Needs', 
                                       all_dataset, 
                                       perform_augmentation=False,
                                        method='keep')

en_tot_train = pd.concat([en_capacities_response_train_dataset,
                       en_hum_conditions_train_dataset,
                       en_impact_train_dataset,
                       en_people_at_risk_train_dataset,
                       en_priority_interventions_train_dataset,
                       en_priority_needs_train_dataset])[['entry_id', 'excerpt', 'pillars']]\
                .rename(columns={'pillars': 'target'})

en_tot_test = pd.concat([en_capacities_response_val_dataset,
                       en_hum_conditions_val_dataset,
                       en_impact_val_dataset,
                       en_people_at_risk_val_dataset,
                       en_priority_interventions_val_dataset,
                       en_priority_needs_val_dataset])[['entry_id', 'excerpt', 'pillars']]\
                .rename(columns={'pillars': 'target'})

en_number_data_classes = [en_capacities_response_train_dataset.shape[0],
                        en_hum_conditions_train_dataset.shape[0],
                        en_impact_train_dataset.shape[0],
                        en_people_at_risk_train_dataset.shape[0],
                        en_priority_interventions_train_dataset.shape[0],
                        en_priority_needs_train_dataset.shape[0]]

en_pillars_weights = compute_weights (en_number_data_classes, en_tot_train.shape[0])
en_pillars_weights

In [None]:
log_dir_name = "-".join(MODEL_NAME_EN.split("/"))
PATH_NAME = log_dir_name + '-no-augmentation-sqrt-weight'
if not os.path.exists(PATH_NAME):
    os.makedirs(PATH_NAME)

In [None]:
os.chdir(PATH_NAME)

In [None]:
en_tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_NAME_EN)

In [None]:

early_stopping_callback = EarlyStopping(monitor='val_f1',
                                        patience=2,
                                       mode='max')

checkpoint_callback_params = {
    'save_top_k': 1,
    'verbose': True,
    'monitor': "val_f1",
    'mode': "max"
}

dirpath_pillars = f"./checkpoints-pillars-{log_dir_name}"


checkpoint_callback_pillars = ModelCheckpoint(
  dirpath=dirpath_pillars,
  **checkpoint_callback_params
)


In [None]:
en_model_pillars = train_on_specific_targets(en_tot_train,
                                        en_tot_test,
                                            f"pillars-{log_dir_name}-",
                                           dirpath_pillars,
                                            MODEL_NAME_EN,
                                            en_tokenizer,
                                            early_stopping_callback,
                                            checkpoint_callback_pillars,
                                           gpu_nb=GPU_NB,
                                           train_params=train_params,
                                           val_params=val_params,
                                           MAX_EPOCHS=EPOCHS,
                                            dropout_rate=DROPOUT_RATE,
                                          weight_classes=en_pillars_weights)