In [None]:
!pip install -r requirements.txt --quiet

In [8]:
import pandas as pd
from datasets import load_dataset
import os
import sys
sys.path.append('src')
from pipeline import TrainingPipeline
import torch

In [9]:
dataset = load_dataset('nlp-thedeep/humset')

In [None]:
def _preprocess_df(df: pd.DataFrame):
    targets = []
    targets += [f"first_level_tags->sectors->{item}" for item in df['sectors']]
    targets += [f"first_level_tags->pillars_2d->{item}" for item in df['pillars_2d']]
    targets += [f"first_level_tags->pillars_1d->{item}" for item in df['pillars_1d']]
    targets += [f"subpillars_1d->{item}" for item in df['subpillars_1d']]
    targets += [f"subpillars_2d->{item}" for item in df['subpillars_2d']]
    return targets

In [None]:
sample = False
relevant_cols = ['entry_id', 'excerpt', 'targets', 'lang']

train_df = pd.DataFrame(dataset['train'])
val_df = pd.DataFrame(dataset['validation'])
test_df = pd.DataFrame(dataset['test'])

train_df['targets'] = train_df.apply(_preprocess_df, axis=1)
val_df['targets'] = val_df.apply(_preprocess_df, axis=1)
test_df['targets'] = test_df.apply(_preprocess_df, axis=1)

train_df = train_df[relevant_cols]
val_df = val_df[relevant_cols]
test_df = test_df[relevant_cols]

if sample: 
    train_df = train_df.sample(frac=0.05)
    val_df = val_df.sample(frac=0.1)
    test_df = test_df.sample(frac=0.1)

In [None]:
relevant_classification_results_cols = [
    "mean->first_level_tags->pillars_1d",
    "mean->first_level_tags->pillars_2d",
    "mean->first_level_tags->sectors",
    "mean->subpillars_1d",
    "mean->subpillars_2d",
]
final_cols_classification = [
    "backbone_name",
    "training_setup",
    "architecture_setup",
    "tag",
    "precision",
    "f_score",
]

train_hyperparams = {
    "max_len": 200, #not the max to optimize computation time.
    "delete_long_excerpts": "false",
    "apply_preprocessing": "true",
    "explainability": "false",
    "n_epochs": 1 if sample  else 3,
    "dropout": 0.2,
    "learning_rate": 3e-5,
    "weight_decay": 1e-2,
    "train_batch_size": 8,
    "val_batch_size": 16,
    "n_freezed_layers": 1,
    "n_mid_layers": 1,
    "output_data_dir": 'results',
    "model_dir": "results/models"
}

output_dir = "outputs"
if not os.path.exists(output_dir):
    os.mkdir(output_dir)

In [None]:
architecture_setups = ["base_architecture", "multiabel_architecture"]
training_setups = ["counterfactual_debiasing", "no_debiasing"]
backbone_names = [
    "xlm-roberta-base",
    "nlp-thedeep/humbert",
    "bert-base-multilingual-cased",
    # "nreimers/mMiniLMv2-L6-H384-distilled-from-XLMR-Large" # distilled model for test and debugging
]
bias_measurement_methods = [
    "probability_discrepency",
]

classification_results_df = pd.DataFrame()
overall_results_df = pd.DataFrame()

for one_backbone_name in backbone_names:
    for one_training_setup in training_setups:
        for one_architecture_setup in architecture_setups:
            pipeline = TrainingPipeline(
                train_hyperparams,
                train_df,
                val_df,
                test_df,
                architecture_setup=one_architecture_setup,
                training_setup=one_training_setup,
                backbone_name=one_backbone_name,
                results_dir="results",
                humbias_set_dir="humbias_set",
            )
            # save training model
            os.makedirs(train_hyperparams["model_dir"], exist_ok=True)

            torch.save(
                pipeline.model,
                os.path.join(train_hyperparams["model_dir"], pipeline.results_name),
            )

            test_set_results = pipeline.test_set_results
            f1score_result = test_set_results[
                test_set_results.tag == "mean->first_level_tags"
            ].f_score.values[0]

            relevant_classification_results = pipeline.test_set_results
            relevant_classification_results = relevant_classification_results[
                relevant_classification_results.tag.isin(
                    relevant_classification_results_cols
                )
            ].copy()
            relevant_classification_results["backbone_name"] = one_backbone_name
            relevant_classification_results["training_setup"] = one_training_setup
            relevant_classification_results[
                "architecture_setup"
            ] = one_architecture_setup

            relevant_classification_results = relevant_classification_results[
                final_cols_classification
            ]
            classification_results_df = pd.concat(
                [classification_results_df, relevant_classification_results]
            )

            pipeline.get_counterfactual_predictions_discrepency_results()