In [None]:
#!pip install -r requirements.txt --quiet

In [8]:
import pandas as pd
import os
from download_data import download_file_from_google_drive
import sys
sys.path.append('src')
from pipeline import TrainingPipeline
import torch

In [9]:
data_folder = 'data'

if not os.path.exists(data_folder):
    os.mkdir(data_folder)
    
train_dataset_id = "17QdfpjnQpyA-Cs89PkOCS-aAlk61NEUr"
val_dataset_id = "198bii8_3K362bTC6UIJMsZo4JVF0Luyt"
test_dataset_id = "12KikQgxxo1CCQPdbupXF8aaLop-v59OH"

train_name = "train.csv"
val_name = "val.csv"
test_name = "test.csv"

files = {
    train_name: train_dataset_id,
    val_name: val_dataset_id,
    test_name: test_dataset_id
}

downloaded_files = os.listdir(data_folder)

for data_name, data_id in files.items():
    if data_name not in downloaded_files:
        print(f"downloading {data_name} file")
        download_file_from_google_drive(id=data_id, destination=f"{data_folder}/{data_name}")

In [5]:
relevant_cols = ['entry_id', 'excerpt', 'targets', 'lang']

train_df = pd.read_csv('data/train.csv', usecols=relevant_cols)
val_df = pd.read_csv('data/val.csv', usecols=relevant_cols)
test_df = pd.read_csv('data/test.csv', usecols=relevant_cols)

In [7]:
sample = True

if sample: 
    train_df = train_df.sample(frac=0.1)
    val_df = val_df.sample(frac=0.1)
    test_df = test_df.sample(frac=0.1)


In [None]:
relevant_classification_results_cols = [
    "mean->first_level_tags->pillars_1d",
    "mean->first_level_tags->pillars_2d",
    "mean->first_level_tags->sectors",
    "mean->subpillars_1d",
    "mean->subpillars_2d",
]
final_cols_classification = [
    "backbone_name",
    "training_setup",
    "architecture_setup",
    "tag",
    "precision",
    "f_score",
]

train_hyperparams = {
    "max_len": 200, #not the max to optimize computation time.
    "delete_long_excerpts": "false",
    "apply_preprocessing": "true",
    "explainability": "false",
    "n_epochs": 1 if sample  else 3,
    "dropout": 0.2,
    "learning_rate": 3e-5,
    "weight_decay": 1e-2,
    "train_batch_size": 8,
    "val_batch_size": 16,
    "n_freezed_layers": 1,
    "n_mid_layers": 1,
    "output_data_dir": 'results',
    "model_dir": "results/models"
}

output_dir = "outputs"
if not os.path.exists(output_dir):
    os.mkdir(output_dir)

In [None]:
architecture_setups = ["base_architecture", "multiabel_architecture"]
training_setups = [
   "counterfactual_debiasing", "no_debiasing", "no_finetuning"
]  
backbone_names = [
    "xlm-roberta-base",
    "nlp-thedeep/humbert",
    "bert-base-multilingual-cased",
    #"nreimers/mMiniLMv2-L6-H384-distilled-from-XLMR-Large" # distilled model for test and debugging
]
bias_measurement_methods = [
    "probability_discrepency",
]

instance_type = "ml.p3.2xlarge"

hyperparameters = {
    "hyperparameters": str(train_hyperparams),
    "architecture_setups": str(architecture_setups),
    "training_setups": str(training_setups),
    "backbone_names": str(backbone_names),
    "bias_measurement_methods": str(bias_measurement_methods),
}


classification_results_df = pd.DataFrame()
overall_results_df = pd.DataFrame()

for one_backbone_name in backbone_names:
    for one_training_setup in training_setups:
        for one_architecture_setup in architecture_setups:
            pipeline = TrainingPipeline(
                train_hyperparams,
                train_df,
                val_df,
                test_df,
                architecture_setup=one_architecture_setup,
                training_setup=one_training_setup,
                backbone_name=one_backbone_name,
                results_dir= "results",
                humbias_set_dir="humbias_set",
            )
            if one_training_setup == "no_finetuning":
                pipeline.test_set_results = pd.DataFrame(
                    {
                        "tag": list(
                            pipeline.model.tagname_to_tagid_classification.keys()
                        )
                    }
                )
                pipeline.test_set_results["precision"] = "-"
                pipeline.test_set_results["f_score"] = "-"
            else:
                # save training model
                torch.save(
                    pipeline.model,
                    os.path.join(train_hyperparams['model_dir'], pipeline.results_name),
                )

                test_set_results = pipeline.test_set_results
                f1score_result = test_set_results[
                    test_set_results.tag == "mean->first_level_tags"
                ].f_score.values[0]

            relevant_classification_results = pipeline.test_set_results
            relevant_classification_results = relevant_classification_results[
                relevant_classification_results.tag.isin(
                    relevant_classification_results_cols
                )
            ].copy()
            relevant_classification_results["backbone_name"] = one_backbone_name
            relevant_classification_results["training_setup"] = one_training_setup
            relevant_classification_results[
                "architecture_setup"
            ] = one_architecture_setup

            relevant_classification_results = relevant_classification_results[
                final_cols_classification
            ]
            classification_results_df = pd.concat(
                [classification_results_df, relevant_classification_results]
            )

            if one_training_setup != "no_finetuning":
                if "probability_discrepency" in bias_measurement_methods:
                    # counterfactual_predictions_discrepency
                    pipeline.get_counterfactual_predictions_discrepency_results()
