# Train the 3 models

## 0. Load the annotated dataset

In [None]:
import pandas as pd
import transformers
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split
import evaluate
import numpy as np

In [None]:
dataset_dict = DatasetDict.load_from_disk("./data/annotation_generated_from_xlsx/annotation.dataset")
dataset_dict

In [None]:
dataset_augmented_dict = DatasetDict.load_from_disk("./data/annotation_generated_from_xlsx/annotation_chatgpt_augmented.dataset")

dataset_augmented_dict

## 1. Fine-tune Bert-like model

In [None]:
metric_average_method = "micro"
nb_epoch = 10
label_correspondance = {
    0: "0",
    1: "covariate",
}
metric = evaluate.load("seqeval")

def label_id(desired_value):
    # reverse Correspondance between label value and their index
    for key, value in label_correspondance.items():
        if value == desired_value:
            return key

def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            # Start of a new word!
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:
            # Special token
            new_labels.append(-100)
        else:
            # Same word as previous token
            label = labels[word_id]
            # If the label is B-XXX we change it to I-XXX
            # if label % 2 == 1:
                # label += 1
            new_labels.append(label)
    return new_labels

def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[label_correspondance[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_correspondance[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": all_metrics["overall_precision"],
        "recall": all_metrics["overall_recall"],
        "f1": all_metrics["overall_f1"],
    }

In [None]:
import mlflow
from mlflow import pytorch

def train_bert_like_model(pretrained_model, dataset_dict, dataset_name):

    from transformers import AutoTokenizer, RobertaTokenizerFast
    try:
        tokenizer = RobertaTokenizerFast.from_pretrained(pretrained_model, 
                                                        add_prefix_space=True,
                                                        truncating = True,
                                                        model_max_length=512
                                                        )
    except:
        tokenizer = AutoTokenizer.from_pretrained(pretrained_model, 
                                                        add_prefix_space=True,
                                                        truncating = True,
                                                        model_max_length=512
                                                        )

    def tokenize_and_align_labels(examples):
        tokenized_inputs = tokenizer(
            examples["tokens"], truncation=True, is_split_into_words=True,
        )
        all_labels = examples["ner_tags"]
        new_labels = []
        for i, labels in enumerate(all_labels):
            # word_ids = tokenized_inputs.word_ids(i)
            # print(f"i: {i} | labels: {labels}")
            word_ids = tokenized_inputs.word_ids(i)
            new_labels.append(align_labels_with_tokens(labels, word_ids))

        tokenized_inputs["labels"] = new_labels
        return tokenized_inputs

    tokenized_datasets = dataset_dict.map(
        tokenize_and_align_labels,
        batched=True,
        remove_columns=dataset_dict["train"].column_names,
    )
    # print(tokenized_datasets)

    metric = evaluate.load("seqeval")
    id2label = label_correspondance
    label2id = {v: k for k, v in id2label.items()}

    from transformers import AutoModelForTokenClassification

    model = AutoModelForTokenClassification.from_pretrained(
        pretrained_model,
        id2label=id2label,
        label2id=label2id,
        
    )

    from transformers import TrainingArguments

    args = TrainingArguments(
        f"mood_covariate_from_{pretrained_model}_{dataset_name}",
        evaluation_strategy="epoch",
        save_strategy="no",
        learning_rate=2e-5,
        num_train_epochs=nb_epoch,
        # output_dir="./models"
        # load_best_model_at_end=True,
        # save_strategy="epoch",mood_covariate_from_{pretrained_model}
        #weight_decay=0.01,
    )

    from transformers import DataCollatorForTokenClassification

    data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

    from transformers import Trainer

    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=tokenized_datasets["train"],
        eval_dataset=tokenized_datasets["test"],
        data_collator=data_collator,
        compute_metrics=compute_metrics,
        tokenizer=tokenizer,
    )
    with mlflow.start_run(run_name=pretrained_model) as run:
        trainer.train()

    trainer.save_model(f"./models/mood_covariate_from_{pretrained_model}")

In [None]:
mlflow.set_experiment("Bert-Like models")


pretrained_model = "roberta-base"
train_bert_like_model(pretrained_model, dataset_dict, "base")

pretrained_model = "microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract-fulltext"
train_bert_like_model(pretrained_model, dataset_dict, "base")

pretrained_model = "FacebookAI/xlm-roberta-base"
train_bert_like_model(pretrained_model, dataset_dict, "base")

In [None]:
mlflow.set_experiment("Hybride")


pretrained_model = "roberta-base"
train_bert_like_model(pretrained_model, dataset_dict, "gpt3-5-augmented")

pretrained_model = "microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract-fulltext"
train_bert_like_model(pretrained_model, dataset_dict, "gpt3-5-augmented")

pretrained_model = "FacebookAI/xlm-roberta-base"
train_bert_like_model(pretrained_model, dataset_dict, "gpt3-5-augmented")

In [None]:
from transformers import AutoModelForSequenceClassification
from transformers import pipeline

model_path = f"./models/mood_covariate_from_{pretrained_model}"

# Load the model using the pipeline for Named Entity Recognition (NER)
ner_classifier = pipeline("ner", model=model_path, aggregation_strategy="simple")

In [None]:
res = ner_classifier(" in Figs. S3 â€“S6. Absolute humidity was found to be signiï¬cantly linked to epidemic onset dates at the spatial scale ( p= 0.029), but not at the other scales. The associated coefï¬cient was negative ( -0.4763). Mobility ï¬‚ows were not found to be signiï¬cantly linked to epidemic onset dates (p= 0.57 with the corrected model, p= 0.73 with the uncorrected model). In the corrected model, the coefï¬cient ")

In [None]:
for r in res:
    print(r)