# BERT-like models fine-tuning
Fine-tune BERT-like model on Text-mine data

In [None]:
import pandas as pd
import transformers
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split

## 0. Global parameters below:
- **Choose your pre-trained model**
- Split percentage between train and test
- Average method for the evaluation metric. Default: "micro"

In [None]:
pretrained_model = "xlm-roberta-large"
splited_size = 0.2
metric_average_method = "micro"

## 1. Importing input data and cleaning

1. Load the data
2. Replace the wrong label (see EDA notebook from Sarah)
3. rebuild the sentences
4. Create dataset_dict for the training
5. split the training / test dataset

In [None]:
df_raw = pd.read_csv("./../data/train.csv")

In [None]:
# Cleaning the data
def cleaning_label(label):
    if label == "geogName name":
        return "name geogName"
    elif label == "geogFeat geogName geogName":
        return "geogFeat geogName"
    elif label == "name geogName geogName":
        return "name geogName"
    elif label == "geogName geogName":
        return "geogName"
    elif label == "geogName geogFeat geogName":
        return "geogFeat geogName"
    elif label == "geogName geogFeat":
        return "geogFeat geogName"
    elif label == "geogName geogName name":
        return "name geogName"
    elif label == "geogName name geogName":
        return "name geogName"
    elif label == "geogFeat geogName geogName geogName":
        return "geogFeat geogName"
    elif label == "geogName geogName name geogName":
        return "name geogName"
    else:
        return label

df_raw["Label"] = df_raw["Label"].apply(cleaning_label)


In [None]:
label_correspondance = {
    0: "aucun",
    1: "geogFeat",
    2: "geogFeat geogName",
    3: "geogName",
    4: "name",
    5: "name geogName"
}

def label_id(desired_value):
    # reverse Correspondance between label value and their index
    for key, value in label_correspondance.items():
        if value == desired_value:
            return key

In [None]:
# Rebuilding the sentences and making list of labels
current_sentence = []
current_sentence_label = []
current_sentence_ner_tag = []
list_sentences = []
list_sentences_label = []
list_sentences_ner_tags = []

# Iterate over each row in the DataFrame
for index, row in df_raw.iterrows():
    token = row['Token'].replace('"','')
    label = row['Label']
    ner_tag = label_id(row["Label"])
    current_sentence.append(token)
    current_sentence_label.append(label)
    current_sentence_ner_tag.append(ner_tag)
    
    # Check if the current token ends with a period
    if token.endswith('.'):       
        # Update the 'Sentence' column with the rebuilt sentence
        list_sentences.append(current_sentence)
        list_sentences_label.append(current_sentence_label)
        list_sentences_ner_tags.append(current_sentence_ner_tag)
        
        # Reset the current sentence
        current_sentence = []
        current_sentence_label = []
        current_sentence_ner_tag = []

df = pd.DataFrame()
df["tokens"] = list_sentences
df["labels"] = list_sentences_label
df["ner_tags"] = list_sentences_ner_tags

In [None]:
df

In [None]:
data_dict = {
    'tokens': df['tokens'].tolist(),
    'ner_tags': df["ner_tags"].tolist()
}

dataset = Dataset.from_dict(data_dict)

# Split the dataset into train and test sets
dataset_dict = dataset.train_test_split(test_size=splited_size)

In [None]:
dataset_dict

In [None]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(pretrained_model)

In [None]:
test_number = 2

inputs = tokenizer(dataset_dict["train"][test_number]["tokens"], is_split_into_words=True)
print(f"sentence: {dataset_dict['train'][test_number]['tokens']}")
inputs.tokens()

In [None]:
def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            # Start of a new word!
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:
            # Special token
            new_labels.append(-100)
        else:
            # Same word as previous token
            label = labels[word_id]
            # If the label is B-XXX we change it to I-XXX
            # if label % 2 == 1:
                # label += 1
            new_labels.append(label)

    return new_labels

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        # examples["tokens"], truncation=True, is_split_into_words=True
        examples["tokens"], is_split_into_words=True
    )
    all_labels = examples["ner_tags"]
    new_labels = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))

    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

In [None]:
tokenized_datasets = dataset_dict.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=dataset_dict["train"].column_names,
)

In [None]:
tokenized_datasets

## 2. Training

In [None]:
import evaluate

metric = evaluate.load("seqeval", average=metric_average_method)

In [None]:
label_correspondance = {
    0: "aucun",
    1: "geogFeat",
    2: "geogFeat geogName",
    3: "geogName",
    4: "name",
    5: "name geogName"
}

# id2label = {i: label for i, label in enumerate(label_correspondance)}
id2label = label_correspondance
label2id = {v: k for k, v in id2label.items()}

In [None]:
import numpy as np


def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[label_correspondance[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_correspondance[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": all_metrics["overall_precision"],
        "recall": all_metrics["overall_recall"],
        "f1": all_metrics["overall_f1"],
        "accuracy": all_metrics["overall_accuracy"],
        "aucun_f1": all_metrics["ucun"]["f1"],
        "geogFeat_f1": all_metrics["eogFeat"]["f1"],
        "geogFeat geogName_f1": all_metrics["eogFeat geogName"]["f1"],
        "geogName_f1": all_metrics["eogName"]["f1"],
        # "name_f1": all_metrics["ame"]["f1"], # there is no token labeled as Name
        "name geogName_f1": all_metrics["ame geogName"]["f1"],
    }

In [None]:
label2id

In [None]:
from transformers import AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained(
    pretrained_model,
    id2label=id2label,
    label2id=label2id,
)

In [None]:
model.config.num_labels

In [None]:
from transformers import TrainingArguments

args = TrainingArguments(
    f"{pretrained_model}_text-mine",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=3,
    weight_decay=0.01,
)

In [None]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)


In [None]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)
trainer.train()