# Classification d'incidents avec des modèles *Transformers*

## 1. Création du jeu de données (*dataset*)

In [4]:
import json
import spacy
import numpy as np
from torch import nn
from torch.utils.data import Dataset, DataLoader
from torch import FloatTensor, LongTensor
from typing import List
from poutyne.framework import Experiment
from poutyne import set_seeds
from torch.optim import SGD
import numpy as np
import pandas as pd
import torch
from transformers import GPT2Tokenizer, GPT2Model
from datasets import Dataset, DatasetDict
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer, GPT2ForSequenceClassification
import evaluate
from transformers import DataCollatorWithPadding


In [5]:
train_json_fn = "./data/incidents_train.json"
validation_json_fn = "./data/incidents_test.json"
test_json_fn = "./data/incidents_test.json"

In [6]:
#Fonction permettant de charger les données
def load_incident_dataset(filename):
    with open(filename, 'r') as fp:
        incident_list = json.load(fp)
    return incident_list

In [7]:
train_list = load_incident_dataset(train_json_fn)
validation_list = load_incident_dataset(validation_json_fn)
test_list = load_incident_dataset(test_json_fn)

print("Nombre d'incidents dans train:", len(train_list))
print("Nombre d'incidents dans validation:", len(validation_list))
print("Nombre d'incidents dans test:", len(test_list))

Nombre d'incidents dans train: 2475
Nombre d'incidents dans validation: 531
Nombre d'incidents dans test: 531


In [8]:
def convert_labels_to_int(dataset):
    for item in dataset:
        item['label'] = int(item['label'])

In [9]:
convert_labels_to_int(train_list)
convert_labels_to_int(validation_list)
convert_labels_to_int(test_list)

In [10]:
train_dataset = Dataset.from_dict({"text": [item['text'] for item in train_list], 
                                   "label": [item['label'] for item in train_list]})

validation_list = Dataset.from_dict({"text": [item['text'] for item in validation_list], 
                                  "label": [item['label'] for item in validation_list]})

test_dataset = Dataset.from_dict({"text": [item['text'] for item in test_list], 
                                  "label": [item['label'] for item in test_list]})

In [11]:
dataset_dict = DatasetDict({
    "train": train_dataset,
    "validation": validation_list,
    "test": test_dataset
})

In [12]:
#On charge notre tokenizer BERT
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

1

In [13]:
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

In [14]:
tokenized_dataset = dataset_dict.map(preprocess_function, batched=True)

Map: 100%|██████████| 2475/2475 [00:01<00:00, 2240.34 examples/s]
Map: 100%|██████████| 531/531 [00:00<00:00, 2508.23 examples/s]
Map: 100%|██████████| 531/531 [00:00<00:00, 2624.43 examples/s]


DataCollatorWithPadding s'occupe automatiquement de "padding" (ajout de zéros ou d'un autre token spécial) pour que tous les exemples dans un lot aient la même longueur.

In [23]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

## 2. Création de modèle(s)

On charge notre Transformer BERT

In [167]:
model = GPT2ForSequenceClassification.from_pretrained("gpt2", num_labels=9, pad_token_id=tokenizer.pad_token_id)
model.resize_token_embeddings(len(tokenizer))

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You are resizing the embedding layer without providing a `pad_to_multiple_of` parameter. This means that the new embeding dimension will be 50258. This might induce some performance reduction as *Tensor Cores* will not be available. For more details  about this, or help on choosing the correct value for resizing, refer to this guide: https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tc


Embedding(50258, 768)

## 3. Entraînement de modèle(s)

Ce code définit une fonction compute_metrics pour évaluer l'exactitude des prédictions d'un modèle de classification en utilisant la métrique 'accuracy' de la bibliothèque Hugging Face datasets.

In [168]:
accuracy = evaluate.load("accuracy")

In [169]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

Cette section de code configure un entraîneur pour notre modèle GPT2 en spécifiant des paramètres d'entraînement, comme le taux d'apprentissage et la taille des lots, et lie le modèle, les données d'entraînement et de test, ainsi que la méthode de calcul des métriques pour l'entraînement et l'évaluation.

In [170]:
training_args = TrainingArguments(
    output_dir="my_awesome_GPT2",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    use_cpu = True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['validation'],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [171]:
trainer.train()

  0%|          | 0/1550 [06:49<?, ?it/s]
 10%|█         | 155/1550 [45:28<5:17:24, 13.65s/it] 
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

[A[A                                         
                                                    
 10%|█         | 155/1550 [47:31<5:17:24, 13.65s/it]
[A

{'eval_loss': 1.247109055519104, 'eval_accuracy': 0.5743879472693032, 'eval_runtime': 122.6716, 'eval_samples_per_second': 4.329, 'eval_steps_per_second': 0.277, 'epoch': 1.0}


 20%|██        | 310/1550 [1:31:48<4:00:24, 11.63s/it] 
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

[A[A                                         
                                                      
 20%|██        | 310/1550 [1:33:52<4:00:24, 11.63s/it]
[A

{'eval_loss': 0.8651298880577087, 'eval_accuracy': 0.7175141242937854, 'eval_runtime': 123.8492, 'eval_samples_per_second': 4.287, 'eval_steps_per_second': 0.275, 'epoch': 2.0}


 30%|███       | 465/1550 [2:17:36<3:25:50, 11.38s/it] 
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

[A[A                                         
                                                      
 30%|███       | 465/1550 [2:19:40<3:25:50, 11.38s/it]
[A

{'eval_loss': 0.775743842124939, 'eval_accuracy': 0.7457627118644068, 'eval_runtime': 124.0313, 'eval_samples_per_second': 4.281, 'eval_steps_per_second': 0.274, 'epoch': 3.0}


 32%|███▏      | 500/1550 [2:30:29<7:10:26, 24.60s/it] 
 32%|███▏      | 500/1550 [2:30:29<7:10:26, 24.60s/it]

{'loss': 1.1135, 'learning_rate': 1.3548387096774194e-05, 'epoch': 3.23}


 40%|████      | 620/1550 [3:04:12<3:18:30, 12.81s/it]
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

[A[A                                         
                                                      
 40%|████      | 620/1550 [3:06:16<3:18:30, 12.81s/it]
[A

{'eval_loss': 0.8132333159446716, 'eval_accuracy': 0.7457627118644068, 'eval_runtime': 123.8773, 'eval_samples_per_second': 4.286, 'eval_steps_per_second': 0.274, 'epoch': 4.0}


 50%|█████     | 775/1550 [3:51:32<3:01:20, 14.04s/it] 
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

[A[A                                         
                                                      
 50%|█████     | 775/1550 [3:53:37<3:01:20, 14.04s/it]
[A

{'eval_loss': 0.8616774678230286, 'eval_accuracy': 0.7382297551789078, 'eval_runtime': 125.1279, 'eval_samples_per_second': 4.244, 'eval_steps_per_second': 0.272, 'epoch': 5.0}


 60%|██████    | 930/1550 [4:38:08<2:27:35, 14.28s/it] 
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

[A[A                                         
                                                      
 60%|██████    | 930/1550 [4:40:12<2:27:35, 14.28s/it]
[A

{'eval_loss': 0.782459557056427, 'eval_accuracy': 0.7645951035781544, 'eval_runtime': 123.841, 'eval_samples_per_second': 4.288, 'eval_steps_per_second': 0.275, 'epoch': 6.0}


 65%|██████▍   | 1000/1550 [5:00:38<2:35:11, 16.93s/it]
 65%|██████▍   | 1000/1550 [5:00:38<2:35:11, 16.93s/it]

{'loss': 0.4587, 'learning_rate': 7.096774193548388e-06, 'epoch': 6.45}


 70%|███████   | 1085/1550 [5:24:09<1:36:28, 12.45s/it]
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

[A[A                                         
                                                       
 70%|███████   | 1085/1550 [5:26:13<1:36:28, 12.45s/it]
[A

{'eval_loss': 0.8469173908233643, 'eval_accuracy': 0.7702448210922788, 'eval_runtime': 124.4524, 'eval_samples_per_second': 4.267, 'eval_steps_per_second': 0.273, 'epoch': 7.0}


 80%|████████  | 1240/1550 [6:10:16<1:08:28, 13.25s/it]
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

[A[A                                         
                                                       
 80%|████████  | 1240/1550 [6:12:20<1:08:28, 13.25s/it]
[A

{'eval_loss': 0.852374792098999, 'eval_accuracy': 0.7702448210922788, 'eval_runtime': 123.7683, 'eval_samples_per_second': 4.29, 'eval_steps_per_second': 0.275, 'epoch': 8.0}


 90%|█████████ | 1395/1550 [6:57:15<37:14, 14.42s/it]  
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

[A[A                                         
                                                     
 90%|█████████ | 1395/1550 [6:59:19<37:14, 14.42s/it]
[A

{'eval_loss': 0.8672060966491699, 'eval_accuracy': 0.7815442561205274, 'eval_runtime': 123.5604, 'eval_samples_per_second': 4.297, 'eval_steps_per_second': 0.275, 'epoch': 9.0}


 97%|█████████▋| 1500/1550 [7:30:28<16:34, 19.90s/it]  
 97%|█████████▋| 1500/1550 [7:30:28<16:34, 19.90s/it]

{'loss': 0.2779, 'learning_rate': 6.451612903225807e-07, 'epoch': 9.68}


100%|██████████| 1550/1550 [7:44:16<00:00, 11.63s/it]
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

[A[A                                         
                                                     
100%|██████████| 1550/1550 [8:06:27<00:00, 11.63s/it]
[A

{'eval_loss': 0.8542028665542603, 'eval_accuracy': 0.7796610169491526, 'eval_runtime': 1331.5856, 'eval_samples_per_second': 0.399, 'eval_steps_per_second': 0.026, 'epoch': 10.0}



100%|██████████| 1550/1550 [8:06:29<00:00, 18.83s/it]

{'train_runtime': 29189.9522, 'train_samples_per_second': 0.848, 'train_steps_per_second': 0.053, 'train_loss': 0.6033945212825652, 'epoch': 10.0}





TrainOutput(global_step=1550, training_loss=0.6033945212825652, metrics={'train_runtime': 29189.9522, 'train_samples_per_second': 0.848, 'train_steps_per_second': 0.053, 'train_loss': 0.6033945212825652, 'epoch': 10.0})

In [174]:
pt_save_directory = "./pt_save_pretrainedGPT"
model.save_pretrained(pt_save_directory)

## 4. Évaluation et analyse de résultats

In [175]:
trainer.evaluate(tokenized_dataset['test'])

100%|██████████| 34/34 [01:52<00:00,  3.30s/it]


{'eval_loss': 0.775743842124939,
 'eval_accuracy': 0.7457627118644068,
 'eval_runtime': 115.4975,
 'eval_samples_per_second': 4.598,
 'eval_steps_per_second': 0.294,
 'epoch': 10.0}

## Comparaison global ##

|  | Accuracy |
|-----------|-----------|
| Logistic Reg.   |  0.72         |
| Naive Bayes  |  0.71         |
| MLP(AVG pool)   |   0.77        |
| RNN uni   |  0.67         |
| RNN bi   |  0.64         |
| Bert           | 0.76 |
| GPT2   |  0.74         |


Les résultats indiquent une variété de performances parmi les modèles de traitement automatique du langage. Les modèles plus simples comme la régression logistique et le Naive Bayes se débrouillent étonnamment bien, tandis que le MLP avec pooling moyen se distingue par la meilleure précision. Les RNNs, bien que conçus pour les données séquentielles, ne montrent pas des résultats aussi forts. Surprenamment, les modèles de langage avancés comme Bert et GPT-2 ne surpassent pas significativement les modèles plus simples, ce qui suggère que pour cette tâche spécifique, la complexité accrue ne se traduit pas nécessairement par une meilleure performance.