# Classification automatique de descriptions d'incidents

Cette tâche vise à classifier de courts textes décrivant des incidents qui se sont produits sur des chantiers de construction. Pour chaque incident, on retrouve une étiquette qui correspond au type d’incident (0 à 8).

In [None]:
import json
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
import spacy
import en_core_web_sm
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import cross_val_score
from sklearn.metrics import precision_score, recall_score, accuracy_score 
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

## Section 1 - Lecture des fichiers de données

In [155]:
train_json_fn = "./data/t3_train.json"
test_json_fn = "./data/t3_test.json"


def load_incident_dataset(filename):
    with open(filename, 'r') as fp:
        incident_list = json.load(fp)
    return incident_list

In [156]:
train_list = load_incident_dataset(train_json_fn)
print("Nombre d'incidents:", len(train_list))
print("\nUn exemple:\n", train_list[7])

Nombre d'incidents: 2475

Un exemple:
 {'text': ' At approximately 2:00 p.m.  on October 28  2008  Employee #1 was digging a  trench for electrical lines. The temperature was 100.7 degrees Fahrenheit and  the relative humidity was 71. Employee #1 felt cramps in his whole body and  then he collapsed. He was hospitalized for two days due to heat exhaustion.     ', 'label': '4'}


In [157]:
test_list = load_incident_dataset(test_json_fn)
print("Nombre d'incidents", len(test_list))
incident = test_list[10]
print("\nUne description d'incident:", incident["text"])
print("\nSon étiquette:", incident["label"])

Nombre d'incidents 1062

Une description d'incident:  At approximately 2:00 p.m. on March 3  2005  Employee #1  an iron worker  was  performing connecting work at a height of approximately 14 ft above grade.  Employee #1 lost his balance and fell to the ground  landing on his feet. In  order to absorb the impact on his legs from the fall  Employee #1 bent down   striking his head on the spud wrench in the tool belt. He sustained 3 stitches  to his forehead  but he was not hospitalized.                                   

Son étiquette: 5


## Section 2 - Entraînement et évaluation des modèles  

In [158]:
def lemmatize_text(text):
    lemmatizer = spacy.load("en_core_web_sm")
    doc = lemmatizer(text)
    lemmas = [token.lemma_ for token in doc]
    return " ".join(lemmas)

def train_and_test_classifier(train_fn, test_fn, model='NB', normalization='words'):
    """
    :param train_fn et test_fn: les 2 fichiers utilisées pour entraîner et tester les classificateurs.
    :param model: le type de classificateur. NB = Naive Bayes, LR = Régression logistique.
    :param normalization: la nomralisation appliquée aux mots des descriptions
                 - 'word': les mots des textes sans normalization.
                 - 'lemma': les lemmes des mots obtenus par lemmatisation avec Spacy.
    :return: un dictionnaire contenant 3 valeurs:
                 - l'accuracy à l'entraînement (validation croisée)
                 - l'accuracy sur le jeu de test
                 - la matrice de confusion calculée par scikit-learn sur les données de test
    """
    #On charge le dataset de train et de test
    training_corpus = load_incident_dataset(train_fn)
    test_corpus = load_incident_dataset(test_fn)

    train_texts = [instance["text"] for instance in training_corpus]
    train_labels = [instance["label"] for instance in training_corpus]
    test_texts = [instance["text"] for instance in test_corpus]
    test_labels = [instance["label"] for instance in test_corpus]

    #si lemma on procede a une normalisation du jeu d'entrainement et de test via spacy
    if normalization == "lemma":
        train_texts = [lemmatize_text(text) for text in train_texts]
        test_texts = [lemmatize_text(text) for text in test_texts]

    #on transforme notre jeu deu de données en représentatiton BoW et on procede par une vectorisation dans lequel 
    # on présente chaque mot avec son compteur via CountVectorizer
    vectorizer = CountVectorizer(lowercase=True)
    X_train = vectorizer.fit_transform(train_texts)

    #construction du classifier en fonction du paramètre model
    classifier = None
    if model=='NB':
        classifier = MultinomialNB().fit(X_train, train_labels)
    elif model == 'LR':
        classifier = LogisticRegression().fit(X_train, train_labels)
    else:
        raise ValueError("Modèle introuvable")

    #on vectorise notre jeu de test pour qu'on puisse inferer notre modèle
    X_test = vectorizer.transform(test_texts)
    y_predicted = classifier.predict(X_test)

    # Les résultats à retourner
    results = dict()
    results['accuracy_train'] = cross_val_score(classifier, X_train, train_labels, cv=5, scoring='accuracy').mean() #cross val
    results['accuracy_test'] = accuracy_score(test_labels, y_predicted) #acc en test
    results['confusion_matrix'] = confusion_matrix(test_labels, y_predicted)  # la matrice de confusion obtenue de Scikit-learn
    results['classifier'] = classifier
    results['vectorizer'] = vectorizer
    results['X_test'] = X_test
    results['y_test'] = test_labels
    return results

In [None]:
models = ['NB', 'LR']
norms = ["word", "lemma"]
results = {}
for model in models:
    for norm in norms:
        results[(model,norm)] = train_and_test_classifier(train_json_fn, test_json_fn, model = model, normalization=norm)

In [160]:
def hold_out_evaluation_test(results):
    y_pred = results["classifier"].predict(results["X_test"])
    print("\nÉvaluation sur les données de tests")
    print("   Accuracy = ", accuracy_score(results["y_test"], y_pred))
    print("   Macro rappel (recall) = ", recall_score(results["y_test"], y_pred, average='macro'))
    print("   Macro précision = ", precision_score(results["y_test"], y_pred, average='macro'))
    print("   Micro rappel (recall) = ", recall_score(results["y_test"], y_pred, average='micro'))
    print("   Micro précision = ", precision_score(results["y_test"], y_pred, average='micro'))

def hold_out_evaluation_train(results):
    print("\nÉvaluation sur les données de train")
    print("   Accuracy en train = ", results["accuracy_train"])

In [161]:
for norm in norms: 
    print ("--- Evaluation pour le modèle NB", "&", norm, "---")
    hold_out_evaluation_train(results[('NB', norm)])
    hold_out_evaluation_test(results[('NB',norm)])

--- Evaluation pour le modèle NB & word ---

Évaluation sur les données de train
   Accuracy en train =  0.6921212121212121

Évaluation sur les données de tests
   Accuracy =  0.7090395480225988
   Macro rappel (recall) =  0.4432504883897007
   Macro précision =  0.5781855163547387
   Micro rappel (recall) =  0.7090395480225988
   Micro précision =  0.7090395480225988
--- Evaluation pour le modèle NB & lemma ---

Évaluation sur les données de train
   Accuracy en train =  0.6953535353535354

Évaluation sur les données de tests
   Accuracy =  0.7062146892655368
   Macro rappel (recall) =  0.44960745553660875
   Macro précision =  0.569586864447891
   Micro rappel (recall) =  0.7062146892655368
   Micro précision =  0.7062146892655368


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [162]:
for norm in norms: 
    print ("--- Evaluation pour le modèle LR", "&", norm, "---")
    hold_out_evaluation_train(results[('LR', norm)])
    hold_out_evaluation_test(results[('LR',norm)])

--- Evaluation pour le modèle LR & word ---

Évaluation sur les données de train
   Accuracy en train =  0.7187878787878788

Évaluation sur les données de tests
   Accuracy =  0.7146892655367232
   Macro rappel (recall) =  0.5275430827666345
   Macro précision =  0.608105394835984
   Micro rappel (recall) =  0.7146892655367232
   Micro précision =  0.7146892655367232
--- Evaluation pour le modèle LR & lemma ---

Évaluation sur les données de train
   Accuracy en train =  0.7260606060606061

Évaluation sur les données de tests
   Accuracy =  0.7250470809792844
   Macro rappel (recall) =  0.560257287063603
   Macro précision =  0.6904827991665691
   Micro rappel (recall) =  0.7250470809792844
   Micro précision =  0.7250470809792844


Étant donné que la macro rappel est assez faible pour l'ensemble de nos modèles, on peut donc conclure que la sensibilité de nos modèles à détecter les vrais positifs est relativement faible en raison des classes moins fréquentes que d'autre. Même chose pour la macro précision.


Accuracy en entraînement :

|           | Naive Bayes | LogisticRegression |
|-----------|-----------|-----------|
| Word   | 0.69   | 0.71   |
| Lemma   | 0.69   | 0.72   |

Accuracy en test :

|           | Naive Bayes | LogisticRegression |
|-----------|-----------|-----------|
| Word   | 0.70   | 0.71   |
| Lemma   | 0.71   | 0.72   |

Il n'y a pas vraiment de différences significatives dans la performance des deux modèles. Cependant, si je devais en recommander un, je recommanderais certainement Logistic Regression sans normalisation, car c'est celui qui offre le meilleur rapport temps d'exécution/accuracy sur le jeu de test, même si l'accuracy est légèrement meilleure lorsque les données sont lemmatisé.

## Section 3 - À quoi correspondent les classes? Explicabilité du modèle

#### Naive Bayes

In [163]:
class_probs = list(zip(results[("NB","word")]['classifier'].classes_, results[("NB","word")]['classifier'].class_log_prior_))
for x, prob in class_probs:
    print( "logprob({}) = {}".format(x, round(prob,2)))

logprob(0) = -1.69
logprob(1) = -5.11
logprob(2) = -2.19
logprob(3) = -3.64
logprob(4) = -4.77
logprob(5) = -0.95
logprob(6) = -3.12
logprob(7) = -2.08
logprob(8) = -2.23


 La classe avec le logarithme de probabilité a priori le plus élevé est celle qui a la plus forte probabilité a priori d'être la classe correcte pour une observation donnée. On remarque ici que la classe 0,5 ont les probabilités les plus élevés. Il semble y avoir des variations dans les probabilités.

In [164]:
def display_confusion_matrix(confusion_matrix, classes):
    print("\n\nVersion graphique de la matrice de confusion") 
    df_cm = pd.DataFrame(confusion_matrix, index=classes, columns=classes)
    f, ax = plt.subplots(figsize=(7, 5))
    sns.heatmap(df_cm, annot=True, fmt="d", linewidths=.5, ax=ax)
    plt.ylabel('Vrai étiquette ')
    plt.xlabel('Étiquette prédite')

In [165]:
df = pd.DataFrame(results[('LR','word')]['vectorizer'].get_feature_names(), columns =['Mots']) 
for i in range(len(results[('LR','word')]['classifier'].classes_)):
    df[results[('LR','word')]['classifier'].classes_[i]] = list(results[('LR','word')]['classifier'].coef_[i])
clue_words = [
    "accident", "collision", "crash", "injury", "damage", "emergency", "fire",
    "explosion", "disaster", "hazard", "danger", "safety", "rescue", "evacuation",
    "injured", "casualty", "chaos", "trouble", "threat", "chaotic", "alarm",
    "incident", "mishap", "wreck", "catastrophe", "breakdown", "spill", "breach",
    "contamination", "explosion", "harm", "outage", "collapse", "inoperable",
    "leak", "crisis", "emergency", "panic", "turbulence", "violence", "insecure",
    "security breach", "accident report", "medical emergency", "natural disaster",
    "cyberattack", "hazardous material", "terrorist threat", 'surgery'
]
w_weights = df[df['Mots'].isin(clue_words)]
display(w_weights)



Unnamed: 0,Mots,0,1,2,3,4,5,6,7,8
662,accident,-0.243076,-0.045799,0.146103,-0.057845,0.285678,-0.061608,0.002875,0.190313,-0.216641
809,alarm,0.265912,-0.004763,-0.02806,0.036059,-0.004593,-0.083136,-0.016535,-0.134286,-0.030598
1519,breakdown,-0.069128,-0.000132,-0.002002,-0.000756,-0.000145,0.079369,-0.000461,-0.001739,-0.005007
1803,catastrophe,-0.000491,-0.00014,-0.000406,-0.000182,-0.000137,0.002446,-0.000227,-0.000468,-0.000396
2118,collapse,0.454267,-0.020484,-0.064749,-0.007056,-0.023632,-0.101618,-0.079341,-0.205921,0.048533
2133,collision,0.029757,-0.000482,-0.00372,-0.001536,-0.001126,-0.021437,-0.002595,-0.019401,0.020541
2317,contamination,-0.014618,-6.9e-05,-0.017573,-0.001217,-0.000242,-0.003775,-0.0061,0.06009,-0.016496
2466,crash,0.011689,-0.000123,-0.001788,-0.000548,-0.000133,0.002616,-0.000744,-0.007419,-0.003549
2602,damage,0.010959,-0.00617,0.070298,0.008085,-0.013509,-0.095599,0.009109,0.025036,-0.008208
2610,danger,0.009382,-0.000644,-0.005935,-0.003749,-0.008148,-0.010269,0.034468,-0.054196,0.039091


In [166]:
df = pd.DataFrame(results[('NB','word')]['vectorizer'].get_feature_names(), columns =['Mots']) 
for i in range(len(results[('NB','word')]['classifier'].classes_)):
    df[results[('NB','word')]['classifier'].classes_[i]] = list(results[('NB','word')]['classifier'].coef_[i])
clue_words = [
    "accident", "collision", "crash", "injury", "damage", "emergency", "fire",
    "explosion", "disaster", "hazard", "danger", "safety", "rescue", "evacuation",
    "injured", "casualty", "chaos", "trouble", "threat", "chaotic", "alarm",
    "incident", "mishap", "wreck", "catastrophe", "breakdown", "spill", "breach",
    "contamination", "explosion", "harm", "outage", "collapse", "inoperable",
    "leak", "crisis", "emergency", "panic", "turbulence", "violence", "insecure",
    "security breach", "accident report", "medical emergency", "natural disaster",
    "cyberattack", "hazardous material", "terrorist threat", 'surgery'
]
w_weights = df[df['Mots'].isin(clue_words)]
display(w_weights)



Unnamed: 0,Mots,0,1,2,3,4,5,6,7,8
662,accident,-6.990359,-8.613412,-6.699736,-7.558343,-8.28341,-6.422001,-7.114092,-6.555148,-6.747898
809,alarm,-8.14613,-9.306559,-10.549884,-9.06242,-9.382022,-10.565135,-10.004463,-10.081508,-9.946571
1519,breakdown,-10.979343,-9.306559,-10.549884,-9.755567,-9.382022,-10.970601,-10.004463,-10.774655,-10.639718
1803,catastrophe,-10.979343,-9.306559,-10.549884,-9.755567,-9.382022,-10.970601,-10.004463,-10.774655,-10.639718
2118,collapse,-7.683506,-9.306559,-10.549884,-9.06242,-9.382022,-8.955698,-10.004463,-10.774655,-8.693808
2133,collision,-9.369905,-9.306559,-10.549884,-9.755567,-9.382022,-11.663748,-10.004463,-10.774655,-9.946571
2317,contamination,-10.979343,-9.306559,-10.549884,-9.755567,-9.382022,-11.663748,-10.004463,-9.676043,-10.639718
2466,crash,-9.880731,-9.306559,-10.549884,-9.755567,-9.382022,-10.970601,-10.004463,-10.774655,-10.639718
2602,damage,-8.899901,-9.306559,-8.603974,-9.06242,-9.382022,-9.717838,-8.618169,-8.695214,-8.693808
2610,danger,-9.880731,-9.306559,-10.549884,-9.755567,-9.382022,-10.970601,-9.311316,-10.774655,-9.541106


In [167]:
df = pd.DataFrame(results[('NB','lemma')]['vectorizer'].get_feature_names(), columns =['Mots']) 
for i in range(len(results[('NB','lemma')]['classifier'].classes_)):
    df[results[('NB','lemma')]['classifier'].classes_[i]] = list(results[('NB','lemma')]['classifier'].coef_[i])
clue_words = [
    "accident", "collision", "crash", "injury", "damage", "emergency", "fire",
    "explosion", "disaster", "hazard", "danger", "safety", "rescue", "evacuation",
    "injured", "casualty", "chaos", "trouble", "threat", "chaotic", "alarm",
    "incident", "mishap", "wreck", "catastrophe", "breakdown", "spill", "breach",
    "contamination", "explosion", "harm", "outage", "collapse", "inoperable",
    "leak", "crisis", "emergency", "panic", "turbulence", "violence", "insecure",
    "security breach", "accident report", "medical emergency", "natural disaster",
    "cyberattack", "hazardous material", "terrorist threat", 'surgery'
]
w_weights = df[df['Mots'].isin(clue_words)]
display(w_weights)



Unnamed: 0,Mots,0,1,2,3,4,5,6,7,8
641,accident,-6.957656,-8.425078,-6.649012,-7.442297,-8.110127,-6.405683,-7.024847,-6.514909,-6.701685
761,alarm,-8.056268,-9.118225,-10.49916,-8.946375,-9.208739,-10.548818,-9.915218,-9.348122,-9.900358
1333,breakdown,-10.94664,-9.118225,-10.49916,-9.639522,-9.208739,-10.954283,-9.915218,-10.734416,-10.593505
1561,catastrophe,-10.94664,-9.118225,-10.49916,-9.639522,-9.208739,-10.954283,-9.915218,-10.734416,-10.593505
1808,collapse,-6.175955,-9.118225,-8.889722,-7.336937,-7.41698,-7.427922,-9.915218,-8.336521,-7.548982
1818,collision,-9.337202,-9.118225,-10.49916,-9.639522,-9.208739,-11.64743,-9.915218,-10.734416,-9.900358
1953,contamination,-10.94664,-9.118225,-10.49916,-9.639522,-9.208739,-11.64743,-9.915218,-9.635804,-10.593505
2066,crash,-9.560346,-9.118225,-10.49916,-9.639522,-9.208739,-9.70152,-9.915218,-10.734416,-10.593505
2179,damage,-8.548745,-8.425078,-7.860102,-8.946375,-9.208739,-8.511936,-8.528924,-8.095359,-7.954448
2185,danger,-9.848028,-9.118225,-10.49916,-9.639522,-9.208739,-10.954283,-9.222071,-10.734416,-9.494893


In [168]:
df = pd.DataFrame(results[('LR','word')]['vectorizer'].get_feature_names(), columns =['Mots']) 
for i in range(len(results[('LR','word')]['classifier'].classes_)):
    df[results[('LR','word')]['classifier'].classes_[i]] = list(results[('LR','word')]['classifier'].coef_[i])
clue_words = [
    "accident", "collision", "crash", "injury", "damage", "emergency", "fire",
    "explosion", "disaster", "hazard", "danger", "safety", "rescue", "evacuation",
    "injured", "casualty", "chaos", "trouble", "threat", "chaotic", "alarm",
    "incident", "mishap", "wreck", "catastrophe", "breakdown", "spill", "breach",
    "contamination", "explosion", "harm", "outage", "collapse", "inoperable",
    "leak", "crisis", "emergency", "panic", "turbulence", "violence", "insecure",
    "security breach", "accident report", "medical emergency", "natural disaster",
    "cyberattack", "hazardous material", "terrorist threat", 'surgery'
]
w_weights = df[df['Mots'].isin(clue_words)]
display(w_weights)



Unnamed: 0,Mots,0,1,2,3,4,5,6,7,8
662,accident,-0.243076,-0.045799,0.146103,-0.057845,0.285678,-0.061608,0.002875,0.190313,-0.216641
809,alarm,0.265912,-0.004763,-0.02806,0.036059,-0.004593,-0.083136,-0.016535,-0.134286,-0.030598
1519,breakdown,-0.069128,-0.000132,-0.002002,-0.000756,-0.000145,0.079369,-0.000461,-0.001739,-0.005007
1803,catastrophe,-0.000491,-0.00014,-0.000406,-0.000182,-0.000137,0.002446,-0.000227,-0.000468,-0.000396
2118,collapse,0.454267,-0.020484,-0.064749,-0.007056,-0.023632,-0.101618,-0.079341,-0.205921,0.048533
2133,collision,0.029757,-0.000482,-0.00372,-0.001536,-0.001126,-0.021437,-0.002595,-0.019401,0.020541
2317,contamination,-0.014618,-6.9e-05,-0.017573,-0.001217,-0.000242,-0.003775,-0.0061,0.06009,-0.016496
2466,crash,0.011689,-0.000123,-0.001788,-0.000548,-0.000133,0.002616,-0.000744,-0.007419,-0.003549
2602,damage,0.010959,-0.00617,0.070298,0.008085,-0.013509,-0.095599,0.009109,0.025036,-0.008208
2610,danger,0.009382,-0.000644,-0.005935,-0.003749,-0.008148,-0.010269,0.034468,-0.054196,0.039091


In [169]:
df = pd.DataFrame(results[('LR','lemma')]['vectorizer'].get_feature_names(), columns =['Mots']) 
for i in range(len(results[('LR','lemma')]['classifier'].classes_)):
    df[results[('LR','lemma')]['classifier'].classes_[i]] = list(results[('LR','lemma')]['classifier'].coef_[i])
clue_words = [
    "accident", "collision", "crash", "injury", "damage", "emergency", "fire",
    "explosion", "disaster", "hazard", "danger", "safety", "rescue", "evacuation",
    "injured", "casualty", "chaos", "trouble", "threat", "chaotic", "alarm",
    "incident", "mishap", "wreck", "catastrophe", "breakdown", "spill", "breach",
    "contamination", "explosion", "harm", "outage", "collapse", "inoperable",
    "leak", "crisis", "emergency", "panic", "turbulence", "violence", "insecure",
    "security breach", "accident report", "medical emergency", "natural disaster",
    "cyberattack", "hazardous material", "terrorist threat", 'surgery'
]
w_weights = df[df['Mots'].isin(clue_words)]
display(w_weights)



Unnamed: 0,Mots,0,1,2,3,4,5,6,7,8
641,accident,-0.143171,-0.045667,0.173511,-0.067025,0.291211,-0.240477,0.01091,0.122232,-0.101524
761,alarm,0.272194,-0.004994,-0.031938,-0.008022,-0.005942,-0.084048,-0.027437,-0.07088,-0.038931
1333,breakdown,-0.09555,-0.000111,-0.001535,-0.000697,-0.000155,0.104185,-0.000487,-0.001394,-0.004256
1561,catastrophe,-0.000577,-0.000157,-0.000445,-0.000204,-0.000146,0.002814,-0.000264,-0.000551,-0.000471
1808,collapse,1.11575,-0.093461,-0.331603,0.074373,0.224402,-0.379882,-0.3415,-0.345022,0.076941
1818,collision,0.024538,-0.000441,-0.004172,-0.001553,-0.000719,-0.030377,-0.002389,-0.015359,0.030472
1953,contamination,-0.007119,-5.7e-05,-0.025815,-0.001408,-0.000342,-0.004067,-0.006955,0.051281,-0.005519
2066,crash,-0.078315,-0.010196,-0.023738,-0.012892,-0.00459,0.263675,-0.010982,-0.044261,-0.078701
2179,damage,-0.099961,0.15008,0.166871,-0.021678,-0.021509,-0.319627,-0.014966,0.117762,0.043028
2185,danger,0.01815,-0.001417,-0.005864,-0.004795,-0.012164,-0.014815,0.038955,-0.059902,0.041852


Remarque : 

Nous pouvons observer que la vraisemblance P(word|c) est très élevée pour certains mots spécifiques. Par exemple, pour la classe d'incident la plus élevée (classe 8), nous constatons une forte vraisemblance avec les mots "surgery," "injury," et "inoperable." En revanche, pour la classe 0, les mots "collapse," "alarm," et "injury" présentent une forte vraisemblance. Il est intéressant de noter que le mot "injury" présente une forte vraisemblance avec plusieurs classes, ce qui suggère qu'il peut y avoir plusieurs niveaux de blessures. De même, le mot "accident" présente une forte vraisemblance avec la classe 4, ce qui pourrait indiquer que les accidents en général sont plus étroitement associés au degré 4 d'incidence.