In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
df_train = pd.read_csv('/content/drive/MyDrive/ML/HAI817_Projet_train.csv')
df_test = pd.read_csv('/content/drive/MyDrive/ML/HAI817_Projet_test.csv')

In [None]:
# Importation des différentes librairies, classes et fonctions utilespour le notebook

#Sickit learn met régulièrement à jour des versions et 
#indique des futurs warnings. 
#ces deux lignes permettent de ne pas les afficher.
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)


# librairies générales
import pandas as pd
import re
from tabulate import tabulate
import time
import numpy as np
import pickle
import string
import base64
import sys

# librairie affichage
import matplotlib.pyplot as plt
import seaborn as sns

# librairies scikit learn
import sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score


# librairies des classifiers utilisés
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier

# librairies NLTK
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer 
from nltk.corpus import stopwords
from nltk import word_tokenize 

 
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt')
stop_words = set(stopwords.words('english')) 

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
my_local_drive='/content/drive/MyDrive/ML/Prof/ML_FDS'

# Ajout du path pour les librairies, fonctions et données
sys.path.append(my_local_drive)
# Se positionner sur le répertoire associé
#%cd $my_local_drive

#%pwd

# fonctions utilities (affichage, confusion, etc.)
from MyNLPUtilities import *

In [None]:
df_all = pd.concat([df_train,df_test])


In [None]:
# compter les valeurs manquantes dans chaque colonne
num_missing_values = df_all.isna().sum()
print(num_missing_values)

public_id      612
text             0
title           23
our rating       0
ID            1264
dtype: int64


Encodage des classes

In [None]:
import pandas as pd



# Encoder les classes en deux catégories : "VRAI ou FAUX" et "AUTRE"
df_all['classe'] = df_all['our rating'].replace(['mixture', 'other'], 'AUTRE')
df_all['classe'] = df_all['classe'].replace(['true', 'false'], 'TrueFalse')

# Afficher le nombre de valeurs par classe
print(df_all['classe'].value_counts())
df_all['classe'] = df_all['classe'].map({'TrueFalse': 1, 'AUTRE': 2})

TrueFalse    1314
AUTRE         562
Name: classe, dtype: int64


In [None]:
print(df_all["classe"].value_counts())

1    1314
2     562
Name: classe, dtype: int64


Equilibrage des classes

In [None]:
from sklearn.utils import resample
import pandas as pd



# Séparer les classes majoritaires et minoritaires
df_majority = df_all[df_all['classe'] == 1]
df_minority = df_all[df_all['classe'] == 2]

# Sous-échantillonner la classe majoritaire
df_majority_downsampled = resample(df_majority, 
                                   replace=False,    # Échantillonnage sans remplacement
                                   n_samples=len(df_minority), # Nombre d'échantillons égal à la classe minoritaire
                                   random_state=42)  # Pour la reproductibilité

# Combiner les classes majoritaire et minoritaire
df_balanced = pd.concat([df_majority_downsampled, df_minority])

# Afficher la taille du jeu de données équilibré
print("Taille du jeu de données équilibré : ", df_balanced.shape)

df_all = df_balanced
print(df_all['classe'].value_counts())

Taille du jeu de données équilibré :  (1124, 6)
1    562
2    562
Name: classe, dtype: int64


Définition de la fonction  MyCleanText

In [None]:
import re
import string
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk import word_tokenize
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt')

stop_words = set(stopwords.words('english'))

def MyCleanText(X,
 lowercase=False, # mettre en minuscule
 removestopwords=False, # supprimer les stopwords
 removedigit=False, # supprimer les nombres
 getstemmer=False, # conserver la racine des termes
 getlemmatisation=False # lematisation des termes
 ):
    
    sentence = str(X)
    
    # suppression des caractères spéciaux
    sentence = re.sub(r'[^\w\s]',' ', sentence)
    
    # suppression de tous les caractères uniques
    sentence = re.sub(r'\s+[a-zA-Z]\s+', ' ', sentence)
    
    # substitution des espaces multiples par un seul espace
    sentence = re.sub(r'\s+', ' ', sentence, flags=re.I)
    
    # decoupage en mots
    tokens = word_tokenize(sentence)
    
    if lowercase:
        tokens = [token.lower() for token in tokens]

    # suppression ponctuation
    table = str.maketrans('', '', string.punctuation)
    words = [token.translate(table) for token in tokens]
    
    # suppression des tokens non alphabetique ou numerique
    words = [word for word in words if word.isalnum()]

    # suppression des tokens numerique
    if removedigit:
        words = [word for word in words if not word.isdigit()]
    
    # suppression des stopwords
    if removestopwords:
        words = [word for word in words if not word in stop_words]
    
    # lemmatisation
    if getlemmatisation:
        lemmatizer=WordNetLemmatizer()
        words = [lemmatizer.lemmatize(word)for word in words]
    
    # racinisation
    if getstemmer:
        ps = PorterStemmer()
        words=[ps.stem(word) for word in words]

    sentence= ' '.join(words)

    return sentence


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Définition de la fonction TextNormalizer

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin
class TextNormalizer(BaseEstimator, TransformerMixin):
    def __init__(self,
                 removestopwords=False, # suppression des stopwords
                 lowercase=False,# passage en minuscule
                 removedigit=False, # supprimer les nombres
                 getstemmer=False,# racinisation des termes
                 getlemmatisation=False # lemmatisation des termes
                ):
        self.lowercase=lowercase
        self.getstemmer=getstemmer
        self.removestopwords=removestopwords
        self.getlemmatisation=getlemmatisation
        self.removedigit=removedigit
    
    def transform(self, X, **transform_params):
        # Nettoyage du texte
        X=X.copy() # pour conserver le fichier d'origine
        return [MyCleanText(text, lowercase=self.lowercase,
                            getstemmer=self.getstemmer,
                            removestopwords=self.removestopwords,
                            getlemmatisation=self.getlemmatisation,
                            removedigit=self.removedigit) for text in X]
    
    def fit(self, X, y=None, **fit_params):
        return self
    
    def fit_transform(self, X, y=None, **fit_params):
        return self.fit(X).transform(X)
    
    def get_params(self, deep=True):
        return {
            'lowercase':self.lowercase,
            'getstemmer':self.getstemmer,
            'removestopwords':self.removestopwords,
            'getlemmatisation':self.getlemmatisation,
            'removedigit':self.removedigit
        }
    
    def set_params(self, **parameters):
        for parameter, value in parameters.items():
            setattr(self, parameter, value)
        return self


## **Evaluation de différents classifieurs**  


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier

# creation du tableau des différents classifieur 


models = []
models.append(('MultinomialNB',MultinomialNB()))
models.append(('LR', LogisticRegression(solver='lbfgs')))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('RF', RandomForestClassifier()))
models.append(('SVM', SVC()))

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
import time


removestopwords=False, # suppression des stopwords

lowercase=False,# passage en minuscule

removedigit=False, # supprimer les nombres

getstemmer=False,# racinisation des termes

getlemmatisation=False # lemmatisation des termes

Choix du meilleur classifieur avec les meilleurs parametres de prétraitement

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.model_selection import cross_val_score, KFold, cross_val_predict
from sklearn.metrics import classification_report
import numpy as np
class Result:
    def __init__(self, name, score_accuracy, score_precision, score_rappel, score_mesure, temps, conf_mat):
        self.name = name
        self.score_accuracy = score_accuracy
        self.score_precision = score_precision
        self.score_rappel = score_rappel
        self.score_mesure = score_mesure
        self.temps = temps
        self.conf_mat = conf_mat 
def ma_fonction(removestopwords1, lowercase1, removedigit1, getstemmer1, getlemmatisation1):

    score_accuracy = 'accuracy'
    score_precision = 'precision'
    score_rappel = 'recall'
    score_mesure = 'f1'
    seed = 7        
    allresults = []
    results = []
    names = []

    X = df_all['text']
    y = df_all['classe']

    # Nous appliquons les pré-traitements sur X

    text_normalizer = TextNormalizer(removestopwords=removestopwords1, lowercase=lowercase1, removedigit=removedigit1, getstemmer=getstemmer1, getlemmatisation=getlemmatisation1)  
    # appliquer fit.transform pour réaliser les pré-traitements sur X
    X_cleaned = text_normalizer.fit_transform(X)

    # pour l'enchainer avec un tf-idf et obtenir une matrice
    tfidf = TfidfVectorizer()
    features = tfidf.fit_transform(X_cleaned).toarray()

    # attention ici il faut passer features dans cross_val_score plutôt que X

    for name, model in models:
        # cross validation en 10 fois
        kfold = KFold(n_splits=10, random_state=seed, shuffle=True)

        print("Evaluation de", name)
        start_time = time.time()
        # application de la classification
        cv_results = cross_val_score(model, features, y, cv=kfold, scoring=score_accuracy)
        cv_results_precision = cross_val_score(model, features, y, cv=kfold, scoring=score_precision)
        cv_results_rappel = cross_val_score(model, features, y, cv=kfold, scoring=score_rappel)
        cv_results_mesure = cross_val_score(model, features, y, cv=kfold, scoring=score_mesure)

        # Calculer la matrice de confusion
        y_pred = cross_val_predict(model, features, y, cv=kfold)
        conf_mat = confusion_matrix(y, y_pred)

        # Affichage du rapport de classification
        class_names = np.unique(y)
        print("Classification Report:")
        print(classification_report(y, y_pred, labels=class_names))

        # Affichage de la matrice de confusion
        print("Confusion Matrix:")
        print(conf_mat)

        thetime = time.time() - start_time






        result = Result(name, cv_results.mean(), cv_results_precision.mean(), cv_results_rappel.mean(), cv_results_mesure.mean(),  thetime, conf_mat)
        allresults.append(result)
        # pour affichage
        results.append(cv_results)
        names.append(name)
        print("%s : exactitude=%.3f, précision=%.3f, rappel=%.3f, mesure F1=%.3f in temps d'exécution=%.3f s" % (name, cv_results.mean(), cv_results_precision.mean(), cv_results_rappel.mean(), cv_results_mesure.mean(), thetime))


In [None]:
#removestopwords, lowercase, removedigit, getstemmer,getlemmatisation
ma_fonction(True, True, True, True, True)

Evaluation de MultinomialNB
Classification Report:
              precision    recall  f1-score   support

           1       0.72      0.64      0.68       562
           2       0.67      0.75      0.71       562

    accuracy                           0.69      1124
   macro avg       0.70      0.69      0.69      1124
weighted avg       0.70      0.69      0.69      1124

Confusion Matrix:
[[358 204]
 [140 422]]
MultinomialNB : exactitude=0.694, précision=0.722, rappel=0.636, mesure F1=0.673 in temps d'exécution=14.828 s
Evaluation de LR
Classification Report:
              precision    recall  f1-score   support

           1       0.69      0.68      0.68       562
           2       0.68      0.69      0.69       562

    accuracy                           0.68      1124
   macro avg       0.68      0.68      0.68      1124
weighted avg       0.68      0.68      0.68      1124

Confusion Matrix:
[[380 182]
 [173 389]]
LR : exactitude=0.684, précision=0.691, rappel=0.677, mesure F

In [None]:
ma_fonction(True, True, True, False, False)

Evaluation de MultinomialNB
Classification Report:
              precision    recall  f1-score   support

           1       0.72      0.65      0.68       562
           2       0.68      0.74      0.71       562

    accuracy                           0.70      1124
   macro avg       0.70      0.70      0.70      1124
weighted avg       0.70      0.70      0.70      1124

Confusion Matrix:
[[365 197]
 [145 417]]
MultinomialNB : exactitude=0.696, précision=0.720, rappel=0.650, mesure F1=0.680 in temps d'exécution=11.530 s
Evaluation de LR
Classification Report:
              precision    recall  f1-score   support

           1       0.69      0.68      0.69       562
           2       0.69      0.69      0.69       562

    accuracy                           0.69      1124
   macro avg       0.69      0.69      0.69      1124
weighted avg       0.69      0.69      0.69      1124

Confusion Matrix:
[[384 178]
 [172 390]]
LR : exactitude=0.689, précision=0.695, rappel=0.685, mesure F

In [None]:
ma_fonction(True, True, True, True, False)

Evaluation de MultinomialNB
Classification Report:
              precision    recall  f1-score   support

           1       0.72      0.64      0.67       562
           2       0.67      0.75      0.71       562

    accuracy                           0.69      1124
   macro avg       0.69      0.69      0.69      1124
weighted avg       0.69      0.69      0.69      1124

Confusion Matrix:
[[357 205]
 [142 420]]
MultinomialNB : exactitude=0.691, précision=0.718, rappel=0.634, mesure F1=0.671 in temps d'exécution=9.347 s
Evaluation de LR
Classification Report:
              precision    recall  f1-score   support

           1       0.69      0.68      0.68       562
           2       0.68      0.69      0.69       562

    accuracy                           0.68      1124
   macro avg       0.68      0.68      0.68      1124
weighted avg       0.68      0.68      0.68      1124

Confusion Matrix:
[[381 181]
 [174 388]]
LR : exactitude=0.684, précision=0.690, rappel=0.679, mesure F1

In [None]:
ma_fonction(True, True, False, True, False)

Evaluation de MultinomialNB
Classification Report:
              precision    recall  f1-score   support

           1       0.72      0.64      0.68       562
           2       0.67      0.75      0.71       562

    accuracy                           0.69      1124
   macro avg       0.70      0.69      0.69      1124
weighted avg       0.70      0.69      0.69      1124

Confusion Matrix:
[[359 203]
 [142 420]]
MultinomialNB : exactitude=0.693, précision=0.718, rappel=0.638, mesure F1=0.673 in temps d'exécution=9.484 s
Evaluation de LR
Classification Report:
              precision    recall  f1-score   support

           1       0.69      0.68      0.68       562
           2       0.68      0.69      0.69       562

    accuracy                           0.69      1124
   macro avg       0.69      0.69      0.69      1124
weighted avg       0.69      0.69      0.69      1124

Confusion Matrix:
[[381 181]
 [172 390]]
LR : exactitude=0.686, précision=0.692, rappel=0.679, mesure F1

In [None]:
ma_fonction(True, True, False, True, True)

Evaluation de MultinomialNB
Classification Report:
              precision    recall  f1-score   support

           1       0.72      0.64      0.67       562
           2       0.67      0.75      0.71       562

    accuracy                           0.69      1124
   macro avg       0.69      0.69      0.69      1124
weighted avg       0.69      0.69      0.69      1124

Confusion Matrix:
[[358 204]
 [142 420]]
MultinomialNB : exactitude=0.692, précision=0.718, rappel=0.636, mesure F1=0.672 in temps d'exécution=8.404 s
Evaluation de LR
Classification Report:
              precision    recall  f1-score   support

           1       0.69      0.68      0.68       562
           2       0.68      0.69      0.69       562

    accuracy                           0.69      1124
   macro avg       0.69      0.69      0.69      1124
weighted avg       0.69      0.69      0.69      1124

Confusion Matrix:
[[381 181]
 [172 390]]
LR : exactitude=0.686, précision=0.693, rappel=0.679, mesure F1

In [None]:
ma_fonction(True, True, False, False, True)

Evaluation de MultinomialNB
Classification Report:
              precision    recall  f1-score   support

           1       0.72      0.64      0.68       562
           2       0.68      0.75      0.71       562

    accuracy                           0.70      1124
   macro avg       0.70      0.70      0.70      1124
weighted avg       0.70      0.70      0.70      1124

Confusion Matrix:
[[362 200]
 [139 423]]
MultinomialNB : exactitude=0.698, précision=0.724, rappel=0.643, mesure F1=0.680 in temps d'exécution=11.559 s
Evaluation de LR
Classification Report:
              precision    recall  f1-score   support

           1       0.69      0.67      0.68       562
           2       0.68      0.69      0.69       562

    accuracy                           0.68      1124
   macro avg       0.68      0.68      0.68      1124
weighted avg       0.68      0.68      0.68      1124

Confusion Matrix:
[[378 184]
 [172 390]]
LR : exactitude=0.683, précision=0.692, rappel=0.674, mesure F

In [None]:
ma_fonction(True, True, True, False, True)

Evaluation de MultinomialNB
Classification Report:
              precision    recall  f1-score   support

           1       0.72      0.64      0.68       562
           2       0.68      0.75      0.71       562

    accuracy                           0.69      1124
   macro avg       0.70      0.69      0.69      1124
weighted avg       0.70      0.69      0.69      1124

Confusion Matrix:
[[359 203]
 [140 422]]
MultinomialNB : exactitude=0.695, précision=0.721, rappel=0.638, mesure F1=0.675 in temps d'exécution=11.472 s
Evaluation de LR
Classification Report:
              precision    recall  f1-score   support

           1       0.69      0.68      0.68       562
           2       0.68      0.70      0.69       562

    accuracy                           0.69      1124
   macro avg       0.69      0.69      0.69      1124
weighted avg       0.69      0.69      0.69      1124

Confusion Matrix:
[[380 182]
 [169 393]]
LR : exactitude=0.688, précision=0.697, rappel=0.678, mesure F