In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
df_train = pd.read_csv('/content/drive/MyDrive/ML/HAI817_Projet_train.csv')
df_test = pd.read_csv('/content/drive/MyDrive/ML/HAI817_Projet_test.csv')

In [None]:
# Importation des différentes librairies, classes et fonctions utilespour le notebook

#Sickit learn met régulièrement à jour des versions et 
#indique des futurs warnings. 
#ces deux lignes permettent de ne pas les afficher.
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)


# librairies générales
import pandas as pd
import re
from tabulate import tabulate
import time
import numpy as np
import pickle
import string
import base64
import sys

# librairie affichage
import matplotlib.pyplot as plt
import seaborn as sns

# librairies scikit learn
import sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score


# librairies des classifiers utilisés
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier

# librairies NLTK
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer 
from nltk.corpus import stopwords
from nltk import word_tokenize 

 
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt')
stop_words = set(stopwords.words('english')) 

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
my_local_drive='/content/drive/MyDrive/ML/Prof/ML_FDS'

# Ajout du path pour les librairies, fonctions et données
sys.path.append(my_local_drive)
# Se positionner sur le répertoire associé
#%cd $my_local_drive

#%pwd

# fonctions utilities (affichage, confusion, etc.)
from MyNLPUtilities import *

In [None]:
df_all = pd.concat([df_train,df_test])
print(df_all.shape)


(1876, 5)


In [None]:
# compter les valeurs manquantes dans chaque colonne
num_missing_values = df_all.isna().sum()
print(num_missing_values)

public_id      612
text             0
title           23
our rating       0
ID            1264
dtype: int64


Encodage des classes

In [None]:


df_all['classe'] = df_all['our rating'].map({'true': 1, 'false': 2, 'mixture': 3, 'other': 4})

In [None]:
print(df_all["classe"].value_counts())

2    893
1    421
3    414
4    148
Name: classe, dtype: int64


Equilibrage des classes

In [None]:
from sklearn.utils import resample
import pandas as pd

# Downsampling de la classe majoritaire
false_downsampled = resample(df_all[df_all['our rating'] == 'false'], replace=False, n_samples=250, random_state=42)

# Upsampling de la classe minoritaire
true_upsampled = resample(df_all[df_all['our rating'] == 'true'], replace=False, n_samples=250, random_state=42)
mixture_upsampled = resample(df_all[df_all['our rating'] == 'mixture'], replace=False, n_samples=250, random_state=42)
other_upsampled = resample(df_all[df_all['our rating'] == 'other'], replace=True, n_samples=250, random_state=42)

# Concaténer les données échantillonnées
balanced_data = pd.concat([false_downsampled, true_upsampled, mixture_upsampled, other_upsampled])

# Afficher les nouvelles proportions des classes
df_all = balanced_data
print(df_all['our rating'].value_counts())


false      250
true       250
mixture    250
other      250
Name: our rating, dtype: int64


Définition de la fonction  MyCleanText

In [None]:
import re
import string
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk import word_tokenize
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt')

stop_words = set(stopwords.words('english'))

def MyCleanText(X,
 lowercase=False, # mettre en minuscule
 removestopwords=False, # supprimer les stopwords
 removedigit=False, # supprimer les nombres
 getstemmer=False, # conserver la racine des termes
 getlemmatisation=False # lematisation des termes
 ):
    
    sentence = str(X)
    
    # suppression des caractères spéciaux
    sentence = re.sub(r'[^\w\s]',' ', sentence)
    
    # suppression de tous les caractères uniques
    sentence = re.sub(r'\s+[a-zA-Z]\s+', ' ', sentence)
    
    # substitution des espaces multiples par un seul espace
    sentence = re.sub(r'\s+', ' ', sentence, flags=re.I)
    
    # decoupage en mots
    tokens = word_tokenize(sentence)
    
    if lowercase:
        tokens = [token.lower() for token in tokens]

    # suppression ponctuation
    table = str.maketrans('', '', string.punctuation)
    words = [token.translate(table) for token in tokens]
    
    # suppression des tokens non alphabetique ou numerique
    words = [word for word in words if word.isalnum()]

    # suppression des tokens numerique
    if removedigit:
        words = [word for word in words if not word.isdigit()]
    
    # suppression des stopwords
    if removestopwords:
        words = [word for word in words if not word in stop_words]
    
    # lemmatisation
    if getlemmatisation:
        lemmatizer=WordNetLemmatizer()
        words = [lemmatizer.lemmatize(word)for word in words]
    
    # racinisation
    if getstemmer:
        ps = PorterStemmer()
        words=[ps.stem(word) for word in words]

    sentence= ' '.join(words)

    return sentence


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Définition de la fonction TextNormalizer

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin
class TextNormalizer(BaseEstimator, TransformerMixin):
    def __init__(self,
                 removestopwords=False, # suppression des stopwords
                 lowercase=False,# passage en minuscule
                 removedigit=False, # supprimer les nombres
                 getstemmer=False,# racinisation des termes
                 getlemmatisation=False # lemmatisation des termes
                ):
        self.lowercase=lowercase
        self.getstemmer=getstemmer
        self.removestopwords=removestopwords
        self.getlemmatisation=getlemmatisation
        self.removedigit=removedigit
    
    def transform(self, X, **transform_params):
        # Nettoyage du texte
        X=X.copy() # pour conserver le fichier d'origine
        return [MyCleanText(text, lowercase=self.lowercase,
                            getstemmer=self.getstemmer,
                            removestopwords=self.removestopwords,
                            getlemmatisation=self.getlemmatisation,
                            removedigit=self.removedigit) for text in X]
    
    def fit(self, X, y=None, **fit_params):
        return self
    
    def fit_transform(self, X, y=None, **fit_params):
        return self.fit(X).transform(X)
    
    def get_params(self, deep=True):
        return {
            'lowercase':self.lowercase,
            'getstemmer':self.getstemmer,
            'removestopwords':self.removestopwords,
            'getlemmatisation':self.getlemmatisation,
            'removedigit':self.removedigit
        }
    
    def set_params(self, **parameters):
        for parameter, value in parameters.items():
            setattr(self, parameter, value)
        return self


## **Evaluation de différents classifieurs**  


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier

# creation du tableau des différents classifieur 


models = []
models.append(('MultinomialNB',MultinomialNB()))
models.append(('LR', LogisticRegression(solver='lbfgs')))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('RF', RandomForestClassifier()))
models.append(('SVM', SVC()))

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
import time


removestopwords=False, # suppression des stopwords

lowercase=False,# passage en minuscule

removedigit=False, # supprimer les nombres

getstemmer=False,# racinisation des termes

getlemmatisation=False # lemmatisation des termes

**Choix du meilleur classifieur avec les meilleurs parametres de prétraitement** 

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.model_selection import cross_val_score, KFold, cross_val_predict
from sklearn.metrics import classification_report
import numpy as np

class Result:
    def __init__(self, name, score_accuracy, score_precision, score_rappel, score_mesure, temps, conf_mat):
        self.name = name
        self.score_accuracy = score_accuracy
        self.score_precision = score_precision
        self.score_rappel = score_rappel
        self.score_mesure = score_mesure
        self.temps = temps
        self.conf_mat = conf_mat 

def ma_fonction(removestopwords1, lowercase1, removedigit1, getstemmer1, getlemmatisation1):

    score_accuracy = 'accuracy'
    score_precision = 'precision_macro'
    score_rappel = 'recall_macro'
    score_mesure = 'f1_macro'
    seed = 7        
    allresults = []
    results = []
    names = []

    X = df_all['text']
    y = df_all['classe']

    # Nous appliquons les pré-traitements sur X

    text_normalizer = TextNormalizer(removestopwords=removestopwords1, lowercase=lowercase1, removedigit=removedigit1, getstemmer=getstemmer1, getlemmatisation=getlemmatisation1)  
    # appliquer fit.transform pour réaliser les pré-traitements sur X
    X_cleaned = text_normalizer.fit_transform(X)

    # pour l'enchainer avec un tf-idf et obtenir une matrice
    tfidf = TfidfVectorizer()
    features = tfidf.fit_transform(X_cleaned).toarray()

    # attention ici il faut passer features dans cross_val_score plutôt que X

    for name, model in models:
        # cross validation en 10 fois
        kfold = KFold(n_splits=10, random_state=seed, shuffle=True)

        print("Evaluation de", name)
        start_time = time.time()
        # application de la classification
        cv_results = cross_val_score(model, features, y, cv=kfold, scoring=score_accuracy)
        cv_results_precision = cross_val_score(model, features, y, cv=kfold, scoring=score_precision)
        cv_results_rappel = cross_val_score(model, features, y, cv=kfold, scoring=score_rappel)
        cv_results_mesure = cross_val_score(model, features, y, cv=kfold, scoring=score_mesure)

        # Calculer la matrice de confusion
        y_pred = cross_val_predict(model, features, y, cv=kfold)
        conf_mat = confusion_matrix(y, y_pred, labels=np.unique(y))

        # Affichage du rapport de classification
        class_names = np.unique(y)
        print("Classification Report:")
       

        print(classification_report(y, y_pred, labels=class_names))

        # Affichage de la matrice de confusion
        print("Confusion Matrix:")
        print(conf_mat)

        thetime = time.time() - start_time

       

        result = Result(name, cv_results.mean(), cv_results_precision.mean(), cv_results_rappel.mean(), cv_results_mesure.mean(), thetime, conf_mat)
        results.append(cv_results)
        names.append(name)
        print("%s : exactitude=%.3f, précision=%.3f, rappel=%.3f, mesure F1=%.3f in temps d'exécution=%.3f s" % (name, cv_results.mean(), cv_results_precision.mean(), cv_results_rappel.mean(), cv_results_mesure.mean(), thetime))




In [None]:
#removestopwords, lowercase, removedigit, getstemmer,getlemmatisation
ma_fonction(True, True, True, True, True)

Evaluation de MultinomialNB
Classification Report:
              precision    recall  f1-score   support

           1       0.42      0.64      0.51       250
           2       0.66      0.46      0.54       250
           3       0.44      0.36      0.39       250
           4       0.79      0.74      0.77       250

    accuracy                           0.55      1000
   macro avg       0.58      0.55      0.55      1000
weighted avg       0.58      0.55      0.55      1000

Confusion Matrix:
[[161  28  51  10]
 [ 71 116  44  19]
 [118  23  89  20]
 [ 35  10  19 186]]
MultinomialNB : exactitude=0.552, précision=0.591, rappel=0.560, mesure F1=0.552 in temps d'exécution=7.286 s
Evaluation de LR
Classification Report:
              precision    recall  f1-score   support

           1       0.51      0.46      0.49       250
           2       0.58      0.61      0.59       250
           3       0.48      0.51      0.49       250
           4       0.84      0.81      0.82       25

D'après les résultats affichés, le meilleur classifieur est SVM 
SVM a obtenu un score élevé de précision, de rappel et de mesure F1

In [None]:
ma_fonction(True, True, True, False, False)

Evaluation de MultinomialNB
Classification Report:
              precision    recall  f1-score   support

           1       0.43      0.66      0.52       250
           2       0.67      0.49      0.56       250
           3       0.49      0.37      0.42       250
           4       0.81      0.78      0.80       250

    accuracy                           0.57      1000
   macro avg       0.60      0.57      0.57      1000
weighted avg       0.60      0.57      0.57      1000

Confusion Matrix:
[[166  28  45  11]
 [ 72 122  38  18]
 [121  21  92  16]
 [ 29  12  14 195]]
MultinomialNB : exactitude=0.575, précision=0.608, rappel=0.581, mesure F1=0.573 in temps d'exécution=12.158 s
Evaluation de LR
Classification Report:
              precision    recall  f1-score   support

           1       0.53      0.49      0.51       250
           2       0.58      0.62      0.60       250
           3       0.51      0.54      0.52       250
           4       0.86      0.80      0.83       2

In [None]:
ma_fonction(True, True, True, True, False)

Evaluation de MultinomialNB
Classification Report:
              precision    recall  f1-score   support

           1       0.42      0.64      0.51       250
           2       0.66      0.48      0.55       250
           3       0.44      0.36      0.40       250
           4       0.79      0.74      0.77       250

    accuracy                           0.56      1000
   macro avg       0.58      0.56      0.56      1000
weighted avg       0.58      0.56      0.56      1000

Confusion Matrix:
[[161  28  51  10]
 [ 70 119  43  18]
 [117  23  90  20]
 [ 35  10  19 186]]
MultinomialNB : exactitude=0.556, précision=0.595, rappel=0.564, mesure F1=0.557 in temps d'exécution=6.375 s
Evaluation de LR
Classification Report:
              precision    recall  f1-score   support

           1       0.51      0.46      0.49       250
           2       0.57      0.62      0.60       250
           3       0.48      0.51      0.50       250
           4       0.84      0.80      0.82       25

In [None]:
ma_fonction(True, True, False, True, False)

Evaluation de MultinomialNB
Classification Report:
              precision    recall  f1-score   support

           1       0.42      0.65      0.51       250
           2       0.68      0.48      0.56       250
           3       0.44      0.36      0.40       250
           4       0.80      0.74      0.77       250

    accuracy                           0.56      1000
   macro avg       0.59      0.56      0.56      1000
weighted avg       0.59      0.56      0.56      1000

Confusion Matrix:
[[163  25  51  11]
 [ 70 119  44  17]
 [121  21  90  18]
 [ 36  10  18 186]]
MultinomialNB : exactitude=0.558, précision=0.600, rappel=0.566, mesure F1=0.559 in temps d'exécution=7.055 s
Evaluation de LR
Classification Report:
              precision    recall  f1-score   support

           1       0.52      0.47      0.49       250
           2       0.57      0.62      0.59       250
           3       0.48      0.52      0.50       250
           4       0.84      0.80      0.82       25

In [None]:
ma_fonction(True, True, False, True, True)

Evaluation de MultinomialNB
Classification Report:
              precision    recall  f1-score   support

           1       0.42      0.65      0.51       250
           2       0.68      0.47      0.55       250
           3       0.44      0.36      0.39       250
           4       0.80      0.74      0.77       250

    accuracy                           0.56      1000
   macro avg       0.58      0.55      0.56      1000
weighted avg       0.58      0.56      0.56      1000

Confusion Matrix:
[[163  25  51  11]
 [ 71 117  44  18]
 [122  21  89  18]
 [ 36  10  18 186]]
MultinomialNB : exactitude=0.555, précision=0.598, rappel=0.563, mesure F1=0.556 in temps d'exécution=7.609 s
Evaluation de LR
Classification Report:
              precision    recall  f1-score   support

           1       0.50      0.46      0.48       250
           2       0.57      0.60      0.58       250
           3       0.48      0.52      0.50       250
           4       0.84      0.80      0.82       25

In [None]:
ma_fonction(True, True, False, False, True)

Evaluation de MultinomialNB
Classification Report:
              precision    recall  f1-score   support

           1       0.41      0.67      0.51       250
           2       0.68      0.47      0.56       250
           3       0.46      0.35      0.40       250
           4       0.81      0.77      0.79       250

    accuracy                           0.56      1000
   macro avg       0.59      0.56      0.56      1000
weighted avg       0.59      0.56      0.56      1000

Confusion Matrix:
[[167  25  49   9]
 [ 77 117  39  17]
 [125  19  88  18]
 [ 34  10  14 192]]
MultinomialNB : exactitude=0.564, précision=0.607, rappel=0.571, mesure F1=0.563 in temps d'exécution=7.028 s
Evaluation de LR
Classification Report:
              precision    recall  f1-score   support

           1       0.52      0.48      0.50       250
           2       0.57      0.60      0.59       250
           3       0.49      0.53      0.51       250
           4       0.86      0.82      0.84       25

In [None]:
ma_fonction(True, True, True, False, True)

Evaluation de MultinomialNB
Classification Report:
              precision    recall  f1-score   support

           1       0.42      0.66      0.51       250
           2       0.66      0.47      0.55       250
           3       0.46      0.35      0.39       250
           4       0.80      0.76      0.78       250

    accuracy                           0.56      1000
   macro avg       0.58      0.56      0.56      1000
weighted avg       0.58      0.56      0.56      1000

Confusion Matrix:
[[164  26  51   9]
 [ 74 118  39  19]
 [120  24  87  19]
 [ 35  10  14 191]]
MultinomialNB : exactitude=0.560, précision=0.598, rappel=0.567, mesure F1=0.558 in temps d'exécution=7.230 s
Evaluation de LR
Classification Report:
              precision    recall  f1-score   support

           1       0.53      0.49      0.51       250
           2       0.58      0.61      0.59       250
           3       0.49      0.53      0.51       250
           4       0.86      0.81      0.84       25