In [None]:
import nltk
from nltk.corpus import stopwords
import re
import pandas as pd
import numpy as np
import json

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score

from nltk import word_tokenize
import seaborn as sns
import joblib

nltk.download('wordnet')
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('omw-1.4')

from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
from sklearn import model_selection, naive_bayes, svm
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from matplotlib import pyplot
from sklearn import metrics
# Sklearn regression model evaluation function
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
from sklearn.model_selection import cross_val_score

In [None]:
df = pd.read_csv('augmented_text_red.csv')

In [None]:
df.head()

In [None]:
special_character_remover = re.compile('[/(){}\[\]\|@,;:]')
extra_symbol_remover = re.compile('[^0-9a-z #+_]')
STOPWORDS = nltk.corpus.stopwords.words('french')

In [None]:
def clean_text(text):
    #text = text.lower()
    text = special_character_remover.sub(' ', text)
    text = extra_symbol_remover.sub('', text)
    text = ''.join(c for c in text if not c.isdigit())
    text = ' '.join(word for word in text.split() if word not in STOPWORDS)
    
    return text
    
df['text'] = df['text'].apply(clean_text)

In [None]:
df = df.drop_duplicates(subset='text', keep="last")

In [None]:
df.head()

In [None]:
df=df.rename(columns={"label2": "label"})

## Split the train and test

In [None]:
from sklearn.model_selection import train_test_split
X = df.text
y = df.label
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 42,stratify=y)

In [None]:
X_train.shape,X_test.shape,y_train.shape,y_test.shape

## Naive Bay Model

In [None]:
from sklearn.naive_bayes import MultinomialNB


naivebayes = Pipeline([('vect', TfidfVectorizer()),
               ('tfidf', TfidfTransformer()),
               ('clf', MultinomialNB()),
              ])
naivebayes.fit(X_train, y_train)

predictions_NB = naivebayes.predict(X_test)

print("Naive Bay Accuracy Score -> ",accuracy_score(predictions_NB,y_test))

In [None]:
print("Naive Bayes Classification Report", classification_report(y_test, predictions_NB))


## Logistic Regression Model

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer

lr = Pipeline([('vect', TfidfVectorizer()),
               ('tfidf', TfidfTransformer()),
               ('clf', LogisticRegression()),
              ])

lr.fit(X_train,y_train)
predictions_LR = lr.predict(X_test)

print("Logistic regression Accuracy Score -> ",accuracy_score(predictions_LR,y_test))

In [None]:
print("Logistic Regression Classification Report", classification_report(y_test, predictions_LR))

## SVM Model

In [None]:
# Classifier - Algorithm - SVM

SVM = Pipeline([('vect', TfidfVectorizer()),
               ('tfidf', TfidfTransformer()),
               ('svm',svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')),
              ])
SVM.fit(X_train, y_train)

predictions_SVM = SVM.predict(X_test)

# Use accuracy_score function to get the accuracy
print("SVM Accuracy Score -> ",accuracy_score(predictions_SVM, y_test))

In [None]:
print(classification_report(y_test, predictions_SVM))

In [None]:
report = classification_report(y_test, predictions_SVM)
report_path = "SVM_report.csv"

text_file = open(report_path, "w")
n = text_file.write(report)
text_file.close()

## Prediction

In [None]:

#Create a dictionary of label number and class
df_original = pd.read_csv('Clean_dataset.csv')
X = df_original["text"].tolist()
y = pd.get_dummies(df_original['label'])
dict = {i: name for i, name in enumerate(y.columns)}
dict

In [None]:
#make the prediction based on three machine learning models
def make_prediction(text):
    cleaned_text = clean_text(text)
    prediction_lr = lr.predict([cleaned_text])
    prediction_svm = SVM.predict([cleaned_text])
    prediction_nb =naivebayes.predict([cleaned_text])
    return dict[prediction_lr[0]],dict[prediction_svm[0]],dict[prediction_nb[0]]

In [None]:
text = "il n'y a plus de batterie"
make_prediction(text)


In [None]:
#Save the three models
LR_filename = './model/logistic_regression.sav'
NB_filename = './model/Naive_bayes.sav'
SVM_filename = './model/SVM.sav'
joblib.dump(lr, LR_filename)
joblib.dump(SVM, SVM_filename)
joblib.dump(naivebayes, NB_filename)


## Evaluation of three machine learning model

In [None]:
print("Evaluation of SVM:")
print("Mean absolute error", mean_absolute_error(y_test, predictions_SVM))
print("R2 Score",r2_score(y_test, predictions_SVM))

In [None]:
print("Evaluation of Logistic Regression:")
print("Mean absolute error", mean_absolute_error(y_test, predictions_LR))
print("R2 Score",r2_score(y_test, predictions_LR))

In [None]:
print("Evaluation of Naive Bayes:")
print("Mean absolute error", mean_absolute_error(y_test, predictions_NB))
print("R2 Score",r2_score(y_test, predictions_NB))

In [None]:
#importing confusion matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, predictions_SVM,labels=SVM.classes_)
print('Confusion Matrix\n')
print(cm)

In [None]:
# Creating a dataframe for a array-formatted Confusion matrix,so it will be easy for plotting.
cm_df = pd.DataFrame(cm,
                     index = df['label'].unique().sort(), 
                     columns = df['label'].unique().sort())

In [None]:
#Plotting the confusion matrix
plt.figure(figsize=(15,10))
sns.heatmap(cm_df, annot=True)
plt.title('Confusion Matrix for SVM Model')
plt.ylabel('Actal Class')
plt.xlabel('Predicted Class')
plt.show()