In [1]:
!pip install pandas
!pip install numpy
!pip install matplotlib



In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings

warnings.filterwarnings("ignore")

In [3]:
data=pd.read_csv("Restaurant_Reviews.tsv", delimiter='\t', quoting=3)
data.head()

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [4]:
data.shape

(1000, 2)

In [5]:
!pip install nltk



In [6]:
import re
import nltk #natural lannguage tool kit

nltk.download('stopwords')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [7]:
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
corpus = []
for i in range(0,1000) :
  #Remplace les non lettres par espace
  review = re.sub('[^a-zA-Z]', ' ', data['Review'][i])
  review = review.lower()
  #Separer les pphrases dans des array (alimine les espaces)
  review = review.split()

  #Stemming
  ps = PorterStemmer()
  review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
  #Joining array
  review = ' '.join(review)
  corpus.append(review)

In [8]:
corpus[0]

'wow love place'

In [9]:
corpus[1]

'crust good'

#Using CountVectorizer

Count vectorizer convertit un texte en une matrice de fréquence des mots. Chaque ligne de cette matrice représente un document (ou un texte) et chaque colonne représente un mot du vocabulaire global extrait du corpus, qu'on a limité à 1500.

Count(term,document)=Number of times term appears in document

Rq: Ici on a juste initier le cv.

In [10]:
!pip install scikit-learn
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 1500)




Mmaintenant, on l'applique au corpus. La methode transform() permet de convertir chaque texte en un vercteur numérique dans lequel chaque valeur correspond à la fréquence d'un mot dans ce dernier.

In [11]:
X=cv.fit_transform(corpus).toarray() #convertion de la matrice finale en un array NumPy

In [12]:
y = data.iloc[:,1].values

In [15]:
X.shape

(1000, 1500)

X : Reviews as numerical vector

y : Liked



In [18]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size =0.2, random_state=0)

In [19]:
!pip install xgboost



In [20]:
from xgboost import XGBClassifier

from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report

DT = DecisionTreeClassifier()
DT.fit(X_train,y_train)
yDT = DT.predict(X_test)

In [21]:
print(classification_report(y_test,yDT))

              precision    recall  f1-score   support

           0       0.64      0.74      0.69        97
           1       0.71      0.60      0.65       103

    accuracy                           0.67       200
   macro avg       0.67      0.67      0.67       200
weighted avg       0.68      0.67      0.67       200



Le classificateur naïf bayésien (Naive Bayes Classifier) repose sur le théorème de Bayes et fait l'hypothèse que les caractéristiques (features) sont indépendantes les unes des autres, ce qui peut être une simplification.

In [23]:
NB = GaussianNB()
NB.fit(X_train,y_train)
yNB = NB.predict(X_test)

print(classification_report(y_test,yNB))

              precision    recall  f1-score   support

           0       0.82      0.57      0.67        97
           1       0.68      0.88      0.77       103

    accuracy                           0.73       200
   macro avg       0.75      0.73      0.72       200
weighted avg       0.75      0.73      0.72       200



XGBoost est un algorithme d'ensemble qui combine plusieurs arbres de décision pour améliorer la précision des prédictions. Contrairement à un modèle d'arbre de décision simple, qui peut être sensible à l'overfitting, XGBoost utilise un processus de boosting pour réduire cette sensibilité et améliorer la performance générale.

In [24]:
XGB = XGBClassifier()
XGB.fit(X_train,y_train)
yXGB = XGB.predict(X_test)

print(classification_report(y_test,yXGB))

              precision    recall  f1-score   support

           0       0.66      0.86      0.75        97
           1       0.81      0.59      0.69       103

    accuracy                           0.72       200
   macro avg       0.74      0.72      0.72       200
weighted avg       0.74      0.72      0.72       200



#Using TF-IDF

TF-IDF(term,document)=TF(term,document)*IDF(term)

In [26]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(max_features=1500)
X2 = tfidf.fit_transform(corpus).toarray()
y2 = data.iloc[:,1].values
from sklearn.model_selection import train_test_split
X2_train,X2_test,y2_train,y2_test = train_test_split(X2,y2,test_size=0.2,random_state=0)

In [27]:
DT2 = DecisionTreeClassifier()
DT2.fit(X2_train,y2_train)
y2DT = DT2.predict(X2_test)

print(classification_report(y2_test,y2DT))

              precision    recall  f1-score   support

           0       0.65      0.76      0.70        97
           1       0.73      0.61      0.67       103

    accuracy                           0.69       200
   macro avg       0.69      0.69      0.68       200
weighted avg       0.69      0.69      0.68       200



In [28]:
KNN2 = KNeighborsClassifier()
KNN2.fit(X2_train,y2_train)
y2KNN = KNN2.predict(X2_test)

print(classification_report(y2_test,y2KNN))

              precision    recall  f1-score   support

           0       0.65      0.81      0.72        97
           1       0.77      0.59      0.67       103

    accuracy                           0.70       200
   macro avg       0.71      0.70      0.70       200
weighted avg       0.71      0.70      0.70       200



In [29]:
NB2 = GaussianNB()
NB2.fit(X2_train,y2_train)
y2NB = NB2.predict(X2_test)

print(classification_report(y2_test,y2NB))

              precision    recall  f1-score   support

           0       0.78      0.59      0.67        97
           1       0.69      0.84      0.76       103

    accuracy                           0.72       200
   macro avg       0.73      0.72      0.71       200
weighted avg       0.73      0.72      0.71       200

