Udemy course Machine Learning A-Z: homework challenge (lecture 242).
Natural Language Processing  (NLP) on Restaurant Reviews. 

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
data = pd.read_csv('Restaurant_Reviews.tsv', sep='\t', quoting=3)

In [3]:
display(data.head())

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Review  1000 non-null   object
 1   Liked   1000 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 15.8+ KB


In [5]:
data.shape[0]

1000

In [6]:
import re
import nltk
# nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

In [7]:
check_word = stopwords.words('english')
print(type(check_word))
print(len(check_word))
print(check_word)

<class 'list'>
179
['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'sam

In [8]:
keep_list = ["not", "no", 'isn', "isn't", "aren't", "doesn't",'didn', "didn't", "should've",
             'hadn', "mustn't", 'mustn',  'wasn', "wasn't", 'won', "won't", "wouldn't", 'wouldn', 'very',
             'against', 'off', 'few', 'more', 'most', 'than', 'too', 'can', 'will', 'just', 'don', "don't",
             'couldn', "couldn't", 'doesn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'mightn', "mightn't", 
              "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'weren', "weren't", 'aren', 'once'
              ]
for word_rm in keep_list:
    check_word.remove(word_rm)

In [9]:
print(check_word)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'over', 'under', 'again', 'further', 'then', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'other', 'some', 'such', 'nor', 'only', 'own', 'same', 'so', 's', 't', 'should', 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain',

### String clean up general steps:
* replace non-word character with a space
* make lower cases
* split the string into individual words
* remove stop words in those words
* get the stem of words
* put them back into a cleaned-up string

In [10]:
corpus = []
for i in range(0, data.shape[0]):
    review = re.sub('[^a-zA-Z]+', ' ', data['Review'][i])
    review = review.lower()
    review = review.split()
    ps  = PorterStemmer()
    review = [ps.stem(word) for word in review if not word in set(check_word)]
    review = ' '.join(review)
    corpus.append(review)

In [11]:
print(corpus[0:11])

['wow love place', 'crust not good', 'not tasti textur just nasti', 'stop late may bank holiday off rick steve recommend love', 'select menu great price', 'get angri want damn pho', 'honeslti didn tast fresh', 'potato like rubber could tell made ahead time kept warmer', 'fri great too', 'great touch', 'servic veri prompt']


In [12]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 3000)
X = cv.fit_transform(corpus).toarray()
y = data.iloc[:, -1].values

## Splitting the dataset into the Training set and Test set

In [13]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

## 1. Training using the Naive Bayes model

In [14]:
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train, y_train)

GaussianNB()

In [15]:
y_pred = classifier.predict(X_test)
pred_test_arr = np.concatenate((y_pred.reshape(len(y_pred), 1), y_test.reshape(len(y_test), 1)), 1)
print(pred_test_arr[0:5, :])

[[1 0]
 [1 0]
 [1 0]
 [0 0]
 [0 0]]


In [16]:
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.metrics import classification_report
acc = accuracy_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)
f1 = classification_report(y_test, y_pred)
print('---')
print('Model 1: Naive Bayes')
print('Accuracy is {}\n'.format(acc))
print('The confusion matrix is \n{}\n'.format(cm))
print('F1 score is {}\n'.format(f1))

---
Model 1: Naive Bayes
Accuracy is 0.735

The confusion matrix is 
[[56 41]
 [12 91]]

F1 score is               precision    recall  f1-score   support

           0       0.82      0.58      0.68        97
           1       0.69      0.88      0.77       103

    accuracy                           0.73       200
   macro avg       0.76      0.73      0.73       200
weighted avg       0.75      0.73      0.73       200




## 2. Training using the Random Forest model

In [17]:
from sklearn.ensemble import RandomForestClassifier
cf_randomf = RandomForestClassifier(n_estimators=100, criterion='entropy', random_state=0)
cf_randomf.fit(X_train, y_train)

RandomForestClassifier(criterion='entropy', random_state=0)

In [18]:
y_pred_cf_randomf = cf_randomf.predict(X_test)
cm = confusion_matrix(y_test, y_pred_cf_randomf)
f1 = classification_report(y_test, y_pred_cf_randomf)
print('---')
print('Model 2: Random Forest')
print('The confusion matrix is \n{}\n'.format(cm))
print('F1 score is {}\n'.format(f1))

---
Model 2: Random Forest
The confusion matrix is 
[[85 12]
 [28 75]]

F1 score is               precision    recall  f1-score   support

           0       0.75      0.88      0.81        97
           1       0.86      0.73      0.79       103

    accuracy                           0.80       200
   macro avg       0.81      0.80      0.80       200
weighted avg       0.81      0.80      0.80       200




## 3. Training using the Gradient Boost model

In [20]:
from sklearn.ensemble import GradientBoostingClassifier
# cf_gb = GradientBoostingClassifier(n_estimators=154, max_depth=18, random_state=10)
cf_gb = GradientBoostingClassifier(n_estimators=154, max_depth=18, random_state=0)

cf_gb.fit(X_train, y_train)

y_pred_cf_gb = cf_gb.predict(X_test)
cm = confusion_matrix(y_test, y_pred_cf_gb)
f1 = classification_report(y_test, y_pred_cf_gb)
print('---')
print('Model 3: Gradient Boost')
print('The confusion matrix is \n{}\n'.format(cm))
print('F1 score is {}\n'.format(f1))

---
Model 3: Gradient Boost
The confusion matrix is 
[[86 11]
 [19 84]]

F1 score is               precision    recall  f1-score   support

           0       0.82      0.89      0.85        97
           1       0.88      0.82      0.85       103

    accuracy                           0.85       200
   macro avg       0.85      0.85      0.85       200
weighted avg       0.85      0.85      0.85       200




## 3.2 GridSearch to fine tune GB hyperparameters

In [21]:
from sklearn.model_selection import GridSearchCV
dt = GradientBoostingClassifier(random_state=0)

param_dt = {
            'n_estimators': [153, 154, 155, 156],
            'max_depth': [15, 18, 19, 20],
            }
grid_dt = GridSearchCV(estimator=dt,
                       param_grid=param_dt,
                       scoring='accuracy',
                       cv=10,
                       n_jobs=-1)
grid_dt.fit(X_train, y_train)
best_hyperparams = grid_dt.best_params_
print('Best hyerparameters:\n', best_hyperparams)
best_CV_score = grid_dt.best_score_
print('Best CV accuracy'.format(best_CV_score))


y_pred_cvgrid = grid_dt.predict(X_test)
cm = confusion_matrix(y_test, y_pred_cvgrid)
f1 = classification_report(y_test, y_pred_cvgrid)
print('---')
print('Model 3b: Gradient Boost with best parameters')
print('The confusion matrix is \n{}\n'.format(cm))
print('F1 score is {}\n'.format(f1))

Best hyerparameters:
 {'max_depth': 18, 'n_estimators': 153}
Best CV accuracy
---
Model 3b: Gradient Boost with best parameters
The confusion matrix is 
[[86 11]
 [19 84]]

F1 score is               precision    recall  f1-score   support

           0       0.82      0.89      0.85        97
           1       0.88      0.82      0.85       103

    accuracy                           0.85       200
   macro avg       0.85      0.85      0.85       200
weighted avg       0.85      0.85      0.85       200




## 4. Training using the Decision Tree model

In [32]:
from sklearn.tree import DecisionTreeClassifier
cf_decisiontree = DecisionTreeClassifier(criterion='entropy', random_state=0)
cf_decisiontree.fit(X_train, y_train)

DecisionTreeClassifier(criterion='entropy', random_state=0)

In [33]:
y_pred_cf_decisiontree = cf_decisiontree.predict(X_test)
cm = confusion_matrix(y_test, y_pred_cf_decisiontree)
f1 = classification_report(y_test, y_pred_cf_decisiontree)
print('---')
print('Model 4:  Decision Tree')
print('The confusion matrix is \n{}\n'.format(cm))
print('F1 score is {}\n'.format(f1))

---
Model 4:  Decision Tree
The confusion matrix is 
[[74 23]
 [22 81]]

F1 score is               precision    recall  f1-score   support

           0       0.77      0.76      0.77        97
           1       0.78      0.79      0.78       103

    accuracy                           0.78       200
   macro avg       0.77      0.77      0.77       200
weighted avg       0.77      0.78      0.77       200




## 5. Training using Logistic Regression

In [34]:
from sklearn.linear_model import LogisticRegression
cf_lr = LogisticRegression(random_state=0)
cf_lr.fit(X_train, y_train)

LogisticRegression(random_state=0)

In [35]:
y_pred_cf_lr = cf_lr.predict(X_test)
cm = confusion_matrix(y_test, y_pred_cf_lr)
f1 = classification_report(y_test, y_pred_cf_lr)
print('---')
print('Model 5:  Logistic Regression')
print('The confusion matrix is \n{}\n'.format(cm))
print('F1 score is {}\n'.format(f1))

---
Model 5:  Logistic Regression
The confusion matrix is 
[[82 15]
 [22 81]]

F1 score is               precision    recall  f1-score   support

           0       0.79      0.85      0.82        97
           1       0.84      0.79      0.81       103

    accuracy                           0.81       200
   macro avg       0.82      0.82      0.81       200
weighted avg       0.82      0.81      0.81       200




## 6. Training using Support Vector Machine (SVM - linear)

In [36]:
from sklearn.svm import SVC
cf_svc = SVC(kernel='linear', random_state=0)
cf_svc.fit(X_train, y_train)

y_pred_cf_svc = cf_svc.predict(X_test)
cm = confusion_matrix(y_test, y_pred_cf_svc)
f1 = classification_report(y_test, y_pred_cf_svc)
print('---')
print('Model 6: SVM - linear')
print('The confusion matrix is \n{}\n'.format(cm))
print('F1 score is {}\n'.format(f1))

---
Model 6: SVM - linear
The confusion matrix is 
[[83 14]
 [22 81]]

F1 score is               precision    recall  f1-score   support

           0       0.79      0.86      0.82        97
           1       0.85      0.79      0.82       103

    accuracy                           0.82       200
   macro avg       0.82      0.82      0.82       200
weighted avg       0.82      0.82      0.82       200




## 7. Training using Support Vector Machine (SVM - nonlinear kernel)

In [37]:
from sklearn.svm import SVC
cf_svcrbf = SVC(kernel='rbf', random_state=0)
cf_svcrbf.fit(X_train, y_train)

y_pred_cf_svcrbf = cf_svcrbf.predict(X_test)
cm = confusion_matrix(y_test, y_pred_cf_svcrbf)
f1 = classification_report(y_test, y_pred_cf_svcrbf)
print('---')
print('Model 7: SVM - nonlinear kernel')
print('The confusion matrix is \n{}\n'.format(cm))
print('F1 score is {}\n'.format(f1))

---
Model 7: SVM - nonlinear kernel
The confusion matrix is 
[[89  8]
 [36 67]]

F1 score is               precision    recall  f1-score   support

           0       0.71      0.92      0.80        97
           1       0.89      0.65      0.75       103

    accuracy                           0.78       200
   macro avg       0.80      0.78      0.78       200
weighted avg       0.81      0.78      0.78       200




## 8. Training using K-nearest neighbors (KNN)

In [38]:
from sklearn.neighbors import KNeighborsClassifier
cf_knn = KNeighborsClassifier(n_neighbors = 20, metric = 'minkowski', p = 6)
cf_knn.fit(X_train, y_train)

y_pred_cf_knn = cf_knn.predict(X_test)
cm = confusion_matrix(y_test, y_pred_cf_knn)
f1 = classification_report(y_test, y_pred_cf_knn)
print('---')
print('Model 8:  Decision Tree')
print('The confusion matrix is \n{}\n'.format(cm))
print('F1 score is {}\n'.format(f1))

---
Model 8:  Decision Tree
The confusion matrix is 
[[54 43]
 [22 81]]

F1 score is               precision    recall  f1-score   support

           0       0.71      0.56      0.62        97
           1       0.65      0.79      0.71       103

    accuracy                           0.68       200
   macro avg       0.68      0.67      0.67       200
weighted avg       0.68      0.68      0.67       200




Summary: Gradient Boost seems to return the best accuracy, precision, recall and F1 score. SVM (linear) is second...
Fine-tunning features (word vectorization step) is important. 