In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
%matplotlib inline

In [2]:
dataset = pd.read_csv('data/Restaurant_Reviews.tsv', sep='\t')
dataset.head()

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [10]:
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

corpus = []

for i in range(len(dataset)):
    review = re.sub('[^a-zA-Z]', ' ', dataset['Review'][i])
    review = review.lower()
    review = review.split()

    ps = PorterStemmer()

    #remove irrelevant words and get word stem
    review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
    review = ' '.join(review)
    
    corpus.append(review)

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/tsansom/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [22]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=1500)
X = cv.fit_transform(corpus).toarray()
Y = dataset['Liked'].values

In [30]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.20, random_state=0)

In [62]:
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import confusion_matrix

In [69]:
models = {'Decision Tree': DecisionTreeClassifier(criterion='entropy'),
          'Naive Bayes': GaussianNB(),
          'K Nearest Neighbor': KNeighborsClassifier(),
          'SVM': SVC(kernel='linear', random_state=0),
          'Random Forrest': RandomForestClassifier(n_estimators=20, criterion='entropy', random_state=0),
          'Multinominal Naive Bayes': MultinomialNB()}

In [70]:
for i in models:
    model = models[i]
    model.fit(X_train, Y_train)
    y_pred = model.predict(X_test)
    cm = confusion_matrix(Y_test, y_pred)
    accuracy = (cm[0,0] + cm[1,1]) / cm.sum()
    precision = cm[1,1] / (cm[0,1] + cm[1,1])
    recall = cm[1,1] / (cm[0,1] + cm[1,0])
    f1_score = (2 * precision * recall) / (precision + recall)
    print('{0}\n------------------------'.format(i))
    print('\tAccuracy: {0:.2f}\n\tPrecision: {1:.2f}\n\tRecall: {2:.2f}\n\tF1 Score: {3:.2f}\n'.format(accuracy,
                                                                                     precision,
                                                                                     recall,
                                                                                     f1_score))

Multinominal Naive Bayes
------------------------
	Accuracy: 0.77
	Precision: 0.76
	Recall: 1.72
	F1 Score: 1.06

Random Forrest
------------------------
	Accuracy: 0.72
	Precision: 0.84
	Recall: 1.04
	F1 Score: 0.93

SVM
------------------------
	Accuracy: 0.72
	Precision: 0.75
	Recall: 1.25
	F1 Score: 0.94

K Nearest Neighbor
------------------------
	Accuracy: 0.61
	Precision: 0.68
	Recall: 0.62
	F1 Score: 0.64

Naive Bayes
------------------------
	Accuracy: 0.73
	Precision: 0.68
	Recall: 1.69
	F1 Score: 0.97

Decision Tree
------------------------
	Accuracy: 0.69
	Precision: 0.73
	Recall: 1.00
	F1 Score: 0.85

