In [1]:
import numpy as np
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score

from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

df = pd.read_csv(r'C:\Users\umut3\PycharmProjects\pythonProject\Restaurant_Reviews.tsv', delimiter='\t', quoting=3)

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\umut3\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
# Helper method
def metrics(y_test, y_pred, model_name):
    data = pd.DataFrame([[accuracy_score(y_test, y_pred), precision_score(y_test, y_pred),
                         recall_score(y_test, y_pred), f1_score(y_test, y_pred)]],index=[model_name],
                        columns=["accuracy_score", "precision_score", "recall_score", "f1_score"])
    print(data)

In [5]:
corpus = []

for i in range(0, 1000):
    review = re.sub('[^a-zA-Z]', ' ', df['Review'][i])
    review = review.lower().split()
    ps = PorterStemmer()
    all_stopwords = stopwords.words('english')
    for w in ['not', "isn't", "don't", "doesn't", "wouldn't", "aren't", "didn't", "couldn't", "wasn't", 'no', "won't"]:
        all_stopwords.remove(w)
    review = [ps.stem(word) for word in review if not word in set(all_stopwords)]
    review = ' '.join(review)
    corpus.append(review)

cv = CountVectorizer(max_features=1530)
X = cv.fit_transform(corpus).toarray()
y = df.iloc[:, -1].values


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=0)

In [6]:
classifier = SVC(kernel='linear', random_state=0)
classifier.fit(X_train, y_train.ravel())
y_pred = classifier.predict(X_test)
metrics(y_test, y_pred, "SVM")

     accuracy_score  precision_score  recall_score  f1_score
SVM           0.815          0.84375      0.786408   0.81407


In [7]:
classifier = RandomForestClassifier(n_estimators=10, criterion="entropy", random_state=0)
classifier.fit(X_train, y_train.ravel())
y_pred = classifier.predict(X_test)
metrics(y_test, y_pred, "RandomForestClassifier")

                        accuracy_score  precision_score  recall_score  \
RandomForestClassifier            0.78          0.90411      0.640777   

                        f1_score  
RandomForestClassifier      0.75  


In [None]:
classifier = DecisionTreeClassifier(criterion="entropy", random_state=0)
classifier.fit(X_train, y_train.ravel())
y_pred = classifier.predict(X_test)
metrics(y_test, y_pred, "DecisionTreeClassifier")