## Load data

In [124]:
from __future__ import absolute_import, print_function, division

import json

In [125]:
# load yelp review data
yelp_datapath = '../dataset/yelp/yelp_academic_dataset_review.json'

LINE_LIMIT = 10000
dataset = []

with open(yelp_datapath, 'r') as f:
    num_line = 0
    for line in f:
        num_line = num_line + 1
        dataset.append(json.loads(line))
        if num_line == LINE_LIMIT:
            break

In [126]:
# some examples
print(dataset[0]['text'], "\n")
print("stars =", dataset[0]['stars'])

The pizza was okay. Not the best I've had. I prefer Biaggio's on Flamingo / Fort Apache. The chef there can make a MUCH better NY style pizza. The pizzeria @ Cosmo was over priced for the quality and lack of personality in the food. Biaggio's is a much better pick if youre going for italian - family owned, home made recipes, people that actually CARE if you like their food. You dont get that at a pizzeria in a casino. I dont care what you say... 

stars = 2


## Using traditional ML models (Naive Bayes, SVM, ...) with Bag of words

In [127]:
from sklearn.model_selection import StratifiedShuffleSplit

from collections import Counter
import multiprocessing
import time

import nltk
from nltk.tokenize import word_tokenize

# convert into trainable data
raw_data = [(d['text'], d['stars']) for d in dataset]

# # tokenize documents and remove stop words
# def text_processing(d):
#     text = map(lambda x: x.lower(), word_tokenize(d[0]))
#     text = filter(lambda x: x not in stop_words, text)
#     text = ' '.join(text)
#     return (text, d[1])
    
# start = time.time()
# pool = multiprocessing.Pool(processes=4)
# raw_data = pool.map(text_processing, raw_data)
# pool.close()
# print("Time for tokenizing = ", (time.time() - start))

documents, labels = zip(*raw_data)

# stratified shuffle split 80-20
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
train_index, test_index = next(sss.split(documents, labels))

X_train = [documents[i] for i in train_index]
y_train = [labels[i] for i in train_index]

X_test = [documents[i] for i in test_index]
y_test = [labels[i] for i in test_index]

# sanity check
print(len(X_train), len(X_test))
print(Counter(y_train), Counter(y_test))

8000 2000
Counter({5: 3661, 4: 1559, 1: 1359, 3: 733, 2: 688}) Counter({5: 915, 4: 390, 1: 340, 3: 183, 2: 172})


In [128]:
# build a data pipeline
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB

stop_words = nltk.corpus.stopwords.words('english')

svm_clf = Pipeline([
    ('vect', CountVectorizer(stop_words=stop_words)),
    ('tfidf', TfidfTransformer()),
    ('clf', SGDClassifier(loss = 'hinge', penalty='l2', alpha=1e-3, random_state=42, max_iter=1000, tol=1e-3)),
])

naive_bayes_clf = Pipeline([
    ('vect', CountVectorizer(stop_words=stop_words)),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB())
])

# fit the data
svm_clf.fit(X_train, y_train)
naive_bayes_clf.fit(X_train, y_train)

# train scores
print("train scores:")
print(svm_clf.score(X_train, y_train))
print(naive_bayes_clf.score(X_train, y_train))

# test scores
print("test scores:")
print(svm_clf.score(X_test, y_test))
print(naive_bayes_clf.score(X_test, y_test))

train scores:
0.6795
0.51975
test scores:
0.5935
0.4895


In [129]:
from sklearn.metrics import confusion_matrix

# try to look at confusion matrix
y_pred = svm_clf.predict(X_test)

print(confusion_matrix(y_test, y_pred, labels=[1, 2, 3, 4, 5]))

[[245   4   1   1  89]
 [ 67  12   4   8  81]
 [ 28   8   7  24 116]
 [ 17   3   2  31 337]
 [ 10   0   3  10 892]]


## Deep learning