In [85]:
import json
import spacy
import pandas as pd
import numpy as np
import collections

In [2]:
# read data file
data = []
with open('NV_review.json') as f:
    for line in f:
        data.append(json.loads(line))

In [50]:
# show one example
data[0]

{'review_id': 'kbtscdyz6lvrtGjD1quQTg',
 'user_id': 'FIk4lQQu1eTe2EpzQ4xhBA',
 'business_id': '8mIrX_LrOnAqWsB5JrOojQ',
 'stars': 4.0,
 'useful': 0,
 'funny': 0,
 'cool': 0,
 'text': 'Like walking back in time, every Saturday morning my sister and I was in a bowling league and after we were done, we\'d spend a few quarters playing the pin ball machines until our mother came to pick us up.\n\nMy sister was daring and play the machines hard, she was afraid of that "tilt" showing up and freezing the game.  I, on the other hand was a bit more gentler and wanted to make sure I got my quarter\'s worth.\n\nThis place has rows and rows of machines, some are really old and some are more of a mid 80\'s theme.  There is even a Ms pac man!  It was fun to spend an afternoon playing the machines and remembering all the fun of my early teen years.',
 'date': '2011-11-30 02:11:15'}

In [4]:
# tol num of reviews
num = len(data)

In [5]:
num

1405392

In [6]:
id = set()
for i in data:
    id.add(i['business_id'])

In [7]:
# tol num of restaurants
len(id)

7398

In [8]:
# Create a list of common words to remove
remove_words=["i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", "yours", "yourself", 
            "yourselves", "he", "him", "his", "himself", "she", "her", "hers", "herself", "it", "its", "itself", 
            "they", "them", "their", "theirs", "themselves", "what", "which", "who", "whom", "this", "that", "these", 
            "those", "am", "is", "are", "was", "were", "be", "been", "being", "have", "has", "had", "having", "do", 
            "does", "did", "doing", "a", "an", "the", "and", "but", "if", "or", "because", "as", "until", "while", 
            "of", "at", "by", "for", "with", "about", "against", "between", "into", "through", "during", "before", 
            "after", "above", "below", "to", "from", "up", "down", "in", "out", "on", "off", "over", "under", "again", 
            "further", "then", "once", "here", "there", "when", "where", "why", "how", "all", "any", "both", "each", 
            "few", "more", "most", "other", "some", "such", "no", "nor", "not", "only", "own", "same", "so", "than",
            "too", "very", "s", "t", "can", "will", "just", "don", "should", "now"]

In [9]:
# Load the pre-trained NLP model in spacy
nlp = spacy.load("en_core_web_lg") 

In [48]:
# get most frequent words
def get_keywords(x, pos = 'NOUN'):
    doc=nlp(x) ## Tokenize and extract grammatical components
    doc=[i.text for i in doc if i.text not in remove_words and i.pos_== pos] ## Remove common words and retain only nouns
    doc=list(map(lambda i: i.lower(),doc)) ## Normalize text to lower case
    doc=pd.Series(doc)
    doc=doc.value_counts().head(5).index.tolist() ## Get 5 most frequent words
    return doc

In [39]:
# combine reviews for each restaurant
reviews_id = {}
buisiness_ids = []
for i in data:
    if i['business_id'] not in reviews_id:
        reviews_id[i['business_id']] = i['text']
        buisiness_ids.append(i['business_id'])
    else:
        reviews_id[i['business_id']] += i['text']

In [49]:
get_keywords(reviews_id[buisiness_ids[0]], pos = 'NOUN')

['machines', 'games', 'pinball', 'place', 'machine']

In [53]:
# calculate average star
avg_star = {}
for i in data:
    if i['business_id'] not in avg_star:
        avg_star[i['business_id']] = []
    avg_star[i['business_id']].append(i['stars'])

In [61]:
for i in avg_star:
    avg_star[i] = np.mean(avg_star[i])

In [68]:
# final results include average star and kewwords (top 5 nouns and top 5 adjs)
final_results = {}
for i in buisiness_ids:
    final_results[i] = {}
    final_results[i]['star'] = avg_star[i]
    noun = get_keywords(reviews_id[i][:1000000], pos = 'NOUN')
    adj  = get_keywords(reviews_id[i][:1000000], pos = 'ADJ')
    final_results[i]['keywords'] = noun + adj

In [95]:
# find out most common keywords in all reviews
keywords = []
for i in final_results.values():
    keywords += i['keywords']

In [96]:
collections.Counter(keywords).most_common(30)

[('good', 6682),
 ('great', 5813),
 ('food', 5779),
 ('place', 5234),
 ('service', 3294),
 ('nice', 2346),
 ('time', 1997),
 ('best', 1777),
 ('friendly', 1638),
 ('fresh', 1367),
 ('delicious', 1328),
 ('little', 1232),
 ('chicken', 945),
 ('restaurant', 838),
 ('bad', 829),
 ('bar', 822),
 ('amazing', 792),
 ('order', 743),
 ('pizza', 696),
 ('hot', 597),
 ('location', 581),
 ('fast', 539),
 ('fries', 496),
 ('sandwich', 492),
 ('mexican', 482),
 ('new', 420),
 ('better', 419),
 ('drinks', 394),
 ('breakfast', 376),
 ('rice', 364)]

In [None]:
# recommended keywords after filtering
# ['good', 'great','service', 'pizza', 'nice', 'friendly', 'chicken', 'amazing', 'fries', 'sandwich', 'mexican', 'breakfast', 'new']

In [75]:
# save results
with open('results.json', 'w') as json_file:
    json.dump(final_results, json_file)