In [15]:
# load the data file
import json
from pprint import pprint

with open('data/yelp.json', 'r') as f:
    data = json.load(f)

# show some basic info
print('Number of reviews:', len(data))
print()

print('Keys in each entry: ')
print(', '.join(data[0].keys()))
print('')

print('First entry: ')
pprint(data[0])

Number of reviews: 10000

Keys in each entry: 
business_id, date, review_id, stars, text, type, user_id, cool, useful, funny

First entry: 
{'business_id': '9yKzy9PApeiPPOUJEtnvkg',
 'cool': '2',
 'date': '2011-01-26',
 'funny': '0',
 'review_id': 'fWKvX83p0-ka4JS3dc6E5A',
 'stars': '5',
 'text': 'My wife took me here on my birthday for breakfast and it was '
         'excellent.  The weather was perfect which made sitting outside '
         'overlooking their grounds an absolute pleasure.  Our waitress was '
         'excellent and our food arrived quickly on the semi-busy Saturday '
         'morning.  It looked like the place fills up pretty quickly so the '
         'earlier you get here the better.\n'
         '\n'
         'Do yourself a favor and get their Bloody Mary.  It was phenomenal '
         "and simply the best I've ever had.  I'm pretty sure they only use "
         'ingredients from their garden and blend them fresh when you order '
         'it.  It was amazing.\n'
  

In [32]:
# count number of unique business ids in this data set
business_ids = [entry['business_id'] for entry in data]
unique_business_ids = set(business_ids)

print('Number of unique business ids:', len(unique_business_ids))

Number of unique business ids: 4174


In [17]:
# tokenize a review with nltk into word tokens
from nltk.tokenize import word_tokenize

review = data[0]['text']
tokens = word_tokenize(review)

pprint(review)
print()
print('Number of tokens: ', len(tokens))
pprint(tokens)



('My wife took me here on my birthday for breakfast and it was excellent.  The '
 'weather was perfect which made sitting outside overlooking their grounds an '
 'absolute pleasure.  Our waitress was excellent and our food arrived quickly '
 'on the semi-busy Saturday morning.  It looked like the place fills up pretty '
 'quickly so the earlier you get here the better.\n'
 '\n'
 'Do yourself a favor and get their Bloody Mary.  It was phenomenal and simply '
 "the best I've ever had.  I'm pretty sure they only use ingredients from "
 'their garden and blend them fresh when you order it.  It was amazing.\n'
 '\n'
 'While EVERYTHING on the menu looks excellent, I had the white truffle '
 'scrambled eggs vegetable skillet and it was tasty and delicious.  It came '
 'with 2 pieces of their griddled bread with was amazing and it absolutely '
 'made the meal complete.  It was the best "toast" I\'ve ever had.\n'
 '\n'
 "Anyway, I can't wait to go back!")

Number of tokens:  175
['My',
 'wife',

In [24]:
# tokenize into sentences (also called sentencizing)
from nltk.tokenize import sent_tokenize

review = data[0]['text']
sentences = [word_tokenize(s) for s in sent_tokenize(review)]

pprint(review)
print()
print('Number of sentences: ', len(sentences))
for sentence in sentences:
    print(sentence)
print()



('My wife took me here on my birthday for breakfast and it was excellent.  The '
 'weather was perfect which made sitting outside overlooking their grounds an '
 'absolute pleasure.  Our waitress was excellent and our food arrived quickly '
 'on the semi-busy Saturday morning.  It looked like the place fills up pretty '
 'quickly so the earlier you get here the better.\n'
 '\n'
 'Do yourself a favor and get their Bloody Mary.  It was phenomenal and simply '
 "the best I've ever had.  I'm pretty sure they only use ingredients from "
 'their garden and blend them fresh when you order it.  It was amazing.\n'
 '\n'
 'While EVERYTHING on the menu looks excellent, I had the white truffle '
 'scrambled eggs vegetable skillet and it was tasty and delicious.  It came '
 'with 2 pieces of their griddled bread with was amazing and it absolutely '
 'made the meal complete.  It was the best "toast" I\'ve ever had.\n'
 '\n'
 "Anyway, I can't wait to go back!")

Number of sentences:  12
['My', 'wife'

In [31]:
# note that sentences are tokenized further into words in the previous example
# if you only want sentences, you can do this instead:
from nltk.tokenize import sent_tokenize

review = data[0]['text']

sentences = sent_tokenize(review)
print('Sentences only (no word tokens):')
for sentence in sentences:
    print(sentence)

Sentences only (no word tokens):
My wife took me here on my birthday for breakfast and it was excellent.
The weather was perfect which made sitting outside overlooking their grounds an absolute pleasure.
Our waitress was excellent and our food arrived quickly on the semi-busy Saturday morning.
It looked like the place fills up pretty quickly so the earlier you get here the better.
Do yourself a favor and get their Bloody Mary.
It was phenomenal and simply the best I've ever had.
I'm pretty sure they only use ingredients from their garden and blend them fresh when you order it.
It was amazing.
While EVERYTHING on the menu looks excellent, I had the white truffle scrambled eggs vegetable skillet and it was tasty and delicious.
It came with 2 pieces of their griddled bread with was amazing and it absolutely made the meal complete.
It was the best "toast" I've ever had.
Anyway, I can't wait to go back!


In [29]:
# count tokens in review
from collections import Counter

review = data[0]['text']
tokens = word_tokenize(review)
token_counts = Counter(tokens)

# Ten most common tokens
print('Ten most common tokens and counts:')
pprint(token_counts.most_common(10))
print()

# Ten least common tokens
print('Ten least common tokens and counts:')
pprint(token_counts.most_common()[-10:])


Ten most common tokens:
[('.', 11),
 ('the', 9),
 ('and', 8),
 ('was', 8),
 ('It', 5),
 ('I', 5),
 ('it', 4),
 ('their', 4),
 ('on', 3),
 ('excellent', 3)]

Ten least common tokens:
[('toast', 1),
 ("''", 1),
 ('Anyway', 1),
 ('ca', 1),
 ("n't", 1),
 ('wait', 1),
 ('to', 1),
 ('go', 1),
 ('back', 1),
 ('!', 1)]


In [30]:
# stem tokens in review
from nltk.stem.porter import PorterStemmer

ps = PorterStemmer()

review = data[0]['text']
tokens = word_tokenize(review)
stemmed_tokens = [ps.stem(t) for t in tokens]

for tok in tokens:
    print(tok, '=>', ps.stem(tok))

My => my
wife => wife
took => took
me => me
here => here
on => on
my => my
birthday => birthday
for => for
breakfast => breakfast
and => and
it => it
was => wa
excellent => excel
. => .
The => the
weather => weather
was => wa
perfect => perfect
which => which
made => made
sitting => sit
outside => outsid
overlooking => overlook
their => their
grounds => ground
an => an
absolute => absolut
pleasure => pleasur
. => .
Our => our
waitress => waitress
was => wa
excellent => excel
and => and
our => our
food => food
arrived => arriv
quickly => quickli
on => on
the => the
semi-busy => semi-busi
Saturday => saturday
morning => morn
. => .
It => it
looked => look
like => like
the => the
place => place
fills => fill
up => up
pretty => pretti
quickly => quickli
so => so
the => the
earlier => earlier
you => you
get => get
here => here
the => the
better => better
. => .
Do => do
yourself => yourself
a => a
favor => favor
and => and
get => get
their => their
Bloody => bloodi
Mary => mari
. => .
It =>