In [1]:
import json
import datetime
from collections import defaultdict

dayOfWeekReviews = defaultdict(dict)
with open('./data/yelp_academic_dataset_review.json', 'rb') as fread:
    for line in fread:
        jsonData = json.loads(line)
        date = datetime.datetime.strptime(jsonData['date'], '%Y-%m-%d')
        star = jsonData['stars']
        weekDay = date.weekday()
        
        weekDayStars = dayOfWeekReviews[weekDay]
        if star in weekDayStars:
            weekDayStars[star] += 1
        else:
            weekDayStars[star] = 1
            
        dayOfWeekReviews[weekDay] = weekDayStars

for k, v in dayOfWeekReviews.iteritems():
    print k, v


0 {1: 24514, 2: 22501, 3: 36661, 4: 75243, 5: 88103}
1 {1: 22734, 2: 20642, 3: 33634, 4: 70039, 5: 85669}
2 {1: 22282, 2: 20231, 3: 32798, 4: 69135, 5: 87414}
3 {1: 20965, 2: 18322, 3: 29390, 4: 62863, 5: 80326}
4 {1: 21701, 2: 18075, 3: 28055, 4: 59687, 5: 77408}
5 {1: 23251, 2: 19461, 3: 28929, 4: 60933, 5: 79878}
6 {1: 24364, 2: 21376, 3: 33252, 4: 68699, 5: 80729}


In [2]:
for i in xrange(7):
    d = dayOfWeekReviews[i]
    print d[1], d[2], d[3], d[4], d[5], i

24514 22501 36661 75243 88103 0
22734 20642 33634 70039 85669 1
22282 20231 32798 69135 87414 2
20965 18322 29390 62863 80326 3
21701 18075 28055 59687 77408 4
23251 19461 28929 60933 79878 5
24364 21376 33252 68699 80729 6


In [3]:
import json
import datetime
from collections import defaultdict

businessTypes = defaultdict(int)
businessCountByCity = defaultdict(int)
businessTypeCountByCity = defaultdict(lambda : defaultdict(int))

with open('./data/yelp_academic_dataset_business.json', 'rb') as fread:
    for line in fread:
        jsonData = json.loads(line)
        city = jsonData['city']
        state = jsonData['state']
        categories = jsonData['categories']
        for cat in categories:
            businessTypes[cat] += 1
            businessTypeCountByCity[(city, state)][cat] += 1
            
        businessCountByCity[(city, state)] += 1
        

In [10]:
print len(businessCountByCity)

391


In [5]:
#for k, v in businessCountByCity.iteritems():
#    print k[0], '|', k[1], '|', v
import operator

print max(businessCountByCity.iteritems(), key=operator.itemgetter(1))[1]

13600


In [8]:
value = 1
dailyIncrease = .18/100
for i in xrange(36):
    value += value * dailyIncrease

print value

1.92785774278


In [8]:
reviewLengthByStar = defaultdict(list)
numOfStarsByUid = defaultdict(lambda : defaultdict(int))
with open('./data/yelp_academic_dataset_review.json', 'rb') as fread:
    for line in fread:
        jsonData = json.loads(line)
        star = jsonData['stars']
        weekDay = date.weekday()
        text = jsonData['text']
        userId = jsonData['user_id']
        reviewLengthByStar[star].append(len(text))
        numOfStarsByUid[userId][star] += 1


In [13]:
import numpy as np
for star in reviewLengthByStar:
    reviewLengths = reviewLengthByStar[star]
    print star, np.mean(reviewLengths), np.std(reviewLengths), len(reviewLengths)

1 808.677519069 754.153286099 159811
2 822.279656919 700.89973176 140608
3 753.1022589 637.060557939 222719
4 674.873238048 591.743440854 466599
5 574.621950315 553.945944464 579527


In [None]:
for cat, count in businessTypes.iteritems():
    print cat, '|', count

In [1]:
import toyplot.color
toyplot.color.Palette()

In [None]:
import numpy as np

np.random.seed(1234)
observations = np.random.normal(size=(50, 50))

x = np.linspace(0, 1, len(observations))
y1 = np.min(observations, axis=1)
y2 = np.max(observations, axis=1)

canvas = toyplot.Canvas(width=400, height=300)
axes = canvas.axes()
mark = axes.fill(x, y1, y2)

In [2]:
from pymongo import MongoClient
import json
import datetime

In [12]:
client = MongoClient('localhost', 27017)
db = client.yelp_dataset
business = db.business

In [13]:
usBoundingBox = ((-124.848974, 24.396308), (-66.885444, 49.384358))
businessCursor = business.find({'categories':'Restaurants', 
                                'longitude': {'$lte': -66.885444}, 
                                'longitude': {'$gte': -124.848974}})

In [14]:
allBusiness = [b for b in businessCursor]

In [15]:
print len(allBusiness)
print allBusiness[12]

21892
{u'city': u'Homestead', u'review_count': 3, u'name': u"Eat'n Park Hospitality Group", u'neighborhoods': [u'Homestead'], u'open': True, u'business_id': u'sRqB6flj3GtTZIZJQxf_oA', u'full_address': u'285 Waterfront Dr E\nHomestead\nHomestead, PA 15120', u'hours': {}, u'state': u'PA', u'longitude': -79.9123428, u'stars': 2.5, u'latitude': 40.4116918, u'attributes': {u'Noise Level': u'very_loud', u'Parking': {u'garage': False, u'street': False, u'validated': False, u'lot': False, u'valet': False}, u'Alcohol': u'beer_and_wine', u'Attire': u'casual'}, u'_id': ObjectId('55d1dfe9fa8def3669b5fcf5'), u'type': u'business', u'categories': [u'Restaurants']}


In [21]:
reviews = db.reviews
tags = db.tags
tagCursor = tags.find()
tagged_reviews = [t['review_id'] for t in tagCursor]
print len(tagged_reviews), tagged_reviews[0]
businessIds = [b['business_id'] for b in allBusiness]
reviewCur = reviews.find({'business_id':{'$in':businessIds}})
usRestaurantReviews = [r for r in reviewCur]

825520 SEw9aEpjlwLeskowgE88Cg


In [24]:
print len(usRestaurantReviews)
print usRestaurantReviews[0]
allRestReviews = [r['review_id'] for r in usRestaurantReviews]
notTagged = set(allRestReviews) - set(tagged_reviews)

990627
{u'votes': {u'funny': 1, u'useful': 1, u'cool': 1}, u'user_id': u'DBz7nWHg6tLK1JzLFOtU3A', u'review_id': u'RdFot8wgSaCtPUatrAkiqA', u'text': u'Great guys, great food. Plus right next to one of my favorite bars..', u'business_id': u'--5jkZ3-nUPZxUvtcbr8Uw', u'stars': 5, u'date': datetime.datetime(2011, 7, 31, 0, 0), u'_id': ObjectId('55d1de3bfa8def3669b0fb65'), u'type': u'review'}


In [29]:
notTaggedReviews = list(notTagged)
import nltk
from nltk.corpus import stopwords

stopwords = stopwords.words('english')

In [None]:
import multiprocessing

def worker(id, db, skip, limit):
    reviewColls = db.reviews
    tagColls = db.tags
    reviewCur = reviewColls.find({'business_id':{'$in':businessIds}}).skip(skip+limit).limit(limit)
    bulkInsertData = []
    for review in reviewCur:
        words = []
        reviewText = review['text'].lower()
        businessId = review['business_id']
        userId = review['user_id']
        date = review['date']
        reviewId = review['review_id']
        stars = review['stars']
        sentences = nltk.sent_tokenize(reviewText)
        for sentence in sentences:
            tokens = nltk.word_tokenize(sentence)
            text = [word for word in tokens if word not in stopwords]
            tagged_text = nltk.pos_tag(text)

            for word, tag in tagged_text:
                words.append({"word": word, "pos": tag})

            insertData = {'business_id': businessId,
                          'user_id': userId,
                          'date': date,
                          'review_id': reviewId,
                          'stars': stars,
                          'words': words}
        bulkInsertData.append(insertData)
    tagColls.insert_many(bulkInsertData)

numWorkers = 6
reviewCur = reviews.find({'business_id':{'$in':businessIds}})
totalReviews = reviewCur.count()
batchSize = totalReviews/numWorkers
extra = totalReviews % numWorkers

jobs = []
for i in xrange(numWorkers):
    left = 0
    if i == (numWorkers - 1):
        left = extra
    p = multiprocessing.Process(target=worker, args=((i + 1), db, i * batchSize, batchSize + left))
    jobs.append(p)
    p.start()

for j in jobs:
    j.join()

In [30]:
reviewColls = db.reviews
tagColls = db.tags
print notTaggedReviews[0]
reviewCur = reviewColls.find({'review_id':{'$in': notTaggedReviews}})
bulkInsertData = []
for review in reviewCur:
        words = []
        reviewText = review['text'].lower()
        businessId = review['business_id']
        userId = review['user_id']
        date = review['date']
        reviewId = review['review_id']
        stars = review['stars']

        tokens = nltk.sent_tokenize(reviewText)
        text = [word for word in tokens if word not in stopwords]
        tagged_text = nltk.pos_tag(text)

        for word, tag in tagged_text:
            words.append({"word": word, "pos": tag})
            
        insertData = {'business_id': businessId,
                      'user_id': userId,
                      'date': date,
                      'review_id': reviewId,
                      'stars': stars,
                      'words': words}
        bulkInsertData.append(insertData)
tagColls.insert_many(bulkInsertData)

u5_dcj2jVs_Ni9yF2LNMAg


<pymongo.results.InsertManyResult at 0x22651d6e0>