In [1]:
import numpy
import gzip
from collections import defaultdict
import scipy.optimize
import random
from math import exp
from math import log
from sklearn import svm

In [2]:
def parseData(fname):
  for l in open(fname):
    yield eval(l)

In [3]:
print "Reading data..."
reviewdata = list(parseData("yelp_training_set_review.json"))
checkindata = list(parseData("yelp_training_set_checkin.json"))
userdata = list(parseData("yelp_training_set_user.json"))
businessdata = list(parseData("yelp_training_set_business.json"))
print "done"

Reading data...
done


In [4]:
# Divide the review data
train_data = reviewdata[:100000]
valid_data = reviewdata[100000:200000]
test_data = reviewdata[200000:]

In [5]:
def inner(x,y):
  return sum([x[i]*y[i] for i in range(len(x))])

In [6]:
def sigmoid(x):
  return 1.0 / (1 + exp(-x))

In [7]:
# NEGATIVE Log-likelihood
def f(theta, X, y, lam):
  loglikelihood = 0
  for i in range(len(X)):
    logit = inner(X[i], theta)
    loglikelihood -= log(1 + exp(-logit))
    if not y[i]:
      loglikelihood -= logit
  for k in range(len(theta)):
    loglikelihood -= lam * theta[k]*theta[k]
  return -loglikelihood

In [8]:
# NEGATIVE Derivative of log-likelihood
def fprime(theta, X, y, lam):
  dl = [0]*len(theta)
  for i in range(len(X)):
    logit = inner(X[i], theta)
    for k in range(len(theta)):
      dl[k] += X[i][k] * (1 - sigmoid(logit))
      if not y[i]:
        dl[k] -= X[i][k]
  for k in range(len(theta)):
    dl[k] -= lam*2*theta[k]
  return numpy.array([-x for x in dl])

In [9]:
train_ratings=[]
for r in train_data:
    train_ratings.append(r['stars'])

In [10]:
# Obtain the evaluation
y_train=[]
for r in train_ratings:
  if r > 3.5:
    y_train.append(1)
  else:
    y_train.append(0)

In [11]:
trainWeek = []
trainCount = []
trainReview= []
trainAvg = []
trainStar = []
trainCheckin = []
trainBCount = []
userRatings = defaultdict(list)
userCounts = defaultdict(list)
businessCounts = defaultdict(list)
businessCheckin = defaultdict(list)
businessStar = defaultdict(list)
ratingAvg = 0
countAvg = 0
checkinAvg = 0
starAvg = 0
countAvg2 = 0


for u in userdata:
    user = u['user_id']
    userRatings[user] = u['average_stars']
    userCounts[user] = u['review_count']
    ratingAvg += u['average_stars']
    countAvg += u['review_count']

for u in checkindata:
    business = u['business_id']
    nCheckin = 0
    for t in u['checkin_info']:
        nCheckin += u['checkin_info'][t]
    businessCheckin[business] = nCheckin
    checkinAvg += nCheckin

for u in businessdata:
    business = u['business_id']
    businessStar[business] = u['stars']
    businessCounts[business] = u['review_count']
    countAvg2 +=u['review_count']
    starAvg += u['stars']
    
ratingAvg /= len(userdata)
countAvg /= len(userdata)
starAvg /= len(businessdata)
checkinAvg /= len(checkindata)
countAvg2 /= len(businessdata)
    
for r in train_data:
    user, business = r['user_id'], r['business_id']
    trainReview.append(len(r['text'].split()))
    if user in userRatings:
        trainAvg.append(userRatings[user])
        trainCount.append(userCounts[user])
    else:
        trainAvg.append(ratingAvg)
        trainCount.append(countAvg)
    if business in businessCheckin:
        trainCheckin.append(businessCheckin[business])
        trainBCount.append(businessCounts[business])
    else:
        trainCheckin.append(checkinAvg)
        trainBCount.append(countAvg2)
    if business in businessStar:
        trainStar.append(businessStar[business])
    else:
        trainStar.append(starAvg)

In [12]:
def feature(reviewWords, avgRating, Count, nCheckin, star):
    feat = [[1] for row in range(len(Count))]
    feat = numpy.column_stack((feat, reviewWords, avgRating, Count, nCheckin, star))
    return feat

In [13]:
X_train = feature(trainReview, trainAvg, trainCount, trainCheckin, trainStar)

In [14]:
# Use a library function to run gradient descent (or you can implement yourself!)
theta,l,info = scipy.optimize.fmin_l_bfgs_b(f, [0]*len(X_train[0]), fprime, args = (X_train, y_train, 1.0))

In [15]:
validReview = []
validAvg = []
validCount = []
validCheckin = []
validStar= []
validBCount = []

for r in valid_data:
    user, business = r['user_id'], r['business_id']
    validReview.append(len(r['text'].split()))
    if user in userRatings:
        validAvg.append(userRatings[user])
        validCount.append(userCounts[user])
    else:
        validAvg.append(ratingAvg)
        validCount.append(countAvg)
    if business in businessCheckin:
        validCheckin.append(businessCheckin[business])
    else:
        validCheckin.append(checkinAvg)
    if business in businessStar:
        validStar.append(businessStar[business])
        validBCount.append(businessCounts[business])
    else:
        validStar.append(starAvg)
        validBCount.append(countAvg2)

In [17]:
y_predictions1 = []
X_valid = feature(validReview, validAvg, validCount, validCheckin, validStar)

In [18]:
for u in range(len(X_valid)):
    y_predictions1.append(inner(X_valid[u],theta))

In [19]:
minimum1 = 0
maximum1 = 0
for u in range(len(y_predictions1)):
    minimum1 = min(y_predictions1[u], minimum1)
    maximum1 = max(y_predictions1[u], maximum1)

In [42]:
predict_ratings1 = []
for u in range(len(y_predictions1)):
    if y_predictions1[u] > 0:
        predict_ratings1.append(3.5 + 1.5 * (y_predictions1[u] / maximum1))
    else:
        predict_ratings1.append(3.5 - 2.5 * (y_predictions1[u] / minimum1))
    #predict_ratings1[u] = round(predict_ratings1[u], 1)

In [43]:
valid_ratings=[]
for r in valid_data:
    valid_ratings.append(r['stars'])

In [44]:
MSE1 = 0
for i in range(len(valid_ratings)):
    MSE1 += (predict_ratings1[i] - valid_ratings[i])**2
MSE1 /= len(valid_ratings)
print MSE1

1.03717992678


In [23]:
testReview = []
testAvg = []
testCount = []
testCheckin = []
testStar= []
testBCount = []

for r in test_data:
    user, business = r['user_id'], r['business_id']
    testReview.append(len(r['text'].split()))
    if user in userRatings:
        testAvg.append(userRatings[user])
        testCount.append(userCounts[user])
    else:
        testAvg.append(ratingAvg)
        testCount.append(countAvg)
    if business in businessCheckin:
        testCheckin.append(businessCheckin[business])
    else:
        testCheckin.append(checkinAvg)
    if business in businessStar:
        testStar.append(businessStar[business])
        testBCount.append(businessCounts[business])
    else:
        testStar.append(starAvg)
        testBCount.append(countAvg2)

In [24]:
y_predictions2 = []
X_test = feature(testReview, testAvg, testCount, testCheckin, testStar)

In [25]:
for u in range(len(X_test)):
    y_predictions2.append(inner(X_test[u],theta))

In [26]:
minimum2 = 0
maximum2 = 0
for u in range(len(y_predictions2)):
    minimum2 = min(y_predictions2[u], minimum2)
    maximum2 = max(y_predictions2[u], maximum2)

In [45]:
predict_ratings2 = []
for u in range(len(y_predictions2)):
    if y_predictions2[u] > 0:
        predict_ratings2.append(3.5 + 1.5 * (y_predictions2[u] / maximum2))
    else:
        predict_ratings2.append(3.5 - 2.5 * (y_predictions2[u] / minimum2))
    #predict_ratings2[u] = round(predict_ratings2[u], 1)

In [46]:
test_ratings=[]
for r in test_data:
    test_ratings.append(r['stars'])

In [47]:
MSE2 = 0
for i in range(len(test_ratings)):
    MSE2 += (predict_ratings2[i] - test_ratings[i])**2
MSE2 /= len(test_ratings)
print MSE2

1.05464099259
