In [5]:
from pyspark.sql import SQLContext
sqlContext = SQLContext(sc)
import json

trainingRawData = sqlContext.read.json('../Datasets/Amazon/Baby/reviews_training_Baby_5.json')
trainingData = trainingRawData.select(trainingRawData['reviewerID'],trainingRawData['asin'],trainingRawData['overall']).rdd

overallRatingSum = trainingData.map(lambda l: l[2]).sum()

mu = float(overallRatingSum/trainingData.count())

testRawData = sqlContext.read.json('../Datasets/Amazon/Baby/reviews_test_Baby_5.json')
testData = testRawData.select(testRawData['reviewerID'],testRawData['asin'],testRawData['overall']).rdd

allData = trainingData.union(testData)
orgReviewerIDs = allData.map(lambda l:l[0]).distinct()
orgProductIDs = allData.map(lambda l:l[1]).distinct()

ReviewerIDsMapping = dict(orgReviewerIDs.zipWithUniqueId().collect())
ProductIDsMapping = dict(orgProductIDs.zipWithUniqueId().collect())

training_RDD = trainingData.map(lambda l:(ReviewerIDsMapping[l[0]],ProductIDsMapping[l[1]], l[2]))
validation_RDD = training_RDD.sample(False, 0.2, seed = 23).cache()
test_RDD = testData.map(lambda l:(ReviewerIDsMapping[l[0]],ProductIDsMapping[l[1]], l[2])).cache()

userRatingData = training_RDD.map(lambda l: (l[0],[l[2]])).reduceByKey(lambda x,y : x+y)
userRatingData = userRatingData.map(lambda l: (l[0], sum(l[1])/len(l[1])))
bu = dict(userRatingData.collect())

productRatingData = training_RDD.map(lambda l: (l[1],[l[2]])).reduceByKey(lambda x,y : x+y)
productRatingData = productRatingData.map(lambda l: (l[0], sum(l[1])/len(l[1])))
bi = dict(productRatingData.collect())

training_RDD = training_RDD.map(lambda l: (l[0], l[1], l[2]-mu -(bu[l[0]]-mu)-(bi[l[1]]-mu))).cache()

validation_RDD_for_predict = validation_RDD.map(lambda row: (row[0],row[1]))
test_RDD_for_predict = test_RDD.map(lambda row: (row[0],row[1]))

from pyspark.mllib.recommendation import ALS
import math

seed = 5L
iterations = 10
regularization_parameter = 0.1
ranks = [10, 20]
errors = [0, 0, 0]
err = 0
tolerance = 0.02

min_error = float('inf')
best_rank = -1
best_iteration = -1
for rank in ranks:
    model = ALS.train(training_RDD, rank, seed=seed, iterations=iterations,lambda_=regularization_parameter)
    predictions = model.predictAll(validation_RDD_for_predict).map(lambda r: ((r[0], r[1]), r[2]+mu+(bu[r[0]]-mu)+(bi[r[1]]-mu)))
    
    rates_and_preds = validation_RDD.map(lambda r: ((int(r[0]), int(r[1])), float(r[2]))).join(predictions)
    error = math.sqrt(rates_and_preds.map(lambda r: (r[1][0] - r[1][1])**2).mean())
    errors[err] = error
    err += 1
    print 'For rank %s the RMSE is %s' % (rank, error)
    if error < min_error:
        min_error = error
        best_rank = rank

print 'The best model was trained with rank %s' % best_rank

model = ALS.train(training_RDD, best_rank, seed=seed, iterations=iterations,
                      lambda_=regularization_parameter)
predictions = model.predictAll(test_RDD_for_predict).map(lambda r: ((r[0], r[1]), r[2]+mu+(bu[r[0]]-mu)+(bi[r[1]]-mu)))
rates_and_preds = test_RDD.map(lambda r: ((int(r[0]), int(r[1])), float(r[2]))).join(predictions)
error = math.sqrt(rates_and_preds.map(lambda r: (r[1][0] - r[1][1])**2).mean())

print 'For testing data the RMSE is %s' % (error)

rates_and_preds.take(3)

For rank 10 the RMSE is 0.209641224328
For rank 20 the RMSE is 0.190510570716
The best model was trained with rank 20
For testing data the RMSE is 1.36460648672


[((57385, 55150), (4.0, 3.4209773758940454)),
 ((369753, 4273), (5.0, 1.8178123602618355)),
 ((311359, 13138), (4.0, 4.330833858769683))]