## Training

In [9]:
import time
from pyspark.sql import SparkSession
import numpy as np
from pyspark.mllib.linalg.distributed import CoordinateMatrix, MatrixEntry

In [10]:
spark = SparkSession \
        .builder \
        .master('local[*]') \
        .appName("NBCF_pyspark") \
        .getOrCreate()

In [11]:
# Load training data
train = spark.sparkContext.textFile("ml-100k/ub.base")

In [12]:
# movieID => userID, rating
train = train.map(lambda r: r.split()).map(lambda r: (int(r[1]), (int(r[0]), float(r[2])-1)))
train.take(5)

[(1, (1, 4.0)), (2, (1, 2.0)), (3, (1, 3.0)), (4, (1, 2.0)), (5, (1, 2.0))]

In [13]:
# Determine mean of ratings by user
# movieID => mean
sepOp = lambda x, y: (x[0]+y[1], x[1]+1)
combOp = lambda x, y: (x[0]+y[0], x[1]+y[1])
means = train.aggregateByKey((0, 0), sepOp, combOp).mapValues(lambda v: v[0]/v[1] if v[1] > 0 else 0)
means.take(5)

[(2, 2.216),
 (4, 2.545918367346939),
 (6, 2.5714285714285716),
 (8, 2.9754901960784315),
 (10, 2.792207792207792)]

In [14]:
# userID => (movieID, rating_normalized)
train = train.join(means).map(lambda l: (l[1][0][0], (l[0], l[1][0][1] - l[1][1])))
train.take(5)

[(1, (4, -0.545918367346939)),
 (7, (4, 1.454081632653061)),
 (10, (4, 0.454081632653061)),
 (12, (4, 1.454081632653061)),
 (13, (4, 1.454081632653061))]

In [15]:
# Create entries of coordinate matrix
entries = train.map(lambda r: MatrixEntry(r[0], r[1][0], r[1][1]))
# Convert to CoordinateMatrix => RowMatrix
mat = CoordinateMatrix(entries).toRowMatrix()
# Compute similarity of movie pairs
moviePairSimilarities = mat.columnSimilarities().entries.map(lambda r: ((r.i, r.j), r.value))
print(moviePairSimilarities.take(5))

[((876, 977), 0.09010407028490032), ((220, 977), 0.11821570614648644), ((869, 1044), 0.03939488310827576), ((233, 1265), 0.037119076720572915), ((347, 428), 0.013109362655315765)]


## Evaluate

In [16]:
# Load testing data
test = spark.sparkContext.textFile('ml-100k/ub.test')

# movieY => (ratingX, ratingXY)
test = test.map(lambda l: l.split()).map(lambda l: (int(l[1]), (int(l[0]), float(l[2])-1)))
n_test = test.count()
n_test

9430

In [17]:
# Subtract rating to mean by movie
# userX => (movieY, ratingXY)
test = test.join(means).map(lambda l: (l[1][0][0], (l[0], l[1][0][1] - l[1][1])))
test.take(5)

[(1, (64, 0.5736434108527133)),
 (43, (64, 0.5736434108527133)),
 (53, (64, 0.5736434108527133)),
 (71, (64, -0.4263565891472867)),
 (76, (64, 0.5736434108527133))]

In [18]:
# userX => ((movieY, ratingXY), (movieA, ratingXA)) - A is movieID rated by userX
evaluate = test.join(train)
evaluate.take(5)

[(96, ((64, 0.5736434108527133), (8, 1.0245098039215685))),
 (96, ((64, 0.5736434108527133), (56, 0.9078212290502794))),
 (96, ((64, 0.5736434108527133), (96, 0.007246376811594235))),
 (96, ((64, 0.5736434108527133), (144, 0.1644444444444444))),
 (96, ((64, 0.5736434108527133), (156, 0.02898550724637694)))]

In [19]:
# (movieY, movieA) => (i, userX, ratingXY, ratingXA) - i is index of movieY in key
def map1(row):
    userX, ((movieY, ratingXY), (movieA, ratingXA))= row
    if movieY < movieA:
        return (movieY, movieA), (0, userX, ratingXY, ratingXA)
    return (movieA, movieY), (1, userX, ratingXY, ratingXA)

evaluate = evaluate.map(map1)
evaluate.take(5)

[((8, 64), (1, 96, 0.5736434108527133, 1.0245098039215685)),
 ((56, 64), (1, 96, 0.5736434108527133, 0.9078212290502794)),
 ((64, 96), (0, 96, 0.5736434108527133, 0.007246376811594235)),
 ((64, 144), (0, 96, 0.5736434108527133, 0.1644444444444444)),
 ((64, 156), (0, 96, 0.5736434108527133, 0.02898550724637694))]

In [20]:
# (movieY, movieA) => ((i, userX, ratingXY, ratingXA), simYA)
evaluate = evaluate.join(moviePairSimilarities)

# (userX, movieY, ratingXY) => [(ratingXA, simYA)]
def map2(row):
    (movieY, movieA), ((i, userX, ratingXY, ratingXA), simYA) = row
    if i == 0:
        return (userX, movieY, ratingXY), [(ratingXA, simYA)]
    return (userX, movieA, ratingXY), [(ratingXA, simYA)]

evaluate = evaluate.map(map2)
evaluate.take(5)

[((96, 64, 0.5736434108527133), [(0.7447916666666665, 0.1482283630863993)]),
 ((1, 64, 0.5736434108527133), [(0.7447916666666665, 0.1482283630863993)]),
 ((465, 64, 0.5736434108527133), [(-1.2552083333333335, 0.1482283630863993)]),
 ((705, 64, 0.5736434108527133), [(0.7447916666666665, 0.1482283630863993)]),
 ((737, 64, -0.4263565891472867), [(-2.2552083333333335, 0.1482283630863993)])]

In [21]:
# (userX, movieY, ratingXY) => [(ratingXA, simYA), (ratingXB, simYB), ...] - A,B is movieID rated by userX
evaluate = evaluate.reduceByKey(lambda a, b: a + b)

# Take top k movies greatest similarity
k = 30
evaluate = evaluate.mapValues(lambda l: sorted(l, key=lambda x: x[1], reverse=True)[:k])
evaluate.first()

((737, 64, -0.4263565891472867),
 [(1.1446808510638298, 0.24017966911631647),
  (-0.15298507462686572, 0.2153544782127603),
  (-0.3875000000000002, 0.2047933967898615),
  (-2.2552083333333335, 0.1482283630863993),
  (-0.04347826086956541, 0.13802406856629623),
  (0.7071129707112971, 0.1333814430197715),
  (0.6970509383378016, 0.13024533078065567),
  (1.1662591687041566, 0.12443899228501198),
  (-0.6529968454258674, 0.12197337786846901),
  (0.8649885583524028, 0.11168633738064088),
  (0.20353982300884965, 0.10215531468072248),
  (-2.504424778761062, 0.08872288693884825),
  (-0.6000000000000001, 0.05913225607584117),
  (-1.2849740932642488, 0.059113781897703124),
  (1.1884816753926701, 0.047074354112719145),
  (0.12217194570135748, 0.028553815696978554),
  (0.11475409836065564, 0.015261002780855876),
  (0.875, -0.0020792717596943254),
  (0.22222222222222232, -0.009653901372037352),
  (1.0743243243243241, -0.015462037891107401),
  (0.53125, -0.0414152415356465),
  (-0.5137614678899083, -0

In [22]:
# Predict rating and calculating RMSE
def predict(row):
    rating_ori = row[0][2]
    val = np.array(row[1])
    ratings = val[:,0]
    sims = val[:,1]
    rating_pred = np.dot(ratings, sims) / (np.abs(sims).sum() + 1e-8)
    return (rating_pred - rating_ori)**2

evaluate = evaluate.map(predict).reduce(lambda a, b: a + b)

# Calculate RMSE
RMSE = np.sqrt(evaluate/n_test)
RMSE

0.9507148125003996