## Training

In [40]:
import time
from pyspark.sql import SparkSession
import numpy as np
from pyspark.mllib.linalg.distributed import CoordinateMatrix, MatrixEntry

In [41]:
spark = SparkSession \
        .builder \
        .master('local[*]') \
        .appName("NBCF_pyspark") \
        .getOrCreate()

In [42]:
# Load training data
train = spark.sparkContext.textFile("ml-100k/ub.base")

In [43]:
# movieID => userID, rating
train = train.map(lambda r: r.split()).map(lambda r: (int(r[1]), (int(r[0]), float(r[2]))))
train.take(5)

[(1, (1, 5.0)), (1, (2, 3.0)), (1, (3, 4.0)), (1, (4, 3.0)), (1, (5, 3.0))]

In [44]:
# Determine mean of ratings by user
# movieID => mean
sepOp = lambda x, y: (x[0]+y[1], x[1]+1)
combOp = lambda x, y: (x[0]+y[0], x[1]+y[1])
means = train.aggregateByKey((0, 0), sepOp, combOp).mapValues(lambda v: v[0]/v[1] if v[1] > 0 else 0)
means.take(5)

[(2, 3.7115384615384617),
 (4, 4.285714285714286),
 (6, 3.63681592039801),
 (8, 3.7142857142857144),
 (10, 4.212643678160919)]

In [45]:
# userID => (movieID, rating_normalized)
train = train.join(means).map(lambda r: (r[1][0][0], (r[0], r[1][0][1] - r[1][1])))
train.take(5)

[(50, (4, 0.7142857142857144)),
 (260, (4, -0.2857142857142856)),
 (264, (4, -1.2857142857142856)),
 (288, (4, -0.2857142857142856)),
 (294, (4, 0.7142857142857144))]

In [46]:
# Create entries of coordinate matrix
entries = train.map(lambda r: MatrixEntry(r[0], r[1][0], r[1][1]))
# Convert to CoordinateMatrix => RowMatrix
mat = CoordinateMatrix(entries).toRowMatrix()
# Compute similarity of movie pairs
moviePairSimilarities = mat.columnSimilarities().entries.map(lambda r: ((r.i, r.j), r.value))
moviePairSimilarities.take(5)

[((533, 919), 0.09467954935718614),
 ((347, 428), 0.08199338594799069),
 ((327, 584), 0.019261412549728893),
 ((280, 467), 0.006922332749699157),
 ((112, 319), 0.13830075422401594)]

## Evaluate

In [47]:
# Load testing data
test = spark.sparkContext.textFile('ml-100k/ub.test')

# movieY => (ratingX, ratingXY)
test = test.map(lambda r: r.split()).map(lambda r: (int(r[1]), (int(r[0]), float(r[2])))).cache()
n_test = test.count()
n_test

9430

In [48]:
# Subtract rating to mean by movie
# userX => (movieY, ratingXY)
test = test.join(means).map(lambda r: (r[1][0][0], (r[0], r[1][0][1] - r[1][1])))
test.take(5)

[(11, (4, -0.2857142857142856)),
 (210, (4, -1.2857142857142856)),
 (258, (4, 0.7142857142857144)),
 (271, (4, -0.2857142857142856)),
 (300, (4, 0.7142857142857144))]

In [49]:
# userX => ((movieY, ratingXY), (movieA, ratingXA)) - A is movieID rated by userX
evaluate = test.join(train)
evaluate.take(5)

[(328, ((4, -1.2857142857142856), (12, -0.3170731707317076))),
 (328, ((4, -1.2857142857142856), (40, 0.1200000000000001))),
 (328, ((4, -1.2857142857142856), (44, 0.34751773049645385))),
 (328, ((4, -1.2857142857142856), (92, -0.24074074074074092))),
 (328, ((4, -1.2857142857142856), (100, 1.0)))]

In [50]:
# (movieY, movieA) => (i, userX, ratingXY, ratingXA) - i is index of movieY in key
def map1(row):
    userX, ((movieY, ratingXY), (movieA, ratingXA))= row
    if movieY < movieA:
        return (movieY, movieA), (0, userX, ratingXY, ratingXA)
    return (movieA, movieY), (1, userX, ratingXY, ratingXA)

evaluate = evaluate.map(map1)
evaluate.take(5)

[((4, 12), (0, 328, -1.2857142857142856, -0.3170731707317076)),
 ((4, 40), (0, 328, -1.2857142857142856, 0.1200000000000001)),
 ((4, 44), (0, 328, -1.2857142857142856, 0.34751773049645385)),
 ((4, 92), (0, 328, -1.2857142857142856, -0.24074074074074092)),
 ((4, 100), (0, 328, -1.2857142857142856, 1.0))]

In [51]:
# (movieY, movieA) => ((i, userX, ratingXY, ratingXA), simYA)
evaluate = evaluate.join(moviePairSimilarities)

# (userX, movieY, ratingXY) => [(ratingXA, simYA)]
def map2(row):
    (movieY, movieA), ((i, userX, ratingXY, ratingXA), simYA) = row
    if i == 0:
        return (userX, movieY, ratingXY), [(ratingXA, simYA)]
    return (userX, movieA, ratingXY), [(ratingXA, simYA)]

evaluate = evaluate.map(map2)
evaluate.take(5)

[((328, 4, -1.2857142857142856), [(-0.2941176470588234, 0.04969766099345801)]),
 ((50, 46, -0.2941176470588234), [(0.7142857142857144, 0.04969766099345801)]),
 ((300, 4, 0.7142857142857144), [(-1.2941176470588234, 0.04969766099345801)]),
 ((294, 46, -2.2941176470588234), [(0.7142857142857144, 0.04969766099345801)]),
 ((328, 4, -1.2857142857142856),
  [(0.34545454545454524, 0.025375575629938158)])]

In [52]:
# (userX, movieY, ratingXY) => [(ratingXA, simYA), (ratingXB, simYB), ...] - A,B is movieID rated by userX
evaluate = evaluate.reduceByKey(lambda a, b: a + b)

# Take top k movies greatest similarity
k = 30
evaluate = evaluate.mapValues(lambda l: sorted(l, key=lambda x: x[1], reverse=True)[:k])
evaluate.first()

((329, 4, 0.7142857142857144),
 [(-0.11111111111111116, 0.3683297467398866),
  (1.75, 0.22430015382565205),
  (-0.5443037974683542, 0.18288636474771272),
  (-1.0, 0.1484556231305236),
  (1.2840909090909092, 0.12151719767937197),
  (0.4920634920634921, 0.10773811615029269),
  (-0.2727272727272725, 0.09211323729436771),
  (1.8533333333333335, 0.08416800120477495),
  (-0.8780487804878048, 0.07946716792719163),
  (-1.125531914893617, 0.07083103999447238),
  (-0.5038167938931299, 0.05769022371430205),
  (-0.9333333333333331, 0.026666856534601685),
  (-0.00874635568513149, 0.02465735722819628),
  (0.8048780487804876, 0.023037328706210346),
  (1.1818181818181817, 0.020698732855892497),
  (-0.9684210526315788, 0.016204391447096702),
  (0.8936170212765959, 0.015295356037658522),
  (0.30935251798561136, 0.013201408961738362),
  (-0.8447204968944098, 0.01075828738303404),
  (0.21478873239436602, 0.007622976051280709),
  (-1.1006389776357826, 0.004049554215496449),
  (0.6928104575163401, 0.0026961

In [53]:
# Predict rating and calculating RMSE
def predict(row):
    rating_ori = row[0][2]
    val = np.array(row[1])
    ratings = val[:,0]
    sims = val[:,1]
    rating_pred = np.dot(ratings, sims) / (np.abs(sims).sum() + 1e-8)
    return (rating_pred - rating_ori)**2

evaluate = evaluate.map(predict).reduce(lambda a, b: a + b)

# Calculate RMSE
RMSE = np.sqrt(evaluate/n_test)
RMSE

0.9737616732758234