In [2]:
#import libraries
from pyspark.sql import SparkSession
from pyspark.mllib.recommendation import ALS
from time import time
import math

In [4]:
#create spark session
spark = SparkSession.builder \
           .appName('Calculate Model Parameter') \
           .config("spark.jars", "mysql-connector-j-8.0.32.jar")\
           .config("spark.driver.memory", "6g") \
           .config("spark.executor.memory", "4g") \
           .getOrCreate()

In [5]:
#read ratings data from mysql, gotta use partition to parallel processing because data is too big
ratings_data = spark.read \
                    .format("jdbc") \
                    .option("driver","com.mysql.cj.jdbc.Driver") \
                    .option("url", "jdbc:mysql://cap2-database/MovieLens") \
                    .option("dbtable", "ratings") \
                    .option("user", "root") \
                    .option("password", "123") \
                    .option('fetchSize', '10000')\
                    .option('partitionColumn', 'ratingTime')\
                    .option('lowerBound', '1995-01-09 11:46:44')\
                    .option('upperBound', '2018-09-26 06:59:09')\
                    .option('numPartitions', '23')\
                    .load()\
                    .rdd

In [6]:
# get 3 necessary columns to reduce data
ratings_data = ratings_data.map(lambda x: (x[1], x[2], x[3]))

In [7]:
ratings_data.take(10)

[(Decimal('56769'), Decimal('1176'), 4.0),
 (Decimal('237556'), Decimal('21'), 3.0),
 (Decimal('237556'), Decimal('47'), 5.0),
 (Decimal('237556'), Decimal('1079'), 3.0),
 (Decimal('26312'), Decimal('1'), 3.0),
 (Decimal('26312'), Decimal('2'), 2.0),
 (Decimal('26312'), Decimal('20'), 3.0),
 (Decimal('26312'), Decimal('21'), 4.0),
 (Decimal('26312'), Decimal('22'), 3.0),
 (Decimal('26312'), Decimal('23'), 4.0)]

In [8]:
# split data to train, validation, test
train, validation, test = ratings_data.randomSplit([6, 2, 2], seed=0)
validation_for_predict = validation.map(lambda x: (x[0], x[1]))
test_for_predict = test.map(lambda x: (x[0], x[1]))

In [9]:
train.take(10)

[(Decimal('56769'), Decimal('1176'), 4.0),
 (Decimal('237556'), Decimal('21'), 3.0),
 (Decimal('26312'), Decimal('1'), 3.0),
 (Decimal('26312'), Decimal('20'), 3.0),
 (Decimal('26312'), Decimal('31'), 4.0),
 (Decimal('26312'), Decimal('44'), 4.0),
 (Decimal('26312'), Decimal('45'), 5.0),
 (Decimal('26312'), Decimal('47'), 5.0),
 (Decimal('26312'), Decimal('48'), 1.0),
 (Decimal('26312'), Decimal('52'), 1.0)]

In [10]:
# select parameter
seed = 1
iterations = 10
regularization_parameter = 0.1
tolerance = 0.02

# find out which is the best 'rank'
for rank in range(4, 17, 4):
    t0 = time()
    model = ALS.train(train, rank = rank, seed = seed, iterations = iterations, lambda_ = regularization_parameter)
    time_cost = time() - t0
    predictions = model.predictAll(validation_for_predict).map(lambda r: ((r[0], r[1]), r[2]))
    rates_and_preds = validation.map(lambda r: ((int(r[0]), int(r[1])), float(r[2]))).join(predictions)
    error = math.sqrt(rates_and_preds.map(lambda r: (r[1][0] - r[1][1])**2).mean())
    print('For rank %s the RMSE is %s. Training process cost %s seconds' % (rank, error, round(time_cost,3)))
#choose rank 12 cause of having lowest RMSE score

For rank 4 the RMSE is 0.8338154411756004. Training process cost 215.481 seconds
For rank 8 the RMSE is 0.8214013518469191. Training process cost 231.918 seconds
For rank 12 the RMSE is 0.8145828983211091. Training process cost 245.818 seconds
For rank 16 the RMSE is 0.8166226126408641. Training process cost 242.284 seconds
