In [1]:
import pyspark as ps
from pyspark.sql import SparkSession
from pyspark.ml.recommendation import ALS
import pandas as pd
import numpy as np
import math
import os
import sys
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql import Row

ROOT_DIR=os.path.split(os.path.realpath(''))[0]
SRC_DIR = os.path.join(ROOT_DIR,'src')
sys.path.append(SRC_DIR)

In [2]:
spark = SparkSession.builder.master("local[4]").appName("sparkSQL_recommender").getOrCreate()
ratings_df = spark.read.csv('../data/movies/ratings.csv', header=True, inferSchema=True)
movies_df = spark.read.csv('../data/movies/movies.csv', header=True, inferSchema=True)

# print(df.show(5))
# print(df.describe)

In [3]:
train_df, test_df, holdout_df = ratings_df.randomSplit([0.6, 0.2, 0.2], seed=42)

In [4]:
# iterations = 10
# ranks = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
# step_size=[0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1]
# seed = 42
# rmse_list = []
# for rank in ranks:
#     als = ALS(rank=rank, maxIter=10, regParam=0.1, alpha=0.05, nonnegative=True,
#           numUserBlocks=10, numItemBlocks=10,
#           userCol="userId", itemCol="movieId", ratingCol="rating",
#           coldStartStrategy="drop")
#     model = als.fit(train_df)
#     predictions = model.transform(test_df)
#     evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",
#                                 predictionCol="prediction")
#     rmse = evaluator.evaluate(predictions)
#     rmse_list.append(rmse)
#     print("Root-mean-square error = " + str(rmse))

In [5]:
# iters = np.arange(1,21)
# ranks = np.arange(1,4)
# steps = step_size=[0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1]

# for rank in ranks:
#     for step in steps:
#         for iterations in iters:
#             als = ALS(rank=rank, maxIter=iterations, regParam=0.1, alpha=0.05, nonnegative=True,
#                   numUserBlocks=10, numItemBlocks=10,
#                   userCol="userId", itemCol="movieId", ratingCol="rating",
#                   coldStartStrategy="drop", stepSize=step)
#             model = als.fit(train_df)
#             predictions = model.transform(validation_df)
#             evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",
#                                         predictionCol="prediction")
#             rmse = evaluator.evaluate(predictions)
#             rmse_list.append(rmse)
#             print(f"Rank = {rank}, Iter = {iterations}, Learning Rate = {step}, RMSE = {rmse:.4f}")

In [6]:
train_df, holdout_df = ratings_df.randomSplit([0.8, 0.2], seed=42)

In [9]:
import timeit
rmse_list = []
seed = 42
iterations = 6
reg_param = 0.1
rank = 2

als = ALS(
    itemCol='movieId',
    userCol='userId',
    ratingCol='rating',
    nonnegative=True,
    regParam=reg_param,
    rank=rank,
    seed=seed,
    maxIter=iterations,
    coldStartStrategy="drop",
    numUserBlocks=10,
    numItemBlocks=10
    )
# start timer
start = timeit.default_timer()
train_test_df = train_df.union(test_df)
model = als.fit(train_test_df)



predictions = model.transform(holdout_df)
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",
                                predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
# stop timer3131
stop = timeit.default_timer()

rmse_list.append(rmse)
print(f"Final Holdout Test: Rank = {rank}; Iterations = {iterations};\nRMSE = {rmse:.2f}")
print(f"Time to complete ALS: {stop-start:.2f} seconds")

Final Holdout Test: Rank = 2; Iterations = 6;
RMSE = 0.92
Time to complete ALS: 4.52 seconds


In [10]:
train_test_df.show()

+------+-------+------+----------+
|userId|movieId|rating| timestamp|
+------+-------+------+----------+
|     1|   1172|   4.0|1260759205|
|     1|   1263|   2.0|1260759151|
|     1|   1287|   2.0|1260759187|
|     1|   1293|   2.0|1260759148|
|     1|   1339|   3.5|1260759125|
|     1|   1371|   2.5|1260759135|
|     1|   1405|   1.0|1260759203|
|     1|   1953|   4.0|1260759191|
|     1|   2150|   3.0|1260759194|
|     1|   2193|   2.0|1260759198|
|     1|   2968|   1.0|1260759200|
|     1|   3671|   3.0|1260759117|
|     2|     10|   4.0| 835355493|
|     2|     17|   5.0| 835355681|
|     2|     39|   5.0| 835355604|
|     2|     47|   4.0| 835355552|
|     2|     50|   4.0| 835355586|
|     2|     52|   3.0| 835356031|
|     2|    110|   4.0| 835355532|
|     2|    161|   3.0| 835355493|
+------+-------+------+----------+
only showing top 20 rows

