In [0]:
#all spark imports
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.sql import functions as F

#instantiate the spark session
spark = SparkSession.builder.appName("Movie-Prediction").getOrCreate()

spark.conf.set("spark.sql.shuffle.partitions", 2)

In [0]:
from pyspark.ml.feature import Imputer, StringIndexer, VectorAssembler
from pyspark.ml.linalg import SparseVector, DenseVector
from pyspark.ml import Pipeline
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder, TrainValidationSplit
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator

In [0]:
# File location and type
path = "/FileStore/tables/movies.csv"

df = spark.read \
  .format("csv") \
  .option("inferSchema", True) \
  .option("header", True) \
  .option("path", path) \
  .load()

In [0]:
display(df)

movieId,rating,userId
2,3,0
3,1,0
5,2,0
9,4,0
11,1,0
12,2,0
15,1,0
17,1,0
19,1,0
21,1,0


In [0]:
df.printSchema()

In [0]:
# check the count of null values for each column
df.select([count(when(col(c).isNull(), c)).alias(c) for c in df.columns]).show()

In [0]:
(training, test) = df.randomSplit([0.8, 0.2])

In [0]:
(training2, test2) = df.randomSplit([0.7, 0.3])

In [0]:
als = ALS(maxIter=5, regParam=0.05, rank= 15, userCol= "userId", itemCol= "movieId", ratingCol="rating", coldStartStrategy = "drop")

In [0]:
eval = RegressionEvaluator(metricName= "rmse", labelCol= "rating",predictionCol= "prediction")
eval2 = RegressionEvaluator(metricName= "mse", labelCol= "rating",predictionCol= "prediction")
eval3 = RegressionEvaluator(metricName= "mae", labelCol= "rating",predictionCol= "prediction")

In [0]:
model = als.fit(training)

In [0]:
predictions = model.transform(test)

In [0]:
rmse = eval.evaluate(predictions)
mse = eval2.evaluate(predictions)
mae = eval3.evaluate(predictions)

In [0]:
print("rmse-") 
print(rmse) 
print("mse-")
print(mse)
print("mae-")
print(mae) 

In [0]:
model2 = als.fit(training2)

In [0]:
predictions2 = model2.transform(test2)

In [0]:
rmse2 = eval.evaluate(predictions2)
mse2 = eval2.evaluate(predictions2)
mae2 = eval3.evaluate(predictions2)

In [0]:
print("rmse-") 
print(rmse2) 
print("mse-")
print(mse2)
print("mae-")
print(mae2) 

In [0]:
parameters = (ParamGridBuilder()
             .addGrid(als.regParam, [0.01, 0.5, 1])
             .addGrid(als.maxIter, [5, 10, 20])
             .addGrid(als.rank, [5, 10, 20])
             .build())

In [0]:
als = ALS( userCol= "userId", itemCol= "movieId", ratingCol="rating", coldStartStrategy = "drop")

In [0]:
cv = CrossValidator(estimator=als, estimatorParamMaps=parameters, evaluator=eval, numFolds=2)

In [0]:
model = cv.fit(training)

In [0]:
best_model = model.bestModel

In [0]:
print("**Best Model**")
# Print "Rank"
print("  Rank:", best_model._java_obj.parent().getRank())
# Print "MaxIter"
print("  MaxIter:", best_model._java_obj.parent().getMaxIter())
# Print "RegParam"
print("  RegParam:", best_model._java_obj.parent().getRegParam())

In [0]:
final_predictions = best_model.transform(test)

In [0]:
RMSE = eval.evaluate(final_predictions)
print(RMSE)

In [0]:
 df.select("movieId").distinct().count()

In [0]:
# Generate n Recommendations for all users
recommendations = best_model.recommendForAllUsers(100)
recommendations.show()

In [0]:
nrecommendations = recommendations.withColumn("rec_exp", explode("recommendations")).select('userId', col("rec_exp.movieId"), col("rec_exp.rating"))
nrecommendations.limit(10).show()

In [0]:
user11_rec=nrecommendations.filter('userId = 11')
user11_rec.show()

In [0]:
user11_exist=df.filter('userId = 11')
user11_exist.show()

In [0]:
user11_rec.join(user11_exist, ['movieId'], 'left_anti').show(15, False)

In [0]:
user23_rec=nrecommendations.filter('userId = 23')
user23_rec.show()

In [0]:
user23_exist=df.filter('userId = 23')
user23_exist.show()

In [0]:
user23_rec.join(user23_exist, ['movieId'], 'left_anti').show(15, False)

In [0]:
movie_avg=df.groupBy("movieId").agg(F.mean('rating'), F.count('rating'))

In [0]:
movie_avg.sort(col("count(rating)").desc()).limit(10).show()

In [0]:
user_avg=df.groupBy("userId").agg(F.mean('rating'), F.count('rating'))

In [0]:
user_avg.sort(col("count(rating)").desc()).limit(10).show()