# import data from HDFS

In [1]:
rawUData=sc.textFile("hdfs://mycluster/user/oracle/data/u.data")
rawUData.count()

100000

In [2]:
rawRating = rawUData.map(lambda x : x.split("\t")[:3])
rawRating.take(5)

[['196', '242', '3'],
 ['186', '302', '3'],
 ['22', '377', '1'],
 ['244', '51', '2'],
 ['166', '346', '1']]

# Preparing data for ALS

In [3]:
#(user_id, item_id, rating)
ratingsRDD = rawRating.map(lambda x : (int(x[0]),int(x[1]),int(x[2])))
ratingsRDD.take(5)

[(196, 242, 3), (186, 302, 3), (22, 377, 1), (244, 51, 2), (166, 346, 1)]

In [4]:
from pyspark.ml.evaluation import RegressionEvaluator #evaluate ALS model
from pyspark.ml.recommendation import ALS
from pyspark.ml.tuning import TrainValidationSplit, ParamGridBuilder, TrainValidationSplitModel
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('als_model').getOrCreate()

#create training set and test set
#covert RDD to DataFrame cuz .fit() only accept dataframe
ratingsRDD = spark.createDataFrame(ratingsRDD, ["userId", "movieId", "rating"])
(training, test)=ratingsRDD.randomSplit([0.8, 0.2], 35)

#create ALS model
#coldStart-> avoid empty prediction value
als = ALS(userCol="userId", itemCol="movieId", ratingCol="rating", coldStartStrategy="drop", nonnegative = True)


In [5]:
#Tune model using ParamGridBuilder
param_grid = ParamGridBuilder()\
             .addGrid(als.rank, [12, 13, 14])\
             .addGrid(als.maxIter, [18, 19, 20])\
             .addGrid(als.regParam, [.17, .18, .19])\
             .build()

In [6]:
#Define evaluator as RMSE
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction")

In [7]:
#Build cross validation using TrainValidationSplit
tvs = TrainValidationSplit(
        estimator=als,
        estimatorParamMaps=param_grid,
        evaluator=evaluator)

In [8]:
#Fit ALS model to training data
model = tvs.fit(training)

# Extract the best model

In [9]:
#Extract best model from the tuning exercise using ParanGridBuilder
best_model = model.bestModel

In [23]:
#model-> TrainValidationSplitModel is a collection of ALSModels with dirfferent set of hyperparameter
type(model)

pyspark.ml.tuning.TrainValidationSplitModel

In [10]:
#best_model-> Extract the best model from the collection
type(best_model)

pyspark.ml.recommendation.ALSModel

# Evaluate model with test set

In [11]:
#Generate predictions and evaluate using RMSE
predictions = best_model.transform(test)
rmse = evaluator.evaluate(predictions)

In [13]:
print("RMSE=" +str(rmse))
print("**Best Model**")
print("  Rank:"),best_model.rank
print("  MaxIter:"), best_model._java_obj.parent().getMaxIter()
print("  RegParam"), best_model._java_obj.parent().getRegParam()

RMSE=0.9274000847657947
**Best Model**
  Rank:
  MaxIter:
  RegParam


(None, 0.17)

# Save and load ALSModel

In [14]:
#save the best model to HDFS
best_model.save("hdfs://mycluster/user/oracle/data/model")

In [17]:
#load model from HDFS and reuse it
from pyspark.ml.recommendation import ALS, ALSModel
from pyspark.ml.tuning import TrainValidationSplit, TrainValidationSplitModel
re_model = ALSModel.load("hdfs://mycluster/user/oracle/data/model")

In [18]:
re_model

ALS_42eba90bc6fef07ad82d

# Recommend movies by UserId

In [20]:
import pandas as pd
def get_recs_for_user(recs):
    recs = recs.select("recommendations.movieId", "recommendations.rating")
    movies = recs.select("movieId").toPandas().iloc[0,0]
    ratings = recs.select("rating").toPandas().iloc[0,0]
    ratings_matrix = pd.DataFrame(movies, columns = ["movieId"])
    ratings_matrix["ratings"] = ratings
    ratings_matrix_ps = sqlContext.createDataFrame(ratings_matrix)
    return ratings_matrix_ps

In [24]:
text = input("userId : ")
topKRecs = re_model.recommendForAllUsers(5) 
get_recs_for_user(topKRecs.where(topKRecs.userId == text)).show()

userId : 481
+-------+------------------+
|movieId|           ratings|
+-------+------------------+
|   1450| 4.800578594207764|
|   1449| 4.732533931732178|
|   1398| 4.686621189117432|
|   1642| 4.589539051055908|
|   1122|4.5377373695373535|
+-------+------------------+



# Recommend users for movies

In [None]:
def get_recs_for_item(recs):
    recs = recs.select("recommendations.userId", "recommendations.rating")
    users = recs.select("userId").toPandas().iloc[0,0]
    ratings = recs.select("rating").toPandas().iloc[0,0]
    ratings_matrix = pd.DataFrame(users, columns = ["userId"])
    ratings_matrix["ratings"] = ratings
    ratings_matrix_ps = sqlContext.createDataFrame(ratings_matrix)
    return ratings_matrix_ps

In [None]:
x = input("movieId : ")
item = best_model.recommendForAllItems(5)
get_recs_for_item(item.where(item.movieId == x)).show()

# Recommend movies to all users

In [None]:
result = best_model.recommendForAllUsers(5)
result

In [None]:
result_p = result.toPandas()

In [None]:
y = result_p['userId'].tolist()
for user in y[0:5]:
    print("userId : " + str(user))
    get_recs_for_user(topKRecs.where(topKRecs.userId == user)).show()
    print("=========================================================")