In [1]:
import findspark
findspark.init()
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.conf import SparkConf
from pyspark.sql import functions as F
SparkSession.builder.config(conf=SparkConf())
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator



######################
# init spark session #
######################
spark = SparkSession.builder \
    .appName("training-model") \
    .getOrCreate()

In [2]:
movie_ratings = spark.read.json('../data/movies.json')
movie_ratings.show(5)

+---+--------+------+--------------------+-------+--------------------+------+------+------------+---------+--------------------+------+
|age|function|gender|               genre|movieId|                name|number|rating|release_date|timestamp|                 url|userId|
+---+--------+------+--------------------+-------+--------------------+------+------+------------+---------+--------------------+------+
| 60| retired|     M|[Animation, Child...|      1|    Toy Story (1995)| 95076|     4| 01-Jan-1995|887736532|http://us.imdb.co...|   308|
| 60| retired|     M|[Action, Comedy, ...|      4|   Get Shorty (1995)| 95076|     5| 01-Jan-1995|887737890|http://us.imdb.co...|   308|
| 60| retired|     M|[Crime, Drama, Th...|      5|      Copycat (1995)| 95076|     4| 01-Jan-1995|887739608|http://us.imdb.co...|   308|
| 60| retired|     M|     [Drama, Sci-Fi]|      7|Twelve Monkeys (1...| 95076|     4| 01-Jan-1995|887738847|http://us.imdb.co...|   308|
| 60| retired|     M|[Children's, Come...

In [3]:
movie_ratings.printSchema()

root
 |-- age: string (nullable = true)
 |-- function: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- genre: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- movieId: string (nullable = true)
 |-- name: string (nullable = true)
 |-- number: string (nullable = true)
 |-- rating: string (nullable = true)
 |-- release_date: string (nullable = true)
 |-- timestamp: string (nullable = true)
 |-- url: string (nullable = true)
 |-- userId: string (nullable = true)



In [4]:
movie_df = movie_ratings.select(
    F.col("age").cast("int").alias("age"),
    F.col("name").cast("string").alias("name"),
    F.col("function").alias("function"),
    F.col("rating").cast("float").alias("rating"),
    F.col("gender").alias("gender"),
    F.col("genre").alias("genre"),
    F.col("movieId").cast("int").alias("movieId"),
    F.col("release_date").alias("release_date"),
    F.col("timestamp").alias("timestamp"),
    F.col("url").alias("url"),
    F.col("number").alias("codeZip"),
    F.col("userId").cast("int").alias("userId"),
)

In [5]:
movie_df.printSchema()

root
 |-- age: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- function: string (nullable = true)
 |-- rating: float (nullable = true)
 |-- gender: string (nullable = true)
 |-- genre: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- movieId: integer (nullable = true)
 |-- release_date: string (nullable = true)
 |-- timestamp: string (nullable = true)
 |-- url: string (nullable = true)
 |-- codeZip: string (nullable = true)
 |-- userId: integer (nullable = true)



<code>
split the data:
</code>

In [6]:
(training, testing) = movie_df.randomSplit([0.8, 0.2])

In [7]:
training.where("url == ''").show()

+---+--------------------+-------------+------+------+---------+-------+------------+---------+---+-------+------+
|age|                name|     function|rating|gender|    genre|movieId|release_date|timestamp|url|codeZip|userId|
+---+--------------------+-------------+------+------+---------+-------+------------+---------+---+-------+------+
| 20|             unknown|         none|   5.0|     M|[unknown]|    267|            |875801239|   |  60115|   130|
| 20|             unknown|      student|   3.0|     M|[unknown]|    267|            |875441348|   |  92705|   532|
| 22|Boys in Venice (1...|   healthcare|   1.0|     F|  [Drama]|   1359| 24-Sep-1996|885549790|   |  10019|   405|
| 24|             unknown|     engineer|   3.0|     M|[unknown]|    267|            |875742077|   |  19422|   268|
| 24|             unknown|   technician|   4.0|     M|[unknown]|    267|            |875692955|   |  85711|     1|
| 26|The Deadly Cure (...|    executive|   1.0|     M| [Action]|   1358| 16-Sep-

In [8]:
als = ALS(maxIter=5,
          rank=20,
          regParam=0.1, 
          userCol="userId", 
          itemCol="movieId", 
          ratingCol="rating",
          coldStartStrategy = "drop")

model = als.fit(training)

In [9]:
predictions = model.transform(testing)
evaluator = RegressionEvaluator(metricName="rmse", 
                                labelCol="rating",
                                predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
# predictions.show()
print("Root-mean-square error = " + str(rmse))

Root-mean-square error = 0.9300435119609682


In [10]:
model.save("../model/als-model")

In [11]:
def getUsers(movieId,limit):
    df = testing.where(f"movieId = {movieId}")
    return df.select("userId").distinct().limit(limit)

In [13]:
users = getUsers(252,5)
userSubsetRecs = model.recommendForUserSubset(users, 2)

In [17]:
userSubsetRecs.collect()[0][0]

251

In [None]:
spark.createDataFrame