In [91]:
import findspark
findspark.init()
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.conf import SparkConf
from pyspark.sql import functions as F
SparkSession.builder.config(conf=SparkConf())
from pyspark.ml.recommendation import ALS


######################
# init spark session #
######################
spark = SparkSession.builder \
    .appName("train-model") \
    .getOrCreate()

In [92]:
movie_ratings = spark.read.json('../data/movies.json')
movie_ratings.show(5)

+---+--------+------+--------------------+-------+--------------------+------+------+------------+---------+--------------------+------+
|age|function|gender|               genre|movieId|                name|number|rating|release_date|timestamp|                 url|userId|
+---+--------+------+--------------------+-------+--------------------+------+------+------------+---------+--------------------+------+
| 60| retired|     M|[Animation, Child...|      1|    Toy Story (1995)| 95076|     4| 01-Jan-1995|887736532|http://us.imdb.co...|   308|
| 60| retired|     M|[Action, Comedy, ...|      4|   Get Shorty (1995)| 95076|     5| 01-Jan-1995|887737890|http://us.imdb.co...|   308|
| 60| retired|     M|[Crime, Drama, Th...|      5|      Copycat (1995)| 95076|     4| 01-Jan-1995|887739608|http://us.imdb.co...|   308|
| 60| retired|     M|     [Drama, Sci-Fi]|      7|Twelve Monkeys (1...| 95076|     4| 01-Jan-1995|887738847|http://us.imdb.co...|   308|
| 60| retired|     M|[Children's, Come...

In [93]:
movie_ratings.printSchema()

root
 |-- age: string (nullable = true)
 |-- function: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- genre: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- movieId: string (nullable = true)
 |-- name: string (nullable = true)
 |-- number: string (nullable = true)
 |-- rating: string (nullable = true)
 |-- release_date: string (nullable = true)
 |-- timestamp: string (nullable = true)
 |-- url: string (nullable = true)
 |-- userId: string (nullable = true)



In [94]:
movie_df = movie_ratings.select(
    F.col("age").cast("int").alias("age"),
    F.col("function").alias("function"),
    F.col("rating").cast("float").alias("rating"),
    F.col("gender").alias("gender"),
    F.col("genre").alias("genre"),
    F.col("movieId").cast("int").alias("movieId"),
    F.col("release_date").alias("release_date"),
    F.col("timestamp").alias("timestamp"),
    F.col("url").alias("url"),
    F.col("number").alias("codeZip"),
    F.col("userId").cast("int").alias("userId"),
)

In [95]:
movie_df.printSchema()

root
 |-- age: integer (nullable = true)
 |-- function: string (nullable = true)
 |-- rating: float (nullable = true)
 |-- gender: string (nullable = true)
 |-- genre: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- movieId: integer (nullable = true)
 |-- release_date: string (nullable = true)
 |-- timestamp: string (nullable = true)
 |-- url: string (nullable = true)
 |-- codeZip: string (nullable = true)
 |-- userId: integer (nullable = true)



In [96]:
als = ALS(maxIter=10, 
          regParam=0.5, 
          userCol="userId", 
          itemCol = "movieId", 
          ratingCol = "rating", 
          coldStartStrategy = "drop")

<code>
split the data:
</code>

In [97]:
train, test = movie_df.randomSplit([0.8, 0.2])

<code>
Training the Model:
</code>

In [98]:
alsModel = als.fit(train)

<code>
Generating Predictions:
</code>

In [99]:
prediction = alsModel.transform(test)

In [100]:
prediction.show()

+---+--------+------+------+--------------------+-------+------------+---------+--------------------+-------+------+----------+
|age|function|rating|gender|               genre|movieId|release_date|timestamp|                 url|codeZip|userId|prediction|
+---+--------+------+------+--------------------+-------+------------+---------+--------------------+-------+------+----------+
| 10| student|   1.0|     M|[Animation, Child...|    432| 01-Jan-1940|889827822|http://us.imdb.co...|  77459|   471|   3.51521|
| 10| student|   1.0|     M|[Animation, Child...|    588| 01-Jan-1991|889827881|http://us.imdb.co...|  77459|   471|  3.563692|
| 10| student|   1.0|     M|[Animation, Child...|    596| 21-Jun-1996|889827881|http://us.imdb.co...|  77459|   471|  3.184797|
| 10| student|   2.0|     M|[Adventure, Child...|    151| 01-Jan-1971|889828154|http://us.imdb.co...|  77459|   471| 3.4146192|
| 10| student|   2.0|     M|[Animation, Child...|    404| 01-Jan-1940|889827757|http://us.imdb.co...|  7