In [1]:
from pyspark.context import SparkContext
from pyspark.sql import SparkSession
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
import pandas as pd

In [2]:
spark = SparkSession.builder.master("local[2]") \
                    .appName('MovieRecomender') \
                    .getOrCreate()
print(spark.sparkContext)
print("Spark App Name : "+ spark.sparkContext.appName)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/10/04 23:02:17 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
<SparkContext master=local[2] appName=MovieRecomender>
Spark App Name : MovieRecomender


In [3]:
import os 
os.chdir ('/home/mt/MovieRecomender')
#change to your working directory
%pwd

'/home/mt/MovieRecomender'

In [4]:
df_m = pd.read_csv("data/movies.dat", engine='python', sep='::', names=["MovieID", "Title", "Genres"],encoding='ISO-8859-1')
df_m = spark.createDataFrame(df_m)

df_r = pd.read_csv("data/ratings.dat", engine='python', sep='::', names=["UserID", "MovieID", "Rating", "Timestamp"])
df_r = spark.createDataFrame(df_r)

df_u = pd.read_csv("data/users.dat", engine='python', sep='::', names=["UserID", "Gender", "Age", "Occupation", "Zip-code"])
df_u = spark.createDataFrame(df_u)

In [5]:
df_m.printSchema()
df_m.show(3)

root
 |-- MovieID: long (nullable = true)
 |-- Title: string (nullable = true)
 |-- Genres: string (nullable = true)



                                                                                

+-------+--------------------+--------------------+
|MovieID|               Title|              Genres|
+-------+--------------------+--------------------+
|      1|    Toy Story (1995)|Animation|Childre...|
|      2|      Jumanji (1995)|Adventure|Childre...|
|      3|Grumpier Old Men ...|      Comedy|Romance|
+-------+--------------------+--------------------+
only showing top 3 rows



In [6]:
df_r.printSchema()
df_r.show(3)

root
 |-- UserID: long (nullable = true)
 |-- MovieID: long (nullable = true)
 |-- Rating: long (nullable = true)
 |-- Timestamp: long (nullable = true)

22/10/04 23:02:46 WARN TaskSetManager: Stage 1 contains a task of very large size (7756 KiB). The maximum recommended task size is 1000 KiB.
+------+-------+------+---------+
|UserID|MovieID|Rating|Timestamp|
+------+-------+------+---------+
|     1|   1193|     5|978300760|
|     1|    661|     3|978302109|
|     1|    914|     3|978301968|
+------+-------+------+---------+
only showing top 3 rows



In [7]:
df_u.printSchema()
df_u.show(3)

root
 |-- UserID: long (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Age: long (nullable = true)
 |-- Occupation: long (nullable = true)
 |-- Zip-code: string (nullable = true)

+------+------+---+----------+--------+
|UserID|Gender|Age|Occupation|Zip-code|
+------+------+---+----------+--------+
|     1|     F|  1|        10|   48067|
|     2|     M| 56|        16|   70072|
|     3|     M| 25|        15|   55117|
+------+------+---+----------+--------+
only showing top 3 rows



In [8]:
df=df_m.join(df_r,"MovieID","inner").join(df_u,"UserID","inner")
df.printSchema()
df.show(3)

root
 |-- UserID: long (nullable = true)
 |-- MovieID: long (nullable = true)
 |-- Title: string (nullable = true)
 |-- Genres: string (nullable = true)
 |-- Rating: long (nullable = true)
 |-- Timestamp: long (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Age: long (nullable = true)
 |-- Occupation: long (nullable = true)
 |-- Zip-code: string (nullable = true)

22/10/04 23:02:48 WARN TaskSetManager: Stage 4 contains a task of very large size (7756 KiB). The maximum recommended task size is 1000 KiB.


                                                                                

+------+-------+--------------------+--------------------+------+---------+------+---+----------+--------+
|UserID|MovieID|               Title|              Genres|Rating|Timestamp|Gender|Age|Occupation|Zip-code|
+------+-------+--------------------+--------------------+------+---------+------+---+----------+--------+
|     5|     29|City of Lost Chil...|    Adventure|Sci-Fi|     5|978245065|     M| 25|        20|   55455|
|     6|   1806|       Paulie (1998)|Adventure|Childre...|     3|978236876|     F| 50|         9|   55117|
|     7|    474|In the Line of Fi...|     Action|Thriller|     5|978234842|     M| 35|         1|   06810|
+------+-------+--------------------+--------------------+------+---------+------+---+----------+--------+
only showing top 3 rows



In [9]:
df=df.select(["UserID","MovieID","Rating"])
df.na.drop()
df.show(3)

22/10/04 23:02:50 WARN TaskSetManager: Stage 13 contains a task of very large size (7756 KiB). The maximum recommended task size is 1000 KiB.


                                                                                

+------+-------+------+
|UserID|MovieID|Rating|
+------+-------+------+
|    26|   3506|     4|
|    29|    474|     2|
|    29|   2529|     5|
+------+-------+------+
only showing top 3 rows



In [10]:
(train, test) = df.randomSplit([0.8, 0.2])

In [11]:
als = ALS(userCol="UserID", itemCol="MovieID", ratingCol="Rating", nonnegative = True, implicitPrefs = False)

In [12]:
grid_search = ParamGridBuilder().addGrid(als.rank, [10, 25, 50 ,75 , 100]  ).addGrid(als.maxIter,[5, 10, 20, 40, 80, 160]  ).addGrid(als.regParam, [.001,.005,.01, .05, .1, .5] ).build()

In [13]:
evaluator = RegressionEvaluator(metricName="rmse", labelCol="Rating", predictionCol="prediction") 

In [14]:
cv = CrossValidator(estimator=als, estimatorParamMaps=grid_search, evaluator=evaluator, numFolds=5)

In [15]:
spark.sparkContext.setCheckpointDir('checkpoint/')
cv_fitted=cv.fit(train)

22/10/04 23:02:53 WARN TaskSetManager: Stage 26 contains a task of very large size (7756 KiB). The maximum recommended task size is 1000 KiB.


                                                                                

22/10/04 23:03:07 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
22/10/04 23:03:07 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.ForeignLinkerBLAS
22/10/04 23:03:10 WARN TaskSetManager: Stage 95 contains a task of very large size (7756 KiB). The maximum recommended task size is 1000 KiB.


                                                                                