In [5]:
!pip install pyspark



In [8]:
# mengimport modul ALS dari Spark Machine Learning
from pyspark.sql import SparkSession
from pyspark.ml.recommendation import ALS

# membuat session 
appName = "Sistem Rekomender di Spark"
spark = SparkSession.builder.appName (appName).config("spark.some.config.option", "some-value").getOrCreate()

## Membuat Data dari File 

In [9]:
# membuat data file ke DataFrame 
ratings = spark.read.csv ('ratings.csv', inferSchema = True, header = True)
movies = spark.read.csv ('movies.csv', inferSchema = True, header = True)

# menggabungkan data movie dan rating berdasarkan movieID 
ratings.join (movies, "movieID").show (5)

+-------+------+------+----------+--------------------+--------------------+
|movieId|userId|rating| timestamp|               title|              genres|
+-------+------+------+----------+--------------------+--------------------+
|     31|     1|   2.5|1260759144|Dangerous Minds (...|               Drama|
|   1029|     1|   3.0|1260759179|        Dumbo (1941)|Animation|Childre...|
|   1061|     1|   3.0|1260759182|     Sleepers (1996)|            Thriller|
|   1129|     1|   2.0|1260759185|Escape from New Y...|Action|Adventure|...|
|   1172|     1|   4.0|1260759205|Cinema Paradiso (...|               Drama|
+-------+------+------+----------+--------------------+--------------------+
only showing top 5 rows



### Menyiapkan Data 

In [None]:
# memilih kolom data "userId", "movieId", dan "rating"
data = ratings.select ("userId", "movieId", "rating")

# membagi data 70% training dan 30% testing
splits = data.randomSplit ([0.7, 0.3])
train = splits[0].withColumnRenamed ("rating", "label")
test = splits [1].withColumnRenamed ("rating", "trueLabel")

# menghitung baris data training dan testing 
train_rows = train.count()
test_rows = test.count ()
print (f"Jumlah baris data training :  {train_rows}, jumlah baris data testing : {test_rows}")

Jumlah baris data training :  69813, jumlah baris data testing : 30191


### Mendefinisikan model dan mentrainingnya 

In [12]:
# mendefinisikan algoritma ALS untuk sistem rekomender kita 
als = ALS (maxIter = 19, regParam = 0.01, userCol = "userId", itemCol = "movieId", ratingCol = "label")

# mentraining model dengan fungsi ".fit()"
model = als.fit (train)
print ("Model telah selasai ditraining")

Model telah selasai ditraining


### Melakukan prediksi dengan model rekomender yang telah ditraining

In [13]:
prediction = model.transform (test)
prediction.join (movies, "movieId").select("userId", "title", "prediction", "trueLabel").show (n = 5, truncate = False)

+------+--------------------------------+----------+---------+
|userId|title                           |prediction|trueLabel|
+------+--------------------------------+----------+---------+
|463   |Out of Africa (1985)            |3.381054  |4.0      |
|85    |Hudsucker Proxy, The (1994)     |2.713788  |3.0      |
|580   |Men in Black (a.k.a. MIB) (1997)|3.1576352 |2.5      |
|580   |American Splendor (2003)        |3.2867143 |4.0      |
|580   |Ice Age 2: The Meltdown (2006)  |2.5637996 |2.5      |
+------+--------------------------------+----------+---------+
only showing top 5 rows



### Mengevaluasi seberapa akurat sistem rekomender

In [17]:
# dikarenakan ingin menghitung RMSE maka menggunaak RegressionEvaluator 
from pyspark.ml.evaluation import RegressionEvaluator 

evaluator = RegressionEvaluator (labelCol = "trueLabel", predictionCol = "prediction", metricName = "rmse")
rmse = evaluator.evaluate (prediction)
print ( f"Root Mean Square Error RMSE : {rmse}")

Root Mean Square Error RMSE : nan


In [18]:
prediction.count()
a = prediction.count()
print ("jumlah baris sebelum di hapus data kosong", a)
cleanPred = prediction.dropna (how = "any", subset = ["prediction"])
b = cleanPred.count()
print ("jumlah baris setelah di hapus data yang kosong", b)
print ("jumlah baris data kosong  : ", a-b )

jumlah baris sebelum di hapus data kosong 30191
jumlah baris setelah di hapus data yang kosong 29036
jumlah baris data kosong  :  1155


In [19]:
rmse = evaluator.evaluate (cleanPred)
print ("Root Mean Square Error (RMSE): ", rmse)

Root Mean Square Error (RMSE):  1.2358890381235188
