In [1]:
import pandas as pd
from pyspark.sql.functions import col, explode
from pyspark import SparkContext

In [2]:
from pyspark.sql import SparkSession
sc = SparkContext
# sc.setCheckpointDir('checkpoint')
spark = SparkSession.builder.appName('Recommendation').getOrCreate()

In [3]:
objekWisata = spark.read.csv("../dataset/cb_objekwisata.csv", header=True)
ratings = spark.read.csv("../dataset/data-rating-objek-wisata.csv", header=True)

In [4]:
ratings.show()

+-------+--------------------+--------------------+------+--------+--------------------+--------+
|id_user|                name|            username|rating|    date|          nama_objek|id_objek|
+-------+--------------------+--------------------+------+--------+--------------------+--------+
|   4363|        Compass73198|/Profile/Compass7...|     1|  19-Sep|Agrowisata Teba S...|     998|
|   6237|              Emc921|     /Profile/Emc921|     1|Okt 2019|Agrowisata Teba S...|     998|
|   7446|           gaddavino|  /Profile/gaddavino|     1|Agt 2019|Agrowisata Teba S...|     998|
|  10391|                 K&C| /Profile/963kristyb|     1|Okt 2019|Agrowisata Teba S...|     998|
|  12803|           Michael H|/Profile/MichaelH...|     1|  19-Jul|Agrowisata Teba S...|     998|
|  16435|                Rona|/Profile/Ronaindo...|     1|  19-Jul|Agrowisata Teba S...|     998|
|  11566|           Lindsay H|/Profile/986lindsayh|     1|  19-Jan|    Air Panas Banjar|     997|
|  11788|           

In [5]:
ratings.printSchema()

root
 |-- id_user: string (nullable = true)
 |-- name: string (nullable = true)
 |-- username: string (nullable = true)
 |-- rating: string (nullable = true)
 |-- date: string (nullable = true)
 |-- nama_objek: string (nullable = true)
 |-- id_objek: string (nullable = true)



In [6]:
ratings = ratings.\
    withColumn('id_user', col('id_user').cast('integer')).\
    withColumn('id_objek', col('id_objek').cast('integer')).\
    withColumn('rating', col('rating').cast('integer'))
ratings.show()

+-------+--------------------+--------------------+------+--------+--------------------+--------+
|id_user|                name|            username|rating|    date|          nama_objek|id_objek|
+-------+--------------------+--------------------+------+--------+--------------------+--------+
|   4363|        Compass73198|/Profile/Compass7...|     1|  19-Sep|Agrowisata Teba S...|     998|
|   6237|              Emc921|     /Profile/Emc921|     1|Okt 2019|Agrowisata Teba S...|     998|
|   7446|           gaddavino|  /Profile/gaddavino|     1|Agt 2019|Agrowisata Teba S...|     998|
|  10391|                 K&C| /Profile/963kristyb|     1|Okt 2019|Agrowisata Teba S...|     998|
|  12803|           Michael H|/Profile/MichaelH...|     1|  19-Jul|Agrowisata Teba S...|     998|
|  16435|                Rona|/Profile/Ronaindo...|     1|  19-Jul|Agrowisata Teba S...|     998|
|  11566|           Lindsay H|/Profile/986lindsayh|     1|  19-Jan|    Air Panas Banjar|     997|
|  11788|           

In [7]:
from pyspark.sql.functions import row_number
from pyspark.sql.window import Window
w = Window.orderBy("count")
objekWisata_x = objekWisata.groupBy("type_objek").count().select("*").withColumn("id_type_objek", row_number().over(w)).drop('count')
objekWisata_x.show()

+--------------------+-------------+
|          type_objek|id_type_objek|
+--------------------+-------------+
|KONSER & PERTUNJUKAN|            1|
|SUMBER INFORMASI ...|            2|
|        TRANSPORTASI|            3|
|KEBUN BINATANG & ...|            4|
|             LAINNYA|            5|
|TAMAN AIR & TAMAN...|            6|
|              MUSEUM|            7|
|      TEMPAT BELANJA|            8|
|PEMANDANGAN & TEN...|            9|
| WISATA ALAM & TAMAN|           10|
+--------------------+-------------+



In [8]:
from pyspark.sql.functions import percentile_approx, lit
ratings_x = ratings.join(objekWisata, on="nama_objek").groupBy(["type_objek","id_user","name","username"]).agg(percentile_approx("rating", 0.5, lit(1000000)).alias("rating"))
ratings_x = ratings_x.join(objekWisata_x, on="type_objek")
ratings_x.show()

+--------------------+-------+-----------------+--------------------+------+-------------+
|          type_objek|id_user|             name|            username|rating|id_type_objek|
+--------------------+-------+-----------------+--------------------+------+-------------+
|KEBUN BINATANG & ...|   4888| Dedy I Santoso I|  /Profile/dedyis_77|     5|            4|
|KEBUN BINATANG & ...|  15239|           Putu D|   /Profile/PutuD111|     5|            4|
|KEBUN BINATANG & ...|  20928|          Zunny'$|     /Profile/ZuniI2|     5|            4|
|KONSER & PERTUNJUKAN|   7293|         fotukava|   /Profile/fotukava|     5|            1|
|             LAINNYA|  12331|     marthavietha|/Profile/marthavi...|     4|            5|
|             LAINNYA|  14844|       peterk2152| /Profile/peterk2152|     3|            5|
|             LAINNYA|  16406|         Roland V|/Profile/rolandva...|     1|            5|
|             LAINNYA|  16459|            Roo T|    /Profile/RoannaS|     2|            5|

In [9]:
numerator = ratings_x.select("rating").count()

num_users = ratings_x.select("id_user").distinct().count()
num_objekWisata =  ratings_x.select("id_type_objek").distinct().count()

denominator = num_users * num_objekWisata

sparsity = (1.0 - (numerator *1.0)/denominator)*100
print("The ratings dataframe is ", "%.2f" % sparsity + "% empty.")

The ratings dataframe is  85.52% empty.


In [10]:
# Group data by userId, count ratings
username_ratings = ratings_x.groupBy("id_user").count().orderBy('count', ascending=False)
username_ratings.show()

+-------+-----+
|id_user|count|
+-------+-----+
|  16265|   14|
|  13297|   14|
|   5799|   12|
|   5303|   11|
|   8449|    9|
|  14243|    9|
|   4739|    9|
|  20062|    9|
|   4567|    9|
|   9177|    9|
|   8509|    8|
|   9515|    8|
|    661|    8|
|  13329|    8|
|   5789|    8|
|  12011|    8|
|  13705|    8|
|   2108|    8|
|  18575|    8|
|   8574|    8|
+-------+-----+
only showing top 20 rows



In [11]:
attraction_ratings = ratings_x.groupBy("id_type_objek").count().orderBy('count', ascending=False)
attraction_ratings.show()

+-------------+-----+
|id_type_objek|count|
+-------------+-----+
|           10|11551|
|            9| 6785|
|            8| 4370|
|            7| 2460|
|            6| 2086|
|            4|  841|
|            5|  838|
|            3|  393|
|            2|  292|
|            1|  197|
+-------------+-----+



In [12]:
# Import the required functions
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

In [13]:
# Create test and train set
(train, test) = ratings_x.randomSplit([0.8, 0.2], seed = 1234)

# Create ALS model
als = ALS(userCol="id_user", itemCol="id_type_objek", ratingCol="rating", nonnegative = True, implicitPrefs = False, coldStartStrategy="drop")

# Confirm that a model called "als" was created
type(als)

pyspark.ml.recommendation.ALS

In [14]:
# Import the requisite items
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

# Add hyperparameters and their respective values to param_grid
param_grid = ParamGridBuilder() \
            .addGrid(als.rank, [10, 50, 100, 150]) \
            .addGrid(als.regParam, [.01, .05, .1, .15]) \
            .build()
            #             .addGrid(als.maxIter, [5, 50, 100, 200]) \

           
# Define evaluator as RMSE and print length of evaluator
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction") 
print ("Num models to be tested: ", len(param_grid))

Num models to be tested:  16


In [15]:
# Build cross validation using CrossValidator
cv = CrossValidator(estimator=als, estimatorParamMaps=param_grid, evaluator=evaluator, numFolds=5)

# Confirm cv was built
print(cv)

CrossValidator_8a7bf346523c


In [16]:
#Fit cross validator to the 'train' dataset
model = cv.fit(train)

#Extract best model from the cv model above
best_model = model.bestModel

In [17]:
# Print best_model
print(type(best_model))

# Complete the code below to extract the ALS model parameters
print("**Best Model**")

# # Print "Rank"
print("  Rank:", best_model._java_obj.parent().getRank())

# Print "MaxIter"
print("  MaxIter:", best_model._java_obj.parent().getMaxIter())

# Print "RegParam"
print("  RegParam:", best_model._java_obj.parent().getRegParam())

<class 'pyspark.ml.recommendation.ALSModel'>
**Best Model**
  Rank: 10
  MaxIter: 10
  RegParam: 0.15


In [18]:
# View the predictions
test_predictions = best_model.transform(test)
RMSE = evaluator.evaluate(test_predictions)
print(RMSE)

1.071140224244767


In [19]:
test_predictions.show()

+--------------------+-------+--------------------+--------------------+------+-------------+----------+
|          type_objek|id_user|                name|            username|rating|id_type_objek|prediction|
+--------------------+-------+--------------------+--------------------+------+-------------+----------+
|KONSER & PERTUNJUKAN|  12366|       MarvinSitorus|/Profile/MarvinSi...|     4|            1| 3.0587642|
|KONSER & PERTUNJUKAN|   2924|                 B G|  /Profile/245bevang|     5|            1| 3.1097438|
|KONSER & PERTUNJUKAN|  19936|           wforwiddi| /Profile/widdimouse|     4|            1| 2.0263917|
|KONSER & PERTUNJUKAN|    332|           2014FpvGt|  /Profile/2014FpvGt|     2|            1| 2.8694944|
|KONSER & PERTUNJUKAN|  10124|          Jonathan S|/Profile/P891HYjo...|     5|            1| 4.3069673|
|KONSER & PERTUNJUKAN|  15948|             Richard| /Profile/312eugenep|     5|            1| 3.3907518|
|KONSER & PERTUNJUKAN|   9778|          Jennifer D|/Pro

In [20]:
# Generate n Recommendations for all users
nrecommendations = best_model.recommendForAllUsers(10)
nrecommendations.limit(20).show()

+-------+--------------------+
|id_user|     recommendations|
+-------+--------------------+
|   1580|[{10, 4.800035}, ...|
|   4900|[{7, 4.810792}, {...|
|   5300|[{8, 4.786434}, {...|
|   6620|[{6, 4.7741694}, ...|
|   7240|[{6, 4.788414}, {...|
|   7340|[{10, 4.800035}, ...|
|   7880|[{6, 1.9153657}, ...|
|   9900|[{8, 0.9572867}, ...|
|  12940|[{9, 4.5897326}, ...|
|  13840|[{10, 3.8400283},...|
|  14450|[{3, 3.8324633}, ...|
|  14570|[{8, 4.786434}, {...|
|  15790|[{6, 4.788414}, {...|
|  17420|[{9, 4.802536}, {...|
|  18800|[{9, 4.630559}, {...|
|  19530|[{7, 4.82004}, {9...|
|    471|[{10, 2.880021}, ...|
|   1591|[{9, 3.8420284}, ...|
|   4101|[{9, 4.802536}, {...|
|  11141|[{8, 4.3202505}, ...|
+-------+--------------------+



In [21]:
nrecommendations = nrecommendations\
    .withColumn("rec_ex", explode("recommendations"))\
    .select('id_user', col("rec_ex.id_type_objek"), col("rec_ex.rating"))

nrecommendations.limit(10).show()

+-------+-------------+---------+
|id_user|id_type_objek|   rating|
+-------+-------------+---------+
|   1580|           10| 4.800035|
|   1580|            7|4.0559735|
|   1580|            3|4.0383573|
|   1580|            9| 3.965877|
|   1580|            2| 3.848159|
|   1580|            6|3.8404694|
|   1580|            5|3.8351462|
|   1580|            8|3.7979074|
|   1580|            4|3.6690586|
|   1580|            1|3.3773198|
+-------+-------------+---------+



In [22]:
nrecommendations.join(objekWisata_x, on='id_type_objek').filter('id_user = 6658').show()

+-------------+-------+---------+--------------------+
|id_type_objek|id_user|   rating|          type_objek|
+-------------+-------+---------+--------------------+
|            9|   6658|4.5590944|PEMANDANGAN & TEN...|
|            7|   6658|4.0592446|              MUSEUM|
|            8|   6658| 3.971062|      TEMPAT BELANJA|
|            3|   6658|3.9481046|        TRANSPORTASI|
|           10|   6658| 3.918055| WISATA ALAM & TAMAN|
|            2|   6658|3.8981953|SUMBER INFORMASI ...|
|            1|   6658|3.8936048|KONSER & PERTUNJUKAN|
|            6|   6658|3.8719065|TAMAN AIR & TAMAN...|
|            5|   6658|3.8427744|             LAINNYA|
|            4|   6658|3.8195112|KEBUN BINATANG & ...|
+-------------+-------+---------+--------------------+



In [23]:
def recommendation(uid):
    return nrecommendations.join(objekWisata_x, on='id_type_objek').filter('id_user = ' + str(uid)).show()

In [24]:
from random import randint
n = randint(0,len(ratings.toPandas()['id_user'].unique()))
random_user = ratings.toPandas()['id_user'].unique()[n]
print('Rekomendasi Objek Pariwisata untuk user dengan id: ', random_user)
recommendation(random_user)

Rekomendasi Objek Pariwisata untuk user dengan id:  11439
+-------------+-------+---------+--------------------+
|id_type_objek|id_user|   rating|          type_objek|
+-------------+-------+---------+--------------------+
|           10|  11439|3.8400283| WISATA ALAM & TAMAN|
|            7|  11439|3.2447789|              MUSEUM|
|            3|  11439|3.2306862|        TRANSPORTASI|
|            9|  11439|3.1727018|PEMANDANGAN & TEN...|
|            2|  11439|3.0785277|SUMBER INFORMASI ...|
|            6|  11439|3.0723753|TAMAN AIR & TAMAN...|
|            5|  11439| 3.068117|             LAINNYA|
|            8|  11439|3.0383258|      TEMPAT BELANJA|
|            4|  11439|2.9352467|KEBUN BINATANG & ...|
|            1|  11439|2.7018557|KONSER & PERTUNJUKAN|
+-------------+-------+---------+--------------------+



In [25]:
len(ratings.toPandas()['id_user'].unique())

20722

In [26]:
ratings.toPandas()['id_user'].unique()[10000]

20914

In [27]:
recommendation(20914)

+-------------+-------+---------+--------------------+
|id_type_objek|id_user|   rating|          type_objek|
+-------------+-------+---------+--------------------+
|           10|  20914|3.8400283| WISATA ALAM & TAMAN|
|            7|  20914|3.2447789|              MUSEUM|
|            3|  20914|3.2306862|        TRANSPORTASI|
|            9|  20914|3.1727018|PEMANDANGAN & TEN...|
|            2|  20914|3.0785277|SUMBER INFORMASI ...|
|            6|  20914|3.0723753|TAMAN AIR & TAMAN...|
|            5|  20914| 3.068117|             LAINNYA|
|            8|  20914|3.0383258|      TEMPAT BELANJA|
|            4|  20914|2.9352467|KEBUN BINATANG & ...|
|            1|  20914|2.7018557|KONSER & PERTUNJUKAN|
+-------------+-------+---------+--------------------+

