In [1]:
import pandas as pd
from pyspark.sql.functions import col, explode
from pyspark import SparkContext

In [2]:
from pyspark.sql import SparkSession
sc = SparkContext
# sc.setCheckpointDir('checkpoint')
spark = SparkSession.builder.appName('Recommendation').getOrCreate()

In [3]:
objekWisata = spark.read.csv("../dataset/cb_objekwisata.csv", header=True)
ratings = spark.read.csv("../dataset/data-rating-objek-wisata.csv", header=True)

In [4]:
ratings.show()

+-------+--------------------+--------------------+------+--------+--------------------+--------+
|id_user|                name|            username|rating|    date|          nama_objek|id_objek|
+-------+--------------------+--------------------+------+--------+--------------------+--------+
|   4363|        Compass73198|/Profile/Compass7...|     1|  19-Sep|Agrowisata Teba S...|     998|
|   6237|              Emc921|     /Profile/Emc921|     1|Okt 2019|Agrowisata Teba S...|     998|
|   7446|           gaddavino|  /Profile/gaddavino|     1|Agt 2019|Agrowisata Teba S...|     998|
|  10391|                 K&C| /Profile/963kristyb|     1|Okt 2019|Agrowisata Teba S...|     998|
|  12803|           Michael H|/Profile/MichaelH...|     1|  19-Jul|Agrowisata Teba S...|     998|
|  16435|                Rona|/Profile/Ronaindo...|     1|  19-Jul|Agrowisata Teba S...|     998|
|  11566|           Lindsay H|/Profile/986lindsayh|     1|  19-Jan|    Air Panas Banjar|     997|
|  11788|           

In [5]:
ratings.printSchema()

root
 |-- id_user: string (nullable = true)
 |-- name: string (nullable = true)
 |-- username: string (nullable = true)
 |-- rating: string (nullable = true)
 |-- date: string (nullable = true)
 |-- nama_objek: string (nullable = true)
 |-- id_objek: string (nullable = true)



In [6]:
ratings = ratings.\
    withColumn('id_user', col('id_user').cast('integer')).\
    withColumn('id_objek', col('id_objek').cast('integer')).\
    withColumn('rating', col('rating').cast('integer'))
ratings.show()

+-------+--------------------+--------------------+------+--------+--------------------+--------+
|id_user|                name|            username|rating|    date|          nama_objek|id_objek|
+-------+--------------------+--------------------+------+--------+--------------------+--------+
|   4363|        Compass73198|/Profile/Compass7...|     1|  19-Sep|Agrowisata Teba S...|     998|
|   6237|              Emc921|     /Profile/Emc921|     1|Okt 2019|Agrowisata Teba S...|     998|
|   7446|           gaddavino|  /Profile/gaddavino|     1|Agt 2019|Agrowisata Teba S...|     998|
|  10391|                 K&C| /Profile/963kristyb|     1|Okt 2019|Agrowisata Teba S...|     998|
|  12803|           Michael H|/Profile/MichaelH...|     1|  19-Jul|Agrowisata Teba S...|     998|
|  16435|                Rona|/Profile/Ronaindo...|     1|  19-Jul|Agrowisata Teba S...|     998|
|  11566|           Lindsay H|/Profile/986lindsayh|     1|  19-Jan|    Air Panas Banjar|     997|
|  11788|           

In [7]:
numerator = ratings.select("rating").count()

num_users = ratings.select("id_user").distinct().count()
num_objekWisata =  ratings.select("id_objek").distinct().count()

denominator = num_users * num_objekWisata

sparsity = (1.0 - (numerator *1.0)/denominator)*100
print("The ratings dataframe is ", "%.2f" % sparsity + "% empty.")

The ratings dataframe is  99.52% empty.


In [8]:
# Group data by userId, count ratings
username_ratings = ratings.groupBy("id_user").count().orderBy('count', ascending=False)
username_ratings.show()

+-------+-----+
|id_user|count|
+-------+-----+
|   2108|  105|
|  18575|   73|
|   2509|   72|
|  18895|   62|
|   9515|   55|
|   8424|   53|
|   8158|   50|
|  11169|   49|
|   4567|   49|
|   1282|   46|
|   8574|   46|
|   9177|   42|
|   5303|   42|
|   6338|   41|
|   9242|   39|
|  20858|   38|
|  16265|   38|
|   7910|   38|
|   3228|   38|
|   9409|   36|
+-------+-----+
only showing top 20 rows



In [9]:
attraction_ratings = ratings.groupBy("id_objek").count().orderBy('count', ascending=False)
attraction_ratings.show()

+--------+-----+
|id_objek|count|
+--------+-----+
|     858|  100|
|     623|  100|
|     897|  100|
|     737|  100|
|     883|  100|
|     879|  100|
|     808|  100|
|     580|  100|
|     898|  100|
|     804|  100|
|     588|  100|
|     970|  100|
|     799|  100|
|     833|  100|
|     853|  100|
|     918|  100|
|     744|  100|
|     683|  100|
|     950|  100|
|     976|  100|
+--------+-----+
only showing top 20 rows



In [10]:
# Import the required functions
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

In [11]:
# Create test and train set
(train, test) = ratings.randomSplit([0.8, 0.2], seed = 1234)

# Create ALS model
als = ALS(userCol="id_user", itemCol="id_objek", ratingCol="rating", nonnegative = True, implicitPrefs = False, coldStartStrategy="drop")

# Confirm that a model called "als" was created
type(als)

pyspark.ml.recommendation.ALS

In [12]:
# Import the requisite items
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

# Add hyperparameters and their respective values to param_grid
param_grid = ParamGridBuilder() \
            .addGrid(als.rank, [10, 50, 100, 150]) \
            .addGrid(als.regParam, [.01, .05, .1, .15]) \
            .build()
            #             .addGrid(als.maxIter, [5, 50, 100, 200]) \

           
# Define evaluator as RMSE and print length of evaluator
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction") 
print ("Num models to be tested: ", len(param_grid))

Num models to be tested:  16


In [13]:
# Build cross validation using CrossValidator
cv = CrossValidator(estimator=als, estimatorParamMaps=param_grid, evaluator=evaluator, numFolds=5)

# Confirm cv was built
print(cv)

CrossValidator_ecff50af31c7


In [14]:
#Fit cross validator to the 'train' dataset
model = cv.fit(train)

#Extract best model from the cv model above
best_model = model.bestModel

In [15]:
# Print best_model
print(type(best_model))

# Complete the code below to extract the ALS model parameters
print("**Best Model**")

# # Print "Rank"
print("  Rank:", best_model._java_obj.parent().getRank())

# Print "MaxIter"
print("  MaxIter:", best_model._java_obj.parent().getMaxIter())

# Print "RegParam"
print("  RegParam:", best_model._java_obj.parent().getRegParam())

<class 'pyspark.ml.recommendation.ALSModel'>
**Best Model**
  Rank: 150
  MaxIter: 10
  RegParam: 0.05


In [16]:
# View the predictions
test_predictions = best_model.transform(test)
RMSE = evaluator.evaluate(test_predictions)
print(RMSE)

0.8045359720452593


In [17]:
test_predictions.show()

+-------+---------------+--------------------+------+--------+--------------------+--------+----------+
|id_user|           name|            username|rating|    date|          nama_objek|id_objek|prediction|
+-------+---------------+--------------------+------+--------+--------------------+--------+----------+
|   8130|     Hardi Prya|  /Profile/239hardip|     4|  18-Jan|         Kuta Square|     833|  4.278912|
|  18738|        ths23id|    /Profile/ths23id|     3|  16-Jul|         Kuta Square|     833| 3.1145837|
|    417|   588chiquitap|/Profile/588chiqu...|     4|  16-Jul|         Kuta Square|     833| 4.0171876|
|   2967|  Bagus Perbawa| /Profile/bawa167799|     5|  19-Sep|         Kuta Square|     833| 4.3657417|
|   7559|      Genaddi P|/Profile/genaddib...|     3|Okt 2017|         Kuta Square|     833| 3.9579992|
|  18361|            T S|   /Profile/ts200777|     3|  17-Nov|         Kuta Square|     833| 2.9907172|
|   8652|        HUSIN G| /Profile/husing2017|     4|  18-Jun|  

In [18]:
# Generate n Recommendations for all users
nrecommendations = best_model.recommendForAllUsers(10)
nrecommendations.limit(20).show()

+-------+--------------------+
|id_user|     recommendations|
+-------+--------------------+
|   4900|[{679, 5.333799},...|
|   5300|[{679, 5.2635303}...|
|   6620|[{679, 4.98906}, ...|
|   7240|[{573, 4.982403},...|
|   7340|[{933, 5.014875},...|
|   7880|[{915, 4.9204884}...|
|   9900|[{679, 1.0867035}...|
|  12940|[{679, 5.3998413}...|
|  13840|[{898, 4.9549756}...|
|  14450|[{933, 4.2564707}...|
|  14570|[{679, 5.0594}, {...|
|  15790|[{873, 4.980567},...|
|  17420|[{679, 5.2401023}...|
|  18800|[{803, 4.9375763}...|
|  19530|[{940, 5.6422114}...|
|    471|[{679, 3.152154},...|
|   1591|[{679, 4.0610113}...|
|   4101|[{679, 4.993885},...|
|  11141|[{688, 4.898542},...|
|  16861|[{772, 2.9904046}...|
+-------+--------------------+



In [19]:
nrecommendations = nrecommendations\
    .withColumn("rec_ex", explode("recommendations"))\
    .select('id_user', col("rec_ex.id_objek"), col("rec_ex.rating"))

nrecommendations.limit(10).show()

+-------+--------+---------+
|id_user|id_objek|   rating|
+-------+--------+---------+
|   4900|     679| 5.333799|
|   4900|     655| 5.292748|
|   4900|     933|5.2675667|
|   4900|     745| 5.244437|
|   4900|     680|5.2405047|
|   4900|     621| 5.216584|
|   4900|     620| 5.212301|
|   4900|     940| 5.191921|
|   4900|     681|5.1876693|
|   4900|     747| 5.164889|
+-------+--------+---------+



In [20]:
nrecommendations.join(objekWisata, on='id_objek').filter('id_user = 6658').show()

+--------+-------+---------+--------------------+--------------------+
|id_objek|id_user|   rating|          nama_objek|          type_objek|
+--------+-------+---------+--------------------+--------------------+
|     933|   6658|4.8566356|       Desa Wae Rebo|PEMANDANGAN & TEN...|
|     679|   6658|4.8421435|      Pulau Pahawang| WISATA ALAM & TAMAN|
|     805|   6658| 4.831583|        Mesjid Ampel|PEMANDANGAN & TEN...|
|     621|   6658| 4.806752|Taman Nasional Br...| WISATA ALAM & TAMAN|
|     680|   6658| 4.801929|         Pulau Padar| WISATA ALAM & TAMAN|
|     620|   6658|4.7955565|Taman Nasional Gu...| WISATA ALAM & TAMAN|
|     940|   6658|4.7388425|Danau Kelimutu (D...| WISATA ALAM & TAMAN|
|     745|   6658| 4.697554|Pantai Kuta - Lombok| WISATA ALAM & TAMAN|
|     655|   6658|4.6645856|Rumah Pengasingan...|              MUSEUM|
|     747|   6658|4.6462793|   Pantai Kelingking| WISATA ALAM & TAMAN|
+--------+-------+---------+--------------------+--------------------+



In [27]:
def recommendation(uid):
    return nrecommendations.join(objekWisata, on='id_objek').filter('id_user = ' + str(uid)).show()

In [37]:
from random import randint
n = randint(0,len(ratings.toPandas()['id_user'].unique()))
random_user = ratings.toPandas()['id_user'].unique()[n]
print('Rekomendasi Objek Pariwisata untuk user dengan id: ', random_user)
recommendation(random_user)

Rekomendasi Objek Pariwisata untuk user dengan id:  5903
+--------+-------+---------+--------------------+--------------------+
|id_objek|id_user|   rating|          nama_objek|          type_objek|
+--------+-------+---------+--------------------+--------------------+
|     852|   5903|3.9383802|  Kebun Raya Cibodas| WISATA ALAM & TAMAN|
|     679|   5903|3.7754612|      Pulau Pahawang| WISATA ALAM & TAMAN|
|     680|   5903|3.6981337|         Pulau Padar| WISATA ALAM & TAMAN|
|     745|   5903|3.6457784|Pantai Kuta - Lombok| WISATA ALAM & TAMAN|
|     747|   5903|3.6433063|   Pantai Kelingking| WISATA ALAM & TAMAN|
|     620|   5903|3.6326773|Taman Nasional Gu...| WISATA ALAM & TAMAN|
|     621|   5903|3.6325064|Taman Nasional Br...| WISATA ALAM & TAMAN|
|     940|   5903|3.6228771|Danau Kelimutu (D...| WISATA ALAM & TAMAN|
|     933|   5903|3.6226022|       Desa Wae Rebo|PEMANDANGAN & TEN...|
|     681|   5903|3.5974474|     Pulau Menjangan| WISATA ALAM & TAMAN|
+--------+-------+--

In [38]:
len(ratings.toPandas()['id_user'].unique())

20722

In [41]:
ratings.toPandas()['id_user'].unique()[10000]

20914

In [42]:
recommendation(20914)

+--------+-------+---------+--------------------+--------------------+
|id_objek|id_user|   rating|          nama_objek|          type_objek|
+--------+-------+---------+--------------------+--------------------+
|     679|  20914|4.1329937|      Pulau Pahawang| WISATA ALAM & TAMAN|
|     933|  20914|4.1092157|       Desa Wae Rebo|PEMANDANGAN & TEN...|
|     940|  20914|4.0939517|Danau Kelimutu (D...| WISATA ALAM & TAMAN|
|     862|  20914|4.0822477|          Kawah Ijen| WISATA ALAM & TAMAN|
|     680|  20914| 4.072837|         Pulau Padar| WISATA ALAM & TAMAN|
|     621|  20914|4.0598607|Taman Nasional Br...| WISATA ALAM & TAMAN|
|     620|  20914|4.0535502|Taman Nasional Gu...| WISATA ALAM & TAMAN|
|     745|  20914|4.0518937|Pantai Kuta - Lombok| WISATA ALAM & TAMAN|
|     747|  20914| 4.029588|   Pantai Kelingking| WISATA ALAM & TAMAN|
|     692|  20914| 3.998792|Pesta Makanan Pak...|      TEMPAT BELANJA|
+--------+-------+---------+--------------------+--------------------+

