In [1]:
spark.conf.set(
  "fs.azure.account.key.storagestudent.blob.core.windows.net", 
  "8ykAjWw0X+FVY8PnmVhnY5XyDdbpBLceLsFTbuTttdn2W76+sdAsSwYNQ9E+H8Oxlsay8/uCm8ayyWq73kB82Q=="
)
datasets = {
  dataset: spark.read.load( 
    "wasbs://default@storagestudent.blob.core.windows.net/datasets/S8-4/Exo/restaurant-data-with-consumer-ratings/{0}.csv".format(dataset), 
    format="csv",
    header="true"
  )
  for dataset in [
    "chefmozaccepts", 
    "chefmozcuisine", 
    "chefmozhours4", 
    "chefmozparking", 
    "geoplaces2", 
    "rating_final", 
    "usercuisine", 
    "userpayment", 
    "userprofile"
  ]
} 

In [2]:
rating = datasets["rating_final"]
restopayment = datasets["chefmozaccepts"]
userpayment  = datasets["userpayment"]

In [3]:
display(rating)

userID,placeID,rating,food_rating,service_rating
U1077,135085,2,2,2
U1077,135038,2,2,1
U1077,132825,2,2,2
U1077,135060,1,2,2
U1068,135104,1,1,2
U1068,132740,0,0,0
U1068,132663,1,1,1
U1068,132732,0,0,0
U1068,132630,1,1,1
U1067,132584,2,2,2


In [4]:
from pyspark.sql.types import *
from pyspark.sql import functions as F

ratings = rating.select(
  F.col("userID").substr(2,20).cast(IntegerType()).alias("userID") ,
  F.col("placeID").cast(IntegerType()),
  F.col("rating").cast(IntegerType()),
  F.col("food_rating").cast(IntegerType()),
  F.col("service_rating").cast(IntegerType())
)

In [5]:
display(ratings)

userID,placeID,rating,food_rating,service_rating
1077,135085,2,2,2
1077,135038,2,2,1
1077,132825,2,2,2
1077,135060,1,2,2
1068,135104,1,1,2
1068,132740,0,0,0
1068,132663,1,1,1
1068,132732,0,0,0
1068,132630,1,1,1
1067,132584,2,2,2


In [6]:
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

train, test = ratings.randomSplit([0.7, 0.3])

als = ALS(userCol="userID", itemCol="placeID", ratingCol="rating", coldStartStrategy="drop")

evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction")

paramGrid = ParamGridBuilder()\
                    .addGrid(als.rank, [1, 5, 10])\
                    .addGrid(als.maxIter, [1, 5, 10])\
                    .build()

#                    .addGrid(als.regParam, [0.3, 0.1, 0.01])\
#                    .addGrid(als.alpha, [2.0, 3.0])\

In [7]:
cv = CrossValidator(
  estimator=als, 
  evaluator=evaluator, 
  estimatorParamMaps=paramGrid, 
  numFolds=5
)

model = cv.fit(train)

In [8]:
predictions = model.transform(test)
evaluator.evaluate(predictions)

In [9]:
restopayment = restopayment.select(
  F.col("placeID").cast(IntegerType()),
  F.col("Rpayment")
)
userpayment = userpayment.select(
  F.col("userID").substr(2,20).cast(IntegerType()).alias("userID") ,
  F.col("Upayment")
)

In [10]:
user_resto_rating_j = ratings.join(restopayment, "placeID", "inner").join(userpayment, "userID", "inner")

display(user_resto_rating_j)

userID,placeID,rating,food_rating,service_rating,Rpayment,Upayment
1041,135109,1,2,1,cash,VISA
1041,135109,1,2,1,cash,American_Express
1041,135109,1,2,1,cash,cash
1041,135109,1,2,1,cash,bank_debit_cards
1051,135109,1,1,1,cash,cash
1051,135109,1,1,1,cash,VISA
1020,135109,2,2,1,cash,bank_debit_cards
1020,135109,2,2,1,cash,cash
1030,135109,0,0,0,cash,cash
1002,135106,1,1,1,cash,cash


In [11]:
user_resto_rating_j = user_resto_rating_j.filter(F.col("Rpayment") == F.col("Upayment"))
display(user_resto_rating_j)

userID,placeID,rating,food_rating,service_rating,Rpayment,Upayment
1041,135109,1,2,1,cash,cash
1051,135109,1,1,1,cash,cash
1020,135109,2,2,1,cash,cash
1030,135109,0,0,0,cash,cash
1002,135106,1,1,1,cash,cash
1078,135106,2,2,2,cash,cash
1135,135106,0,0,0,cash,cash
1101,135106,0,0,0,cash,cash
1016,135106,2,2,2,cash,cash
1106,135106,0,0,0,cash,cash


In [12]:
test = user_resto_rating_j.select(
  F.col("userID"),
  F.col("placeID"),
  F.col("rating"),
  F.col("food_rating"),
  F.col("service_rating")
)
display(test)

userID,placeID,rating,food_rating,service_rating
1041,135109,1,2,1
1051,135109,1,1,1
1020,135109,2,2,1
1030,135109,0,0,0
1002,135106,1,1,1
1078,135106,2,2,2
1135,135106,0,0,0
1101,135106,0,0,0
1016,135106,2,2,2
1106,135106,0,0,0


In [13]:
predictions = model.transform(test)
evaluator.evaluate(predictions)