<a href="https://colab.research.google.com/github/tbazzi/Data-Science---Fullstack-Bootcamp/blob/master/S9_5_RecommandationYelp_V2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#0. Chargement des données

In [0]:
spark.conf.set(
  "fs.azure.account.key.storagestudent.blob.core.windows.net", 
  "8ykAjWw0X+FVY8PnmVhnY5XyDdbpBLceLsFTbuTttdn2W76+sdAsSwYNQ9E+H8Oxlsay8/uCm8ayyWq73kB82Q=="
)

In [0]:
datasets = {
  dataset: spark.read.load( 
    "wasbs://default@storagestudent.blob.core.windows.net/datasets/S8-5/Exo/restaurant-data-with-consumer-ratings/{0}.csv".format(dataset), 
    format="csv",
    header="true"
  )
  for dataset in [
    "chefmozaccepts", 
    "chefmozcuisine", 
    "chefmozhours4", 
    "chefmozparking", 
    "geoplaces2", 
    "rating_final", 
    "usercuisine", 
    "userpayment", 
    "userprofile"
  ]
} 

In [0]:
datasets

In [0]:
rating = datasets["rating_final"]

In [0]:
display(rating)

userID,placeID,rating,food_rating,service_rating
U1077,135085,2,2,2
U1077,135038,2,2,1
U1077,132825,2,2,2
U1077,135060,1,2,2
U1068,135104,1,1,2
U1068,132740,0,0,0
U1068,132663,1,1,1
U1068,132732,0,0,0
U1068,132630,1,1,1
U1067,132584,2,2,2


#1. Pre processing

In [0]:
from pyspark.ml.feature import StringIndexer

userIdIndexer = StringIndexer(inputCol="userID", outputCol="userIdIndex").fit(rating)

rating = userIdIndexer.transform(rating)

display(rating)

userID,placeID,rating,food_rating,service_rating,userIdIndex
U1077,135085,2,2,2,112.0
U1077,135038,2,2,1,112.0
U1077,132825,2,2,2,112.0
U1077,135060,1,2,2,112.0
U1068,135104,1,1,2,81.0
U1068,132740,0,0,0,81.0
U1068,132663,1,1,1,81.0
U1068,132732,0,0,0,81.0
U1068,132630,1,1,1,81.0
U1067,132584,2,2,2,99.0


In [0]:
from pyspark.sql.types import *
from pyspark.sql import functions as F

rating = rating.select(
  F.col("userIdIndex"),
  F.col("placeID").cast(IntegerType()),
  F.col("rating").cast(IntegerType())
)

display(rating)

userIdIndex,placeID,rating
112.0,135085,2
112.0,135038,2
112.0,132825,2
112.0,135060,1
81.0,135104,1
81.0,132740,0
81.0,132663,1
81.0,132732,0
81.0,132630,1
99.0,132584,2


#2. Entrainement

In [0]:
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import BinaryClassificationEvaluator, RegressionEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

train, test = rating.randomSplit([0.8, 0.2])

In [0]:
als = ALS(userCol="userIdIndex", itemCol="placeID", ratingCol="rating", coldStartStrategy="drop", nonnegative=True)

evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction")

paramGrid = ParamGridBuilder() \
                    .addGrid(als.rank, [1, 5, 10]) \
                    .addGrid(als.maxIter,[1,5,10]) \
                    .build()

cv = CrossValidator(
  estimator=als, 
  evaluator=evaluator, 
  estimatorParamMaps=paramGrid, 
  numFolds=5
)

In [0]:
model = cv.fit(train)

In [0]:
bestModel = model.bestModel

bestModelParameters = {
  key.name: value
  for key, value in bestModel.extractParamMap().items()
  if key.name in ["rank", "maxIter"]
}

In [0]:
print(bestModelParameters)

#3. Evaluation et analyse

In [0]:
predictions_test = model.transform(test)

In [0]:
display(predictions_test)

userIdIndex,placeID,rating,prediction
122.0,135000,0,1.3397467
48.0,135027,2,1.0132016
107.0,135027,0,0.63504094
47.0,135066,1,2.0617023
45.0,135066,1,0.8988468
134.0,132663,0,0.0
66.0,135108,0,0.0
35.0,135071,1,1.0277802
38.0,135071,1,0.7917879
83.0,135071,0,0.0


#4. Post processing

In [0]:
from pyspark.ml.feature import IndexToString

predictions_test = IndexToString(inputCol="userIdIndex", outputCol="userID").transform(predictions_test).drop("userIdIndex")

In [0]:
userpayment = datasets['userpayment']
chefmozaccepts = datasets['chefmozaccepts']

In [0]:
payment_matching = userpayment.join(chefmozaccepts, userpayment['Upayment'] == chefmozaccepts['Rpayment'], how='inner')
payment_matching = payment_matching.select("userID", "placeID").dropDuplicates()

In [0]:
display(payment_matching)

userID,placeID
U1081,135110
U1006,135107
U1042,135107
U1029,135105
U1070,135104
U1102,135103
U1034,135103
U1027,135100
U1104,135099
U1086,135094


In [0]:
predictions_test.count()

In [0]:
predictions_test_filtered = predictions_test.join(
  payment_matching, 
  ["userID", "placeID"],
  how='inner'
)

predictions_test_filtered.count()

In [0]:
display(predictions_test_filtered)

userID,placeID,rating,prediction
U1072,135000,0,1.3397467
U1132,135027,2,1.0132016
U1076,135027,0,0.63504094
U1058,135066,1,2.0617023
U1018,135066,1,0.8988468
U1037,135071,1,1.0277802
U1124,135071,1,0.7917879
U1094,135071,0,0.0
U1096,132723,1,1.4948902
U1137,132723,2,1.8555685


In [0]:
evaluator.evaluate(predictions_test_filtered)

In [0]:
evaluator.isLargerBetter()

In [0]:
usercuisine = datasets["usercuisine"]
restocuisine = datasets["chefmozcuisine"]

In [0]:
cuisine_matching = usercuisine.join(restocuisine, restocuisine['Rcuisine'] == usercuisine['Rcuisine'], how='inner')
cuisine_matching = cuisine_matching.select("userID", "placeID").dropDuplicates()

In [0]:
predictions_test_filtered2 = predictions_test_filtered.join(
  cuisine_matching, 
  ["userID", "placeID"],
  how='inner'
)

predictions_test_filtered2.count()

In [0]:
display(predictions_test_filtered2)

userID,placeID,rating,prediction
U1132,135027,2,1.0132016
U1076,135027,0,0.63504094
U1096,132723,1,1.4948902
U1137,132723,2,1.8555685
U1114,132723,0,0.26690057
U1026,132706,0,2.3614423
U1067,132732,1,0.76187825
U1135,132856,0,0.0
U1028,132740,1,0.7672801
U1060,132740,1,1.1700937


In [0]:
evaluator.evaluate(predictions_test_filtered2)

In [0]:
restoprofile = datasets["geoplaces2"]
userprofile = datasets["userprofile"]

In [0]:
display(restoprofile)

placeID,latitude,longitude,the_geom_meter,name,address,city,state,country,fax,zip,alcohol,smoking_area,dress_code,accessibility,price,url,Rambience,franchise,area,other_services
134999,18.915421,-99.184871,0101000020957F000088568DE356715AC138C0A525FC464A41,Kiku Cuernavaca,Revolucion,Cuernavaca,Morelos,Mexico,?,?,No_Alcohol_Served,none,informal,no_accessibility,medium,kikucuernavaca.com.mx,familiar,f,closed,none
132825,22.1473922,-100.983092,0101000020957F00001AD016568C4858C1243261274BA54B41,puesto de tacos,esquina santos degollado y leon guzman,s.l.p.,s.l.p.,mexico,?,78280,No_Alcohol_Served,none,informal,completely,low,?,familiar,f,open,none
135106,22.1497088,-100.9760928,0101000020957F0000649D6F21634858C119AE9BF528A34B41,El Rinc�n de San Francisco,Universidad 169,San Luis Potosi,San Luis Potosi,Mexico,?,78000,Wine-Beer,only at bar,informal,partially,medium,?,familiar,f,open,none
132667,23.7526973,-99.1633594,0101000020957F00005D67BCDDED8157C1222A2DC8D84D4941,little pizza Emilio Portes Gil,calle emilio portes gil,victoria,tamaulipas,?,?,?,No_Alcohol_Served,none,informal,completely,low,?,familiar,t,closed,none
132613,23.7529035,-99.165076,0101000020957F00008EBA2D06DC8157C194E03B7B504E4941,carnitas_mata,lic. Emilio portes gil,victoria,Tamaulipas,Mexico,?,?,No_Alcohol_Served,permitted,informal,completely,medium,?,familiar,t,closed,none
135040,22.135617,-100.969709,0101000020957F00001B552189B84A58C15A2AAEFD2CA24B41,Restaurant los Compadres,Camino a Simon Diaz 155 Centro,San Luis Potosi,SLP,Mexico,?,74000,Wine-Beer,none,informal,no_accessibility,high,?,familiar,f,closed,none
132732,23.7543569,-99.171288,0101000020957F00008A20E615808157C16272FECBF84F4941,Taqueria EL amigo,Calle Mezquite Fracc Framboyanes,Cd Victoria,Tamaulipas,Mexico,?,87018,No_Alcohol_Served,none,casual,completely,low,?,familiar,f,open,none
132875,22.1499013,-100.9937793,0101000020957F00008A2A0747DE4758C11EB31D2A31A84B41,shi ro ie,?,?,?,?,?,?,Wine-Beer,section,informal,no_accessibility,high,?,familiar,t,open,Internet
132609,23.7602683,-99.1658646,0101000020957F0000A478418BBA8057C133851EB22C4E4941,Pollo_Frito_Buenos_Aires,tampico,victoria,Tamaulipas,Mexico,?,?,No_Alcohol_Served,not permitted,informal,completely,low,?,quiet,t,closed,none
135082,22.151448,-100.915099,0101000020957F0000A29FAF95CD4958C1FEEEBB73A9914B41,la Estrella de Dimas,Villa de Pozos 192 Villa de Pozos,San Luis Potosi,SLP,Mexico,?,78421,No_Alcohol_Served,none,informal,no_accessibility,medium,?,familiar,f,closed,none


In [0]:
from  pyspark.sql import functions as F
restoprofile =  restoprofile.select(
  F.col("placeID"),
  F.round(F.abs(F.col("latitude").cast(FloatType())),2).alias("latitude"),
  F.round(F.abs(F.col("longitude").cast(FloatType())),2).alias("longitude"),
  F.when(F.col("alcohol") == "No_Alcohol_Served", 0).otherwise(1).alias("alcohol"),
  F.when(F.col("smoking_area") == "not permitted", 0).otherwise(1).alias("smoker"),
  #F.col("dress_code"),
  F.col("price"),
  F.when(F.col("Rambience") == "quiet", 0).otherwise(1).alias("ambience")
)

In [0]:
display(restoprofile)

placeID,latitude,longitude,alcohol,smoker,price,ambience
134999,18.92,99.18,0,1,medium,1
132825,22.15,100.98,0,1,low,1
135106,22.15,100.98,1,1,medium,1
132667,23.75,99.16,0,1,low,1
132613,23.75,99.17,0,1,medium,1
135040,22.14,100.97,1,1,high,1
132732,23.75,99.17,0,1,low,1
132875,22.15,100.99,1,1,high,1
132609,23.76,99.17,0,0,low,0
135082,22.15,100.92,0,1,medium,1


In [0]:
display(userprofile)

userID,latitude,longitude,smoker,drink_level,dress_preference,ambience,transport,marital_status,hijos,birth_year,interest,personality,religion,activity,color,weight,budget,height
U1001,22.139997,-100.978803,false,abstemious,informal,family,on foot,single,independent,1989,variety,thrifty-protector,none,student,black,69,medium,1.77
U1002,22.150087,-100.983325,false,abstemious,informal,family,public,single,independent,1990,technology,hunter-ostentatious,Catholic,student,red,40,low,1.87
U1003,22.119847,-100.946527,false,social drinker,formal,family,public,single,independent,1989,none,hard-worker,Catholic,student,blue,60,low,1.69
U1004,18.867,-99.183,false,abstemious,informal,family,public,single,independent,1940,variety,hard-worker,none,professional,green,44,medium,1.53
U1005,22.183477,-100.959891,false,abstemious,no preference,family,public,single,independent,1992,none,thrifty-protector,Catholic,student,black,65,medium,1.69
U1006,22.15,-100.983,true,social drinker,no preference,friends,car owner,single,independent,1989,variety,hard-worker,none,student,blue,75,medium,1.8
U1007,22.118464,-100.938256,false,casual drinker,informal,solitary,public,single,independent,1989,variety,thrifty-protector,Catholic,student,purple,60,low,1.59
U1008,22.122989,-100.923811,false,social drinker,formal,solitary,public,single,independent,1989,technology,hard-worker,Catholic,student,green,68,low,1.72
U1009,22.159427,-100.990448,false,abstemious,formal,family,on foot,single,kids,1991,variety,thrifty-protector,Catholic,student,green,75,medium,1.78
U1010,22.190889,-100.998669,false,social drinker,no preference,friends,car owner,married,kids,1987,technology,hard-worker,none,student,green,40,medium,1.67


In [0]:
userprofile = userprofile.select(
  F.col("userID"),
  F.round(F.abs(F.col("latitude").cast(FloatType())),2).alias("latitude"),
  F.round(F.abs(F.col("longitude").cast(FloatType())),2).alias("longitude"),
  F.when(F.col("smoker") == "false", 0).otherwise(1).alias("smoker"),
  F.when(F.col("drink_level") == "abstemious", 0).otherwise(1).alias("alcohol"),
  #F.when(F.col("dress_preference") == "no preference", 0).otherwise(1).alias("alcohol"),
  F.when(F.col("ambience") == "solitary", 0).otherwise(1).alias("ambience"),
  F.when(F.col("budget")=="?","medium").otherwise(F.col("budget")).alias("price")

)

In [0]:
display(userprofile)

userID,latitude,longitude,smoker,alcohol,ambience,price
U1001,22.14,100.98,0,0,1,medium
U1002,22.15,100.98,0,0,1,low
U1003,22.12,100.95,0,1,1,low
U1004,18.87,99.18,0,0,1,medium
U1005,22.18,100.96,0,0,1,medium
U1006,22.15,100.98,1,1,1,medium
U1007,22.12,100.94,0,1,0,low
U1008,22.12,100.92,0,1,0,low
U1009,22.16,100.99,0,0,1,medium
U1010,22.19,101.0,0,1,1,medium


In [0]:
from  pyspark.sql.functions import *
profile_matching = restoprofile.join(userprofile, ['latitude', 'longitude', 'smoker', 'alcohol', 'ambience', 'price'] , how='inner')
profile_matching = profile_matching.select("userID", "placeID").dropDuplicates()

predictions_test_filtered3 = predictions_test_filtered.join(
  profile_matching, 
  ["userID", "placeID"],
  how='inner'
)

predictions_test_filtered3.count()


In [0]:
display(profile_matching)

userID,placeID
U1099,135076
U1082,132613
U1024,135062
U1123,132584
U1013,132872
U1015,135071
U1104,135106
U1006,135041
U1019,132925
U1014,132584


In [0]:
display(predictions_test_filtered3)

userID,placeID,rating,prediction
U1026,132706,0,2.3614423
U1032,132872,1,0.7341482
U1082,132613,0,0.0
U1099,135076,1,1.1364857


In [0]:
evaluator.evaluate(predictions_test_filtered3)

In [0]:
profile_matching2 = restoprofile.join(userprofile, ['smoker', 'alcohol', 'ambience', 'price'] , how='inner')
profile_matching2 = profile_matching2.select("userID", "placeID").dropDuplicates()

predictions_test_filtered4 = predictions_test_filtered.join(
  profile_matching2, 
  ["userID", "placeID"],
  how='inner'
)



In [0]:

display(predictions_test_filtered4)

userID,placeID,rating,prediction
U1026,132706,0,2.3614423
U1097,132954,1,1.8988886
U1095,132872,1,1.0832087
U1032,132872,1,0.7341482
U1097,135086,2,1.0537778
U1063,135104,1,1.2146893
U1082,132613,0,0.0
U1035,135018,1,1.9549072
U1115,135069,1,0.51389927
U1111,132854,2,1.2930962


In [0]:
predictions_test_filtered4.count()

In [0]:

evaluator.evaluate(predictions_test_filtered4)