# Recommend Products using SparkML

This notebook is written in **Python** so the default cell type is Python. However, you can use different languages by using the `%LANGUAGE` syntax. Python, Scala, SQL, and R are all supported.

As for the notebooks, to run these you will need to register for a free Databricks
[Community Edition account](https://community.cloud.databricks.com/)

## import modules

In [3]:
from pyspark.sql import SparkSession
from pyspark import SparkContext
import pyspark.sql.functions as F

from pyspark.mllib.recommendation import ALS, MatrixFactorizationModel, Rating

## Design Schema

In [5]:
rate_schema = "`userid` string, `accoid` string, `rating` INT"
accos_schema = "`id` string, `title` string, `location` string, `price` int, `rooms` int, `rating` float, `type` string"
recommend_schema = "`userid` string, `accoid` srting, `prediction` float"

## Read csv files to Spark DataFrame

In [7]:
rating_file_location = '/FileStore/tables/rating.csv'
dfRates = spark.read.csv(rating_file_location, rate_schema)

print(dfRates.count())
display(dfRates.take(10))

userid,accoid,rating
10,1,1
18,1,2
13,1,1
7,2,2
4,2,2
13,2,3
19,2,2
12,2,1
11,2,1
1,2,2


In [8]:
print(dfRates.count())

In [9]:
accos_file_location = "/FileStore/tables/accommodation.csv"
dfAccos = spark.read.csv(accos_file_location, accos_schema)

print(dfAccos.count())
display(dfAccos.take(10))

id,title,location,price,rooms,rating,type
1,Comfy Quiet Chalet,Vancouver,50,3,3.0999999046325684,cottage
2,Cozy Calm Hut,London,65,2,4.099999904632568,cottage
3,Agreable Calm Place,London,65,4,4.800000190734863,house
4,Colossal Quiet Chateau,Paris,3400,16,2.700000047683716,castle
5,Homy Quiet Shack,Paris,50,1,1.100000023841858,cottage
6,Pleasant Quiet Place,Dublin,35,5,4.300000190734863,house
7,Vast Peaceful Fortress,Seattle,3200,24,1.899999976158142,castle
8,Giant Quiet Fortress,San Francisco,3400,12,4.099999904632568,castle
9,Giant Peaceful Palace,London,1500,20,3.5,castle
10,Sizable Calm Country House,Auckland,650,9,4.900000095367432,mansion


## Aggregations

In [11]:
df_agg = dfRates.agg(F.count('userid').alias('num_ratings'),
            F.countDistinct('userid').alias('distinct_users_rating'),
            F.max('rating').alias('best_rating'),
            F.min('rating').alias('worst_rating'),
            F.avg('rating').alias('avg_rating')
           )
display(df_agg)

num_ratings,distinct_users_rating,best_rating,worst_rating,avg_rating
1187,25,5,1,2.4667228306655438


## Left Join

In [13]:
df_leftjoined = dfAccos.withColumnRenamed('rating', 'ratings').join(dfRates, dfAccos.id == dfRates.accoid, how = 'left')
print(df_leftjoined.count())
display(df_leftjoined.take(10))

id,title,location,price,rooms,ratings,type,userid,accoid,rating
1,Comfy Quiet Chalet,Vancouver,50,3,3.0999999046325684,cottage,13,1,1
1,Comfy Quiet Chalet,Vancouver,50,3,3.0999999046325684,cottage,18,1,2
1,Comfy Quiet Chalet,Vancouver,50,3,3.0999999046325684,cottage,10,1,1
2,Cozy Calm Hut,London,65,2,4.099999904632568,cottage,3,2,1
2,Cozy Calm Hut,London,65,2,4.099999904632568,cottage,2,2,4
2,Cozy Calm Hut,London,65,2,4.099999904632568,cottage,20,2,2
2,Cozy Calm Hut,London,65,2,4.099999904632568,cottage,1,2,2
2,Cozy Calm Hut,London,65,2,4.099999904632568,cottage,11,2,1
2,Cozy Calm Hut,London,65,2,4.099999904632568,cottage,12,2,1
2,Cozy Calm Hut,London,65,2,4.099999904632568,cottage,19,2,2


In [14]:
display(df_leftjoined.select('*').where(df_leftjoined.id == '100'))

id,title,location,price,rooms,ratings,type,userid,accoid,rating
100,Villa Staden,Stockholm,80000,4,4.2,house,,,


In [15]:
display(df_leftjoined.select('*').where(df_leftjoined.id == '101'))

id,title,location,price,rooms,ratings,userid,accoid,rating


## Right Join

In [17]:
df_rightjoined = dfAccos.withColumnRenamed('rating', 'ratings').join(dfRates, dfAccos.id == dfRates.accoid, how = 'right')
print(df_rightjoined.count())
display(df_rightjoined.take(10))

id,title,location,price,rooms,ratings,type,userid,accoid,rating
1,Comfy Quiet Chalet,Vancouver,50,3,3.0999999046325684,cottage,10,1,1
1,Comfy Quiet Chalet,Vancouver,50,3,3.0999999046325684,cottage,18,1,2
1,Comfy Quiet Chalet,Vancouver,50,3,3.0999999046325684,cottage,13,1,1
2,Cozy Calm Hut,London,65,2,4.099999904632568,cottage,7,2,2
2,Cozy Calm Hut,London,65,2,4.099999904632568,cottage,4,2,2
2,Cozy Calm Hut,London,65,2,4.099999904632568,cottage,13,2,3
2,Cozy Calm Hut,London,65,2,4.099999904632568,cottage,19,2,2
2,Cozy Calm Hut,London,65,2,4.099999904632568,cottage,12,2,1
2,Cozy Calm Hut,London,65,2,4.099999904632568,cottage,11,2,1
2,Cozy Calm Hut,London,65,2,4.099999904632568,cottage,1,2,2


In [18]:
display(df_rightjoined.select('*').where(df_rightjoined.id == '100'))

id,title,location,price,rooms,ratings,userid,accoid,rating


In [19]:
display(df_rightjoined.select('*').where(df_rightjoined.accoid == '101'))

id,title,location,price,rooms,ratings,type,userid,accoid,rating
,,,,,,,23,101,5


## Inner Join

In [21]:
df_innerjoined = dfAccos.withColumnRenamed('rating' , 'ratings').join(dfRates, dfAccos.id == dfRates.accoid, how = 'inner')
print(df_innerjoined.count())
display(df_innerjoined.take(10))

id,title,location,price,rooms,ratings,type,userid,accoid,rating
1,Comfy Quiet Chalet,Vancouver,50,3,3.0999999046325684,cottage,10,1,1
1,Comfy Quiet Chalet,Vancouver,50,3,3.0999999046325684,cottage,18,1,2
1,Comfy Quiet Chalet,Vancouver,50,3,3.0999999046325684,cottage,13,1,1
2,Cozy Calm Hut,London,65,2,4.099999904632568,cottage,7,2,2
2,Cozy Calm Hut,London,65,2,4.099999904632568,cottage,4,2,2
2,Cozy Calm Hut,London,65,2,4.099999904632568,cottage,13,2,3
2,Cozy Calm Hut,London,65,2,4.099999904632568,cottage,19,2,2
2,Cozy Calm Hut,London,65,2,4.099999904632568,cottage,12,2,1
2,Cozy Calm Hut,London,65,2,4.099999904632568,cottage,11,2,1
2,Cozy Calm Hut,London,65,2,4.099999904632568,cottage,1,2,2


In [22]:
display(df_innerjoined.select('*').where(df_innerjoined.id == '100'))

id,title,location,price,rooms,ratings,userid,accoid,rating


In [23]:
display(df_innerjoined.select('*').where(df_innerjoined.accoid == '101'))

id,title,location,price,rooms,ratings,userid,accoid,rating


## Train the model and recommend products with the model

### Train the model

In [26]:
model = ALS.train(dfRates.rdd, 20, 20)
print(type(model))

### Save the model

In [28]:
model.save(sc,'/FileStore/model_recommendations_v3.ml' )

### Use the trained model to predict what accommodations each user might be interested

In [30]:
allPredictions = None
for USER_ID in range(0, 100):
  dfUserRatings = dfRates.filter(dfRates.userid == USER_ID).rdd.map(lambda r: r.accoid).collect()
  rddPotential  = dfAccos.rdd.filter(lambda x: x[0] not in dfUserRatings)
  pairsPotential = rddPotential.map(lambda x: (USER_ID, x[0]))
  predictions = model.predictAll(pairsPotential).map(lambda p: (str(p[0]), str(p[1]), float(p[2])))
  predictions = predictions.takeOrdered(5, key=lambda x: -x[2]) # top 5
  print("predicted for user={0}".format(USER_ID))
  if (allPredictions == None):
    allPredictions = predictions
  else:
    allPredictions.extend(predictions)

### 5 Recommendations for every user

In [32]:
allPredictions

### Prediction for user '1'

In [34]:
dfUserRatings = dfRates.filter(dfRates.userid == '1').rdd.map(lambda r: r.accoid).collect()
rddPotential  = dfAccos.rdd.filter(lambda x: x[0] not in dfUserRatings)
pairsPotential = rddPotential.map(lambda x: ('1', x[0]))

In [35]:
pairsPotential.collect()

In [36]:
predictions = model.predictAll(pairsPotential).map(lambda p: (str(p[0]), str(p[1]), float(p[2])))

In [37]:
predictions_top5 = predictions.takeOrdered(5, key=lambda x: -x[2]) # top 5

In [38]:
predictions_top5

### The top five accommodations are recommended to user '1'. Note that the quality of the recommendations are not great because the dataset was so small (the predicted ratings are not very high). Still, this lab illustrates the process you'd go through to create product recommendations.

In [40]:
predictions.sortBy(lambda kv: kv[2], False).collect()

### Create DataFram as dfRecommendation

In [42]:
dfRecommendation = spark.createDataFrame(allPredictions, ['userid', 'accoid', 'prediction'] )
display(dfRecommendation.take(10))

userid,accoid,prediction
0,76,3.634036205302968
0,66,3.5387555809515
0,75,3.5161224476648
0,49,3.4598124917225226
0,39,3.452309519311184
1,85,2.794932087521952
1,18,2.231116724409522
1,43,1.993245523585799
1,6,1.839350063497429
1,38,1.8062653885433055


### Find the recommendations for a specified user

#### dfRecommendation.accoid and dfAccos.id are duplicated in the joined DataFrame, could drop one of the columns

In [45]:
df_joined_rec_acco = dfRecommendation.join(dfAccos, dfRecommendation.accoid == dfAccos.id, how = 'inner').drop(dfAccos.id)
display(df_joined_rec_acco.take(10))

userid,accoid,prediction,title,location,price,rooms,rating,type
0,76,3.634036205302968,Pleasant Calm Villa,Berlin,30,2,2.400000095367432,house
0,66,3.5387555809515,Beautiful Private Villa,London,80,2,2.400000095367432,house
0,75,3.5161224476648,Large Private Place,Berlin,50,4,3.5999999046325684,house
0,49,3.4598124917225226,Big Private Villa,NYC,90,2,4.800000190734863,house
0,39,3.452309519311184,Beautiful Calm Villa,Vancouver,50,3,3.5,house
1,85,2.794932087521952,Nice Private Shack,Auckland,55,1,4.900000095367432,cottage
1,18,2.231116724409522,Big Peaceful Hut,Melbourne,60,2,2.400000095367432,cottage
1,43,1.993245523585799,Nice Private Hut,Melbourne,60,3,2.799999952316284,cottage
1,6,1.839350063497429,Pleasant Quiet Place,Dublin,35,5,4.300000190734863,house
1,38,1.8062653885433055,Big Private House,San Francisco,70,4,2.900000095367432,house


In [46]:
display(df_joined_rec_acco.select('*').where(df_joined_rec_acco.userid == '10'))

userid,accoid,prediction,title,location,price,rooms,rating,type
10,41,1.557513021864533,Big Calm Manor,Seattle,800,11,2.7,mansion
10,31,1.5217864118887516,Colossal Private Castle,Buenos Aires,1400,15,3.3,castle
10,74,1.5191292231831937,Giant Calm Fort,Melbourne,2400,12,2.3,castle
10,46,1.463981723574951,Colossal Private Castle,San Francisco,1900,15,3.7,castle
10,35,1.4013996611501864,Colossal Quiet Chateau,NYC,2300,14,4.6,castle
