# Recommend Products using SparkML

This notebook is written in **Python** so the default cell type is Python. However, you can use different languages by using the `%LANGUAGE` syntax. Python, Scala, SQL, and R are all supported.

As for the notebooks, to run these you will need to register for a free Databricks
[Community Edition account](https://community.cloud.databricks.com/)

## import modules

In [3]:
from pyspark.sql import SparkSession
from pyspark import SparkContext
import pyspark.sql.functions as F

from pyspark.mllib.recommendation import ALS, MatrixFactorizationModel, Rating

## Design Schema

In [5]:
rate_schema = "`userid` string, `accoid` string, `rating` INT"
accos_schema = "`id` string, `title` string, `location` string, `price` int, `rooms` int, `rating` float"

## Read csv files to Spark DataFrame

In [7]:
rating_file_location = '/FileStore/tables/rating-1.csv'
dfRates = spark.read.csv(rating_file_location, rate_schema)

print(dfRates.count())
display(dfRates)

userid,accoid,rating
10,1,1
18,1,2
13,1,1
7,2,2
4,2,2
13,2,3
19,2,2
12,2,1
11,2,1
1,2,2


In [8]:
print(dfRates.count())

In [9]:
accos_file_location = "/FileStore/tables/accommodation-1.csv"
dfAccos = spark.read.csv(accos_file_location, accos_schema)

print(dfAccos.count())
display(dfAccos)

id,title,location,price,rooms,rating
1,Comfy Quiet Chalet,Vancouver,50,3,3.1
2,Cozy Calm Hut,London,65,2,4.1
3,Agreable Calm Place,London,65,4,4.8
4,Colossal Quiet Chateau,Paris,3400,16,2.7
5,Homy Quiet Shack,Paris,50,1,1.1
6,Pleasant Quiet Place,Dublin,35,5,4.3
7,Vast Peaceful Fortress,Seattle,3200,24,1.9
8,Giant Quiet Fortress,San Francisco,3400,12,4.1
9,Giant Peaceful Palace,London,1500,20,3.5
10,Sizable Calm Country House,Auckland,650,9,4.9


## Aggregations

In [11]:
display(dfRates)

userid,accoid,rating
10,1,1
18,1,2
13,1,1
7,2,2
4,2,2
13,2,3
19,2,2
12,2,1
11,2,1
1,2,2


In [12]:
df_agg = dfRates.agg(F.count('userid').alias('num_ratings'),
            F.countDistinct('userid').alias('distinct_users_rating'),
            F.max('rating').alias('best_rating'),
            F.min('rating').alias('worst_rating'),
            F.avg('rating').alias('avg_rating')
           )
display(df_agg)

num_ratings,distinct_users_rating,best_rating,worst_rating,avg_rating
1187,25,5,1,2.4667228306655438


## Left Join

In [14]:
df_leftjoined = dfAccos.withColumnRenamed('rating', 'ratings').join(dfRates, dfAccos.id == dfRates.accoid, how = 'left')
print(df_leftjoined.count())
display(df_leftjoined)

id,title,location,price,rooms,ratings,userid,accoid,rating
1,Comfy Quiet Chalet,Vancouver,50,3,3.1,13,1,1
1,Comfy Quiet Chalet,Vancouver,50,3,3.1,18,1,2
1,Comfy Quiet Chalet,Vancouver,50,3,3.1,10,1,1
2,Cozy Calm Hut,London,65,2,4.1,3,2,1
2,Cozy Calm Hut,London,65,2,4.1,2,2,4
2,Cozy Calm Hut,London,65,2,4.1,20,2,2
2,Cozy Calm Hut,London,65,2,4.1,1,2,2
2,Cozy Calm Hut,London,65,2,4.1,11,2,1
2,Cozy Calm Hut,London,65,2,4.1,12,2,1
2,Cozy Calm Hut,London,65,2,4.1,19,2,2


In [15]:
display(df_leftjoined.select('*').where(df_leftjoined.id == '100'))

id,title,location,price,rooms,ratings,userid,accoid,rating
100,Villa Staden,Stockholm,80000,4,4.2,,,


## Right Join

In [17]:
df_rightjoined = dfAccos.withColumnRenamed('rating', 'ratings').join(dfRates, dfAccos.id == dfRates.accoid, how = 'right')
print(df_rightjoined.count())
display(df_rightjoined)

id,title,location,price,rooms,ratings,userid,accoid,rating
1,Comfy Quiet Chalet,Vancouver,50,3,3.1,10,1,1
1,Comfy Quiet Chalet,Vancouver,50,3,3.1,18,1,2
1,Comfy Quiet Chalet,Vancouver,50,3,3.1,13,1,1
2,Cozy Calm Hut,London,65,2,4.1,7,2,2
2,Cozy Calm Hut,London,65,2,4.1,4,2,2
2,Cozy Calm Hut,London,65,2,4.1,13,2,3
2,Cozy Calm Hut,London,65,2,4.1,19,2,2
2,Cozy Calm Hut,London,65,2,4.1,12,2,1
2,Cozy Calm Hut,London,65,2,4.1,11,2,1
2,Cozy Calm Hut,London,65,2,4.1,1,2,2


In [18]:
df_rightjoined.select('*').where(df_rightjoined.id == '100').show()

In [19]:
display(df_rightjoined.select('*').where(df_rightjoined.accoid == '101'))

id,title,location,price,rooms,rating,userid,accoid,rating.1
,,,,,,23,101,5


## Inner Join

In [21]:
df_innerjoined = dfAccos.withColumnRenamed('rating' , 'ratings').join(dfRates, dfAccos.id == dfRates.accoid, how = 'inner')
print(df_innerjoined.count())
display(df_innerjoined)

id,title,location,price,rooms,ratings,userid,accoid,rating
1,Comfy Quiet Chalet,Vancouver,50,3,3.1,10,1,1
1,Comfy Quiet Chalet,Vancouver,50,3,3.1,18,1,2
1,Comfy Quiet Chalet,Vancouver,50,3,3.1,13,1,1
2,Cozy Calm Hut,London,65,2,4.1,7,2,2
2,Cozy Calm Hut,London,65,2,4.1,4,2,2
2,Cozy Calm Hut,London,65,2,4.1,13,2,3
2,Cozy Calm Hut,London,65,2,4.1,19,2,2
2,Cozy Calm Hut,London,65,2,4.1,12,2,1
2,Cozy Calm Hut,London,65,2,4.1,11,2,1
2,Cozy Calm Hut,London,65,2,4.1,1,2,2


In [22]:
df_innerjoined.select('*').where(df_innerjoined.id == '100').show()

In [23]:
df_innerjoined.select('*').where(df_joined.accoid == '101').show()

## Train the model and recommend products with the model

### Train the model

In [26]:
model = ALS.train(dfRates.rdd, 20, 20)
print(type(model))

### Save the model

In [28]:
model.save(sc,'/FileStore/model_recommendation.ml' )

### Use the trained model to predict what accommodations each user might be interested

In [30]:
allPredictions = None
for USER_ID in range(0, 100):
  dfUserRatings = dfRates.filter(dfRates.userid == USER_ID).rdd.map(lambda r: r.accoid).collect()
  rddPotential  = dfAccos.rdd.filter(lambda x: x[0] not in dfUserRatings)
  pairsPotential = rddPotential.map(lambda x: (USER_ID, x[0]))
  predictions = model.predictAll(pairsPotential).map(lambda p: (str(p[0]), str(p[1]), float(p[2])))
  predictions = predictions.takeOrdered(5, key=lambda x: -x[2]) # top 5
  print("predicted for user={0}".format(USER_ID))
  if (allPredictions == None):
    allPredictions = predictions
  else:
    allPredictions.extend(predictions)

In [31]:
type(allPredictions)

In [32]:
allPredictions

### Prediction for user '1'

In [34]:
dfUserRatings = dfRates.filter(dfRates.userid == '1').rdd.map(lambda r: r.accoid).collect()
rddPotential  = dfAccos.rdd.filter(lambda x: x[0] not in dfUserRatings)
pairsPotential = rddPotential.map(lambda x: ('1', x[0]))

In [35]:
pairsPotential.collect()

In [36]:
predictions = model.predictAll(pairsPotential).map(lambda p: (str(p[0]), str(p[1]), float(p[2])))

In [37]:
predictions = predictions.takeOrdered(5, key=lambda x: -x[2]) # top 5

#### These are the five accommodations that are recommended to user '1'. Note that the quality of the recommendations are not great because the dataset was so small (the predicted ratings are not very high). Still, this lab illustrates the process you'd go through to create product recommendations.

In [39]:
predictions