# Spark Recommender System

In [48]:
# import necessary modules
import os
import shutil
import pyspark as ps
from pyspark.ml import Pipeline, Transformer
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS, ALSModel
from pyspark.sql import Row
from pyspark.sql.types import DoubleType

In [49]:
# create spark context
spark = (ps.sql.SparkSession.builder
        .appName("ALS model")
        .getOrCreate()
        )
sc = spark.sparkContext
print(spark.version)

3.5.0


## Read in Data

We will use user clusters as part of user features for our model here. 

In [50]:
data_dir = os.path.join("modified_data", "")
file = os.path.join(data_dir, "item_features_clustered.csv")

file

'modified_data\\item_features_clustered.csv'

In [51]:
# source data from prior step
data_dir = os.path.join("clean_data", "")
file = os.path.join(data_dir, "master.csv")

# options are specified to read in data without error
df_master = spark.read.format("csv")\
               .option("multiline", "true")\
               .option("quote", '"')\
               .option("header", "true")\
               .option("escape", "\\")\
               .option("escape", '"')\
               .option("inferSchema", "true")\
               .load(file)

## Create use and item feature matrices

In [52]:
user_features = df_master.select(df_master['customer_unique_id'], 
                               df_master['product_id'], 
                               df_master['review_score'])

In [53]:
user_features.show(4)

+--------------------+--------------------+------------+
|  customer_unique_id|          product_id|review_score|
+--------------------+--------------------+------------+
|eb28e67c4c0b83846...|e5f2d52b802189ee6...|           4|
|635d9ac1680f03288...|8d4f2bb7e93e6710a...|           4|
|6457be0b331148fb5...|4fa33915031a8cde0...|           1|
|98e71752819916789...|08279c494018541f7...|           1|
+--------------------+--------------------+------------+
only showing top 4 rows



In [54]:
user_features = user_features.sort("customer_unique_id")
user_features.count()

108455

In [55]:
from pyspark.sql.functions import mean

user_features = user_features.groupBy("customer_unique_id","product_id") \
    .agg(mean('review_score').alias('review_score'))
    # .mean("review_score").alias("review_score")

user_features.count()




97686

In [56]:
user_features.show()

+--------------------+--------------------+------------+
|  customer_unique_id|          product_id|review_score|
+--------------------+--------------------+------------+
|f507945d757fe904e...|f3b8bfa5b86249e75...|         1.0|
|e59d98970fee89f12...|abc3432ff825d8e52...|         5.0|
|f7089cec59a3515cb...|9615c720e219d641f...|         4.0|
|5097528b417fe105b...|2d65aa8c163f7a8dc...|         5.0|
|c3ec2ad05ad7887e8...|bddc1407637849d18...|         4.0|
|56018f90d63660400...|71da6d6632902431c...|         5.0|
|5745d934dfb11a294...|b184461ba53ea15b7...|         5.0|
|ba4ed9f6b174012ec...|50e9e70a21b874311...|         5.0|
|057f54779f0908b7b...|98a5e061c4731a1d4...|         2.0|
|886d5f561ce675d06...|9b968712a8768d8e2...|         4.0|
|99be275a1517effe1...|bc4a074ab7a7ff14e...|         5.0|
|b067d457d34c2533b...|2ed4ea5a5394cfcb0...|         2.0|
|97107c892093c359b...|bb9a7bcdb8c9facc7...|         1.0|
|4d7ad8005bec85fd6...|8a2c3b7ed6d098de8...|         5.0|
|1b277f8bf9377e4d8...|d315b738a

## Index user and product ids

In [57]:
from pyspark.ml.feature import StringIndexer

# create object of StringIndexer class and specify input and output column
SI_customer = StringIndexer(inputCol='customer_unique_id',outputCol='customer_index')
SI_product = StringIndexer(inputCol='product_id',outputCol='product_index')

# transform the data
user_features = SI_customer.fit(user_features).transform(user_features)
user_features = SI_product.fit(user_features).transform(user_features)

# view the transformed data
user_features.select('customer_unique_id', 'customer_index', 'product_id', 'product_index').show(10)

+--------------------+--------------+--------------------+-------------+
|  customer_unique_id|customer_index|          product_id|product_index|
+--------------------+--------------+--------------------+-------------+
|0000366f3b9a7992b...|        5166.0|372645c7439f9661f...|        451.0|
|0000b849f77a49e4a...|        5167.0|5099f7000472b634f...|       2313.0|
|0000f46a3911fa3c0...|        5168.0|64b488de448a5324c...|       3486.0|
|0000f6ccb0745a6a4...|        5169.0|2345a354a6f203360...|       5592.0|
|0004aac84e0df4da2...|        5170.0|c72e18b3fe2739b8d...|      27381.0|
|0004bd2a26a76fe21...|        5171.0|25cf184645f3fae66...|       5607.0|
|00050ab1314c0e55a...|        5172.0|8cefe1c6f2304e7e6...|       2900.0|
|00053a61a98854899...|         726.0|62984ea1bba7fcea1...|       4526.0|
|00053a61a98854899...|         726.0|58727e154e8e85d84...|       1134.0|
|0005e1862207bf6cc...|        5173.0|e24f73b7631ee3fbb...|        887.0|
+--------------------+--------------+--------------

In [58]:
from pyspark.sql.types import IntegerType
# convert columns to integer types
user_features = user_features.withColumn("review_score",
                                        user_features["review_score"].cast(IntegerType()))

In [59]:
user_features.where((user_features.review_score > 5) | (user_features.review_score < 1)).show()

+------------------+----------+------------+--------------+-------------+
|customer_unique_id|product_id|review_score|customer_index|product_index|
+------------------+----------+------------+--------------+-------------+
+------------------+----------+------------+--------------+-------------+



## Model Training

In [60]:
# split 80-20
(training, test) = user_features.randomSplit([0.8, 0.2], seed= 42)

In [61]:
# train the recommender with als
als_alg = ALS(maxIter=5, 
              regParam=0.01, 
              userCol='customer_index', 
              itemCol="product_index", 
              ratingCol='review_score',
              coldStartStrategy='drop', 
              seed = 3)

model=als_alg.fit(training)

# evaluate with the holdout set
predictions = model.transform(test)

evaluator = RegressionEvaluator(metricName='rmse',
                                labelCol='review_score',
                                predictionCol='prediction')
rmse = evaluator.evaluate(predictions)

print("Root-mean-squared-error = " + str(round(rmse, 3)))

Root-mean-squared-error = 0.767


## Generate user and product recommendations

These can be sampled from to output predictions for specific users.

In [62]:
# generate top 5 product recommendations for user
user_recs = model.recommendForAllUsers(5)


In [63]:
user_recs.where(user_recs.customer_index == 728).select("recommendations.product_index", "recommendations.rating").collect()

[Row(product_index=[3122, 6583, 2815, 1364, 2351], rating=[8.74084186553955, 8.381982803344727, 8.03279972076416, 7.954357147216797, 7.855741500854492])]

In [64]:
product_recs = model.recommendForAllItems(10)

In [65]:
user_recs.show(truncate=False)

+--------------+-----------------------------------------------------------------------------------------------+
|customer_index|recommendations                                                                                |
+--------------+-----------------------------------------------------------------------------------------------+
|28            |[{1079, 8.19504}, {2325, 8.102985}, {1979, 7.702739}, {3122, 7.689831}, {1744, 7.307098}]      |
|31            |[{2758, 19.74845}, {1700, 19.039661}, {2165, 18.61437}, {984, 17.944609}, {951, 17.652035}]    |
|34            |[{358, 10.140588}, {1668, 10.120234}, {1283, 9.923436}, {1254, 9.726533}, {1281, 9.715166}]    |
|53            |[{4163, 8.803051}, {2150, 8.466472}, {2615, 8.005125}, {723, 7.9910007}, {2041, 7.9739823}]    |
|65            |[{5262, 9.820147}, {2327, 9.441342}, {1732, 9.422139}, {3258, 9.079272}, {2456, 9.049652}]     |
|78            |[{3151, 10.416695}, {1147, 10.159479}, {1103, 10.02924}, {908, 9.4026985}, {2993

In [66]:
product_recs.show(truncate=False)

+-------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|product_index|recommendations                                                                                                                                                                                    |
+-------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|28           |[{345, 6.918575}, {61, 6.63913}, {106, 6.1025157}, {2778, 4.9998403}, {3882, 4.9998255}, {90959, 4.9958224}, {89923, 4.9958224}, {89310, 4.9958224}, {88108, 4.9958224}, {87193, 4.9958224}]       |
|31           |[{61, 5.4109335}, {4144, 4.999881}, {91003, 4.995648}, {90270, 4.995648}, {89932, 4.995648}, {88773, 4.995648}, {88074, 4.995648}, {87809

In [67]:
users = user_features.select(als_alg.getUserCol()).distinct().limit(3)
user_subset_recs = model.recommendForUserSubset(users, 10)
user_subset_recs.show(n=4)

+--------------+--------------------+
|customer_index|     recommendations|
+--------------+--------------------+
|          5360|[{1858, 4.999555}...|
|          5776|[{660, 6.7424064}...|
|          5858|[{1829, 31.782501...|
+--------------+--------------------+



### Parameter Tuning

In [102]:
# Import the required functions
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

In [103]:
# train the recommender with als
als = ALS(maxIter=5, 
              regParam=0.01, 
              userCol='customer_index', 
              itemCol="product_index", 
              ratingCol='review_score',
              coldStartStrategy='drop')

In [104]:
# Import the requisite packages
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import RegressionEvaluator

# Add hyperparameters and their respective values to param_grid
param_grid = ParamGridBuilder() \
            .addGrid(als.rank, [10, 50, 100, 150]) \
            .addGrid(als.regParam, [.01, .05, .1, .15]) \
            .build()

In [105]:
evaluator = RegressionEvaluator(
           metricName="rmse", 
           labelCol="review_score", 
           predictionCol="prediction") 
print ("Num models to be tested: ", len(param_grid))

Num models to be tested:  16


In [106]:
# Build cross validation using CrossValidator
cv = CrossValidator(estimator=als, estimatorParamMaps=param_grid, evaluator=evaluator, numFolds=3)

In [107]:
#Fit cross validator to the 'train' dataset
model = cv.fit(training)
#Extract best model from the cv model above
best_model = model.bestModel
# View the predictions
test_predictions = best_model.transform(test)
RMSE = evaluator.evaluate(test_predictions)
print(RMSE)

0.7172052405614309


In [108]:
print("**Best Model**")
# Print "Rank"
print("  Rank:", best_model._java_obj.parent().getRank())
# Print "MaxIter"
print("  MaxIter:", best_model._java_obj.parent().getMaxIter())
# Print "RegParam"
print("  RegParam:", best_model._java_obj.parent().getRegParam())

**Best Model**
  Rank: 150
  MaxIter: 5
  RegParam: 0.1


In [25]:
# Generate n Recommendations for all users
recommendations = best_model.recommendForAllUsers(5)
recommendations.show()

+--------------+--------------------+
|customer_index|     recommendations|
+--------------+--------------------+
|           148|[[1358, 5.983632]...|
|           463|[[586, 5.0032835]...|
|           471|[[353, 4.9666333]...|
|           496|[[14701, 3.994572...|
|           833|[[5976, 3.981684]...|
|          1088|[[8835, 4.0529366...|
|          1238|[[19280, 1.956733...|
|          1342|[[3733, 2.9766014...|
|          1580|[[2116, 2.9813814...|
|          1591|[[30130, 3.586678...|
|          1645|[[8578, 2.9761343...|
|          1829|[[8550, 2.9761338...|
|          1959|[[23256, 3.067747...|
|          2122|[[460, 1.9372059]...|
|          2142|[[13069, 3.092081...|
|          2366|[[23114, 3.457238...|
|          2659|[[1923, 2.969991]...|
|          2866|[[678, 3.32295], ...|
|          3175|[[3776, 1.9789387...|
|          3749|[[5388, 1.3674556...|
+--------------+--------------------+
only showing top 20 rows



### Re-run model for all users

In [26]:
# train the recommender with als
als_alg = ALS(rank=best_model._java_obj.parent().getRank(),
              maxIter=best_model._java_obj.parent().getMaxIter(), 
              regParam=best_model._java_obj.parent().getRegParam(), 
              userCol='customer_index', 
              itemCol="product_index", 
              ratingCol='product_category_count',
              coldStartStrategy='drop')

final_model=als_alg.fit(user_features)

In [27]:
# generate top_n product recommendations for user
nrecommend = 5
user_recs = final_model.recommendForAllUsers(nrecommend)
user_recs.show(4)

+--------------+--------------------+
|customer_index|     recommendations|
+--------------+--------------------+
|           148|[[1358, 5.9854245...|
|           463|[[586, 5.0025682]...|
|           471|[[353, 4.9697766]...|
|           496|[[14701, 3.999740...|
+--------------+--------------------+
only showing top 4 rows



In [28]:
recs = user_recs.toPandas()

## Recommender Function

In [29]:
# Generate pandas df for accessing products in recommender function
products = item_features.toPandas()

In [32]:
user_features_df = user_features.toPandas()

In [33]:
def user_recommendations(user_id, top_n = 3):
    
    if top_n > nrecommend:
        print("Please select up to {} items to recommend".format(nrecommend))
        return; 
    
    prior_purchases = user_features_df[user_features_df['customer_unique_id'] == user_id]\
                                                                                        ['product_id'].unique()
    num_items = len(prior_purchases)
    
    if num_items < 3:
        items = num_items
    else:
        items = 3
    
    print("User: {}\n".format(user_id))
    print("Known positives: ")
    for n in range(items):
        known_like_product = user_features_df[user_features_df['customer_unique_id'] == user_id]\
                                                            ['product_id'].unique()[n]
        known_like_category = products[products['product_id'] == known_like_product]\
                                                            ['product_category_name'].unique()[0]
    
        print("\t", known_like_product)
        print("\t", known_like_category, "\n")
    
    
    customer_index = user_features_df[user_features_df['customer_unique_id'] == user_id]\
                                                            ['customer_index'].unique()[0]
    print("Top {} Recommendations: \n".format(top_n))
    rec_products = []
    
    for n in range(top_n):
        
        rec_products.append(list(recs[recs['customer_index'] == customer_index]['recommendations'])[0][n][0])
        
        print("{}.\n".format(n+1), products[products['product_index'] == rec_products[n]]\
                                                  [['product_id', 'product_category_name']].iloc[0][0])
        
        print(products[products['product_index'] == rec_products[n]]\
                                                  [['product_id', 'product_category_name']].iloc[0][1])

__Test for customer_id = 'c8ed31310fc440a3f8031b177f9842c3'__

In [34]:
user_recommendations('c8ed31310fc440a3f8031b177f9842c3', top_n=5)

User: c8ed31310fc440a3f8031b177f9842c3

Known positives: 
	 1065e0ebef073787a7bf691924c60eeb
	 construction_tools_construction 

	 0cf2faf9749f53924cea652a09d8e327
	 construction_tools_construction 

	 309dd69eb83cea38c51709d62befe1a4
	 construction_tools_construction 

Top 5 Recommendations: 

1.
 15b1f9b06d0e709552d7d8638387e09b
furniture_decor
2.
 7189fb70393a0b87189f93f19655f8db
toys
3.
 3e7ec3672e5549ba74cf635752bfc70b
furniture_decor
4.
 14ad6805c263d8d758d648f46a06570e
baby
5.
 329c661807f085964b1877bfeca6ff73
furniture_decor


__Test for customer_id = '698e1cf81d01a3d389d96145f7fa6df8'__

In [35]:
user_recommendations('698e1cf81d01a3d389d96145f7fa6df8', top_n=5)

User: 698e1cf81d01a3d389d96145f7fa6df8

Known positives: 
	 9571759451b1d780ee7c15012ea109d4
	 auto 

Top 5 Recommendations: 

1.
 0a4f9f421af66d2ea061fbb8883419f7
health_beauty
2.
 fdd84aefb08c8f8225e0b8c97429d53b
health_beauty
3.
 12485f9cdebb6ca179826ede539554ad
air_conditioning
4.
 616042729c11849827291496b18e9ec5
sports_leisure
5.
 7a5c07212703b5f01ee199d29a29a587
cool_stuff


__Test for customer_id = '89be58cbdd6ef318e3ed93fdb22be178'__

In [36]:
user_recommendations('89be58cbdd6ef318e3ed93fdb22be178', top_n=5)

User: 89be58cbdd6ef318e3ed93fdb22be178

Known positives: 
	 3fdb534dccf5bc9ab0406944b913787d
	 diapers_and_hygiene 

Top 5 Recommendations: 

1.
 779dd392d4fbe5ca656bf3ceabecbf0b
construction_tools_construction
2.
 bdcf6a834e8faa30dac3886c7a58e92e
health_beauty
3.
 91b08d34d0ba4db44da2dc382867ba49
telephony
4.
 1b8ee158f59c098470fad33f39660964
furniture_living_room
5.
 d9339c5714743c460a9470730f79f6c5
computers_accessories


## Summary of Results

It's clear after producing a simple recommendation system with matrix factorization using only users prior purchase history that this dataset simply does not have the data necessary to give accurate results. Many of the attempted recommendations produced results that are clearly not relevant for the user. As can be seen in the suggestions above, many of the top items are from categories very different from the original purchase. 

Comparing these same recommendations with LightFM shows how well hybrid recommenders can do for data sets like this one, with very few return users. 