# Spark Recommender System

In [None]:
# import necessary modules
import os
import shutil
import pyspark as ps
from pyspark.ml import Pipeline, Transformer
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS, ALSModel
from pyspark.sql import Row
from pyspark.sql.types import DoubleType

In [None]:
# create spark context
spark = (ps.sql.SparkSession.builder
        .appName("ALS model")
        .getOrCreate()
        )
sc = spark.sparkContext
print(spark.version)

## Read in Data

We will use user clusters as part of user features for our model here. 

In [None]:
# source data from prior step
data_dir = os.path.join("model_data", "")
file = os.path.join(data_dir, "user_model.csv")

# options are specified to read in data without error
user_features = spark.read.format("csv")\
               .option("multiline", "true")\
               .option("header", "true")\
               .option("inferSchema", "true")\
               .load(file)

In [None]:
user_features.printSchema()

## Model Training

In [None]:
# split 80-20
(training, test) = user_features.randomSplit([0.8, 0.2], seed= 42)

### Parameter Tuning

In [None]:
# Import the required functions
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

In [None]:
# train the recommender with als
als = ALS(maxIter=10,   # default 5, change to 10
            #   regParam=0.01, 
              userCol='customer_index', 
              itemCol="product_index", 
              ratingCol='review_score',
              coldStartStrategy='drop')

In [None]:
# Import the requisite packages
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import RegressionEvaluator

# Add hyperparameters and their respective values to param_grid
param_grid = ParamGridBuilder() \
            .addGrid(als.rank, [10, 50, 100, 150]) \
            .addGrid(als.regParam, [.01, .05, .1, .15]) \
            .build()

In [None]:
evaluator = RegressionEvaluator(
           metricName="rmse", 
           labelCol="review_score", 
           predictionCol="prediction") 
print ("Num models to be tested: ", len(param_grid))

In [None]:
# Build cross validation using CrossValidator
numfolds = 5
cv = CrossValidator(estimator=als, estimatorParamMaps=param_grid, evaluator=evaluator, numFolds=numfolds)

In [None]:
#Fit cross validator to the 'train' dataset
model = cv.fit(training)
#Extract best model from the cv model above
best_model = model.bestModel
# View the predictions
test_predictions = best_model.transform(test)
RMSE = evaluator.evaluate(test_predictions)
print(RMSE)

In [None]:
model.write().overwrite().save("./model/cv")

In [None]:
print("**Best Model**")
# Print "Rank"
print("  Rank:", best_model._java_obj.parent().getRank())
# Print "MaxIter"
print("  MaxIter:", best_model._java_obj.parent().getMaxIter())
# Print "RegParam"
print("  RegParam:", best_model._java_obj.parent().getRegParam())

In [None]:
# Generate n Recommendations for all users
recommendations = best_model.recommendForAllUsers(5)
recommendations.show()

### Re-run model for all users

In [None]:
# train the recommender with als
als_alg = ALS(rank=best_model._java_obj.parent().getRank(),
              maxIter=best_model._java_obj.parent().getMaxIter(), 
              regParam=best_model._java_obj.parent().getRegParam(), 
              userCol='customer_index', 
              itemCol="product_index", 
              ratingCol='review_score',
              coldStartStrategy='drop')

final_model=als_alg.fit(user_features)


In [None]:
final_model.write().overwrite().save("./model/bestmodel")

In [None]:
# generate top_n product recommendations for user
nrecommend = 5
user_recs = final_model.recommendForAllUsers(nrecommend)
user_recs.show(4)

In [None]:
recs = user_recs.toPandas()

In [None]:
import pandas as pd

user_features_df = pd.read_csv("./model_data/user_model.csv")
products = pd.read_csv("./model_data/item_model.csv")

In [None]:
nrecommend = 5
def user_recommendations(user_id, top_n = 3):
    
    if top_n > nrecommend:
        print("Please select up to {} items to recommend".format(nrecommend))
        return; 
    
    prior_purchases = user_features_df[user_features_df['customer_unique_id'] == user_id]\
                                                                                        ['product_id'].unique()
    num_items = len(prior_purchases)
    
    if num_items < 3:
        items = num_items
    else:
        items = 3
    
    print("User: {}\n".format(user_id))
    print("Known positives: ")
    for n in range(items):
        known_like_product = user_features_df[user_features_df['customer_unique_id'] == user_id]\
                                                            ['product_id'].unique()[n]
        known_like_category = products[products['product_id'] == known_like_product]\
                                                            ['product_category_name'].unique()[0]
    
        print("\t", known_like_product)
        print("\t", known_like_category, "\n")
    
    
    customer_index = user_features_df[user_features_df['customer_unique_id'] == user_id]\
                                                            ['customer_index'].unique()[0]
    print("Top {} Recommendations: \n".format(top_n))
    rec_products = []
    
    for n in range(top_n):
        
        rec_products.append(list(recs[recs['customer_index'] == customer_index]['recommendations'])[0][n][0])
        
        print("{}.\n".format(n+1), products[products['product_index'] == rec_products[n]]\
                                                  [['product_id', 'product_category_name']].iloc[0][0])
        
        print(products[products['product_index'] == rec_products[n]]\
                                                  [['product_id', 'product_category_name']].iloc[0][1])

In [None]:
user_recommendations('c8ed31310fc440a3f8031b177f9842c3', top_n=5)

In [None]:
user_recommendations('b56d31572e47b1e6d1b88d3128f2226b', top_n=5)

In [None]:
user_recommendations('89be58cbdd6ef318e3ed93fdb22be178', top_n=5)

In [None]:
# Generate pandas df for accessing products in recommender function
# products = item_features.toPandas()

In [None]:
# recs.to_csv('./model_data/recs.csv', index=False)

In [None]:
# products.to_csv('./model_data/products.csv', index=False)

In [None]:
# user_features_df = user_features.toPandas()

In [None]:
# user_features_df.to_csv('./model_data/users.csv', index=False)