# Spark Recommender System

In [19]:
# import necessary modules
import os
import shutil
import pyspark as ps
from pyspark.ml import Pipeline, Transformer
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS, ALSModel
from pyspark.sql import Row
from pyspark.sql.types import DoubleType

In [20]:
# create spark context
spark = (ps.sql.SparkSession.builder
        .appName("ALS model")
        .getOrCreate()
        )
sc = spark.sparkContext
print(spark.version)

3.5.0


## Read in Data

We will use user clusters as part of user features for our model here. 

In [21]:
# source data from prior step
data_dir = os.path.join("clean_data", "")
file = os.path.join(data_dir, "spark_master.csv")

# options are specified to read in data without error
df_master = spark.read.format("csv")\
               .option("multiline", "true")\
               .option("header", "true")\
               .option("inferSchema", "true")\
               .load(file)

## Create use and item feature matrices

In [22]:
user_features = df_master.select(df_master['customer_unique_id'], 
                               df_master['product_id'], 
                               df_master['review_score'])
user_features.dropna()

DataFrame[customer_unique_id: string, product_id: string, review_score: int]

In [23]:
user_features = user_features.sort("customer_unique_id")
user_features.count()

108455

In [24]:
from pyspark.sql.functions import mean

user_features = user_features.groupBy("customer_unique_id","product_id") \
    .agg(mean('review_score').alias('review_score'))
    # .mean("review_score").alias("review_score")

user_features.count()

97686

In [25]:
user_features.show()

+--------------------+--------------------+------------+
|  customer_unique_id|          product_id|review_score|
+--------------------+--------------------+------------+
|f507945d757fe904e...|f3b8bfa5b86249e75...|         1.0|
|e59d98970fee89f12...|abc3432ff825d8e52...|         5.0|
|f7089cec59a3515cb...|9615c720e219d641f...|         4.0|
|5097528b417fe105b...|2d65aa8c163f7a8dc...|         5.0|
|c3ec2ad05ad7887e8...|bddc1407637849d18...|         4.0|
|56018f90d63660400...|71da6d6632902431c...|         5.0|
|5745d934dfb11a294...|b184461ba53ea15b7...|         5.0|
|ba4ed9f6b174012ec...|50e9e70a21b874311...|         5.0|
|057f54779f0908b7b...|98a5e061c4731a1d4...|         2.0|
|886d5f561ce675d06...|9b968712a8768d8e2...|         4.0|
|99be275a1517effe1...|bc4a074ab7a7ff14e...|         5.0|
|b067d457d34c2533b...|2ed4ea5a5394cfcb0...|         2.0|
|97107c892093c359b...|bb9a7bcdb8c9facc7...|         1.0|
|4d7ad8005bec85fd6...|8a2c3b7ed6d098de8...|         5.0|
|1b277f8bf9377e4d8...|d315b738a

In [26]:
item_features = df_master.select(df_master['product_id'], 
                               df_master['product_category_name'])
item_features = item_features.dropna()

## Index user and product ids

In [27]:
from pyspark.ml.feature import StringIndexer

SI_product = StringIndexer(inputCol='product_id',outputCol='product_index')

item_features = SI_product.fit(item_features).transform(item_features)

item_features.select('product_id', 'product_index').show(10)

+--------------------+-------------+
|          product_id|product_index|
+--------------------+-------------+
|e5f2d52b802189ee6...|      29774.0|
|8d4f2bb7e93e6710a...|       7523.0|
|4fa33915031a8cde0...|        653.0|
|08279c494018541f7...|      14836.0|
|9545d45c37449ccbc...|        161.0|
|19421075ae0b585f2...|        128.0|
|e819fddd6622f30e8...|       8461.0|
|7583d9a579408cb84...|       3335.0|
|d64e758afad411049...|      28752.0|
|eaa26a34984cbfedc...|       1839.0|
+--------------------+-------------+
only showing top 10 rows



23/12/22 22:42:48 WARN DAGScheduler: Broadcasting large task binary with size 1815.7 KiB


In [28]:

# create object of StringIndexer class and specify input and output column
SI_customer = StringIndexer(inputCol='customer_unique_id',outputCol='customer_index')
SI_product = StringIndexer(inputCol='product_id',outputCol='product_index')

# transform the data
user_features = SI_customer.fit(user_features).transform(user_features)
user_features = SI_product.fit(user_features).transform(user_features)

# view the transformed data
user_features.select('customer_unique_id', 'customer_index', 'product_id', 'product_index').show(10)

+--------------------+--------------+--------------------+-------------+
|  customer_unique_id|customer_index|          product_id|product_index|
+--------------------+--------------+--------------------+-------------+
|0000366f3b9a7992b...|        5166.0|372645c7439f9661f...|        451.0|
|0000b849f77a49e4a...|        5167.0|5099f7000472b634f...|       2313.0|
|0000f46a3911fa3c0...|        5168.0|64b488de448a5324c...|       3486.0|
|0000f6ccb0745a6a4...|        5169.0|2345a354a6f203360...|       5592.0|
|0004aac84e0df4da2...|        5170.0|c72e18b3fe2739b8d...|      27381.0|
|0004bd2a26a76fe21...|        5171.0|25cf184645f3fae66...|       5607.0|
|00050ab1314c0e55a...|        5172.0|8cefe1c6f2304e7e6...|       2900.0|
|00053a61a98854899...|         726.0|62984ea1bba7fcea1...|       4526.0|
|00053a61a98854899...|         726.0|58727e154e8e85d84...|       1134.0|
|0005e1862207bf6cc...|        5173.0|e24f73b7631ee3fbb...|        887.0|
+--------------------+--------------+--------------

23/12/22 22:42:50 WARN DAGScheduler: Broadcasting large task binary with size 6.4 MiB


In [29]:
from pyspark.sql.types import IntegerType
# convert columns to integer types
user_features = user_features.withColumn("review_score",
                                        user_features["review_score"].cast(IntegerType()))

In [30]:
user_features.where((user_features.review_score > 5) | (user_features.review_score < 1)).show()

23/12/22 22:42:51 WARN DAGScheduler: Broadcasting large task binary with size 6.4 MiB
23/12/22 22:42:52 WARN DAGScheduler: Broadcasting large task binary with size 6.4 MiB


+------------------+----------+------------+--------------+-------------+
|customer_unique_id|product_id|review_score|customer_index|product_index|
+------------------+----------+------------+--------------+-------------+
+------------------+----------+------------+--------------+-------------+



23/12/22 22:42:52 WARN DAGScheduler: Broadcasting large task binary with size 6.4 MiB


In [31]:
# change to your clone prj folder
owd = "/home/huynhminhphuoc01/olist_recommend_sys"

def write_df_as_csv_file(df, path,csvsavename, mode="overwrite", header=True, inferschema = True ):
    import shutil

    df = df.coalesce(1)  # join partitions to produce 1 csv file

    header = "true" if header else "false"
    inferschema = "true" if inferschema else "false"

    dfw = df.write.format("csv")\
                    .option("header", header)\
                    .mode(mode)\
                    .option("timestampFormat", "yyyy-MM-dd HH:mm:ss")\
                    .option("inferSchema", inferschema)\
                    .option("multipleLine", "false")
    dfw.save(path + csvsavename)


    csv_filenames = [filename for filename in os.listdir(path + csvsavename) if filename.endswith(".csv")]

    os.chdir(owd)
    source_path = path + csvsavename + "/" + csv_filenames[0]
    destination_path = path +"/"+ csvsavename

    shutil.copyfile(source_path, destination_path)

    shutil.rmtree(path + csvsavename)

In [32]:
savepath = owd + "/model_data"


# delete exist file with same name in model_data folder
#  before running this cell
# write_df_as_csv_file(user_features, savepath, "user_model.csv")
# write_df_as_csv_file(item_features, savepath, "item_model.csv")

## Model Training

In [33]:
# split 80-20
(training, test) = user_features.randomSplit([0.8, 0.2], seed= 42)

In [34]:
# train the recommender with als
als_alg = ALS(maxIter=5, 
              regParam=0.01, 
              userCol='customer_index', 
              itemCol="product_index", 
              ratingCol='review_score',
              coldStartStrategy='drop', 
              seed = 3)

model=als_alg.fit(training)

# evaluate with the holdout set
predictions = model.transform(test)

evaluator = RegressionEvaluator(metricName='rmse',
                                labelCol='review_score',
                                predictionCol='prediction')
rmse = evaluator.evaluate(predictions)

print("Root-mean-squared-error = " + str(round(rmse, 3)))

23/12/22 22:42:54 WARN DAGScheduler: Broadcasting large task binary with size 6.4 MiB
23/12/22 22:42:54 WARN DAGScheduler: Broadcasting large task binary with size 6.4 MiB
23/12/22 22:42:54 WARN DAGScheduler: Broadcasting large task binary with size 6.4 MiB
23/12/22 22:42:55 WARN DAGScheduler: Broadcasting large task binary with size 6.4 MiB
23/12/22 22:42:55 WARN DAGScheduler: Broadcasting large task binary with size 6.4 MiB
23/12/22 22:42:56 WARN DAGScheduler: Broadcasting large task binary with size 6.4 MiB
23/12/22 22:42:56 WARN DAGScheduler: Broadcasting large task binary with size 6.4 MiB
23/12/22 22:42:57 WARN DAGScheduler: Broadcasting large task binary with size 6.4 MiB
23/12/22 22:42:57 WARN DAGScheduler: Broadcasting large task binary with size 6.4 MiB
23/12/22 22:42:58 WARN DAGScheduler: Broadcasting large task binary with size 6.4 MiB
23/12/22 22:42:58 WARN DAGScheduler: Broadcasting large task binary with size 6.4 MiB
23/12/22 22:42:58 WARN DAGScheduler: Broadcasting larg

Root-mean-squared-error = 0.803


23/12/22 22:43:03 WARN DAGScheduler: Broadcasting large task binary with size 6.5 MiB


In [35]:
from pyspark.ml.evaluation import RegressionEvaluator
import pyspark.sql.functions as F
RATING_COL = "review_score"
def evaluate(model, data, verbose=0):
    """
    Evaluate the model by computing rmse, mae, r2 and variance over the data.
    """

    predictions = model.transform(data).withColumn(
        "prediction", F.col("prediction").cast("double")
    )

    if verbose > 1:
        # Show 10 predictions
        predictions.select("_user_id", "_item_id", RATING_COL, "prediction").limit(
            10
        ).show()

    # Initialize the regression evaluator
    evaluator = RegressionEvaluator(predictionCol="prediction", labelCol=RATING_COL)

    _evaluator = lambda metric: evaluator.setMetricName(metric).evaluate(predictions)
    rmse = _evaluator("rmse")
    mae = _evaluator("mae")
    r2 = _evaluator("r2")
    var = _evaluator("var")

    if verbose > 0:
        print(f"RMSE score = {rmse}")
        print(f"MAE score = {mae}")
        print(f"R2 score = {r2}")
        print(f"Explained variance = {var}")

    return predictions, (rmse, mae, r2, var)

In [63]:
predictions, (rmse, mae, r2, var) = evaluate(model, test)
print(f"RMSE score = {rmse}")
print(f"MAE score = {mae}")
print(f"R2 score = {r2}")
print(f"Explained variance = {var}")

23/12/23 00:15:28 WARN DAGScheduler: Broadcasting large task binary with size 6.4 MiB
23/12/23 00:15:28 WARN DAGScheduler: Broadcasting large task binary with size 6.4 MiB
23/12/23 00:15:29 WARN DAGScheduler: Broadcasting large task binary with size 6.4 MiB
23/12/23 00:15:29 WARN DAGScheduler: Broadcasting large task binary with size 6.4 MiB
23/12/23 00:15:30 WARN DAGScheduler: Broadcasting large task binary with size 6.5 MiB
23/12/23 00:15:31 WARN DAGScheduler: Broadcasting large task binary with size 6.5 MiB
23/12/23 00:15:31 WARN DAGScheduler: Broadcasting large task binary with size 6.4 MiB
23/12/23 00:15:31 WARN DAGScheduler: Broadcasting large task binary with size 6.4 MiB
23/12/23 00:15:32 WARN DAGScheduler: Broadcasting large task binary with size 6.4 MiB
23/12/23 00:15:33 WARN DAGScheduler: Broadcasting large task binary with size 6.4 MiB
23/12/23 00:15:34 WARN DAGScheduler: Broadcasting large task binary with size 6.5 MiB
23/12/23 00:15:34 WARN DAGScheduler: Broadcasting larg

RMSE score = 0.7008349607035208
MAE score = 0.14912946911570826
R2 score = 0.7095023922332659
Explained variance = 2.1305195765753084


23/12/23 00:15:42 WARN DAGScheduler: Broadcasting large task binary with size 6.5 MiB
                                                                                

## Generate user and product recommendations

These can be sampled from to output predictions for specific users.

In [37]:
# generate top 5 product recommendations for user
user_recs = model.recommendForAllUsers(5)


In [38]:
user_recs.show(truncate=False)

23/12/22 22:43:12 WARN DAGScheduler: Broadcasting large task binary with size 6.5 MiB
23/12/22 22:43:30 WARN DAGScheduler: Broadcasting large task binary with size 6.4 MiB


+--------------+-----------------------------------------------------------------------------------------------+
|customer_index|recommendations                                                                                |
+--------------+-----------------------------------------------------------------------------------------------+
|26            |[{1060, 8.688657}, {1115, 8.409449}, {1669, 8.333566}, {1097, 7.9857073}, {3950, 7.61391}]     |
|27            |[{7272, 9.522753}, {1724, 8.645385}, {2376, 8.347814}, {1485, 8.288625}, {1552, 8.246329}]     |
|28            |[{2041, 9.059436}, {1670, 8.859715}, {1018, 8.547615}, {1364, 8.508508}, {5012, 8.396306}]     |
|31            |[{3150, 25.464808}, {7840, 24.426056}, {3092, 22.928698}, {3775, 21.879086}, {6870, 21.037989}]|
|34            |[{1448, 9.866649}, {2230, 9.369169}, {1650, 8.924972}, {1793, 8.812243}, {4624, 8.780968}]     |
|44            |[{1116, 34.668804}, {1510, 34.501568}, {3444, 31.701674}, {724, 30.150785}, {660

                                                                                

In [39]:
user_recs.where(user_recs.customer_index == 728).select("recommendations.product_index", "recommendations.rating").collect()

23/12/22 22:43:30 WARN DAGScheduler: Broadcasting large task binary with size 6.5 MiB
23/12/22 22:43:47 WARN DAGScheduler: Broadcasting large task binary with size 6.4 MiB
                                                                                

[Row(product_index=[2030, 920, 1405, 1178, 873], rating=[8.747422218322754, 8.393131256103516, 8.392362594604492, 8.351383209228516, 8.327407836914062])]

In [40]:
# generate top 10 user recommendations for product
product_recs = model.recommendForAllItems(10)

In [41]:
product_recs.printSchema()

root
 |-- product_index: integer (nullable = false)
 |-- recommendations: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- customer_index: integer (nullable = true)
 |    |    |-- rating: float (nullable = true)



In [42]:
product_recs.show(truncate=False)

23/12/22 22:43:48 WARN DAGScheduler: Broadcasting large task binary with size 6.5 MiB
23/12/22 22:44:05 WARN DAGScheduler: Broadcasting large task binary with size 6.4 MiB


+-------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|product_index|recommendations                                                                                                                                                                                         |
+-------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|26           |[{599, 10.294521}, {4881, 9.385081}, {3137, 9.385081}, {2613, 9.385081}, {2762, 8.972782}, {4093, 8.45564}, {1284, 7.9222527}, {2994, 7.638472}, {91294, 6.8603377}, {91002, 6.8603377}]                |
|27           |[{2773, 8.055581}, {599, 6.6936707}, {5078, 5.7567425}, {2620, 5.572836}, {106, 5.3618097}, {4663, 5.093957}, {3584, 

                                                                                

In [43]:
# generate top 10 product recommendations for subset user
users = user_features.select(als_alg.getUserCol()).distinct().limit(3)
user_subset_recs = model.recommendForUserSubset(users, 10)
user_subset_recs.show(truncate=False)

23/12/22 22:44:06 WARN DAGScheduler: Broadcasting large task binary with size 4.6 MiB
23/12/22 22:44:07 WARN DAGScheduler: Broadcasting large task binary with size 4.6 MiB
23/12/22 22:44:07 WARN DAGScheduler: Broadcasting large task binary with size 6.5 MiB

+--------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|customer_index|recommendations                                                                                                                                                                              |
+--------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|5360          |[{1858, 4.9995756}, {1490, 4.545968}, {2183, 4.2266192}, {921, 4.061932}, {1293, 4.02992}, {1291, 3.9712772}, {1615, 3.9584608}, {2237, 3.6884427}, {1453, 3.686512}, {1746, 3.6833885}]     |
|5776          |[{1277, 6.2545342}, {992, 6.234127}, {2412, 6.174125}, {1593, 6.0062885}, {989, 5.851141}, {3459, 5.8301096}, {1051, 5.6890745}, {3776, 5.615653}, {1873, 5.

23/12/22 22:44:11 WARN DAGScheduler: Broadcasting large task binary with size 6.5 MiB
                                                                                

### Parameter Tuning

In [44]:
# Import the required functions
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

In [45]:
# train the recommender with als
als = ALS(maxIter=5, 
              regParam=0.01, 
              userCol='customer_index', 
              itemCol="product_index", 
              ratingCol='review_score',
              coldStartStrategy='drop')

In [46]:
# Import the requisite packages
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import RegressionEvaluator

# Add hyperparameters and their respective values to param_grid
param_grid = ParamGridBuilder() \
            .addGrid(als.rank, [10, 50, 100, 150]) \
            .addGrid(als.regParam, [.01, .05, .1, .15]) \
            .build()

In [47]:
evaluator = RegressionEvaluator(
           metricName="rmse", 
           labelCol="review_score", 
           predictionCol="prediction") 
print ("Num models to be tested: ", len(param_grid))

Num models to be tested:  16


In [48]:
# Build cross validation using CrossValidator
cv = CrossValidator(estimator=als, estimatorParamMaps=param_grid, evaluator=evaluator, numFolds=5)

In [49]:
#Fit cross validator to the 'train' dataset
model = cv.fit(training)
#Extract best model from the cv model above
best_model = model.bestModel
# View the predictions
test_predictions = best_model.transform(test)
RMSE = evaluator.evaluate(test_predictions)
print(RMSE)

23/12/22 22:44:12 WARN DAGScheduler: Broadcasting large task binary with size 6.4 MiB
23/12/22 22:44:13 WARN DAGScheduler: Broadcasting large task binary with size 6.4 MiB
23/12/22 22:44:13 WARN DAGScheduler: Broadcasting large task binary with size 6.4 MiB
23/12/22 22:44:14 WARN DAGScheduler: Broadcasting large task binary with size 6.4 MiB
23/12/22 22:44:14 WARN DAGScheduler: Broadcasting large task binary with size 6.4 MiB
23/12/22 22:44:14 WARN DAGScheduler: Broadcasting large task binary with size 6.4 MiB
23/12/22 22:44:15 WARN DAGScheduler: Broadcasting large task binary with size 6.4 MiB
23/12/22 22:44:15 WARN DAGScheduler: Broadcasting large task binary with size 6.4 MiB
23/12/22 22:44:15 WARN DAGScheduler: Broadcasting large task binary with size 6.4 MiB
23/12/22 22:44:16 WARN DAGScheduler: Broadcasting large task binary with size 6.4 MiB
23/12/22 22:44:16 WARN DAGScheduler: Broadcasting large task binary with size 6.4 MiB
23/12/22 22:44:17 WARN DAGScheduler: Broadcasting larg

0.6873359143193


23/12/22 23:37:19 WARN DAGScheduler: Broadcasting large task binary with size 6.5 MiB
                                                                                

In [64]:
predictions, (rmse, mae, r2, var) = evaluate(best_model, test)
print(f"RMSE score = {rmse}")
print(f"MAE score = {mae}")
print(f"R2 score = {r2}")
print(f"Explained variance = {var}")

23/12/23 00:18:39 WARN DAGScheduler: Broadcasting large task binary with size 6.4 MiB
23/12/23 00:18:40 WARN DAGScheduler: Broadcasting large task binary with size 6.4 MiB
23/12/23 00:18:40 WARN DAGScheduler: Broadcasting large task binary with size 6.4 MiB
23/12/23 00:18:41 WARN DAGScheduler: Broadcasting large task binary with size 6.4 MiB
23/12/23 00:18:42 WARN DAGScheduler: Broadcasting large task binary with size 6.5 MiB
23/12/23 00:18:42 WARN DAGScheduler: Broadcasting large task binary with size 6.5 MiB
23/12/23 00:18:43 WARN DAGScheduler: Broadcasting large task binary with size 6.4 MiB
23/12/23 00:18:43 WARN DAGScheduler: Broadcasting large task binary with size 6.4 MiB
23/12/23 00:18:44 WARN DAGScheduler: Broadcasting large task binary with size 6.4 MiB
23/12/23 00:18:45 WARN DAGScheduler: Broadcasting large task binary with size 6.4 MiB
23/12/23 00:18:46 WARN DAGScheduler: Broadcasting large task binary with size 6.5 MiB
23/12/23 00:18:46 WARN DAGScheduler: Broadcasting larg

RMSE score = 0.6953290955708991
MAE score = 0.1516022316809063
R2 score = 0.685070562737336
Explained variance = 2.1353352254191353


23/12/23 00:18:53 WARN DAGScheduler: Broadcasting large task binary with size 6.5 MiB
                                                                                

In [51]:
model.write().overwrite().save("./model/cv")

23/12/22 23:37:34 WARN DAGScheduler: Broadcasting large task binary with size 6.6 MiB
23/12/22 23:37:35 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 95.00% for 8 writers
23/12/22 23:37:37 WARN DAGScheduler: Broadcasting large task binary with size 6.6 MiB
23/12/22 23:37:38 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 95.00% for 8 writers
                                                                                

In [52]:
print("**Best Model**")
# Print "Rank"
print("  Rank:", best_model._java_obj.parent().getRank())
# Print "MaxIter"
print("  MaxIter:", best_model._java_obj.parent().getMaxIter())
# Print "RegParam"
print("  RegParam:", best_model._java_obj.parent().getRegParam())

**Best Model**
  Rank: 150
  MaxIter: 5
  RegParam: 0.1


In [53]:
# Generate n Recommendations for all users
recommendations = best_model.recommendForAllUsers(5)
recommendations.show()

23/12/22 23:37:38 WARN DAGScheduler: Broadcasting large task binary with size 6.5 MiB

+--------------+--------------------+
|customer_index|     recommendations|
+--------------+--------------------+
|            26|[{4369, 1.236779}...|
|            27|[{5077, 4.9843864...|
|            28|[{12701, 1.211870...|
|            31|[{2857, 4.9634705...|
|            34|[{879, 4.9770007}...|
|            44|[{639, 4.945782},...|
|            53|[{11275, 4.999411...|
|            65|[{31272, 4.961011...|
|            76|[{926, 4.944134},...|
|            78|[{12156, 4.990938...|
|            81|[{27303, 1.920084...|
|            85|[{29113, 4.975178...|
|           101|[{1343, 4.948754}...|
|           103|[{9312, 4.010607}...|
|           108|[{1785, 2.9941223...|
|           115|[{1511, 4.991942}...|
|           126|[{22489, 4.961010...|
|           133|[{2791, 0.9946561...|
|           137|[{4907, 0.9915466...|
|           148|[{8138, 5.008093}...|
+--------------+--------------------+
only showing top 20 rows



23/12/22 23:38:59 WARN DAGScheduler: Broadcasting large task binary with size 6.4 MiB
                                                                                

### Re-run model for all users

In [54]:
# train the recommender with als
als_alg = ALS(rank=best_model._java_obj.parent().getRank(),
              maxIter=best_model._java_obj.parent().getMaxIter(), 
              regParam=best_model._java_obj.parent().getRegParam(), 
              userCol='customer_index', 
              itemCol="product_index", 
              ratingCol='review_score',
              coldStartStrategy='drop')

final_model=als_alg.fit(user_features)


23/12/22 23:39:01 WARN DAGScheduler: Broadcasting large task binary with size 6.4 MiB
23/12/22 23:39:01 WARN DAGScheduler: Broadcasting large task binary with size 6.4 MiB
23/12/22 23:39:02 WARN DAGScheduler: Broadcasting large task binary with size 6.4 MiB
23/12/22 23:39:02 WARN DAGScheduler: Broadcasting large task binary with size 6.4 MiB
23/12/22 23:39:03 WARN DAGScheduler: Broadcasting large task binary with size 6.4 MiB
23/12/22 23:39:03 WARN DAGScheduler: Broadcasting large task binary with size 6.4 MiB
23/12/22 23:39:03 WARN DAGScheduler: Broadcasting large task binary with size 6.4 MiB
23/12/22 23:39:04 WARN DAGScheduler: Broadcasting large task binary with size 6.4 MiB
23/12/22 23:39:11 WARN DAGScheduler: Broadcasting large task binary with size 6.4 MiB
23/12/22 23:39:28 WARN DAGScheduler: Broadcasting large task binary with size 6.4 MiB
23/12/22 23:39:34 WARN DAGScheduler: Broadcasting large task binary with size 6.4 MiB
23/12/22 23:39:51 WARN DAGScheduler: Broadcasting larg

In [55]:
final_model.write().overwrite().save("./model/bestmodel")

23/12/22 23:41:09 WARN DAGScheduler: Broadcasting large task binary with size 6.6 MiB
23/12/22 23:41:09 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 95.00% for 8 writers
23/12/22 23:41:11 WARN DAGScheduler: Broadcasting large task binary with size 6.6 MiB
23/12/22 23:41:11 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 95.00% for 8 writers
                                                                                

In [56]:
# generate top_n product recommendations for user
nrecommend = 5
user_recs = final_model.recommendForAllUsers(nrecommend)
user_recs.show(4)

23/12/22 23:41:11 WARN DAGScheduler: Broadcasting large task binary with size 6.4 MiB

+--------------+--------------------+
|customer_index|     recommendations|
+--------------+--------------------+
|            26|[{4332, 1.1560177...|
|            27|[{7645, 5.0136065...|
|            28|[{8766, 1.2823216...|
|            31|[{2857, 4.93124},...|
+--------------+--------------------+
only showing top 4 rows



23/12/22 23:43:05 WARN DAGScheduler: Broadcasting large task binary with size 6.4 MiB
                                                                                

In [57]:
# recs = user_recs.toPandas()

In [58]:
# Generate pandas df for accessing products in recommender function
# products = item_features.toPandas()

In [59]:
# recs.to_csv('./model_data/recs.csv', index=False)

In [60]:
# products.to_csv('./model_data/products.csv', index=False)

In [61]:
# user_features_df = user_features.toPandas()

In [62]:
# user_features_df.to_csv('./model_data/users.csv', index=False)