In [1]:
!pip3 install pyspark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyspark
  Downloading pyspark-3.3.1.tar.gz (281.4 MB)
[K     |████████████████████████████████| 281.4 MB 47 kB/s 
[?25hCollecting py4j==0.10.9.5
  Downloading py4j-0.10.9.5-py2.py3-none-any.whl (199 kB)
[K     |████████████████████████████████| 199 kB 44.1 MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.3.1-py2.py3-none-any.whl size=281845512 sha256=1dc470ff63779647f9a386160cba2e89601a49c9c7721cdad6efc552896ba13c
  Stored in directory: /root/.cache/pip/wheels/43/dc/11/ec201cd671da62fa9c5cc77078235e40722170ceba231d7598
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9.5 pyspark-3.3.1


Import Libraries

In [2]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS

from pyspark.sql import SparkSession

Initialise Spark Session

In [3]:
spark = SparkSession.builder.appName("amazon_reviews_test").getOrCreate()


Read subset of data in.

In [5]:
df = spark.read.options(delimiter='\t').csv('/content/drive/MyDrive/amazon_short.tsv', inferSchema=True, header=True)

In [6]:
df.show(5)

+-----------+-----------+--------------+----------+--------------+--------------------+----------------+------+-------------+-----------+
|marketplace|customer_id|     review_id|product_id|product_parent|       product_title|product_category|rating|helpful_votes|total_votes|
+-----------+-----------+--------------+----------+--------------+--------------------+----------------+------+-------------+-----------+
|         US|   12068140|R341Z2FE4RPDY8|0786868821|     913926982|Cad: Confessions ...|           Books|     1|            2|          3|
|         US|   12068447|R1XHTJ69WFA79N|0240807316|     209406469|Sports Media:  Re...|           Books|     5|            8|          8|
|         US|   12069735|R3GBNWUL5TX127|B0009A0GXW|     355154473|The Secret Goldfi...|           Books|     5|            3|          3|
|         US|   12070636|R3VK6TOXURV3Q2|0316107719|     529689635|The Baby Sleep Bo...|           Books|     3|          111|        120|
|         US|   12070768| R5SZVXLZ

In [7]:
df.describe().show()

+-------+-----------+-----------------+--------------+--------------------+--------------------+------------------------------+----------------+------------------+------------------+-----------------+
|summary|marketplace|      customer_id|     review_id|          product_id|      product_parent|                 product_title|product_category|            rating|     helpful_votes|      total_votes|
+-------+-----------+-----------------+--------------+--------------------+--------------------+------------------------------+----------------+------------------+------------------+-----------------+
|  count|      29998|            29998|         29998|               29998|               29998|                         29998|           29998|             29998|             29998|            29998|
|   mean|       null|5206875.412727515|          null|1.0196748957946323E9| 4.925178649852657E8|                        1569.5|            null| 4.329588639242616|2.7935862390826056|4.541102740182

Split data into train and test

In [8]:
train, test = df.randomSplit([0.8, 0.2], seed=42)

Build ALS model.

itemCol is an interesting problem. Values have to be numeric so we can't use product_id as is.

coldStartStrategy = 'drop' means to drop new users as they have no reference to recommend.


In [47]:
als = ALS(maxIter=10, regParam=0.3, userCol="customer_id", itemCol="product_parent", ratingCol='rating', coldStartStrategy='drop')

In [48]:
model = als.fit(train)

In [49]:
pred = model.transform(test)

In [28]:
pred.show(10)

+-----------+-----------+--------------+----------+--------------+--------------------+----------------+------+-------------+-----------+-----------+
|marketplace|customer_id|     review_id|product_id|product_parent|       product_title|product_category|rating|helpful_votes|total_votes| prediction|
+-----------+-----------+--------------+----------+--------------+--------------------+----------------+------+-------------+-----------+-----------+
|         US|      10206| R1ALPEI2SO0D1|B00HNYWFMC|     805852390| Far Cry Compliation|     Video Games|     5|            0|          0|  -0.988089|
|         US|      10206| RLQIPPSIW9KQ0|B0053WVBSA|     603885070|Happy Feet Two: T...|     Video Games|     3|            0|          1| -0.5765556|
|         US|     312383|R1UOSG99YL2EF9|B00DPXV33M|     950721730|Playstation 3 Bla...|     Video Games|     5|            3|          6| -1.5499941|
|         US|     798134| RYJH4YCB8CAF7|B0050SXKU4|     277576962|  Grand Theft Auto V|     Video Ga

In [50]:
eval = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol='prediction')

The root mean squared error of the model. This is absolutely awful. Maybe we need to set a threshold for the amount of ratings a user has to have.

In [51]:
rmse = eval.evaluate(pred)
print(rmse)

4.578921959143033


In [31]:
user_1 = test.filter(test['customer_id'] == 10206).select(['product_parent', 'customer_id'])

In [33]:
user_1.show()

+-----------+-----------+-------------+----------+--------------+--------------------+----------------+------+-------------+-----------+
|marketplace|customer_id|    review_id|product_id|product_parent|       product_title|product_category|rating|helpful_votes|total_votes|
+-----------+-----------+-------------+----------+--------------+--------------------+----------------+------+-------------+-----------+
|         US|      10206|R1ALPEI2SO0D1|B00HNYWFMC|     805852390| Far Cry Compliation|     Video Games|     5|            0|          0|
|         US|      10206|RLQIPPSIW9KQ0|B0053WVBSA|     603885070|Happy Feet Two: T...|     Video Games|     3|            0|          1|
+-----------+-----------+-------------+----------+--------------+--------------------+----------------+------+-------------+-----------+



In [34]:
rec=model.transform(user_1)

In [35]:
rec.orderBy('prediction', ascending=False).show()

+-----------+-----------+-------------+----------+--------------+--------------------+----------------+------+-------------+-----------+----------+
|marketplace|customer_id|    review_id|product_id|product_parent|       product_title|product_category|rating|helpful_votes|total_votes|prediction|
+-----------+-----------+-------------+----------+--------------+--------------------+----------------+------+-------------+-----------+----------+
|         US|      10206|RLQIPPSIW9KQ0|B0053WVBSA|     603885070|Happy Feet Two: T...|     Video Games|     3|            0|          1|-0.5765556|
|         US|      10206|R1ALPEI2SO0D1|B00HNYWFMC|     805852390| Far Cry Compliation|     Video Games|     5|            0|          0| -0.988089|
+-----------+-----------+-------------+----------+--------------+--------------------+----------------+------+-------------+-----------+----------+



From here down is actually making a recommendation. 

It's taking 3 users and recommending 3 products each to them.

In [40]:
users = df.select(als.getUserCol()).distinct().limit(3)
userSubsetRecs = model.recommendForUserSubset(users, 3)

In [41]:
userSubsetRecs.show()

+-----------+--------------------+
|customer_id|     recommendations|
+-----------+--------------------+
|   12134452|[{40467685, 2.801...|
|   12182893|[{843965595, 5.08...|
|   12226553|[{92090520, 4.945...|
+-----------+--------------------+

