# PROJECT 2 : RECOMMENDER SYSTEM (COLLABORATIVE FILTERING - PYSPARK)

## Import libraries, setup enviroment

In [1]:
!apt update
!apt-get install openjdk-11-jdk-headless -qq > /dev/null
!wget -q http://archive.apache.org/dist/spark/spark-3.3.0/spark-3.3.0-bin-hadoop3.tgz
!tar -xvf spark-3.3.0-bin-hadoop3.tgz
!pip install -q findspark
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.3.0-bin-hadoop3"

[33m0% [Working][0m            Get:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,632 B]
Get:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease [1,581 B]
Hit:3 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:4 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ Packages [70.9 kB]
Get:5 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Get:6 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
Get:7 https://r2u.stat.illinois.edu/ubuntu jammy InRelease [6,555 B]
Get:8 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  Packages [1,381 kB]
Get:9 http://archive.ubuntu.com/ubuntu jammy-backports InRelease [127 kB]
Get:10 https://r2u.stat.illinois.edu/ubuntu jammy/main amd64 Packages [2,686 kB]
Hit:11 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Get:12 http://archive.ubuntu.com/ubuntu jammy-updates/main amd64 Packages [3,092 kB]
Hit

In [2]:
import findspark
findspark.init()

In [4]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

Mounted at /content/gdrive


In [5]:
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('collaborative_filtering_pyspark').getOrCreate()

In [6]:
from pyspark.sql import SQLContext
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.sql.functions import *

## EDA

In [7]:
data=spark.read.csv('/content/gdrive/MyDrive/DL07_K302_NguyenNhatToTran_NguyenVuMaiPhuong/PROJECT 2/cung_cap_HV/Products_ThoiTrangNam_rating_raw.csv', inferSchema=True, header=True, sep='\t')
data.show(5)

+----------+-------+------------------+------+
|product_id|user_id|              user|rating|
+----------+-------+------------------+------+
|       190|      1|      karmakyun2nd|     5|
|       190|      2|  tranquangvinh_vv|     5|
|       190|      3|nguyenquoctoan2005|     5|
|       190|      4|    nguyenthuyhavi|     5|
|       190|      5|      luonganh5595|     5|
+----------+-------+------------------+------+
only showing top 5 rows



In [8]:
data.printSchema()

root
 |-- product_id: integer (nullable = true)
 |-- user_id: integer (nullable = true)
 |-- user: string (nullable = true)
 |-- rating: integer (nullable = true)



In [9]:
data.count()

1024482

In [10]:
data_sub=data.select(['product_id','user_id','rating'])
data_sub.show(5)

+----------+-------+------+
|product_id|user_id|rating|
+----------+-------+------+
|       190|      1|     5|
|       190|      2|     5|
|       190|      3|     5|
|       190|      4|     5|
|       190|      5|     5|
+----------+-------+------+
only showing top 5 rows



In [11]:
data_sub.select([count(when(isnull(c), c)).alias(c) for c in data_sub.columns]).toPandas().T

Unnamed: 0,0
product_id,0
user_id,0
rating,0


In [12]:
data_sub.select([count(when(isnan(c), c)).alias(c) for c in data_sub.columns]).toPandas().T

Unnamed: 0,0
product_id,0
user_id,0
rating,0


In [13]:
data_sub=data_sub.dropDuplicates()
data_sub.count()

999815

- Loại bỏ 2.4% dữ liệu trùng

In [14]:
# Distinct users and products
users = data_sub.select("user_id").distinct().count()
products = data_sub.select("product_id").distinct().count()
numerator = data_sub.count()
display(numerator, users, products)

999815

650636

31267

In [15]:
# Number of ratings matrix could contain if no empty cells
denominator = users * products
denominator

20343435812

In [16]:
#Calculating sparsity
sparsity = 1 - (numerator*1.0 / denominator)
print ("Sparsity: "), sparsity

Sparsity: 


(None, 0.9999508531887514)

## BUILD MODEL ALS

In [17]:
train, test = data_sub.randomSplit([0.8, 0.2])

In [None]:
als = ALS(maxIter=10,
          regParam=0.1,
          rank = 15,
          userCol="user_id",
          itemCol="product_id",
          ratingCol="rating",
          coldStartStrategy="drop",
          nonnegative=True)
model = als.fit(train)

In [None]:
predictions = model.transform(test)
predictions.show(5)

+----------+-------+------+----------+
|product_id|user_id|rating|prediction|
+----------+-------+------+----------+
|     18852|     13|     5| 4.5961437|
|       143|     31|     2| 3.7201061|
|       207|     31|     5| 2.5797834|
|       254|     31|     4| 2.9208674|
|      1367|     31|     5| 4.0626254|
+----------+-------+------+----------+
only showing top 5 rows



In [None]:
evaluator=RegressionEvaluator(metricName='rmse', labelCol='rating', predictionCol='prediction')
rmse=evaluator.evaluate(predictions)
rmse

1.222326248523703

### Tunning parameter

In [None]:
for regParam in [0.01, 0.1, 1]:
    for rank in [10, 20, 30, 40]:
        als = ALS(maxIter=10,
          regParam=regParam,
          rank = rank,
          userCol="user_id",
          itemCol="product_id",
          ratingCol="rating",
          coldStartStrategy="drop",
          nonnegative=True)

        model = als.fit(train)

        predictions=model.transform(test)

        evaluator=RegressionEvaluator(metricName='rmse', labelCol='rating', predictionCol='prediction')
        rmse=evaluator.evaluate(predictions)
        print('With regParam =', regParam, ', rank =', rank, ': RSME =', rmse)

With regParam = 0.01 , rank = 10 : RSME = 1.740031765175746
With regParam = 0.01 , rank = 20 : RSME = 1.5625361551509533
With regParam = 0.01 , rank = 30 : RSME = 1.4411923672958247
With regParam = 0.01 , rank = 40 : RSME = 1.346831082140375
With regParam = 0.1 , rank = 10 : RSME = 1.2354277507801557
With regParam = 0.1 , rank = 20 : RSME = 1.2176063769364174
With regParam = 0.1 , rank = 30 : RSME = 1.213568245355742
With regParam = 0.1 , rank = 40 : RSME = 1.1999399671776902
With regParam = 1 , rank = 10 : RSME = 1.4009550752293
With regParam = 1 , rank = 20 : RSME = 1.402385256245753
With regParam = 1 , rank = 30 : RSME = 1.4037030566660966
With regParam = 1 , rank = 40 : RSME = 1.4042780710765987


In [None]:
for maxIter in [15, 20, 25]:
  for rank in [20, 25, 30]:
    als = ALS(maxIter=10,
            regParam=0.1,
            rank = rank,
            userCol="user_id",
            itemCol="product_id",
            ratingCol="rating",
            coldStartStrategy="drop",
            nonnegative=True)

    model = als.fit(train)

    predictions=model.transform(test)

    evaluator=RegressionEvaluator(metricName='rmse', labelCol='rating', predictionCol='prediction')
    rmse=evaluator.evaluate(predictions)
    print('With maxIter =', maxIter, ', regParam =', 0.1, ', rank =', rank, ': RSME =', rmse)

With maxIter = 15 , regParam = 0.1 , rank = 20 : RSME = 1.2176063769364174
With maxIter = 15 , regParam = 0.1 , rank = 25 : RSME = 1.2125209373708428
With maxIter = 15 , regParam = 0.1 , rank = 30 : RSME = 1.213568245355742
With maxIter = 20 , regParam = 0.1 , rank = 20 : RSME = 1.2176063769364174
With maxIter = 20 , regParam = 0.1 , rank = 25 : RSME = 1.2125209373708428
With maxIter = 20 , regParam = 0.1 , rank = 30 : RSME = 1.213568245355742
With maxIter = 25 , regParam = 0.1 , rank = 20 : RSME = 1.2176063769364174
With maxIter = 25 , regParam = 0.1 , rank = 25 : RSME = 1.2125209373708428
With maxIter = 25 , regParam = 0.1 , rank = 30 : RSME = 1.213568245355742


In [18]:
als = ALS(maxIter=15,
          regParam=0.1,
          rank = 20,
          userCol="user_id",
          itemCol="product_id",
          ratingCol="rating",
          coldStartStrategy="drop",
          nonnegative=True)

model = als.fit(train)

predictions=model.transform(test)

evaluator=RegressionEvaluator(metricName='rmse', labelCol='rating', predictionCol='prediction')
rmse=evaluator.evaluate(predictions)
print('With regParam =', 0.1, ', rank =', 20, ', maxIter =', 15 , ': RSME =', rmse)

With regParam = 0.1 , rank = 20 , maxIter = 15 : RSME = 1.1617218779544844


### Make recommendations

In [19]:
user_recs = model.recommendForAllUsers(10)

In [None]:
user_recs.show(10, False)

+-------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|user_id|recommendations                                                                                                                                                                                             |
+-------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|26     |[{233550, 5.725358}, {173568, 5.643593}, {233297, 5.568351}, {11755, 5.5326004}, {234487, 5.5163403}, {12382, 5.4999223}, {12677, 5.4939094}, {172416, 5.4856267}, {172697, 5.478857}, {171242, 5.444896}]  |
|27     |[{233550, 5.881158}, {173568, 5.8217535}, {11755, 5.7310367}, {233297, 5.724292}, {12677, 5.708291}, {234487, 5.7076883}, {172416, 

In [None]:
for user in user_recs.head(10):
    print(user)

Row(user_id=26, recommendations=[Row(product_id=233550, rating=5.725358009338379), Row(product_id=173568, rating=5.643592834472656), Row(product_id=233297, rating=5.568350791931152), Row(product_id=11755, rating=5.532600402832031), Row(product_id=234487, rating=5.516340255737305), Row(product_id=12382, rating=5.499922275543213), Row(product_id=12677, rating=5.4939093589782715), Row(product_id=172416, rating=5.485626697540283), Row(product_id=172697, rating=5.478857040405273), Row(product_id=171242, rating=5.444896221160889)])
Row(user_id=27, recommendations=[Row(product_id=233550, rating=5.881157875061035), Row(product_id=173568, rating=5.82175350189209), Row(product_id=11755, rating=5.73103666305542), Row(product_id=233297, rating=5.724291801452637), Row(product_id=12677, rating=5.708291053771973), Row(product_id=234487, rating=5.707688331604004), Row(product_id=172416, rating=5.698760032653809), Row(product_id=12382, rating=5.649702072143555), Row(product_id=172697, rating=5.62389707

In [20]:
# Recommend for users
for user_id in [5, 10, 15]:
    find_user_rec = user_recs.filter(user_recs['user_id'] == user_id)
    user = find_user_rec.first()
    dic_user_rec = {'user_id' : user.user_id, 'recommendations' : user.recommendations}
    print(dic_user_rec)

{'user_id': 5, 'recommendations': [Row(product_id=20754, rating=6.339175701141357), Row(product_id=111938, rating=6.224900722503662), Row(product_id=173007, rating=6.217190742492676), Row(product_id=251318, rating=6.205512046813965), Row(product_id=232867, rating=6.131865978240967), Row(product_id=10166, rating=6.08130407333374), Row(product_id=163486, rating=6.068048477172852), Row(product_id=151339, rating=6.060095310211182), Row(product_id=233529, rating=6.059097766876221), Row(product_id=19176, rating=6.046304702758789)]}
{'user_id': 10, 'recommendations': [Row(product_id=151483, rating=6.34671688079834), Row(product_id=16700, rating=6.346645355224609), Row(product_id=211407, rating=6.314743995666504), Row(product_id=251318, rating=6.285700798034668), Row(product_id=231098, rating=6.260807514190674), Row(product_id=233948, rating=6.247946739196777), Row(product_id=232611, rating=6.164119720458984), Row(product_id=241187, rating=6.136471271514893), Row(product_id=233550, rating=6.12