In [60]:
# Spark init
!wget -q https://dlcdn.apache.org/spark/spark-3.3.2/spark-3.3.2-bin-hadoop2.tgz
!tar xf spark-3.3.2-bin-hadoop2.tgz
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/default-java"
os.environ["SPARK_HOME"] = "/content/spark-3.3.2-bin-hadoop2"
!pip install -q findspark
import findspark
findspark.init()

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [61]:
!wget --no-check-certificate 'https://drive.google.com/u/0/uc?id=1TIHlxcb61tDvXg9mtOaypQIPCTeFW0dH' -O '/content/drive/MyDrive/data/rating-Alaska.csv.gz'
!gunzip -d '/content/drive/MyDrive/data/rating-Alaska.csv.gz'

--2023-03-10 01:27:23--  https://drive.google.com/u/0/uc?id=1TIHlxcb61tDvXg9mtOaypQIPCTeFW0dH
Resolving drive.google.com (drive.google.com)... 142.251.163.139, 142.251.163.113, 142.251.163.138, ...
Connecting to drive.google.com (drive.google.com)|142.251.163.139|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://drive.google.com/uc?id=1TIHlxcb61tDvXg9mtOaypQIPCTeFW0dH [following]
--2023-03-10 01:27:23--  https://drive.google.com/uc?id=1TIHlxcb61tDvXg9mtOaypQIPCTeFW0dH
Reusing existing connection to drive.google.com:443.
HTTP request sent, awaiting response... 303 See Other
Location: https://doc-0o-14-docs.googleusercontent.com/docs/securesc/ha0ro937gcuc7l7deffksulhg5h7mbp1/d9dsfvq7e401viv3i4d0knfqet0shlba/1678411575000/10931378669845637259/*/1TIHlxcb61tDvXg9mtOaypQIPCTeFW0dH?uuid=6241d265-1802-4e19-bc2b-bab9f6bf6e33 [following]
--2023-03-10 01:27:26--  https://doc-0o-14-docs.googleusercontent.com/docs/securesc/ha0ro937gcuc7l7deffksulhg5h7mbp1/d9dsfv

In [62]:
from pyspark.sql.types import *
from pyspark.sql import SparkSession

spark = SparkSession.builder\
        .master("local")\
        .appName("Colab")\
        .config('spark.ui.port', '4050')\
        .getOrCreate()

In [63]:
df = spark.read.csv("/content/drive/MyDrive/data/rating-Alaska.csv",header=True)
df.show()

+--------------------+--------------------+------+-------------+
|            business|                user|rating|    timestamp|
+--------------------+--------------------+------+-------------+
|0x56b646ed2220b77...|10912980484268620...|     5|1566331951619|
|0x56b646ed2220b77...|10823390834518466...|     5|1503373018846|
|0x56b646ed2220b77...|11271909818428319...|     5|1410062370985|
|0x56b646ed2220b77...|11142389119045307...|     5|1495241580499|
|0x56b646ed2220b77...|11324092640575896...|     5|1504917982385|
|0x56b646ed2220b77...|11304483789114125...|     4|1474765901185|
|0x56b646ed2220b77...|10588996714814731...|     5|1499178889954|
|0x56b646ed2220b77...|11032915514759203...|     4|1472858535682|
|0x56b646ed2220b77...|10898963490860201...|     5|1529649811341|
|0x56b646ed2220b77...|11596257491256948...|     1|1565638584279|
|0x56b646ed2220b77...|10777630865613708...|     5|1461267716241|
|0x56b646ed2220b77...|10500297978546557...|     5|1529681946388|
|0x56b646ed2220b77...|106

In [19]:
!pip install pandas matplotlib seaborn

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [67]:
#Find the number of 5-rating users
num_5_star_reviews = len(df[df['rating'] == 5])

# Print the results
print("A string that includes the number of reviews receiving the rating of '5' is:", num_5_star_reviews)

A string that includes the number of reviews receiving the rating of '5' is: 658262


In [68]:
# Find the number of unique users
num_unique_users = len(df['user'].unique())

# Print the results
print("The number of unique users is:", num_unique_users)

The number of unique users is: 278695


In [96]:
# Import the required functions
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS

from pyspark.sql.functions import col
from pyspark.ml.feature import StringIndexer

index = StringIndexer(inputCol="user", outputCol="UserIndex")

index_model=index.fit(df)

df_index=index_model.transform(df)

In [97]:
index_business = StringIndexer(inputCol="business", outputCol="BusinessIndex")

index_model_business=index_business.fit(df)

df_index=index_model_business.transform(df_index)

In [98]:
rating_df = df_index
rating_df = rating_df.withColumn("user", col("user").cast("integer"))
rating_df.printSchema()

root
 |-- business: string (nullable = true)
 |-- user: integer (nullable = true)
 |-- rating: long (nullable = true)
 |-- timestamp: long (nullable = true)
 |-- UserIndex: double (nullable = false)
 |-- BusinessIndex: double (nullable = false)



In [101]:
# Split the data into training,test sets
(training_df, test_df) = rating_df.randomSplit([0.8, 0.2])

# Train the ALS model
df_als = ALS(maxIter=5, regParam=0.01, userCol="UserIndex", itemCol="BusinessIndex", 
    ratingCol="rating", coldStartStrategy="drop")

In [102]:
model = df_als.fit(training_df)

In [104]:
# Evaluate the model on the test set
predictions = model.transform(test_df)
df_eval = RegressionEvaluator(metricName="rmse", labelCol="rating",
                                predictionCol="prediction")

In [106]:
root_mean = df_eval.evaluate(predictions)
num_of_users = training_df.select("UserIndex").distinct().count()
num_of_business = training_df.select("BusinessIndex").distinct().count()

In [109]:
print("The total number of users of the training data = " + str(num_of_users))
print("The total number of businesses of the training data = " + str(num_of_business))
print("Root-mean-square error of the ALS model = " + str(root_mean))

The total number of users of the training data = 243825
The total number of businesses of the training data = 12525
Root-mean-square error of the ALS model = 3.1186199541485617


In [108]:
users = rating_df.select(df_als.getUserCol()).distinct().limit(10)
user_recommends = model.recommendForUserSubset(users, 5)
user_recommends.show(truncate=False)

+---------+-----------------------------------------------------------------------------------------------+
|UserIndex|recommendations                                                                                |
+---------+-----------------------------------------------------------------------------------------------+
|130842   |[{7112, 5.208185}, {9082, 4.999231}, {10906, 4.573681}, {8873, 4.3016367}, {5765, 4.142447}]   |
|113493   |[{6317, 15.026649}, {9033, 13.70389}, {9194, 13.49011}, {4584, 13.316617}, {5305, 12.84112}]   |
|17633    |[{7854, 36.970997}, {7257, 36.962933}, {8410, 36.224564}, {7673, 34.626022}, {8003, 34.27674}] |
|255693   |[{5434, 4.9994793}, {9327, 4.8389263}, {10005, 3.9507933}, {8099, 3.8697686}, {8713, 3.778717}]|
|205965   |[{8277, 4.999713}, {5712, 3.3875847}, {9449, 3.1230187}, {7734, 2.871557}, {6134, 2.7197313}]  |
|223675   |[{6841, 4.999462}, {10888, 4.3699794}, {5932, 3.6705213}, {6362, 3.6574922}, {8325, 3.6037424}]|
|44506    |[{7854, 29.55307}