In [1]:
import findspark
findspark.init()

from pyspark.sql import SparkSession
from pyspark.sql.functions import row_number
from pyspark.sql.window import Window
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator
 
spark = SparkSession.builder.appName('yelp').getOrCreate()

In [2]:
df = spark.read.json('data/yelp_academic_dataset_review.json')

In [3]:
uuid = df.select('user_id').distinct()
buid = df.select('business_id').distinct()

In [4]:
uuid = uuid.withColumn('uuid', row_number().over(Window().ordeRo_id')))

In [5]:
df = df.join(uuid, on='user_id')
df = df.join(buid, on='business_id')

In [6]:
df

DataFrame[business_id: string, user_id: string, cool: bigint, date: string, funny: bigint, review_id: string, stars: double, text: string, useful: bigint, uuid: int, buid: int]

In [None]:
train, test = df.randomSplit([0.8, 0.2])

In [8]:
als = ALS(userCol='uuid', itemCol='buid', ratingCol='stars', coldStartStrategy='drop', maxIter=5, regParam=0.01)
evaluator = RegressionEvaluator(metricName='rmse', predictionCol='prediction', labelCol='stars')

In [9]:
model = als.fit(train)
predictions = model.transform(test)

In [10]:
rmse = evaluator.evaluate(predictions)

In [11]:
print(f"Root Mean Squared Error: {rmse}")

Root Mean Squared Error: 3.7951497787258592


In [None]:
recom = model.recommendForAllUsers(10)